diff options
113 files changed, 4408 insertions, 1632 deletions
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-ksm b/Documentation/ABI/testing/sysfs-kernel-mm-ksm new file mode 100644 index 000000000000..73e653ee2481 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-mm-ksm | |||
| @@ -0,0 +1,52 @@ | |||
| 1 | What: /sys/kernel/mm/ksm | ||
| 2 | Date: September 2009 | ||
| 3 | KernelVersion: 2.6.32 | ||
| 4 | Contact: Linux memory management mailing list <linux-mm@kvack.org> | ||
| 5 | Description: Interface for Kernel Samepage Merging (KSM) | ||
| 6 | |||
| 7 | What: /sys/kernel/mm/ksm/full_scans | ||
| 8 | What: /sys/kernel/mm/ksm/pages_shared | ||
| 9 | What: /sys/kernel/mm/ksm/pages_sharing | ||
| 10 | What: /sys/kernel/mm/ksm/pages_to_scan | ||
| 11 | What: /sys/kernel/mm/ksm/pages_unshared | ||
| 12 | What: /sys/kernel/mm/ksm/pages_volatile | ||
| 13 | What: /sys/kernel/mm/ksm/run | ||
| 14 | What: /sys/kernel/mm/ksm/sleep_millisecs | ||
| 15 | Date: September 2009 | ||
| 16 | Contact: Linux memory management mailing list <linux-mm@kvack.org> | ||
| 17 | Description: Kernel Samepage Merging daemon sysfs interface | ||
| 18 | |||
| 19 | full_scans: how many times all mergeable areas have been | ||
| 20 | scanned. | ||
| 21 | |||
| 22 | pages_shared: how many shared pages are being used. | ||
| 23 | |||
| 24 | pages_sharing: how many more sites are sharing them i.e. how | ||
| 25 | much saved. | ||
| 26 | |||
| 27 | pages_to_scan: how many present pages to scan before ksmd goes | ||
| 28 | to sleep. | ||
| 29 | |||
| 30 | pages_unshared: how many pages unique but repeatedly checked | ||
| 31 | for merging. | ||
| 32 | |||
| 33 | pages_volatile: how many pages changing too fast to be placed | ||
| 34 | in a tree. | ||
| 35 | |||
| 36 | run: write 0 to disable ksm, read 0 while ksm is disabled. | ||
| 37 | write 1 to run ksm, read 1 while ksm is running. | ||
| 38 | write 2 to disable ksm and unmerge all its pages. | ||
| 39 | |||
| 40 | sleep_millisecs: how many milliseconds ksm should sleep between | ||
| 41 | scans. | ||
| 42 | |||
| 43 | See Documentation/vm/ksm.txt for more information. | ||
| 44 | |||
| 45 | What: /sys/kernel/mm/ksm/merge_across_nodes | ||
| 46 | Date: January 2013 | ||
| 47 | KernelVersion: 3.9 | ||
| 48 | Contact: Linux memory management mailing list <linux-mm@kvack.org> | ||
| 49 | Description: Control merging pages across different NUMA nodes. | ||
| 50 | |||
| 51 | When it is set to 0 only pages from the same node are merged, | ||
| 52 | otherwise pages from all nodes can be merged together (default). | ||
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 9aa8ff3e54dc..766087781ecd 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
| @@ -1640,6 +1640,42 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
| 1640 | that the amount of memory usable for all allocations | 1640 | that the amount of memory usable for all allocations |
| 1641 | is not too small. | 1641 | is not too small. |
| 1642 | 1642 | ||
| 1643 | movablemem_map=acpi | ||
| 1644 | [KNL,X86,IA-64,PPC] This parameter is similar to | ||
| 1645 | memmap except it specifies the memory map of | ||
| 1646 | ZONE_MOVABLE. | ||
| 1647 | This option inform the kernel to use Hot Pluggable bit | ||
| 1648 | in flags from SRAT from ACPI BIOS to determine which | ||
| 1649 | memory devices could be hotplugged. The corresponding | ||
| 1650 | memory ranges will be set as ZONE_MOVABLE. | ||
| 1651 | NOTE: Whatever node the kernel resides in will always | ||
| 1652 | be un-hotpluggable. | ||
| 1653 | |||
| 1654 | movablemem_map=nn[KMG]@ss[KMG] | ||
| 1655 | [KNL,X86,IA-64,PPC] This parameter is similar to | ||
| 1656 | memmap except it specifies the memory map of | ||
| 1657 | ZONE_MOVABLE. | ||
| 1658 | If user specifies memory ranges, the info in SRAT will | ||
| 1659 | be ingored. And it works like the following: | ||
| 1660 | - If more ranges are all within one node, then from | ||
| 1661 | lowest ss to the end of the node will be ZONE_MOVABLE. | ||
| 1662 | - If a range is within a node, then from ss to the end | ||
| 1663 | of the node will be ZONE_MOVABLE. | ||
| 1664 | - If a range covers two or more nodes, then from ss to | ||
| 1665 | the end of the 1st node will be ZONE_MOVABLE, and all | ||
| 1666 | the rest nodes will only have ZONE_MOVABLE. | ||
| 1667 | If memmap is specified at the same time, the | ||
| 1668 | movablemem_map will be limited within the memmap | ||
| 1669 | areas. If kernelcore or movablecore is also specified, | ||
| 1670 | movablemem_map will have higher priority to be | ||
| 1671 | satisfied. So the administrator should be careful that | ||
| 1672 | the amount of movablemem_map areas are not too large. | ||
| 1673 | Otherwise kernel won't have enough memory to start. | ||
| 1674 | NOTE: We don't stop users specifying the node the | ||
| 1675 | kernel resides in as hotpluggable so that this | ||
| 1676 | option can be used as a workaround of firmware | ||
| 1677 | bugs. | ||
| 1678 | |||
| 1643 | MTD_Partition= [MTD] | 1679 | MTD_Partition= [MTD] |
| 1644 | Format: <name>,<region-number>,<size>,<offset> | 1680 | Format: <name>,<region-number>,<size>,<offset> |
| 1645 | 1681 | ||
diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt index b392e496f816..f34a8ee6f860 100644 --- a/Documentation/vm/ksm.txt +++ b/Documentation/vm/ksm.txt | |||
| @@ -58,6 +58,21 @@ sleep_millisecs - how many milliseconds ksmd should sleep before next scan | |||
| 58 | e.g. "echo 20 > /sys/kernel/mm/ksm/sleep_millisecs" | 58 | e.g. "echo 20 > /sys/kernel/mm/ksm/sleep_millisecs" |
| 59 | Default: 20 (chosen for demonstration purposes) | 59 | Default: 20 (chosen for demonstration purposes) |
| 60 | 60 | ||
| 61 | merge_across_nodes - specifies if pages from different numa nodes can be merged. | ||
| 62 | When set to 0, ksm merges only pages which physically | ||
| 63 | reside in the memory area of same NUMA node. That brings | ||
| 64 | lower latency to access of shared pages. Systems with more | ||
| 65 | nodes, at significant NUMA distances, are likely to benefit | ||
| 66 | from the lower latency of setting 0. Smaller systems, which | ||
| 67 | need to minimize memory usage, are likely to benefit from | ||
| 68 | the greater sharing of setting 1 (default). You may wish to | ||
| 69 | compare how your system performs under each setting, before | ||
| 70 | deciding on which to use. merge_across_nodes setting can be | ||
| 71 | changed only when there are no ksm shared pages in system: | ||
| 72 | set run 2 to unmerge pages first, then to 1 after changing | ||
| 73 | merge_across_nodes, to remerge according to the new setting. | ||
| 74 | Default: 1 (merging across nodes as in earlier releases) | ||
| 75 | |||
| 61 | run - set 0 to stop ksmd from running but keep merged pages, | 76 | run - set 0 to stop ksmd from running but keep merged pages, |
| 62 | set 1 to run ksmd e.g. "echo 1 > /sys/kernel/mm/ksm/run", | 77 | set 1 to run ksmd e.g. "echo 1 > /sys/kernel/mm/ksm/run", |
| 63 | set 2 to stop ksmd and unmerge all pages currently merged, | 78 | set 2 to stop ksmd and unmerge all pages currently merged, |
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index f4dd585898c5..224b44ab534e 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c | |||
| @@ -434,4 +434,7 @@ int __meminit vmemmap_populate(struct page *start_page, | |||
| 434 | return 0; | 434 | return 0; |
| 435 | } | 435 | } |
| 436 | #endif /* CONFIG_ARM64_64K_PAGES */ | 436 | #endif /* CONFIG_ARM64_64K_PAGES */ |
| 437 | void vmemmap_free(struct page *memmap, unsigned long nr_pages) | ||
| 438 | { | ||
| 439 | } | ||
| 437 | #endif /* CONFIG_SPARSEMEM_VMEMMAP */ | 440 | #endif /* CONFIG_SPARSEMEM_VMEMMAP */ |
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c index 1516d1dc11fd..80dab509dfb0 100644 --- a/arch/ia64/mm/contig.c +++ b/arch/ia64/mm/contig.c | |||
| @@ -93,7 +93,7 @@ void show_mem(unsigned int filter) | |||
| 93 | printk(KERN_INFO "%d pages swap cached\n", total_cached); | 93 | printk(KERN_INFO "%d pages swap cached\n", total_cached); |
| 94 | printk(KERN_INFO "Total of %ld pages in page table cache\n", | 94 | printk(KERN_INFO "Total of %ld pages in page table cache\n", |
| 95 | quicklist_total_size()); | 95 | quicklist_total_size()); |
| 96 | printk(KERN_INFO "%d free buffer pages\n", nr_free_buffer_pages()); | 96 | printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages()); |
| 97 | } | 97 | } |
| 98 | 98 | ||
| 99 | 99 | ||
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index c641333cd997..c2e955ee79a8 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c | |||
| @@ -666,7 +666,7 @@ void show_mem(unsigned int filter) | |||
| 666 | printk(KERN_INFO "%d pages swap cached\n", total_cached); | 666 | printk(KERN_INFO "%d pages swap cached\n", total_cached); |
| 667 | printk(KERN_INFO "Total of %ld pages in page table cache\n", | 667 | printk(KERN_INFO "Total of %ld pages in page table cache\n", |
| 668 | quicklist_total_size()); | 668 | quicklist_total_size()); |
| 669 | printk(KERN_INFO "%d free buffer pages\n", nr_free_buffer_pages()); | 669 | printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages()); |
| 670 | } | 670 | } |
| 671 | 671 | ||
| 672 | /** | 672 | /** |
| @@ -822,4 +822,8 @@ int __meminit vmemmap_populate(struct page *start_page, | |||
| 822 | { | 822 | { |
| 823 | return vmemmap_populate_basepages(start_page, size, node); | 823 | return vmemmap_populate_basepages(start_page, size, node); |
| 824 | } | 824 | } |
| 825 | |||
| 826 | void vmemmap_free(struct page *memmap, unsigned long nr_pages) | ||
| 827 | { | ||
| 828 | } | ||
| 825 | #endif | 829 | #endif |
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index b755ea92aea7..20bc967c7209 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c | |||
| @@ -688,6 +688,24 @@ int arch_add_memory(int nid, u64 start, u64 size) | |||
| 688 | 688 | ||
| 689 | return ret; | 689 | return ret; |
| 690 | } | 690 | } |
| 691 | |||
| 692 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
| 693 | int arch_remove_memory(u64 start, u64 size) | ||
| 694 | { | ||
| 695 | unsigned long start_pfn = start >> PAGE_SHIFT; | ||
| 696 | unsigned long nr_pages = size >> PAGE_SHIFT; | ||
| 697 | struct zone *zone; | ||
| 698 | int ret; | ||
| 699 | |||
| 700 | zone = page_zone(pfn_to_page(start_pfn)); | ||
| 701 | ret = __remove_pages(zone, start_pfn, nr_pages); | ||
| 702 | if (ret) | ||
| 703 | pr_warn("%s: Problem encountered in __remove_pages() as" | ||
| 704 | " ret=%d\n", __func__, ret); | ||
| 705 | |||
| 706 | return ret; | ||
| 707 | } | ||
| 708 | #endif | ||
| 691 | #endif | 709 | #endif |
| 692 | 710 | ||
| 693 | /* | 711 | /* |
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 95a45293e5ac..7e2246fb2f31 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c | |||
| @@ -297,5 +297,10 @@ int __meminit vmemmap_populate(struct page *start_page, | |||
| 297 | 297 | ||
| 298 | return 0; | 298 | return 0; |
| 299 | } | 299 | } |
| 300 | |||
| 301 | void vmemmap_free(struct page *memmap, unsigned long nr_pages) | ||
| 302 | { | ||
| 303 | } | ||
| 304 | |||
| 300 | #endif /* CONFIG_SPARSEMEM_VMEMMAP */ | 305 | #endif /* CONFIG_SPARSEMEM_VMEMMAP */ |
| 301 | 306 | ||
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 40df7c8f2096..f1f7409a4183 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c | |||
| @@ -133,6 +133,18 @@ int arch_add_memory(int nid, u64 start, u64 size) | |||
| 133 | 133 | ||
| 134 | return __add_pages(nid, zone, start_pfn, nr_pages); | 134 | return __add_pages(nid, zone, start_pfn, nr_pages); |
| 135 | } | 135 | } |
| 136 | |||
| 137 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
| 138 | int arch_remove_memory(u64 start, u64 size) | ||
| 139 | { | ||
| 140 | unsigned long start_pfn = start >> PAGE_SHIFT; | ||
| 141 | unsigned long nr_pages = size >> PAGE_SHIFT; | ||
| 142 | struct zone *zone; | ||
| 143 | |||
| 144 | zone = page_zone(pfn_to_page(start_pfn)); | ||
| 145 | return __remove_pages(zone, start_pfn, nr_pages); | ||
| 146 | } | ||
| 147 | #endif | ||
| 136 | #endif /* CONFIG_MEMORY_HOTPLUG */ | 148 | #endif /* CONFIG_MEMORY_HOTPLUG */ |
| 137 | 149 | ||
| 138 | /* | 150 | /* |
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index ae672f41c464..49ce6bb2c641 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c | |||
| @@ -228,4 +228,16 @@ int arch_add_memory(int nid, u64 start, u64 size) | |||
| 228 | vmem_remove_mapping(start, size); | 228 | vmem_remove_mapping(start, size); |
| 229 | return rc; | 229 | return rc; |
| 230 | } | 230 | } |
| 231 | |||
| 232 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
| 233 | int arch_remove_memory(u64 start, u64 size) | ||
| 234 | { | ||
| 235 | /* | ||
| 236 | * There is no hardware or firmware interface which could trigger a | ||
| 237 | * hot memory remove on s390. So there is nothing that needs to be | ||
| 238 | * implemented. | ||
| 239 | */ | ||
| 240 | return -EBUSY; | ||
| 241 | } | ||
| 242 | #endif | ||
| 231 | #endif /* CONFIG_MEMORY_HOTPLUG */ | 243 | #endif /* CONFIG_MEMORY_HOTPLUG */ |
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 79699f46a443..e21aaf4f5cb6 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c | |||
| @@ -268,6 +268,10 @@ out: | |||
| 268 | return ret; | 268 | return ret; |
| 269 | } | 269 | } |
| 270 | 270 | ||
| 271 | void vmemmap_free(struct page *memmap, unsigned long nr_pages) | ||
| 272 | { | ||
| 273 | } | ||
| 274 | |||
| 271 | /* | 275 | /* |
| 272 | * Add memory segment to the segment list if it doesn't overlap with | 276 | * Add memory segment to the segment list if it doesn't overlap with |
| 273 | * an already present segment. | 277 | * an already present segment. |
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 82cc576fab15..105794037143 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c | |||
| @@ -558,4 +558,21 @@ int memory_add_physaddr_to_nid(u64 addr) | |||
| 558 | EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); | 558 | EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); |
| 559 | #endif | 559 | #endif |
| 560 | 560 | ||
| 561 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
| 562 | int arch_remove_memory(u64 start, u64 size) | ||
| 563 | { | ||
| 564 | unsigned long start_pfn = start >> PAGE_SHIFT; | ||
| 565 | unsigned long nr_pages = size >> PAGE_SHIFT; | ||
| 566 | struct zone *zone; | ||
| 567 | int ret; | ||
| 568 | |||
| 569 | zone = page_zone(pfn_to_page(start_pfn)); | ||
| 570 | ret = __remove_pages(zone, start_pfn, nr_pages); | ||
| 571 | if (unlikely(ret)) | ||
| 572 | pr_warn("%s: Failed, __remove_pages() == %d\n", __func__, | ||
| 573 | ret); | ||
| 574 | |||
| 575 | return ret; | ||
| 576 | } | ||
| 577 | #endif | ||
| 561 | #endif /* CONFIG_MEMORY_HOTPLUG */ | 578 | #endif /* CONFIG_MEMORY_HOTPLUG */ |
diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index dde85ef1c56d..48e0c030e8f5 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c | |||
| @@ -57,7 +57,7 @@ void show_mem(unsigned int filter) | |||
| 57 | printk("Mem-info:\n"); | 57 | printk("Mem-info:\n"); |
| 58 | show_free_areas(filter); | 58 | show_free_areas(filter); |
| 59 | printk("Free swap: %6ldkB\n", | 59 | printk("Free swap: %6ldkB\n", |
| 60 | nr_swap_pages << (PAGE_SHIFT-10)); | 60 | get_nr_swap_pages() << (PAGE_SHIFT-10)); |
| 61 | printk("%ld pages of RAM\n", totalram_pages); | 61 | printk("%ld pages of RAM\n", totalram_pages); |
| 62 | printk("%ld free pages\n", nr_free_pages()); | 62 | printk("%ld free pages\n", nr_free_pages()); |
| 63 | } | 63 | } |
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 5c2c6e61facb..1588d33d5492 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c | |||
| @@ -2235,6 +2235,11 @@ void __meminit vmemmap_populate_print_last(void) | |||
| 2235 | node_start = 0; | 2235 | node_start = 0; |
| 2236 | } | 2236 | } |
| 2237 | } | 2237 | } |
| 2238 | |||
| 2239 | void vmemmap_free(struct page *memmap, unsigned long nr_pages) | ||
| 2240 | { | ||
| 2241 | } | ||
| 2242 | |||
| 2238 | #endif /* CONFIG_SPARSEMEM_VMEMMAP */ | 2243 | #endif /* CONFIG_SPARSEMEM_VMEMMAP */ |
| 2239 | 2244 | ||
| 2240 | static void prot_init_common(unsigned long page_none, | 2245 | static void prot_init_common(unsigned long page_none, |
diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c index 3cfa98bf9125..743c951c61b0 100644 --- a/arch/tile/mm/elf.c +++ b/arch/tile/mm/elf.c | |||
| @@ -130,7 +130,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, | |||
| 130 | if (!retval) { | 130 | if (!retval) { |
| 131 | unsigned long addr = MEM_USER_INTRPT; | 131 | unsigned long addr = MEM_USER_INTRPT; |
| 132 | addr = mmap_region(NULL, addr, INTRPT_SIZE, | 132 | addr = mmap_region(NULL, addr, INTRPT_SIZE, |
| 133 | MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE, | ||
| 134 | VM_READ|VM_EXEC| | 133 | VM_READ|VM_EXEC| |
| 135 | VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0); | 134 | VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0); |
| 136 | if (addr > (unsigned long) -PAGE_SIZE) | 135 | if (addr > (unsigned long) -PAGE_SIZE) |
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c index ef29d6c5e10e..2749515a0547 100644 --- a/arch/tile/mm/init.c +++ b/arch/tile/mm/init.c | |||
| @@ -935,6 +935,14 @@ int remove_memory(u64 start, u64 size) | |||
| 935 | { | 935 | { |
| 936 | return -EINVAL; | 936 | return -EINVAL; |
| 937 | } | 937 | } |
| 938 | |||
| 939 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
| 940 | int arch_remove_memory(u64 start, u64 size) | ||
| 941 | { | ||
| 942 | /* TODO */ | ||
| 943 | return -EBUSY; | ||
| 944 | } | ||
| 945 | #endif | ||
| 938 | #endif | 946 | #endif |
| 939 | 947 | ||
| 940 | struct kmem_cache *pgd_cache; | 948 | struct kmem_cache *pgd_cache; |
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c index de0de0c0e8a1..b3b4972c2451 100644 --- a/arch/tile/mm/pgtable.c +++ b/arch/tile/mm/pgtable.c | |||
| @@ -61,7 +61,7 @@ void show_mem(unsigned int filter) | |||
| 61 | global_page_state(NR_PAGETABLE), | 61 | global_page_state(NR_PAGETABLE), |
| 62 | global_page_state(NR_BOUNCE), | 62 | global_page_state(NR_BOUNCE), |
| 63 | global_page_state(NR_FILE_PAGES), | 63 | global_page_state(NR_FILE_PAGES), |
| 64 | nr_swap_pages); | 64 | get_nr_swap_pages()); |
| 65 | 65 | ||
| 66 | for_each_zone(zone) { | 66 | for_each_zone(zone) { |
| 67 | unsigned long flags, order, total = 0, largest_order = -1; | 67 | unsigned long flags, order, total = 0, largest_order = -1; |
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 52560a2038e1..1b99ee5c9f00 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h | |||
| @@ -57,8 +57,8 @@ static inline int numa_cpu_node(int cpu) | |||
| 57 | #endif | 57 | #endif |
| 58 | 58 | ||
| 59 | #ifdef CONFIG_NUMA | 59 | #ifdef CONFIG_NUMA |
| 60 | extern void __cpuinit numa_set_node(int cpu, int node); | 60 | extern void numa_set_node(int cpu, int node); |
| 61 | extern void __cpuinit numa_clear_node(int cpu); | 61 | extern void numa_clear_node(int cpu); |
| 62 | extern void __init init_cpu_to_node(void); | 62 | extern void __init init_cpu_to_node(void); |
| 63 | extern void __cpuinit numa_add_cpu(int cpu); | 63 | extern void __cpuinit numa_add_cpu(int cpu); |
| 64 | extern void __cpuinit numa_remove_cpu(int cpu); | 64 | extern void __cpuinit numa_remove_cpu(int cpu); |
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index e6423002c10b..567b5d0632b2 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h | |||
| @@ -351,6 +351,7 @@ static inline void update_page_count(int level, unsigned long pages) { } | |||
| 351 | * as a pte too. | 351 | * as a pte too. |
| 352 | */ | 352 | */ |
| 353 | extern pte_t *lookup_address(unsigned long address, unsigned int *level); | 353 | extern pte_t *lookup_address(unsigned long address, unsigned int *level); |
| 354 | extern int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase); | ||
| 354 | extern phys_addr_t slow_virt_to_phys(void *__address); | 355 | extern phys_addr_t slow_virt_to_phys(void *__address); |
| 355 | 356 | ||
| 356 | #endif /* !__ASSEMBLY__ */ | 357 | #endif /* !__ASSEMBLY__ */ |
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index cfc755dc1607..230c8ea878e5 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c | |||
| @@ -696,6 +696,10 @@ EXPORT_SYMBOL(acpi_map_lsapic); | |||
| 696 | 696 | ||
| 697 | int acpi_unmap_lsapic(int cpu) | 697 | int acpi_unmap_lsapic(int cpu) |
| 698 | { | 698 | { |
| 699 | #ifdef CONFIG_ACPI_NUMA | ||
| 700 | set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE); | ||
| 701 | #endif | ||
| 702 | |||
| 699 | per_cpu(x86_cpu_to_apicid, cpu) = -1; | 703 | per_cpu(x86_cpu_to_apicid, cpu) = -1; |
| 700 | set_cpu_present(cpu, false); | 704 | set_cpu_present(cpu, false); |
| 701 | num_processors--; | 705 | num_processors--; |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 915f5efefcf5..9c857f05cef0 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
| @@ -1056,6 +1056,15 @@ void __init setup_arch(char **cmdline_p) | |||
| 1056 | setup_bios_corruption_check(); | 1056 | setup_bios_corruption_check(); |
| 1057 | #endif | 1057 | #endif |
| 1058 | 1058 | ||
| 1059 | /* | ||
| 1060 | * In the memory hotplug case, the kernel needs info from SRAT to | ||
| 1061 | * determine which memory is hotpluggable before allocating memory | ||
| 1062 | * using memblock. | ||
| 1063 | */ | ||
| 1064 | acpi_boot_table_init(); | ||
| 1065 | early_acpi_boot_init(); | ||
| 1066 | early_parse_srat(); | ||
| 1067 | |||
| 1059 | #ifdef CONFIG_X86_32 | 1068 | #ifdef CONFIG_X86_32 |
| 1060 | printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", | 1069 | printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", |
| 1061 | (max_pfn_mapped<<PAGE_SHIFT) - 1); | 1070 | (max_pfn_mapped<<PAGE_SHIFT) - 1); |
| @@ -1101,10 +1110,6 @@ void __init setup_arch(char **cmdline_p) | |||
| 1101 | /* | 1110 | /* |
| 1102 | * Parse the ACPI tables for possible boot-time SMP configuration. | 1111 | * Parse the ACPI tables for possible boot-time SMP configuration. |
| 1103 | */ | 1112 | */ |
| 1104 | acpi_boot_table_init(); | ||
| 1105 | |||
| 1106 | early_acpi_boot_init(); | ||
| 1107 | |||
| 1108 | initmem_init(); | 1113 | initmem_init(); |
| 1109 | memblock_find_dma_reserve(); | 1114 | memblock_find_dma_reserve(); |
| 1110 | 1115 | ||
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index b299724f6e34..2d19001151d5 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c | |||
| @@ -862,6 +862,18 @@ int arch_add_memory(int nid, u64 start, u64 size) | |||
| 862 | 862 | ||
| 863 | return __add_pages(nid, zone, start_pfn, nr_pages); | 863 | return __add_pages(nid, zone, start_pfn, nr_pages); |
| 864 | } | 864 | } |
| 865 | |||
| 866 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
| 867 | int arch_remove_memory(u64 start, u64 size) | ||
| 868 | { | ||
| 869 | unsigned long start_pfn = start >> PAGE_SHIFT; | ||
| 870 | unsigned long nr_pages = size >> PAGE_SHIFT; | ||
| 871 | struct zone *zone; | ||
| 872 | |||
| 873 | zone = page_zone(pfn_to_page(start_pfn)); | ||
| 874 | return __remove_pages(zone, start_pfn, nr_pages); | ||
| 875 | } | ||
| 876 | #endif | ||
| 865 | #endif | 877 | #endif |
| 866 | 878 | ||
| 867 | /* | 879 | /* |
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3eba7f429880..474e28f10815 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
| @@ -707,6 +707,343 @@ int arch_add_memory(int nid, u64 start, u64 size) | |||
| 707 | } | 707 | } |
| 708 | EXPORT_SYMBOL_GPL(arch_add_memory); | 708 | EXPORT_SYMBOL_GPL(arch_add_memory); |
| 709 | 709 | ||
| 710 | #define PAGE_INUSE 0xFD | ||
| 711 | |||
| 712 | static void __meminit free_pagetable(struct page *page, int order) | ||
| 713 | { | ||
| 714 | struct zone *zone; | ||
| 715 | bool bootmem = false; | ||
| 716 | unsigned long magic; | ||
| 717 | unsigned int nr_pages = 1 << order; | ||
| 718 | |||
| 719 | /* bootmem page has reserved flag */ | ||
| 720 | if (PageReserved(page)) { | ||
| 721 | __ClearPageReserved(page); | ||
| 722 | bootmem = true; | ||
| 723 | |||
| 724 | magic = (unsigned long)page->lru.next; | ||
| 725 | if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { | ||
| 726 | while (nr_pages--) | ||
| 727 | put_page_bootmem(page++); | ||
| 728 | } else | ||
| 729 | __free_pages_bootmem(page, order); | ||
| 730 | } else | ||
| 731 | free_pages((unsigned long)page_address(page), order); | ||
| 732 | |||
| 733 | /* | ||
| 734 | * SECTION_INFO pages and MIX_SECTION_INFO pages | ||
| 735 | * are all allocated by bootmem. | ||
| 736 | */ | ||
| 737 | if (bootmem) { | ||
| 738 | zone = page_zone(page); | ||
| 739 | zone_span_writelock(zone); | ||
| 740 | zone->present_pages += nr_pages; | ||
| 741 | zone_span_writeunlock(zone); | ||
| 742 | totalram_pages += nr_pages; | ||
| 743 | } | ||
| 744 | } | ||
| 745 | |||
| 746 | static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) | ||
| 747 | { | ||
| 748 | pte_t *pte; | ||
| 749 | int i; | ||
| 750 | |||
| 751 | for (i = 0; i < PTRS_PER_PTE; i++) { | ||
| 752 | pte = pte_start + i; | ||
| 753 | if (pte_val(*pte)) | ||
| 754 | return; | ||
| 755 | } | ||
| 756 | |||
| 757 | /* free a pte talbe */ | ||
| 758 | free_pagetable(pmd_page(*pmd), 0); | ||
| 759 | spin_lock(&init_mm.page_table_lock); | ||
| 760 | pmd_clear(pmd); | ||
| 761 | spin_unlock(&init_mm.page_table_lock); | ||
| 762 | } | ||
| 763 | |||
| 764 | static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) | ||
| 765 | { | ||
| 766 | pmd_t *pmd; | ||
| 767 | int i; | ||
| 768 | |||
| 769 | for (i = 0; i < PTRS_PER_PMD; i++) { | ||
| 770 | pmd = pmd_start + i; | ||
| 771 | if (pmd_val(*pmd)) | ||
| 772 | return; | ||
| 773 | } | ||
| 774 | |||
| 775 | /* free a pmd talbe */ | ||
| 776 | free_pagetable(pud_page(*pud), 0); | ||
| 777 | spin_lock(&init_mm.page_table_lock); | ||
| 778 | pud_clear(pud); | ||
| 779 | spin_unlock(&init_mm.page_table_lock); | ||
| 780 | } | ||
| 781 | |||
| 782 | /* Return true if pgd is changed, otherwise return false. */ | ||
| 783 | static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd) | ||
| 784 | { | ||
| 785 | pud_t *pud; | ||
| 786 | int i; | ||
| 787 | |||
| 788 | for (i = 0; i < PTRS_PER_PUD; i++) { | ||
| 789 | pud = pud_start + i; | ||
| 790 | if (pud_val(*pud)) | ||
| 791 | return false; | ||
| 792 | } | ||
| 793 | |||
| 794 | /* free a pud table */ | ||
| 795 | free_pagetable(pgd_page(*pgd), 0); | ||
| 796 | spin_lock(&init_mm.page_table_lock); | ||
| 797 | pgd_clear(pgd); | ||
| 798 | spin_unlock(&init_mm.page_table_lock); | ||
| 799 | |||
| 800 | return true; | ||
| 801 | } | ||
| 802 | |||
| 803 | static void __meminit | ||
| 804 | remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, | ||
| 805 | bool direct) | ||
| 806 | { | ||
| 807 | unsigned long next, pages = 0; | ||
| 808 | pte_t *pte; | ||
| 809 | void *page_addr; | ||
| 810 | phys_addr_t phys_addr; | ||
| 811 | |||
| 812 | pte = pte_start + pte_index(addr); | ||
| 813 | for (; addr < end; addr = next, pte++) { | ||
| 814 | next = (addr + PAGE_SIZE) & PAGE_MASK; | ||
| 815 | if (next > end) | ||
| 816 | next = end; | ||
| 817 | |||
| 818 | if (!pte_present(*pte)) | ||
| 819 | continue; | ||
| 820 | |||
| 821 | /* | ||
| 822 | * We mapped [0,1G) memory as identity mapping when | ||
| 823 | * initializing, in arch/x86/kernel/head_64.S. These | ||
| 824 | * pagetables cannot be removed. | ||
| 825 | */ | ||
| 826 | phys_addr = pte_val(*pte) + (addr & PAGE_MASK); | ||
| 827 | if (phys_addr < (phys_addr_t)0x40000000) | ||
| 828 | return; | ||
| 829 | |||
| 830 | if (IS_ALIGNED(addr, PAGE_SIZE) && | ||
| 831 | IS_ALIGNED(next, PAGE_SIZE)) { | ||
| 832 | /* | ||
| 833 | * Do not free direct mapping pages since they were | ||
| 834 | * freed when offlining, or simplely not in use. | ||
| 835 | */ | ||
| 836 | if (!direct) | ||
| 837 | free_pagetable(pte_page(*pte), 0); | ||
| 838 | |||
| 839 | spin_lock(&init_mm.page_table_lock); | ||
| 840 | pte_clear(&init_mm, addr, pte); | ||
| 841 | spin_unlock(&init_mm.page_table_lock); | ||
| 842 | |||
| 843 | /* For non-direct mapping, pages means nothing. */ | ||
| 844 | pages++; | ||
| 845 | } else { | ||
| 846 | /* | ||
| 847 | * If we are here, we are freeing vmemmap pages since | ||
| 848 | * direct mapped memory ranges to be freed are aligned. | ||
| 849 | * | ||
| 850 | * If we are not removing the whole page, it means | ||
| 851 | * other page structs in this page are being used and | ||
| 852 | * we canot remove them. So fill the unused page_structs | ||
| 853 | * with 0xFD, and remove the page when it is wholly | ||
| 854 | * filled with 0xFD. | ||
| 855 | */ | ||
| 856 | memset((void *)addr, PAGE_INUSE, next - addr); | ||
| 857 | |||
| 858 | page_addr = page_address(pte_page(*pte)); | ||
| 859 | if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) { | ||
| 860 | free_pagetable(pte_page(*pte), 0); | ||
| 861 | |||
| 862 | spin_lock(&init_mm.page_table_lock); | ||
| 863 | pte_clear(&init_mm, addr, pte); | ||
| 864 | spin_unlock(&init_mm.page_table_lock); | ||
| 865 | } | ||
| 866 | } | ||
| 867 | } | ||
| 868 | |||
| 869 | /* Call free_pte_table() in remove_pmd_table(). */ | ||
| 870 | flush_tlb_all(); | ||
| 871 | if (direct) | ||
| 872 | update_page_count(PG_LEVEL_4K, -pages); | ||
| 873 | } | ||
| 874 | |||
| 875 | static void __meminit | ||
| 876 | remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end, | ||
| 877 | bool direct) | ||
| 878 | { | ||
| 879 | unsigned long next, pages = 0; | ||
| 880 | pte_t *pte_base; | ||
| 881 | pmd_t *pmd; | ||
| 882 | void *page_addr; | ||
| 883 | |||
| 884 | pmd = pmd_start + pmd_index(addr); | ||
| 885 | for (; addr < end; addr = next, pmd++) { | ||
| 886 | next = pmd_addr_end(addr, end); | ||
| 887 | |||
| 888 | if (!pmd_present(*pmd)) | ||
| 889 | continue; | ||
| 890 | |||
| 891 | if (pmd_large(*pmd)) { | ||
| 892 | if (IS_ALIGNED(addr, PMD_SIZE) && | ||
| 893 | IS_ALIGNED(next, PMD_SIZE)) { | ||
| 894 | if (!direct) | ||
| 895 | free_pagetable(pmd_page(*pmd), | ||
| 896 | get_order(PMD_SIZE)); | ||
| 897 | |||
| 898 | spin_lock(&init_mm.page_table_lock); | ||
| 899 | pmd_clear(pmd); | ||
| 900 | spin_unlock(&init_mm.page_table_lock); | ||
| 901 | pages++; | ||
| 902 | } else { | ||
| 903 | /* If here, we are freeing vmemmap pages. */ | ||
| 904 | memset((void *)addr, PAGE_INUSE, next - addr); | ||
| 905 | |||
| 906 | page_addr = page_address(pmd_page(*pmd)); | ||
| 907 | if (!memchr_inv(page_addr, PAGE_INUSE, | ||
| 908 | PMD_SIZE)) { | ||
| 909 | free_pagetable(pmd_page(*pmd), | ||
| 910 | get_order(PMD_SIZE)); | ||
| 911 | |||
| 912 | spin_lock(&init_mm.page_table_lock); | ||
| 913 | pmd_clear(pmd); | ||
| 914 | spin_unlock(&init_mm.page_table_lock); | ||
| 915 | } | ||
| 916 | } | ||
| 917 | |||
| 918 | continue; | ||
| 919 | } | ||
| 920 | |||
| 921 | pte_base = (pte_t *)pmd_page_vaddr(*pmd); | ||
| 922 | remove_pte_table(pte_base, addr, next, direct); | ||
| 923 | free_pte_table(pte_base, pmd); | ||
| 924 | } | ||
| 925 | |||
| 926 | /* Call free_pmd_table() in remove_pud_table(). */ | ||
| 927 | if (direct) | ||
| 928 | update_page_count(PG_LEVEL_2M, -pages); | ||
| 929 | } | ||
| 930 | |||
| 931 | static void __meminit | ||
| 932 | remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, | ||
| 933 | bool direct) | ||
| 934 | { | ||
| 935 | unsigned long next, pages = 0; | ||
| 936 | pmd_t *pmd_base; | ||
| 937 | pud_t *pud; | ||
| 938 | void *page_addr; | ||
| 939 | |||
| 940 | pud = pud_start + pud_index(addr); | ||
| 941 | for (; addr < end; addr = next, pud++) { | ||
| 942 | next = pud_addr_end(addr, end); | ||
| 943 | |||
| 944 | if (!pud_present(*pud)) | ||
| 945 | continue; | ||
| 946 | |||
| 947 | if (pud_large(*pud)) { | ||
| 948 | if (IS_ALIGNED(addr, PUD_SIZE) && | ||
| 949 | IS_ALIGNED(next, PUD_SIZE)) { | ||
| 950 | if (!direct) | ||
| 951 | free_pagetable(pud_page(*pud), | ||
| 952 | get_order(PUD_SIZE)); | ||
| 953 | |||
| 954 | spin_lock(&init_mm.page_table_lock); | ||
| 955 | pud_clear(pud); | ||
| 956 | spin_unlock(&init_mm.page_table_lock); | ||
| 957 | pages++; | ||
| 958 | } else { | ||
| 959 | /* If here, we are freeing vmemmap pages. */ | ||
| 960 | memset((void *)addr, PAGE_INUSE, next - addr); | ||
| 961 | |||
| 962 | page_addr = page_address(pud_page(*pud)); | ||
| 963 | if (!memchr_inv(page_addr, PAGE_INUSE, | ||
| 964 | PUD_SIZE)) { | ||
| 965 | free_pagetable(pud_page(*pud), | ||
| 966 | get_order(PUD_SIZE)); | ||
| 967 | |||
| 968 | spin_lock(&init_mm.page_table_lock); | ||
| 969 | pud_clear(pud); | ||
| 970 | spin_unlock(&init_mm.page_table_lock); | ||
| 971 | } | ||
| 972 | } | ||
| 973 | |||
| 974 | continue; | ||
| 975 | } | ||
| 976 | |||
| 977 | pmd_base = (pmd_t *)pud_page_vaddr(*pud); | ||
| 978 | remove_pmd_table(pmd_base, addr, next, direct); | ||
| 979 | free_pmd_table(pmd_base, pud); | ||
| 980 | } | ||
| 981 | |||
| 982 | if (direct) | ||
| 983 | update_page_count(PG_LEVEL_1G, -pages); | ||
| 984 | } | ||
| 985 | |||
| 986 | /* start and end are both virtual address. */ | ||
| 987 | static void __meminit | ||
| 988 | remove_pagetable(unsigned long start, unsigned long end, bool direct) | ||
| 989 | { | ||
| 990 | unsigned long next; | ||
| 991 | pgd_t *pgd; | ||
| 992 | pud_t *pud; | ||
| 993 | bool pgd_changed = false; | ||
| 994 | |||
| 995 | for (; start < end; start = next) { | ||
| 996 | next = pgd_addr_end(start, end); | ||
| 997 | |||
| 998 | pgd = pgd_offset_k(start); | ||
| 999 | if (!pgd_present(*pgd)) | ||
| 1000 | continue; | ||
| 1001 | |||
| 1002 | pud = (pud_t *)pgd_page_vaddr(*pgd); | ||
| 1003 | remove_pud_table(pud, start, next, direct); | ||
| 1004 | if (free_pud_table(pud, pgd)) | ||
| 1005 | pgd_changed = true; | ||
| 1006 | } | ||
| 1007 | |||
| 1008 | if (pgd_changed) | ||
| 1009 | sync_global_pgds(start, end - 1); | ||
| 1010 | |||
| 1011 | flush_tlb_all(); | ||
| 1012 | } | ||
| 1013 | |||
| 1014 | void __ref vmemmap_free(struct page *memmap, unsigned long nr_pages) | ||
| 1015 | { | ||
| 1016 | unsigned long start = (unsigned long)memmap; | ||
| 1017 | unsigned long end = (unsigned long)(memmap + nr_pages); | ||
| 1018 | |||
| 1019 | remove_pagetable(start, end, false); | ||
| 1020 | } | ||
| 1021 | |||
| 1022 | static void __meminit | ||
| 1023 | kernel_physical_mapping_remove(unsigned long start, unsigned long end) | ||
| 1024 | { | ||
| 1025 | start = (unsigned long)__va(start); | ||
| 1026 | end = (unsigned long)__va(end); | ||
| 1027 | |||
| 1028 | remove_pagetable(start, end, true); | ||
| 1029 | } | ||
| 1030 | |||
| 1031 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
| 1032 | int __ref arch_remove_memory(u64 start, u64 size) | ||
| 1033 | { | ||
| 1034 | unsigned long start_pfn = start >> PAGE_SHIFT; | ||
| 1035 | unsigned long nr_pages = size >> PAGE_SHIFT; | ||
| 1036 | struct zone *zone; | ||
| 1037 | int ret; | ||
| 1038 | |||
| 1039 | zone = page_zone(pfn_to_page(start_pfn)); | ||
| 1040 | kernel_physical_mapping_remove(start, start + size); | ||
| 1041 | ret = __remove_pages(zone, start_pfn, nr_pages); | ||
| 1042 | WARN_ON_ONCE(ret); | ||
| 1043 | |||
| 1044 | return ret; | ||
| 1045 | } | ||
| 1046 | #endif | ||
| 710 | #endif /* CONFIG_MEMORY_HOTPLUG */ | 1047 | #endif /* CONFIG_MEMORY_HOTPLUG */ |
| 711 | 1048 | ||
| 712 | static struct kcore_list kcore_vsyscall; | 1049 | static struct kcore_list kcore_vsyscall; |
| @@ -1019,6 +1356,66 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) | |||
| 1019 | return 0; | 1356 | return 0; |
| 1020 | } | 1357 | } |
| 1021 | 1358 | ||
| 1359 | #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE) | ||
| 1360 | void register_page_bootmem_memmap(unsigned long section_nr, | ||
| 1361 | struct page *start_page, unsigned long size) | ||
| 1362 | { | ||
| 1363 | unsigned long addr = (unsigned long)start_page; | ||
| 1364 | unsigned long end = (unsigned long)(start_page + size); | ||
| 1365 | unsigned long next; | ||
| 1366 | pgd_t *pgd; | ||
| 1367 | pud_t *pud; | ||
| 1368 | pmd_t *pmd; | ||
| 1369 | unsigned int nr_pages; | ||
| 1370 | struct page *page; | ||
| 1371 | |||
| 1372 | for (; addr < end; addr = next) { | ||
| 1373 | pte_t *pte = NULL; | ||
| 1374 | |||
| 1375 | pgd = pgd_offset_k(addr); | ||
| 1376 | if (pgd_none(*pgd)) { | ||
| 1377 | next = (addr + PAGE_SIZE) & PAGE_MASK; | ||
| 1378 | continue; | ||
| 1379 | } | ||
| 1380 | get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO); | ||
| 1381 | |||
| 1382 | pud = pud_offset(pgd, addr); | ||
| 1383 | if (pud_none(*pud)) { | ||
| 1384 | next = (addr + PAGE_SIZE) & PAGE_MASK; | ||
| 1385 | continue; | ||
| 1386 | } | ||
| 1387 | get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); | ||
| 1388 | |||
| 1389 | if (!cpu_has_pse) { | ||
| 1390 | next = (addr + PAGE_SIZE) & PAGE_MASK; | ||
| 1391 | pmd = pmd_offset(pud, addr); | ||
| 1392 | if (pmd_none(*pmd)) | ||
| 1393 | continue; | ||
| 1394 | get_page_bootmem(section_nr, pmd_page(*pmd), | ||
| 1395 | MIX_SECTION_INFO); | ||
| 1396 | |||
| 1397 | pte = pte_offset_kernel(pmd, addr); | ||
| 1398 | if (pte_none(*pte)) | ||
| 1399 | continue; | ||
| 1400 | get_page_bootmem(section_nr, pte_page(*pte), | ||
| 1401 | SECTION_INFO); | ||
| 1402 | } else { | ||
| 1403 | next = pmd_addr_end(addr, end); | ||
| 1404 | |||
| 1405 | pmd = pmd_offset(pud, addr); | ||
| 1406 | if (pmd_none(*pmd)) | ||
| 1407 | continue; | ||
| 1408 | |||
| 1409 | nr_pages = 1 << (get_order(PMD_SIZE)); | ||
| 1410 | page = pmd_page(*pmd); | ||
| 1411 | while (nr_pages--) | ||
| 1412 | get_page_bootmem(section_nr, page++, | ||
| 1413 | SECTION_INFO); | ||
| 1414 | } | ||
| 1415 | } | ||
| 1416 | } | ||
| 1417 | #endif | ||
| 1418 | |||
| 1022 | void __meminit vmemmap_populate_print_last(void) | 1419 | void __meminit vmemmap_populate_print_last(void) |
| 1023 | { | 1420 | { |
| 1024 | if (p_start) { | 1421 | if (p_start) { |
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 8504f3698753..dfd30259eb89 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c | |||
| @@ -56,7 +56,7 @@ early_param("numa", numa_setup); | |||
| 56 | /* | 56 | /* |
| 57 | * apicid, cpu, node mappings | 57 | * apicid, cpu, node mappings |
| 58 | */ | 58 | */ |
| 59 | s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { | 59 | s16 __apicid_to_node[MAX_LOCAL_APIC] = { |
| 60 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE | 60 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE |
| 61 | }; | 61 | }; |
| 62 | 62 | ||
| @@ -78,7 +78,7 @@ EXPORT_SYMBOL(node_to_cpumask_map); | |||
| 78 | DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); | 78 | DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); |
| 79 | EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); | 79 | EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); |
| 80 | 80 | ||
| 81 | void __cpuinit numa_set_node(int cpu, int node) | 81 | void numa_set_node(int cpu, int node) |
| 82 | { | 82 | { |
| 83 | int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); | 83 | int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); |
| 84 | 84 | ||
| @@ -101,7 +101,7 @@ void __cpuinit numa_set_node(int cpu, int node) | |||
| 101 | set_cpu_numa_node(cpu, node); | 101 | set_cpu_numa_node(cpu, node); |
| 102 | } | 102 | } |
| 103 | 103 | ||
| 104 | void __cpuinit numa_clear_node(int cpu) | 104 | void numa_clear_node(int cpu) |
| 105 | { | 105 | { |
| 106 | numa_set_node(cpu, NUMA_NO_NODE); | 106 | numa_set_node(cpu, NUMA_NO_NODE); |
| 107 | } | 107 | } |
| @@ -213,10 +213,9 @@ static void __init setup_node_data(int nid, u64 start, u64 end) | |||
| 213 | * Allocate node data. Try node-local memory and then any node. | 213 | * Allocate node data. Try node-local memory and then any node. |
| 214 | * Never allocate in DMA zone. | 214 | * Never allocate in DMA zone. |
| 215 | */ | 215 | */ |
| 216 | nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); | 216 | nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); |
| 217 | if (!nd_pa) { | 217 | if (!nd_pa) { |
| 218 | pr_err("Cannot find %zu bytes in node %d\n", | 218 | pr_err("Cannot find %zu bytes in any node\n", nd_size); |
| 219 | nd_size, nid); | ||
| 220 | return; | 219 | return; |
| 221 | } | 220 | } |
| 222 | nd = __va(nd_pa); | 221 | nd = __va(nd_pa); |
| @@ -561,10 +560,12 @@ static int __init numa_init(int (*init_func)(void)) | |||
| 561 | for (i = 0; i < MAX_LOCAL_APIC; i++) | 560 | for (i = 0; i < MAX_LOCAL_APIC; i++) |
| 562 | set_apicid_to_node(i, NUMA_NO_NODE); | 561 | set_apicid_to_node(i, NUMA_NO_NODE); |
| 563 | 562 | ||
| 564 | nodes_clear(numa_nodes_parsed); | 563 | /* |
| 564 | * Do not clear numa_nodes_parsed or zero numa_meminfo here, because | ||
| 565 | * SRAT was parsed earlier in early_parse_srat(). | ||
| 566 | */ | ||
| 565 | nodes_clear(node_possible_map); | 567 | nodes_clear(node_possible_map); |
| 566 | nodes_clear(node_online_map); | 568 | nodes_clear(node_online_map); |
| 567 | memset(&numa_meminfo, 0, sizeof(numa_meminfo)); | ||
| 568 | WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES)); | 569 | WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES)); |
| 569 | numa_reset_distance(); | 570 | numa_reset_distance(); |
| 570 | 571 | ||
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a1b1c88f9caf..ca1f1c2bb7be 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c | |||
| @@ -529,21 +529,13 @@ out_unlock: | |||
| 529 | return do_split; | 529 | return do_split; |
| 530 | } | 530 | } |
| 531 | 531 | ||
| 532 | static int split_large_page(pte_t *kpte, unsigned long address) | 532 | int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase) |
| 533 | { | 533 | { |
| 534 | unsigned long pfn, pfninc = 1; | 534 | unsigned long pfn, pfninc = 1; |
| 535 | unsigned int i, level; | 535 | unsigned int i, level; |
| 536 | pte_t *pbase, *tmp; | 536 | pte_t *tmp; |
| 537 | pgprot_t ref_prot; | 537 | pgprot_t ref_prot; |
| 538 | struct page *base; | 538 | struct page *base = virt_to_page(pbase); |
| 539 | |||
| 540 | if (!debug_pagealloc) | ||
| 541 | spin_unlock(&cpa_lock); | ||
| 542 | base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); | ||
| 543 | if (!debug_pagealloc) | ||
| 544 | spin_lock(&cpa_lock); | ||
| 545 | if (!base) | ||
| 546 | return -ENOMEM; | ||
| 547 | 539 | ||
| 548 | spin_lock(&pgd_lock); | 540 | spin_lock(&pgd_lock); |
| 549 | /* | 541 | /* |
| @@ -551,10 +543,11 @@ static int split_large_page(pte_t *kpte, unsigned long address) | |||
| 551 | * up for us already: | 543 | * up for us already: |
| 552 | */ | 544 | */ |
| 553 | tmp = lookup_address(address, &level); | 545 | tmp = lookup_address(address, &level); |
| 554 | if (tmp != kpte) | 546 | if (tmp != kpte) { |
| 555 | goto out_unlock; | 547 | spin_unlock(&pgd_lock); |
| 548 | return 1; | ||
| 549 | } | ||
| 556 | 550 | ||
| 557 | pbase = (pte_t *)page_address(base); | ||
| 558 | paravirt_alloc_pte(&init_mm, page_to_pfn(base)); | 551 | paravirt_alloc_pte(&init_mm, page_to_pfn(base)); |
| 559 | ref_prot = pte_pgprot(pte_clrhuge(*kpte)); | 552 | ref_prot = pte_pgprot(pte_clrhuge(*kpte)); |
| 560 | /* | 553 | /* |
| @@ -601,17 +594,27 @@ static int split_large_page(pte_t *kpte, unsigned long address) | |||
| 601 | * going on. | 594 | * going on. |
| 602 | */ | 595 | */ |
| 603 | __flush_tlb_all(); | 596 | __flush_tlb_all(); |
| 597 | spin_unlock(&pgd_lock); | ||
| 604 | 598 | ||
| 605 | base = NULL; | 599 | return 0; |
| 600 | } | ||
| 606 | 601 | ||
| 607 | out_unlock: | 602 | static int split_large_page(pte_t *kpte, unsigned long address) |
| 608 | /* | 603 | { |
| 609 | * If we dropped out via the lookup_address check under | 604 | pte_t *pbase; |
| 610 | * pgd_lock then stick the page back into the pool: | 605 | struct page *base; |
| 611 | */ | 606 | |
| 612 | if (base) | 607 | if (!debug_pagealloc) |
| 608 | spin_unlock(&cpa_lock); | ||
| 609 | base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); | ||
| 610 | if (!debug_pagealloc) | ||
| 611 | spin_lock(&cpa_lock); | ||
| 612 | if (!base) | ||
| 613 | return -ENOMEM; | ||
| 614 | |||
| 615 | pbase = (pte_t *)page_address(base); | ||
| 616 | if (__split_large_page(kpte, address, pbase)) | ||
| 613 | __free_page(base); | 617 | __free_page(base); |
| 614 | spin_unlock(&pgd_lock); | ||
| 615 | 618 | ||
| 616 | return 0; | 619 | return 0; |
| 617 | } | 620 | } |
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index cdd0da9dd530..79836d01f789 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c | |||
| @@ -141,11 +141,126 @@ static inline int save_add_info(void) {return 1;} | |||
| 141 | static inline int save_add_info(void) {return 0;} | 141 | static inline int save_add_info(void) {return 0;} |
| 142 | #endif | 142 | #endif |
| 143 | 143 | ||
| 144 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | ||
| 145 | static void __init | ||
| 146 | handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable) | ||
| 147 | { | ||
| 148 | int overlap, i; | ||
| 149 | unsigned long start_pfn, end_pfn; | ||
| 150 | |||
| 151 | start_pfn = PFN_DOWN(start); | ||
| 152 | end_pfn = PFN_UP(end); | ||
| 153 | |||
| 154 | /* | ||
| 155 | * For movablemem_map=acpi: | ||
| 156 | * | ||
| 157 | * SRAT: |_____| |_____| |_________| |_________| ...... | ||
| 158 | * node id: 0 1 1 2 | ||
| 159 | * hotpluggable: n y y n | ||
| 160 | * movablemem_map: |_____| |_________| | ||
| 161 | * | ||
| 162 | * Using movablemem_map, we can prevent memblock from allocating memory | ||
| 163 | * on ZONE_MOVABLE at boot time. | ||
| 164 | * | ||
| 165 | * Before parsing SRAT, memblock has already reserve some memory ranges | ||
| 166 | * for other purposes, such as for kernel image. We cannot prevent | ||
| 167 | * kernel from using these memory, so we need to exclude these memory | ||
| 168 | * even if it is hotpluggable. | ||
| 169 | * Furthermore, to ensure the kernel has enough memory to boot, we make | ||
| 170 | * all the memory on the node which the kernel resides in | ||
| 171 | * un-hotpluggable. | ||
| 172 | */ | ||
| 173 | if (hotpluggable && movablemem_map.acpi) { | ||
| 174 | /* Exclude ranges reserved by memblock. */ | ||
| 175 | struct memblock_type *rgn = &memblock.reserved; | ||
| 176 | |||
| 177 | for (i = 0; i < rgn->cnt; i++) { | ||
| 178 | if (end <= rgn->regions[i].base || | ||
| 179 | start >= rgn->regions[i].base + | ||
| 180 | rgn->regions[i].size) | ||
| 181 | continue; | ||
| 182 | |||
| 183 | /* | ||
| 184 | * If the memory range overlaps the memory reserved by | ||
| 185 | * memblock, then the kernel resides in this node. | ||
| 186 | */ | ||
| 187 | node_set(node, movablemem_map.numa_nodes_kernel); | ||
| 188 | |||
| 189 | goto out; | ||
| 190 | } | ||
| 191 | |||
| 192 | /* | ||
| 193 | * If the kernel resides in this node, then the whole node | ||
| 194 | * should not be hotpluggable. | ||
| 195 | */ | ||
| 196 | if (node_isset(node, movablemem_map.numa_nodes_kernel)) | ||
| 197 | goto out; | ||
| 198 | |||
| 199 | insert_movablemem_map(start_pfn, end_pfn); | ||
| 200 | |||
| 201 | /* | ||
| 202 | * numa_nodes_hotplug nodemask represents which nodes are put | ||
| 203 | * into movablemem_map.map[]. | ||
| 204 | */ | ||
| 205 | node_set(node, movablemem_map.numa_nodes_hotplug); | ||
| 206 | goto out; | ||
| 207 | } | ||
| 208 | |||
| 209 | /* | ||
| 210 | * For movablemem_map=nn[KMG]@ss[KMG]: | ||
| 211 | * | ||
| 212 | * SRAT: |_____| |_____| |_________| |_________| ...... | ||
| 213 | * node id: 0 1 1 2 | ||
| 214 | * user specified: |__| |___| | ||
| 215 | * movablemem_map: |___| |_________| |______| ...... | ||
| 216 | * | ||
| 217 | * Using movablemem_map, we can prevent memblock from allocating memory | ||
| 218 | * on ZONE_MOVABLE at boot time. | ||
| 219 | * | ||
| 220 | * NOTE: In this case, SRAT info will be ingored. | ||
| 221 | */ | ||
| 222 | overlap = movablemem_map_overlap(start_pfn, end_pfn); | ||
| 223 | if (overlap >= 0) { | ||
| 224 | /* | ||
| 225 | * If part of this range is in movablemem_map, we need to | ||
| 226 | * add the range after it to extend the range to the end | ||
| 227 | * of the node, because from the min address specified to | ||
| 228 | * the end of the node will be ZONE_MOVABLE. | ||
| 229 | */ | ||
| 230 | start_pfn = max(start_pfn, | ||
| 231 | movablemem_map.map[overlap].start_pfn); | ||
| 232 | insert_movablemem_map(start_pfn, end_pfn); | ||
| 233 | |||
| 234 | /* | ||
| 235 | * Set the nodemask, so that if the address range on one node | ||
| 236 | * is not continuse, we can add the subsequent ranges on the | ||
| 237 | * same node into movablemem_map. | ||
| 238 | */ | ||
| 239 | node_set(node, movablemem_map.numa_nodes_hotplug); | ||
| 240 | } else { | ||
| 241 | if (node_isset(node, movablemem_map.numa_nodes_hotplug)) | ||
| 242 | /* | ||
| 243 | * Insert the range if we already have movable ranges | ||
| 244 | * on the same node. | ||
| 245 | */ | ||
| 246 | insert_movablemem_map(start_pfn, end_pfn); | ||
| 247 | } | ||
| 248 | out: | ||
| 249 | return; | ||
| 250 | } | ||
| 251 | #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | ||
| 252 | static inline void | ||
| 253 | handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable) | ||
| 254 | { | ||
| 255 | } | ||
| 256 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | ||
| 257 | |||
| 144 | /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ | 258 | /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ |
| 145 | int __init | 259 | int __init |
| 146 | acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) | 260 | acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) |
| 147 | { | 261 | { |
| 148 | u64 start, end; | 262 | u64 start, end; |
| 263 | u32 hotpluggable; | ||
| 149 | int node, pxm; | 264 | int node, pxm; |
| 150 | 265 | ||
| 151 | if (srat_disabled()) | 266 | if (srat_disabled()) |
| @@ -154,7 +269,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) | |||
| 154 | goto out_err_bad_srat; | 269 | goto out_err_bad_srat; |
| 155 | if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) | 270 | if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) |
| 156 | goto out_err; | 271 | goto out_err; |
| 157 | if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) | 272 | hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE; |
| 273 | if (hotpluggable && !save_add_info()) | ||
| 158 | goto out_err; | 274 | goto out_err; |
| 159 | 275 | ||
| 160 | start = ma->base_address; | 276 | start = ma->base_address; |
| @@ -174,9 +290,12 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) | |||
| 174 | 290 | ||
| 175 | node_set(node, numa_nodes_parsed); | 291 | node_set(node, numa_nodes_parsed); |
| 176 | 292 | ||
| 177 | printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n", | 293 | printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx] %s\n", |
| 178 | node, pxm, | 294 | node, pxm, |
| 179 | (unsigned long long) start, (unsigned long long) end - 1); | 295 | (unsigned long long) start, (unsigned long long) end - 1, |
| 296 | hotpluggable ? "Hot Pluggable": ""); | ||
| 297 | |||
| 298 | handle_movablemem(node, start, end, hotpluggable); | ||
| 180 | 299 | ||
| 181 | return 0; | 300 | return 0; |
| 182 | out_err_bad_srat: | 301 | out_err_bad_srat: |
diff --git a/block/genhd.c b/block/genhd.c index 3993ebf4135f..5f73c2435fde 100644 --- a/block/genhd.c +++ b/block/genhd.c | |||
| @@ -18,6 +18,7 @@ | |||
| 18 | #include <linux/mutex.h> | 18 | #include <linux/mutex.h> |
| 19 | #include <linux/idr.h> | 19 | #include <linux/idr.h> |
| 20 | #include <linux/log2.h> | 20 | #include <linux/log2.h> |
| 21 | #include <linux/pm_runtime.h> | ||
| 21 | 22 | ||
| 22 | #include "blk.h" | 23 | #include "blk.h" |
| 23 | 24 | ||
| @@ -534,6 +535,14 @@ static void register_disk(struct gendisk *disk) | |||
| 534 | return; | 535 | return; |
| 535 | } | 536 | } |
| 536 | } | 537 | } |
| 538 | |||
| 539 | /* | ||
| 540 | * avoid probable deadlock caused by allocating memory with | ||
| 541 | * GFP_KERNEL in runtime_resume callback of its all ancestor | ||
| 542 | * devices | ||
| 543 | */ | ||
| 544 | pm_runtime_set_memalloc_noio(ddev, true); | ||
| 545 | |||
| 537 | disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); | 546 | disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); |
| 538 | disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); | 547 | disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); |
| 539 | 548 | ||
| @@ -663,6 +672,7 @@ void del_gendisk(struct gendisk *disk) | |||
| 663 | disk->driverfs_dev = NULL; | 672 | disk->driverfs_dev = NULL; |
| 664 | if (!sysfs_deprecated) | 673 | if (!sysfs_deprecated) |
| 665 | sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); | 674 | sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); |
| 675 | pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); | ||
| 666 | device_del(disk_to_dev(disk)); | 676 | device_del(disk_to_dev(disk)); |
| 667 | } | 677 | } |
| 668 | EXPORT_SYMBOL(del_gendisk); | 678 | EXPORT_SYMBOL(del_gendisk); |
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index 034d3e72aa92..da1f82b445e0 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c | |||
| @@ -280,9 +280,11 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) | |||
| 280 | 280 | ||
| 281 | static int acpi_memory_remove_memory(struct acpi_memory_device *mem_device) | 281 | static int acpi_memory_remove_memory(struct acpi_memory_device *mem_device) |
| 282 | { | 282 | { |
| 283 | int result = 0; | 283 | int result = 0, nid; |
| 284 | struct acpi_memory_info *info, *n; | 284 | struct acpi_memory_info *info, *n; |
| 285 | 285 | ||
| 286 | nid = acpi_get_node(mem_device->device->handle); | ||
| 287 | |||
| 286 | list_for_each_entry_safe(info, n, &mem_device->res_list, list) { | 288 | list_for_each_entry_safe(info, n, &mem_device->res_list, list) { |
| 287 | if (info->failed) | 289 | if (info->failed) |
| 288 | /* The kernel does not use this memory block */ | 290 | /* The kernel does not use this memory block */ |
| @@ -295,7 +297,9 @@ static int acpi_memory_remove_memory(struct acpi_memory_device *mem_device) | |||
| 295 | */ | 297 | */ |
| 296 | return -EBUSY; | 298 | return -EBUSY; |
| 297 | 299 | ||
| 298 | result = remove_memory(info->start_addr, info->length); | 300 | if (nid < 0) |
| 301 | nid = memory_add_physaddr_to_nid(info->start_addr); | ||
| 302 | result = remove_memory(nid, info->start_addr, info->length); | ||
| 299 | if (result) | 303 | if (result) |
| 300 | return result; | 304 | return result; |
| 301 | 305 | ||
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index 33e609f63585..59844ee149be 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c | |||
| @@ -282,10 +282,10 @@ acpi_table_parse_srat(enum acpi_srat_type id, | |||
| 282 | handler, max_entries); | 282 | handler, max_entries); |
| 283 | } | 283 | } |
| 284 | 284 | ||
| 285 | int __init acpi_numa_init(void) | 285 | static int srat_mem_cnt; |
| 286 | { | ||
| 287 | int cnt = 0; | ||
| 288 | 286 | ||
| 287 | void __init early_parse_srat(void) | ||
| 288 | { | ||
| 289 | /* | 289 | /* |
| 290 | * Should not limit number with cpu num that is from NR_CPUS or nr_cpus= | 290 | * Should not limit number with cpu num that is from NR_CPUS or nr_cpus= |
| 291 | * SRAT cpu entries could have different order with that in MADT. | 291 | * SRAT cpu entries could have different order with that in MADT. |
| @@ -295,21 +295,24 @@ int __init acpi_numa_init(void) | |||
| 295 | /* SRAT: Static Resource Affinity Table */ | 295 | /* SRAT: Static Resource Affinity Table */ |
| 296 | if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) { | 296 | if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) { |
| 297 | acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY, | 297 | acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY, |
| 298 | acpi_parse_x2apic_affinity, 0); | 298 | acpi_parse_x2apic_affinity, 0); |
| 299 | acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY, | 299 | acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY, |
| 300 | acpi_parse_processor_affinity, 0); | 300 | acpi_parse_processor_affinity, 0); |
| 301 | cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, | 301 | srat_mem_cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, |
| 302 | acpi_parse_memory_affinity, | 302 | acpi_parse_memory_affinity, |
| 303 | NR_NODE_MEMBLKS); | 303 | NR_NODE_MEMBLKS); |
| 304 | } | 304 | } |
| 305 | } | ||
| 305 | 306 | ||
| 307 | int __init acpi_numa_init(void) | ||
| 308 | { | ||
| 306 | /* SLIT: System Locality Information Table */ | 309 | /* SLIT: System Locality Information Table */ |
| 307 | acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit); | 310 | acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit); |
| 308 | 311 | ||
| 309 | acpi_numa_arch_fixup(); | 312 | acpi_numa_arch_fixup(); |
| 310 | 313 | ||
| 311 | if (cnt < 0) | 314 | if (srat_mem_cnt < 0) |
| 312 | return cnt; | 315 | return srat_mem_cnt; |
| 313 | else if (!parsed_numa_memblks) | 316 | else if (!parsed_numa_memblks) |
| 314 | return -ENOENT; | 317 | return -ENOENT; |
| 315 | return 0; | 318 | return 0; |
diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c index cbf1f122666b..df34bd04ae62 100644 --- a/drivers/acpi/processor_driver.c +++ b/drivers/acpi/processor_driver.c | |||
| @@ -45,6 +45,7 @@ | |||
| 45 | #include <linux/cpuidle.h> | 45 | #include <linux/cpuidle.h> |
| 46 | #include <linux/slab.h> | 46 | #include <linux/slab.h> |
| 47 | #include <linux/acpi.h> | 47 | #include <linux/acpi.h> |
| 48 | #include <linux/memory_hotplug.h> | ||
| 48 | 49 | ||
| 49 | #include <asm/io.h> | 50 | #include <asm/io.h> |
| 50 | #include <asm/cpu.h> | 51 | #include <asm/cpu.h> |
| @@ -641,6 +642,7 @@ static int acpi_processor_remove(struct acpi_device *device) | |||
| 641 | 642 | ||
| 642 | per_cpu(processors, pr->id) = NULL; | 643 | per_cpu(processors, pr->id) = NULL; |
| 643 | per_cpu(processor_device_array, pr->id) = NULL; | 644 | per_cpu(processor_device_array, pr->id) = NULL; |
| 645 | try_offline_node(cpu_to_node(pr->id)); | ||
| 644 | 646 | ||
| 645 | free: | 647 | free: |
| 646 | free_cpumask_var(pr->throttling.shared_cpu_map); | 648 | free_cpumask_var(pr->throttling.shared_cpu_map); |
diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 83d0b17ba1c2..a51007b79032 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c | |||
| @@ -693,6 +693,12 @@ int offline_memory_block(struct memory_block *mem) | |||
| 693 | return ret; | 693 | return ret; |
| 694 | } | 694 | } |
| 695 | 695 | ||
| 696 | /* return true if the memory block is offlined, otherwise, return false */ | ||
| 697 | bool is_memblock_offlined(struct memory_block *mem) | ||
| 698 | { | ||
| 699 | return mem->state == MEM_OFFLINE; | ||
| 700 | } | ||
| 701 | |||
| 696 | /* | 702 | /* |
| 697 | * Initialize the sysfs support for memory devices... | 703 | * Initialize the sysfs support for memory devices... |
| 698 | */ | 704 | */ |
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 3148b10dc2e5..1244930e3d7a 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c | |||
| @@ -124,6 +124,76 @@ unsigned long pm_runtime_autosuspend_expiration(struct device *dev) | |||
| 124 | } | 124 | } |
| 125 | EXPORT_SYMBOL_GPL(pm_runtime_autosuspend_expiration); | 125 | EXPORT_SYMBOL_GPL(pm_runtime_autosuspend_expiration); |
| 126 | 126 | ||
| 127 | static int dev_memalloc_noio(struct device *dev, void *data) | ||
| 128 | { | ||
| 129 | return dev->power.memalloc_noio; | ||
| 130 | } | ||
| 131 | |||
| 132 | /* | ||
| 133 | * pm_runtime_set_memalloc_noio - Set a device's memalloc_noio flag. | ||
| 134 | * @dev: Device to handle. | ||
| 135 | * @enable: True for setting the flag and False for clearing the flag. | ||
| 136 | * | ||
| 137 | * Set the flag for all devices in the path from the device to the | ||
| 138 | * root device in the device tree if @enable is true, otherwise clear | ||
| 139 | * the flag for devices in the path whose siblings don't set the flag. | ||
| 140 | * | ||
| 141 | * The function should only be called by block device, or network | ||
| 142 | * device driver for solving the deadlock problem during runtime | ||
| 143 | * resume/suspend: | ||
| 144 | * | ||
| 145 | * If memory allocation with GFP_KERNEL is called inside runtime | ||
| 146 | * resume/suspend callback of any one of its ancestors(or the | ||
| 147 | * block device itself), the deadlock may be triggered inside the | ||
| 148 | * memory allocation since it might not complete until the block | ||
| 149 | * device becomes active and the involed page I/O finishes. The | ||
| 150 | * situation is pointed out first by Alan Stern. Network device | ||
| 151 | * are involved in iSCSI kind of situation. | ||
| 152 | * | ||
| 153 | * The lock of dev_hotplug_mutex is held in the function for handling | ||
| 154 | * hotplug race because pm_runtime_set_memalloc_noio() may be called | ||
| 155 | * in async probe(). | ||
| 156 | * | ||
| 157 | * The function should be called between device_add() and device_del() | ||
| 158 | * on the affected device(block/network device). | ||
| 159 | */ | ||
| 160 | void pm_runtime_set_memalloc_noio(struct device *dev, bool enable) | ||
| 161 | { | ||
| 162 | static DEFINE_MUTEX(dev_hotplug_mutex); | ||
| 163 | |||
| 164 | mutex_lock(&dev_hotplug_mutex); | ||
| 165 | for (;;) { | ||
| 166 | bool enabled; | ||
| 167 | |||
| 168 | /* hold power lock since bitfield is not SMP-safe. */ | ||
| 169 | spin_lock_irq(&dev->power.lock); | ||
| 170 | enabled = dev->power.memalloc_noio; | ||
| 171 | dev->power.memalloc_noio = enable; | ||
| 172 | spin_unlock_irq(&dev->power.lock); | ||
| 173 | |||
| 174 | /* | ||
| 175 | * not need to enable ancestors any more if the device | ||
| 176 | * has been enabled. | ||
| 177 | */ | ||
| 178 | if (enabled && enable) | ||
| 179 | break; | ||
| 180 | |||
| 181 | dev = dev->parent; | ||
| 182 | |||
| 183 | /* | ||
| 184 | * clear flag of the parent device only if all the | ||
| 185 | * children don't set the flag because ancestor's | ||
| 186 | * flag was set by any one of the descendants. | ||
| 187 | */ | ||
| 188 | if (!dev || (!enable && | ||
| 189 | device_for_each_child(dev, NULL, | ||
| 190 | dev_memalloc_noio))) | ||
| 191 | break; | ||
| 192 | } | ||
| 193 | mutex_unlock(&dev_hotplug_mutex); | ||
| 194 | } | ||
| 195 | EXPORT_SYMBOL_GPL(pm_runtime_set_memalloc_noio); | ||
| 196 | |||
| 127 | /** | 197 | /** |
| 128 | * rpm_check_suspend_allowed - Test whether a device may be suspended. | 198 | * rpm_check_suspend_allowed - Test whether a device may be suspended. |
| 129 | * @dev: Device to test. | 199 | * @dev: Device to test. |
| @@ -278,7 +348,24 @@ static int rpm_callback(int (*cb)(struct device *), struct device *dev) | |||
| 278 | if (!cb) | 348 | if (!cb) |
| 279 | return -ENOSYS; | 349 | return -ENOSYS; |
| 280 | 350 | ||
| 281 | retval = __rpm_callback(cb, dev); | 351 | if (dev->power.memalloc_noio) { |
| 352 | unsigned int noio_flag; | ||
| 353 | |||
| 354 | /* | ||
| 355 | * Deadlock might be caused if memory allocation with | ||
| 356 | * GFP_KERNEL happens inside runtime_suspend and | ||
| 357 | * runtime_resume callbacks of one block device's | ||
| 358 | * ancestor or the block device itself. Network | ||
| 359 | * device might be thought as part of iSCSI block | ||
| 360 | * device, so network device and its ancestor should | ||
| 361 | * be marked as memalloc_noio too. | ||
| 362 | */ | ||
| 363 | noio_flag = memalloc_noio_save(); | ||
| 364 | retval = __rpm_callback(cb, dev); | ||
| 365 | memalloc_noio_restore(noio_flag); | ||
| 366 | } else { | ||
| 367 | retval = __rpm_callback(cb, dev); | ||
| 368 | } | ||
| 282 | 369 | ||
| 283 | dev->power.runtime_error = retval; | 370 | dev->power.runtime_error = retval; |
| 284 | return retval != -EACCES ? retval : -EIO; | 371 | return retval != -EACCES ? retval : -EIO; |
diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c index 90723e65b081..0b5b5f619c75 100644 --- a/drivers/firmware/memmap.c +++ b/drivers/firmware/memmap.c | |||
| @@ -21,6 +21,7 @@ | |||
| 21 | #include <linux/types.h> | 21 | #include <linux/types.h> |
| 22 | #include <linux/bootmem.h> | 22 | #include <linux/bootmem.h> |
| 23 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
| 24 | #include <linux/mm.h> | ||
| 24 | 25 | ||
| 25 | /* | 26 | /* |
| 26 | * Data types ------------------------------------------------------------------ | 27 | * Data types ------------------------------------------------------------------ |
| @@ -52,6 +53,9 @@ static ssize_t start_show(struct firmware_map_entry *entry, char *buf); | |||
| 52 | static ssize_t end_show(struct firmware_map_entry *entry, char *buf); | 53 | static ssize_t end_show(struct firmware_map_entry *entry, char *buf); |
| 53 | static ssize_t type_show(struct firmware_map_entry *entry, char *buf); | 54 | static ssize_t type_show(struct firmware_map_entry *entry, char *buf); |
| 54 | 55 | ||
| 56 | static struct firmware_map_entry * __meminit | ||
| 57 | firmware_map_find_entry(u64 start, u64 end, const char *type); | ||
| 58 | |||
| 55 | /* | 59 | /* |
| 56 | * Static data ----------------------------------------------------------------- | 60 | * Static data ----------------------------------------------------------------- |
| 57 | */ | 61 | */ |
| @@ -79,7 +83,52 @@ static const struct sysfs_ops memmap_attr_ops = { | |||
| 79 | .show = memmap_attr_show, | 83 | .show = memmap_attr_show, |
| 80 | }; | 84 | }; |
| 81 | 85 | ||
| 82 | static struct kobj_type memmap_ktype = { | 86 | /* Firmware memory map entries. */ |
| 87 | static LIST_HEAD(map_entries); | ||
| 88 | static DEFINE_SPINLOCK(map_entries_lock); | ||
| 89 | |||
| 90 | /* | ||
| 91 | * For memory hotplug, there is no way to free memory map entries allocated | ||
| 92 | * by boot mem after the system is up. So when we hot-remove memory whose | ||
| 93 | * map entry is allocated by bootmem, we need to remember the storage and | ||
| 94 | * reuse it when the memory is hot-added again. | ||
| 95 | */ | ||
| 96 | static LIST_HEAD(map_entries_bootmem); | ||
| 97 | static DEFINE_SPINLOCK(map_entries_bootmem_lock); | ||
| 98 | |||
| 99 | |||
| 100 | static inline struct firmware_map_entry * | ||
| 101 | to_memmap_entry(struct kobject *kobj) | ||
| 102 | { | ||
| 103 | return container_of(kobj, struct firmware_map_entry, kobj); | ||
| 104 | } | ||
| 105 | |||
| 106 | static void __meminit release_firmware_map_entry(struct kobject *kobj) | ||
| 107 | { | ||
| 108 | struct firmware_map_entry *entry = to_memmap_entry(kobj); | ||
| 109 | |||
| 110 | if (PageReserved(virt_to_page(entry))) { | ||
| 111 | /* | ||
| 112 | * Remember the storage allocated by bootmem, and reuse it when | ||
| 113 | * the memory is hot-added again. The entry will be added to | ||
| 114 | * map_entries_bootmem here, and deleted from &map_entries in | ||
| 115 | * firmware_map_remove_entry(). | ||
| 116 | */ | ||
| 117 | if (firmware_map_find_entry(entry->start, entry->end, | ||
| 118 | entry->type)) { | ||
| 119 | spin_lock(&map_entries_bootmem_lock); | ||
| 120 | list_add(&entry->list, &map_entries_bootmem); | ||
| 121 | spin_unlock(&map_entries_bootmem_lock); | ||
| 122 | } | ||
| 123 | |||
| 124 | return; | ||
| 125 | } | ||
| 126 | |||
| 127 | kfree(entry); | ||
| 128 | } | ||
| 129 | |||
| 130 | static struct kobj_type __refdata memmap_ktype = { | ||
| 131 | .release = release_firmware_map_entry, | ||
| 83 | .sysfs_ops = &memmap_attr_ops, | 132 | .sysfs_ops = &memmap_attr_ops, |
| 84 | .default_attrs = def_attrs, | 133 | .default_attrs = def_attrs, |
| 85 | }; | 134 | }; |
| @@ -88,13 +137,6 @@ static struct kobj_type memmap_ktype = { | |||
| 88 | * Registration functions ------------------------------------------------------ | 137 | * Registration functions ------------------------------------------------------ |
| 89 | */ | 138 | */ |
| 90 | 139 | ||
| 91 | /* | ||
| 92 | * Firmware memory map entries. No locking is needed because the | ||
| 93 | * firmware_map_add() and firmware_map_add_early() functions are called | ||
| 94 | * in firmware initialisation code in one single thread of execution. | ||
| 95 | */ | ||
| 96 | static LIST_HEAD(map_entries); | ||
| 97 | |||
| 98 | /** | 140 | /** |
| 99 | * firmware_map_add_entry() - Does the real work to add a firmware memmap entry. | 141 | * firmware_map_add_entry() - Does the real work to add a firmware memmap entry. |
| 100 | * @start: Start of the memory range. | 142 | * @start: Start of the memory range. |
| @@ -118,11 +160,25 @@ static int firmware_map_add_entry(u64 start, u64 end, | |||
| 118 | INIT_LIST_HEAD(&entry->list); | 160 | INIT_LIST_HEAD(&entry->list); |
| 119 | kobject_init(&entry->kobj, &memmap_ktype); | 161 | kobject_init(&entry->kobj, &memmap_ktype); |
| 120 | 162 | ||
| 163 | spin_lock(&map_entries_lock); | ||
| 121 | list_add_tail(&entry->list, &map_entries); | 164 | list_add_tail(&entry->list, &map_entries); |
| 165 | spin_unlock(&map_entries_lock); | ||
| 122 | 166 | ||
| 123 | return 0; | 167 | return 0; |
| 124 | } | 168 | } |
| 125 | 169 | ||
| 170 | /** | ||
| 171 | * firmware_map_remove_entry() - Does the real work to remove a firmware | ||
| 172 | * memmap entry. | ||
| 173 | * @entry: removed entry. | ||
| 174 | * | ||
| 175 | * The caller must hold map_entries_lock, and release it properly. | ||
| 176 | **/ | ||
| 177 | static inline void firmware_map_remove_entry(struct firmware_map_entry *entry) | ||
| 178 | { | ||
| 179 | list_del(&entry->list); | ||
| 180 | } | ||
| 181 | |||
| 126 | /* | 182 | /* |
| 127 | * Add memmap entry on sysfs | 183 | * Add memmap entry on sysfs |
| 128 | */ | 184 | */ |
| @@ -144,6 +200,78 @@ static int add_sysfs_fw_map_entry(struct firmware_map_entry *entry) | |||
| 144 | return 0; | 200 | return 0; |
| 145 | } | 201 | } |
| 146 | 202 | ||
| 203 | /* | ||
| 204 | * Remove memmap entry on sysfs | ||
| 205 | */ | ||
| 206 | static inline void remove_sysfs_fw_map_entry(struct firmware_map_entry *entry) | ||
| 207 | { | ||
| 208 | kobject_put(&entry->kobj); | ||
| 209 | } | ||
| 210 | |||
| 211 | /* | ||
| 212 | * firmware_map_find_entry_in_list() - Search memmap entry in a given list. | ||
| 213 | * @start: Start of the memory range. | ||
| 214 | * @end: End of the memory range (exclusive). | ||
| 215 | * @type: Type of the memory range. | ||
| 216 | * @list: In which to find the entry. | ||
| 217 | * | ||
| 218 | * This function is to find the memmap entey of a given memory range in a | ||
| 219 | * given list. The caller must hold map_entries_lock, and must not release | ||
| 220 | * the lock until the processing of the returned entry has completed. | ||
| 221 | * | ||
| 222 | * Return: Pointer to the entry to be found on success, or NULL on failure. | ||
| 223 | */ | ||
| 224 | static struct firmware_map_entry * __meminit | ||
| 225 | firmware_map_find_entry_in_list(u64 start, u64 end, const char *type, | ||
| 226 | struct list_head *list) | ||
| 227 | { | ||
| 228 | struct firmware_map_entry *entry; | ||
| 229 | |||
| 230 | list_for_each_entry(entry, list, list) | ||
| 231 | if ((entry->start == start) && (entry->end == end) && | ||
| 232 | (!strcmp(entry->type, type))) { | ||
| 233 | return entry; | ||
| 234 | } | ||
| 235 | |||
| 236 | return NULL; | ||
| 237 | } | ||
| 238 | |||
| 239 | /* | ||
| 240 | * firmware_map_find_entry() - Search memmap entry in map_entries. | ||
| 241 | * @start: Start of the memory range. | ||
| 242 | * @end: End of the memory range (exclusive). | ||
| 243 | * @type: Type of the memory range. | ||
| 244 | * | ||
| 245 | * This function is to find the memmap entey of a given memory range. | ||
| 246 | * The caller must hold map_entries_lock, and must not release the lock | ||
| 247 | * until the processing of the returned entry has completed. | ||
| 248 | * | ||
| 249 | * Return: Pointer to the entry to be found on success, or NULL on failure. | ||
| 250 | */ | ||
| 251 | static struct firmware_map_entry * __meminit | ||
| 252 | firmware_map_find_entry(u64 start, u64 end, const char *type) | ||
| 253 | { | ||
| 254 | return firmware_map_find_entry_in_list(start, end, type, &map_entries); | ||
| 255 | } | ||
| 256 | |||
| 257 | /* | ||
| 258 | * firmware_map_find_entry_bootmem() - Search memmap entry in map_entries_bootmem. | ||
| 259 | * @start: Start of the memory range. | ||
| 260 | * @end: End of the memory range (exclusive). | ||
| 261 | * @type: Type of the memory range. | ||
| 262 | * | ||
| 263 | * This function is similar to firmware_map_find_entry except that it find the | ||
| 264 | * given entry in map_entries_bootmem. | ||
| 265 | * | ||
| 266 | * Return: Pointer to the entry to be found on success, or NULL on failure. | ||
| 267 | */ | ||
| 268 | static struct firmware_map_entry * __meminit | ||
| 269 | firmware_map_find_entry_bootmem(u64 start, u64 end, const char *type) | ||
| 270 | { | ||
| 271 | return firmware_map_find_entry_in_list(start, end, type, | ||
| 272 | &map_entries_bootmem); | ||
| 273 | } | ||
| 274 | |||
| 147 | /** | 275 | /** |
| 148 | * firmware_map_add_hotplug() - Adds a firmware mapping entry when we do | 276 | * firmware_map_add_hotplug() - Adds a firmware mapping entry when we do |
| 149 | * memory hotplug. | 277 | * memory hotplug. |
| @@ -161,9 +289,19 @@ int __meminit firmware_map_add_hotplug(u64 start, u64 end, const char *type) | |||
| 161 | { | 289 | { |
| 162 | struct firmware_map_entry *entry; | 290 | struct firmware_map_entry *entry; |
| 163 | 291 | ||
| 164 | entry = kzalloc(sizeof(struct firmware_map_entry), GFP_ATOMIC); | 292 | entry = firmware_map_find_entry_bootmem(start, end, type); |
| 165 | if (!entry) | 293 | if (!entry) { |
| 166 | return -ENOMEM; | 294 | entry = kzalloc(sizeof(struct firmware_map_entry), GFP_ATOMIC); |
| 295 | if (!entry) | ||
| 296 | return -ENOMEM; | ||
| 297 | } else { | ||
| 298 | /* Reuse storage allocated by bootmem. */ | ||
| 299 | spin_lock(&map_entries_bootmem_lock); | ||
| 300 | list_del(&entry->list); | ||
| 301 | spin_unlock(&map_entries_bootmem_lock); | ||
| 302 | |||
| 303 | memset(entry, 0, sizeof(*entry)); | ||
| 304 | } | ||
| 167 | 305 | ||
| 168 | firmware_map_add_entry(start, end, type, entry); | 306 | firmware_map_add_entry(start, end, type, entry); |
| 169 | /* create the memmap entry */ | 307 | /* create the memmap entry */ |
| @@ -196,6 +334,36 @@ int __init firmware_map_add_early(u64 start, u64 end, const char *type) | |||
| 196 | return firmware_map_add_entry(start, end, type, entry); | 334 | return firmware_map_add_entry(start, end, type, entry); |
| 197 | } | 335 | } |
| 198 | 336 | ||
| 337 | /** | ||
| 338 | * firmware_map_remove() - remove a firmware mapping entry | ||
| 339 | * @start: Start of the memory range. | ||
| 340 | * @end: End of the memory range. | ||
| 341 | * @type: Type of the memory range. | ||
| 342 | * | ||
| 343 | * removes a firmware mapping entry. | ||
| 344 | * | ||
| 345 | * Returns 0 on success, or -EINVAL if no entry. | ||
| 346 | **/ | ||
| 347 | int __meminit firmware_map_remove(u64 start, u64 end, const char *type) | ||
| 348 | { | ||
| 349 | struct firmware_map_entry *entry; | ||
| 350 | |||
| 351 | spin_lock(&map_entries_lock); | ||
| 352 | entry = firmware_map_find_entry(start, end - 1, type); | ||
| 353 | if (!entry) { | ||
| 354 | spin_unlock(&map_entries_lock); | ||
| 355 | return -EINVAL; | ||
| 356 | } | ||
| 357 | |||
| 358 | firmware_map_remove_entry(entry); | ||
| 359 | spin_unlock(&map_entries_lock); | ||
| 360 | |||
| 361 | /* remove the memmap entry */ | ||
| 362 | remove_sysfs_fw_map_entry(entry); | ||
| 363 | |||
| 364 | return 0; | ||
| 365 | } | ||
| 366 | |||
| 199 | /* | 367 | /* |
| 200 | * Sysfs functions ------------------------------------------------------------- | 368 | * Sysfs functions ------------------------------------------------------------- |
| 201 | */ | 369 | */ |
| @@ -217,8 +385,10 @@ static ssize_t type_show(struct firmware_map_entry *entry, char *buf) | |||
| 217 | return snprintf(buf, PAGE_SIZE, "%s\n", entry->type); | 385 | return snprintf(buf, PAGE_SIZE, "%s\n", entry->type); |
| 218 | } | 386 | } |
| 219 | 387 | ||
| 220 | #define to_memmap_attr(_attr) container_of(_attr, struct memmap_attribute, attr) | 388 | static inline struct memmap_attribute *to_memmap_attr(struct attribute *attr) |
| 221 | #define to_memmap_entry(obj) container_of(obj, struct firmware_map_entry, kobj) | 389 | { |
| 390 | return container_of(attr, struct memmap_attribute, attr); | ||
| 391 | } | ||
| 222 | 392 | ||
| 223 | static ssize_t memmap_attr_show(struct kobject *kobj, | 393 | static ssize_t memmap_attr_show(struct kobject *kobj, |
| 224 | struct attribute *attr, char *buf) | 394 | struct attribute *attr, char *buf) |
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c index d247a35da3c6..7b17a1fdeaf9 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.c +++ b/drivers/md/persistent-data/dm-transaction-manager.c | |||
| @@ -25,8 +25,8 @@ struct shadow_info { | |||
| 25 | /* | 25 | /* |
| 26 | * It would be nice if we scaled with the size of transaction. | 26 | * It would be nice if we scaled with the size of transaction. |
| 27 | */ | 27 | */ |
| 28 | #define HASH_SIZE 256 | 28 | #define DM_HASH_SIZE 256 |
| 29 | #define HASH_MASK (HASH_SIZE - 1) | 29 | #define DM_HASH_MASK (DM_HASH_SIZE - 1) |
| 30 | 30 | ||
| 31 | struct dm_transaction_manager { | 31 | struct dm_transaction_manager { |
| 32 | int is_clone; | 32 | int is_clone; |
| @@ -36,7 +36,7 @@ struct dm_transaction_manager { | |||
| 36 | struct dm_space_map *sm; | 36 | struct dm_space_map *sm; |
| 37 | 37 | ||
| 38 | spinlock_t lock; | 38 | spinlock_t lock; |
| 39 | struct hlist_head buckets[HASH_SIZE]; | 39 | struct hlist_head buckets[DM_HASH_SIZE]; |
| 40 | }; | 40 | }; |
| 41 | 41 | ||
| 42 | /*----------------------------------------------------------------*/ | 42 | /*----------------------------------------------------------------*/ |
| @@ -44,7 +44,7 @@ struct dm_transaction_manager { | |||
| 44 | static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b) | 44 | static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b) |
| 45 | { | 45 | { |
| 46 | int r = 0; | 46 | int r = 0; |
| 47 | unsigned bucket = dm_hash_block(b, HASH_MASK); | 47 | unsigned bucket = dm_hash_block(b, DM_HASH_MASK); |
| 48 | struct shadow_info *si; | 48 | struct shadow_info *si; |
| 49 | struct hlist_node *n; | 49 | struct hlist_node *n; |
| 50 | 50 | ||
| @@ -71,7 +71,7 @@ static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b) | |||
| 71 | si = kmalloc(sizeof(*si), GFP_NOIO); | 71 | si = kmalloc(sizeof(*si), GFP_NOIO); |
| 72 | if (si) { | 72 | if (si) { |
| 73 | si->where = b; | 73 | si->where = b; |
| 74 | bucket = dm_hash_block(b, HASH_MASK); | 74 | bucket = dm_hash_block(b, DM_HASH_MASK); |
| 75 | spin_lock(&tm->lock); | 75 | spin_lock(&tm->lock); |
| 76 | hlist_add_head(&si->hlist, tm->buckets + bucket); | 76 | hlist_add_head(&si->hlist, tm->buckets + bucket); |
| 77 | spin_unlock(&tm->lock); | 77 | spin_unlock(&tm->lock); |
| @@ -86,7 +86,7 @@ static void wipe_shadow_table(struct dm_transaction_manager *tm) | |||
| 86 | int i; | 86 | int i; |
| 87 | 87 | ||
| 88 | spin_lock(&tm->lock); | 88 | spin_lock(&tm->lock); |
| 89 | for (i = 0; i < HASH_SIZE; i++) { | 89 | for (i = 0; i < DM_HASH_SIZE; i++) { |
| 90 | bucket = tm->buckets + i; | 90 | bucket = tm->buckets + i; |
| 91 | hlist_for_each_entry_safe(si, n, tmp, bucket, hlist) | 91 | hlist_for_each_entry_safe(si, n, tmp, bucket, hlist) |
| 92 | kfree(si); | 92 | kfree(si); |
| @@ -115,7 +115,7 @@ static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm, | |||
| 115 | tm->sm = sm; | 115 | tm->sm = sm; |
| 116 | 116 | ||
| 117 | spin_lock_init(&tm->lock); | 117 | spin_lock_init(&tm->lock); |
| 118 | for (i = 0; i < HASH_SIZE; i++) | 118 | for (i = 0; i < DM_HASH_SIZE; i++) |
| 119 | INIT_HLIST_HEAD(tm->buckets + i); | 119 | INIT_HLIST_HEAD(tm->buckets + i); |
| 120 | 120 | ||
| 121 | return tm; | 121 | return tm; |
diff --git a/drivers/staging/zcache/zbud.c b/drivers/staging/zcache/zbud.c index 328c397ea5dc..fdff5c6a0239 100644 --- a/drivers/staging/zcache/zbud.c +++ b/drivers/staging/zcache/zbud.c | |||
| @@ -404,7 +404,7 @@ static inline struct page *zbud_unuse_zbudpage(struct zbudpage *zbudpage, | |||
| 404 | else | 404 | else |
| 405 | zbud_pers_pageframes--; | 405 | zbud_pers_pageframes--; |
| 406 | zbudpage_spin_unlock(zbudpage); | 406 | zbudpage_spin_unlock(zbudpage); |
| 407 | reset_page_mapcount(page); | 407 | page_mapcount_reset(page); |
| 408 | init_page_count(page); | 408 | init_page_count(page); |
| 409 | page->index = 0; | 409 | page->index = 0; |
| 410 | return page; | 410 | return page; |
diff --git a/drivers/staging/zsmalloc/zsmalloc-main.c b/drivers/staging/zsmalloc/zsmalloc-main.c index 06f73a93a44d..e78d262c5249 100644 --- a/drivers/staging/zsmalloc/zsmalloc-main.c +++ b/drivers/staging/zsmalloc/zsmalloc-main.c | |||
| @@ -472,7 +472,7 @@ static void reset_page(struct page *page) | |||
| 472 | set_page_private(page, 0); | 472 | set_page_private(page, 0); |
| 473 | page->mapping = NULL; | 473 | page->mapping = NULL; |
| 474 | page->freelist = NULL; | 474 | page->freelist = NULL; |
| 475 | reset_page_mapcount(page); | 475 | page_mapcount_reset(page); |
| 476 | } | 476 | } |
| 477 | 477 | ||
| 478 | static void free_zspage(struct page *first_page) | 478 | static void free_zspage(struct page *first_page) |
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 1775ad471edd..5480352f984d 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c | |||
| @@ -5177,6 +5177,7 @@ int usb_reset_device(struct usb_device *udev) | |||
| 5177 | { | 5177 | { |
| 5178 | int ret; | 5178 | int ret; |
| 5179 | int i; | 5179 | int i; |
| 5180 | unsigned int noio_flag; | ||
| 5180 | struct usb_host_config *config = udev->actconfig; | 5181 | struct usb_host_config *config = udev->actconfig; |
| 5181 | 5182 | ||
| 5182 | if (udev->state == USB_STATE_NOTATTACHED || | 5183 | if (udev->state == USB_STATE_NOTATTACHED || |
| @@ -5186,6 +5187,17 @@ int usb_reset_device(struct usb_device *udev) | |||
| 5186 | return -EINVAL; | 5187 | return -EINVAL; |
| 5187 | } | 5188 | } |
| 5188 | 5189 | ||
| 5190 | /* | ||
| 5191 | * Don't allocate memory with GFP_KERNEL in current | ||
| 5192 | * context to avoid possible deadlock if usb mass | ||
| 5193 | * storage interface or usbnet interface(iSCSI case) | ||
| 5194 | * is included in current configuration. The easist | ||
| 5195 | * approach is to do it for every device reset, | ||
| 5196 | * because the device 'memalloc_noio' flag may have | ||
| 5197 | * not been set before reseting the usb device. | ||
| 5198 | */ | ||
| 5199 | noio_flag = memalloc_noio_save(); | ||
| 5200 | |||
| 5189 | /* Prevent autosuspend during the reset */ | 5201 | /* Prevent autosuspend during the reset */ |
| 5190 | usb_autoresume_device(udev); | 5202 | usb_autoresume_device(udev); |
| 5191 | 5203 | ||
| @@ -5230,6 +5242,7 @@ int usb_reset_device(struct usb_device *udev) | |||
| 5230 | } | 5242 | } |
| 5231 | 5243 | ||
| 5232 | usb_autosuspend_device(udev); | 5244 | usb_autosuspend_device(udev); |
| 5245 | memalloc_noio_restore(noio_flag); | ||
| 5233 | return ret; | 5246 | return ret; |
| 5234 | } | 5247 | } |
| 5235 | EXPORT_SYMBOL_GPL(usb_reset_device); | 5248 | EXPORT_SYMBOL_GPL(usb_reset_device); |
| @@ -101,7 +101,7 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
| 101 | struct aio_ring *ring; | 101 | struct aio_ring *ring; |
| 102 | struct aio_ring_info *info = &ctx->ring_info; | 102 | struct aio_ring_info *info = &ctx->ring_info; |
| 103 | unsigned nr_events = ctx->max_reqs; | 103 | unsigned nr_events = ctx->max_reqs; |
| 104 | unsigned long size; | 104 | unsigned long size, populate; |
| 105 | int nr_pages; | 105 | int nr_pages; |
| 106 | 106 | ||
| 107 | /* Compensate for the ring buffer's head/tail overlap entry */ | 107 | /* Compensate for the ring buffer's head/tail overlap entry */ |
| @@ -129,7 +129,8 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
| 129 | down_write(&ctx->mm->mmap_sem); | 129 | down_write(&ctx->mm->mmap_sem); |
| 130 | info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, | 130 | info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, |
| 131 | PROT_READ|PROT_WRITE, | 131 | PROT_READ|PROT_WRITE, |
| 132 | MAP_ANONYMOUS|MAP_PRIVATE, 0); | 132 | MAP_ANONYMOUS|MAP_PRIVATE, 0, |
| 133 | &populate); | ||
| 133 | if (IS_ERR((void *)info->mmap_base)) { | 134 | if (IS_ERR((void *)info->mmap_base)) { |
| 134 | up_write(&ctx->mm->mmap_sem); | 135 | up_write(&ctx->mm->mmap_sem); |
| 135 | info->mmap_size = 0; | 136 | info->mmap_size = 0; |
| @@ -147,6 +148,8 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
| 147 | aio_free_ring(ctx); | 148 | aio_free_ring(ctx); |
| 148 | return -EAGAIN; | 149 | return -EAGAIN; |
| 149 | } | 150 | } |
| 151 | if (populate) | ||
| 152 | mm_populate(info->mmap_base, populate); | ||
| 150 | 153 | ||
| 151 | ctx->user_id = info->mmap_base; | 154 | ctx->user_id = info->mmap_base; |
| 152 | 155 | ||
diff --git a/fs/buffer.c b/fs/buffer.c index 2ea9cd44aeae..62169c192c21 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
| @@ -3227,7 +3227,7 @@ static struct kmem_cache *bh_cachep __read_mostly; | |||
| 3227 | * Once the number of bh's in the machine exceeds this level, we start | 3227 | * Once the number of bh's in the machine exceeds this level, we start |
| 3228 | * stripping them in writeback. | 3228 | * stripping them in writeback. |
| 3229 | */ | 3229 | */ |
| 3230 | static int max_buffer_heads; | 3230 | static unsigned long max_buffer_heads; |
| 3231 | 3231 | ||
| 3232 | int buffer_heads_over_limit; | 3232 | int buffer_heads_over_limit; |
| 3233 | 3233 | ||
| @@ -3343,7 +3343,7 @@ EXPORT_SYMBOL(bh_submit_read); | |||
| 3343 | 3343 | ||
| 3344 | void __init buffer_init(void) | 3344 | void __init buffer_init(void) |
| 3345 | { | 3345 | { |
| 3346 | int nrpages; | 3346 | unsigned long nrpages; |
| 3347 | 3347 | ||
| 3348 | bh_cachep = kmem_cache_create("buffer_head", | 3348 | bh_cachep = kmem_cache_create("buffer_head", |
| 3349 | sizeof(struct buffer_head), 0, | 3349 | sizeof(struct buffer_head), 0, |
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index ac8ed96c4199..499e957510e7 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c | |||
| @@ -151,7 +151,7 @@ get_nfs4_file(struct nfs4_file *fi) | |||
| 151 | } | 151 | } |
| 152 | 152 | ||
| 153 | static int num_delegations; | 153 | static int num_delegations; |
| 154 | unsigned int max_delegations; | 154 | unsigned long max_delegations; |
| 155 | 155 | ||
| 156 | /* | 156 | /* |
| 157 | * Open owner state (share locks) | 157 | * Open owner state (share locks) |
| @@ -700,8 +700,8 @@ static int nfsd4_get_drc_mem(int slotsize, u32 num) | |||
| 700 | num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION); | 700 | num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION); |
| 701 | 701 | ||
| 702 | spin_lock(&nfsd_drc_lock); | 702 | spin_lock(&nfsd_drc_lock); |
| 703 | avail = min_t(int, NFSD_MAX_MEM_PER_SESSION, | 703 | avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, |
| 704 | nfsd_drc_max_mem - nfsd_drc_mem_used); | 704 | nfsd_drc_max_mem - nfsd_drc_mem_used); |
| 705 | num = min_t(int, num, avail / slotsize); | 705 | num = min_t(int, num, avail / slotsize); |
| 706 | nfsd_drc_mem_used += num * slotsize; | 706 | nfsd_drc_mem_used += num * slotsize; |
| 707 | spin_unlock(&nfsd_drc_lock); | 707 | spin_unlock(&nfsd_drc_lock); |
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index de23db255c69..07a473fd49bc 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h | |||
| @@ -56,8 +56,8 @@ extern struct svc_version nfsd_version2, nfsd_version3, | |||
| 56 | extern u32 nfsd_supported_minorversion; | 56 | extern u32 nfsd_supported_minorversion; |
| 57 | extern struct mutex nfsd_mutex; | 57 | extern struct mutex nfsd_mutex; |
| 58 | extern spinlock_t nfsd_drc_lock; | 58 | extern spinlock_t nfsd_drc_lock; |
| 59 | extern unsigned int nfsd_drc_max_mem; | 59 | extern unsigned long nfsd_drc_max_mem; |
| 60 | extern unsigned int nfsd_drc_mem_used; | 60 | extern unsigned long nfsd_drc_mem_used; |
| 61 | 61 | ||
| 62 | extern const struct seq_operations nfs_exports_op; | 62 | extern const struct seq_operations nfs_exports_op; |
| 63 | 63 | ||
| @@ -106,7 +106,7 @@ static inline int nfsd_v4client(struct svc_rqst *rq) | |||
| 106 | * NFSv4 State | 106 | * NFSv4 State |
| 107 | */ | 107 | */ |
| 108 | #ifdef CONFIG_NFSD_V4 | 108 | #ifdef CONFIG_NFSD_V4 |
| 109 | extern unsigned int max_delegations; | 109 | extern unsigned long max_delegations; |
| 110 | void nfs4_state_init(void); | 110 | void nfs4_state_init(void); |
| 111 | int nfsd4_init_slabs(void); | 111 | int nfsd4_init_slabs(void); |
| 112 | void nfsd4_free_slabs(void); | 112 | void nfsd4_free_slabs(void); |
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index cee62ab9d4a3..be7af509930c 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c | |||
| @@ -59,8 +59,8 @@ DEFINE_MUTEX(nfsd_mutex); | |||
| 59 | * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage. | 59 | * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage. |
| 60 | */ | 60 | */ |
| 61 | spinlock_t nfsd_drc_lock; | 61 | spinlock_t nfsd_drc_lock; |
| 62 | unsigned int nfsd_drc_max_mem; | 62 | unsigned long nfsd_drc_max_mem; |
| 63 | unsigned int nfsd_drc_mem_used; | 63 | unsigned long nfsd_drc_mem_used; |
| 64 | 64 | ||
| 65 | #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) | 65 | #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) |
| 66 | static struct svc_stat nfsd_acl_svcstats; | 66 | static struct svc_stat nfsd_acl_svcstats; |
| @@ -342,7 +342,7 @@ static void set_max_drc(void) | |||
| 342 | >> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE; | 342 | >> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE; |
| 343 | nfsd_drc_mem_used = 0; | 343 | nfsd_drc_mem_used = 0; |
| 344 | spin_lock_init(&nfsd_drc_lock); | 344 | spin_lock_init(&nfsd_drc_lock); |
| 345 | dprintk("%s nfsd_drc_max_mem %u \n", __func__, nfsd_drc_max_mem); | 345 | dprintk("%s nfsd_drc_max_mem %lu \n", __func__, nfsd_drc_max_mem); |
| 346 | } | 346 | } |
| 347 | 347 | ||
| 348 | static int nfsd_get_default_max_blksize(void) | 348 | static int nfsd_get_default_max_blksize(void) |
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 80e4645f7990..1efaaa19c4f3 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c | |||
| @@ -40,7 +40,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
| 40 | * sysctl_overcommit_ratio / 100) + total_swap_pages; | 40 | * sysctl_overcommit_ratio / 100) + total_swap_pages; |
| 41 | 41 | ||
| 42 | cached = global_page_state(NR_FILE_PAGES) - | 42 | cached = global_page_state(NR_FILE_PAGES) - |
| 43 | total_swapcache_pages - i.bufferram; | 43 | total_swapcache_pages() - i.bufferram; |
| 44 | if (cached < 0) | 44 | if (cached < 0) |
| 45 | cached = 0; | 45 | cached = 0; |
| 46 | 46 | ||
| @@ -109,7 +109,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
| 109 | K(i.freeram), | 109 | K(i.freeram), |
| 110 | K(i.bufferram), | 110 | K(i.bufferram), |
| 111 | K(cached), | 111 | K(cached), |
| 112 | K(total_swapcache_pages), | 112 | K(total_swapcache_pages()), |
| 113 | K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]), | 113 | K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]), |
| 114 | K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]), | 114 | K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]), |
| 115 | K(pages[LRU_ACTIVE_ANON]), | 115 | K(pages[LRU_ACTIVE_ANON]), |
| @@ -158,7 +158,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
| 158 | vmi.used >> 10, | 158 | vmi.used >> 10, |
| 159 | vmi.largest_chunk >> 10 | 159 | vmi.largest_chunk >> 10 |
| 160 | #ifdef CONFIG_MEMORY_FAILURE | 160 | #ifdef CONFIG_MEMORY_FAILURE |
| 161 | ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10) | 161 | ,atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) |
| 162 | #endif | 162 | #endif |
| 163 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 163 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
| 164 | ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * | 164 | ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * |
diff --git a/include/linux/acpi.h b/include/linux/acpi.h index bcbdd7484e58..f46cfd73a553 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h | |||
| @@ -485,6 +485,14 @@ static inline bool acpi_driver_match_device(struct device *dev, | |||
| 485 | 485 | ||
| 486 | #endif /* !CONFIG_ACPI */ | 486 | #endif /* !CONFIG_ACPI */ |
| 487 | 487 | ||
| 488 | #ifdef CONFIG_ACPI_NUMA | ||
| 489 | void __init early_parse_srat(void); | ||
| 490 | #else | ||
| 491 | static inline void early_parse_srat(void) | ||
| 492 | { | ||
| 493 | } | ||
| 494 | #endif | ||
| 495 | |||
| 488 | #ifdef CONFIG_ACPI | 496 | #ifdef CONFIG_ACPI |
| 489 | void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state, | 497 | void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state, |
| 490 | u32 pm1a_ctrl, u32 pm1b_ctrl)); | 498 | u32 pm1a_ctrl, u32 pm1b_ctrl)); |
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 3cd16ba82f15..cdc3bab01832 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h | |||
| @@ -53,6 +53,7 @@ extern void free_bootmem_node(pg_data_t *pgdat, | |||
| 53 | unsigned long size); | 53 | unsigned long size); |
| 54 | extern void free_bootmem(unsigned long physaddr, unsigned long size); | 54 | extern void free_bootmem(unsigned long physaddr, unsigned long size); |
| 55 | extern void free_bootmem_late(unsigned long physaddr, unsigned long size); | 55 | extern void free_bootmem_late(unsigned long physaddr, unsigned long size); |
| 56 | extern void __free_pages_bootmem(struct page *page, unsigned int order); | ||
| 56 | 57 | ||
| 57 | /* | 58 | /* |
| 58 | * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE, | 59 | * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE, |
diff --git a/include/linux/compaction.h b/include/linux/compaction.h index cc7bddeaf553..091d72e70d8a 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h | |||
| @@ -23,7 +23,7 @@ extern int fragmentation_index(struct zone *zone, unsigned int order); | |||
| 23 | extern unsigned long try_to_compact_pages(struct zonelist *zonelist, | 23 | extern unsigned long try_to_compact_pages(struct zonelist *zonelist, |
| 24 | int order, gfp_t gfp_mask, nodemask_t *mask, | 24 | int order, gfp_t gfp_mask, nodemask_t *mask, |
| 25 | bool sync, bool *contended); | 25 | bool sync, bool *contended); |
| 26 | extern int compact_pgdat(pg_data_t *pgdat, int order); | 26 | extern void compact_pgdat(pg_data_t *pgdat, int order); |
| 27 | extern void reset_isolation_suitable(pg_data_t *pgdat); | 27 | extern void reset_isolation_suitable(pg_data_t *pgdat); |
| 28 | extern unsigned long compaction_suitable(struct zone *zone, int order); | 28 | extern unsigned long compaction_suitable(struct zone *zone, int order); |
| 29 | 29 | ||
| @@ -80,9 +80,8 @@ static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
| 80 | return COMPACT_CONTINUE; | 80 | return COMPACT_CONTINUE; |
| 81 | } | 81 | } |
| 82 | 82 | ||
| 83 | static inline int compact_pgdat(pg_data_t *pgdat, int order) | 83 | static inline void compact_pgdat(pg_data_t *pgdat, int order) |
| 84 | { | 84 | { |
| 85 | return COMPACT_CONTINUE; | ||
| 86 | } | 85 | } |
| 87 | 86 | ||
| 88 | static inline void reset_isolation_suitable(pg_data_t *pgdat) | 87 | static inline void reset_isolation_suitable(pg_data_t *pgdat) |
diff --git a/include/linux/firmware-map.h b/include/linux/firmware-map.h index 43fe52fcef0f..71d4fa721db9 100644 --- a/include/linux/firmware-map.h +++ b/include/linux/firmware-map.h | |||
| @@ -25,6 +25,7 @@ | |||
| 25 | 25 | ||
| 26 | int firmware_map_add_early(u64 start, u64 end, const char *type); | 26 | int firmware_map_add_early(u64 start, u64 end, const char *type); |
| 27 | int firmware_map_add_hotplug(u64 start, u64 end, const char *type); | 27 | int firmware_map_add_hotplug(u64 start, u64 end, const char *type); |
| 28 | int firmware_map_remove(u64 start, u64 end, const char *type); | ||
| 28 | 29 | ||
| 29 | #else /* CONFIG_FIRMWARE_MEMMAP */ | 30 | #else /* CONFIG_FIRMWARE_MEMMAP */ |
| 30 | 31 | ||
| @@ -38,6 +39,11 @@ static inline int firmware_map_add_hotplug(u64 start, u64 end, const char *type) | |||
| 38 | return 0; | 39 | return 0; |
| 39 | } | 40 | } |
| 40 | 41 | ||
| 42 | static inline int firmware_map_remove(u64 start, u64 end, const char *type) | ||
| 43 | { | ||
| 44 | return 0; | ||
| 45 | } | ||
| 46 | |||
| 41 | #endif /* CONFIG_FIRMWARE_MEMMAP */ | 47 | #endif /* CONFIG_FIRMWARE_MEMMAP */ |
| 42 | 48 | ||
| 43 | #endif /* _LINUX_FIRMWARE_MAP_H */ | 49 | #endif /* _LINUX_FIRMWARE_MAP_H */ |
diff --git a/include/linux/highmem.h b/include/linux/highmem.h index ef788b5b4a35..7fb31da45d03 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h | |||
| @@ -219,12 +219,6 @@ static inline void zero_user(struct page *page, | |||
| 219 | zero_user_segments(page, start, start + size, 0, 0); | 219 | zero_user_segments(page, start, start + size, 0, 0); |
| 220 | } | 220 | } |
| 221 | 221 | ||
| 222 | static inline void __deprecated memclear_highpage_flush(struct page *page, | ||
| 223 | unsigned int offset, unsigned int size) | ||
| 224 | { | ||
| 225 | zero_user(page, offset, size); | ||
| 226 | } | ||
| 227 | |||
| 228 | #ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE | 222 | #ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE |
| 229 | 223 | ||
| 230 | static inline void copy_user_highpage(struct page *to, struct page *from, | 224 | static inline void copy_user_highpage(struct page *to, struct page *from, |
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 1d76f8ca90f0..ee1c244a62a1 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h | |||
| @@ -113,7 +113,7 @@ extern void __split_huge_page_pmd(struct vm_area_struct *vma, | |||
| 113 | do { \ | 113 | do { \ |
| 114 | pmd_t *____pmd = (__pmd); \ | 114 | pmd_t *____pmd = (__pmd); \ |
| 115 | anon_vma_lock_write(__anon_vma); \ | 115 | anon_vma_lock_write(__anon_vma); \ |
| 116 | anon_vma_unlock(__anon_vma); \ | 116 | anon_vma_unlock_write(__anon_vma); \ |
| 117 | BUG_ON(pmd_trans_splitting(*____pmd) || \ | 117 | BUG_ON(pmd_trans_splitting(*____pmd) || \ |
| 118 | pmd_trans_huge(*____pmd)); \ | 118 | pmd_trans_huge(*____pmd)); \ |
| 119 | } while (0) | 119 | } while (0) |
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 0c80d3f57a5b..eedc334fb6f5 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h | |||
| @@ -43,9 +43,9 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, | |||
| 43 | #endif | 43 | #endif |
| 44 | 44 | ||
| 45 | int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); | 45 | int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); |
| 46 | int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, | 46 | long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, |
| 47 | struct page **, struct vm_area_struct **, | 47 | struct page **, struct vm_area_struct **, |
| 48 | unsigned long *, int *, int, unsigned int flags); | 48 | unsigned long *, unsigned long *, long, unsigned int); |
| 49 | void unmap_hugepage_range(struct vm_area_struct *, | 49 | void unmap_hugepage_range(struct vm_area_struct *, |
| 50 | unsigned long, unsigned long, struct page *); | 50 | unsigned long, unsigned long, struct page *); |
| 51 | void __unmap_hugepage_range_final(struct mmu_gather *tlb, | 51 | void __unmap_hugepage_range_final(struct mmu_gather *tlb, |
diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 3319a6967626..45c9b6a17bcb 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h | |||
| @@ -16,9 +16,6 @@ | |||
| 16 | struct stable_node; | 16 | struct stable_node; |
| 17 | struct mem_cgroup; | 17 | struct mem_cgroup; |
| 18 | 18 | ||
| 19 | struct page *ksm_does_need_to_copy(struct page *page, | ||
| 20 | struct vm_area_struct *vma, unsigned long address); | ||
| 21 | |||
| 22 | #ifdef CONFIG_KSM | 19 | #ifdef CONFIG_KSM |
| 23 | int ksm_madvise(struct vm_area_struct *vma, unsigned long start, | 20 | int ksm_madvise(struct vm_area_struct *vma, unsigned long start, |
| 24 | unsigned long end, int advice, unsigned long *vm_flags); | 21 | unsigned long end, int advice, unsigned long *vm_flags); |
| @@ -73,15 +70,8 @@ static inline void set_page_stable_node(struct page *page, | |||
| 73 | * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE, | 70 | * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE, |
| 74 | * but what if the vma was unmerged while the page was swapped out? | 71 | * but what if the vma was unmerged while the page was swapped out? |
| 75 | */ | 72 | */ |
| 76 | static inline int ksm_might_need_to_copy(struct page *page, | 73 | struct page *ksm_might_need_to_copy(struct page *page, |
| 77 | struct vm_area_struct *vma, unsigned long address) | 74 | struct vm_area_struct *vma, unsigned long address); |
| 78 | { | ||
| 79 | struct anon_vma *anon_vma = page_anon_vma(page); | ||
| 80 | |||
| 81 | return anon_vma && | ||
| 82 | (anon_vma->root != vma->anon_vma->root || | ||
| 83 | page->index != linear_page_index(vma, address)); | ||
| 84 | } | ||
| 85 | 75 | ||
| 86 | int page_referenced_ksm(struct page *page, | 76 | int page_referenced_ksm(struct page *page, |
| 87 | struct mem_cgroup *memcg, unsigned long *vm_flags); | 77 | struct mem_cgroup *memcg, unsigned long *vm_flags); |
| @@ -113,10 +103,10 @@ static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start, | |||
| 113 | return 0; | 103 | return 0; |
| 114 | } | 104 | } |
| 115 | 105 | ||
| 116 | static inline int ksm_might_need_to_copy(struct page *page, | 106 | static inline struct page *ksm_might_need_to_copy(struct page *page, |
| 117 | struct vm_area_struct *vma, unsigned long address) | 107 | struct vm_area_struct *vma, unsigned long address) |
| 118 | { | 108 | { |
| 119 | return 0; | 109 | return page; |
| 120 | } | 110 | } |
| 121 | 111 | ||
| 122 | static inline int page_referenced_ksm(struct page *page, | 112 | static inline int page_referenced_ksm(struct page *page, |
diff --git a/include/linux/memblock.h b/include/linux/memblock.h index f388203db7e8..3e5ecb2d790e 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h | |||
| @@ -42,6 +42,7 @@ struct memblock { | |||
| 42 | 42 | ||
| 43 | extern struct memblock memblock; | 43 | extern struct memblock memblock; |
| 44 | extern int memblock_debug; | 44 | extern int memblock_debug; |
| 45 | extern struct movablemem_map movablemem_map; | ||
| 45 | 46 | ||
| 46 | #define memblock_dbg(fmt, ...) \ | 47 | #define memblock_dbg(fmt, ...) \ |
| 47 | if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) | 48 | if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) |
| @@ -60,6 +61,7 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size); | |||
| 60 | void memblock_trim_memory(phys_addr_t align); | 61 | void memblock_trim_memory(phys_addr_t align); |
| 61 | 62 | ||
| 62 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 63 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
| 64 | |||
| 63 | void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, | 65 | void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, |
| 64 | unsigned long *out_end_pfn, int *out_nid); | 66 | unsigned long *out_end_pfn, int *out_nid); |
| 65 | 67 | ||
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 28bd5fa2ff2e..d6183f06d8c1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
| @@ -116,7 +116,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); | |||
| 116 | * For memory reclaim. | 116 | * For memory reclaim. |
| 117 | */ | 117 | */ |
| 118 | int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec); | 118 | int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec); |
| 119 | int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec); | ||
| 120 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); | 119 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); |
| 121 | unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list); | 120 | unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list); |
| 122 | void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int); | 121 | void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int); |
| @@ -321,12 +320,6 @@ mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) | |||
| 321 | return 1; | 320 | return 1; |
| 322 | } | 321 | } |
| 323 | 322 | ||
| 324 | static inline int | ||
| 325 | mem_cgroup_inactive_file_is_low(struct lruvec *lruvec) | ||
| 326 | { | ||
| 327 | return 1; | ||
| 328 | } | ||
| 329 | |||
| 330 | static inline unsigned long | 323 | static inline unsigned long |
| 331 | mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) | 324 | mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) |
| 332 | { | 325 | { |
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 4a45c4e50025..b6a3be7d47bf 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h | |||
| @@ -96,6 +96,7 @@ extern void __online_page_free(struct page *page); | |||
| 96 | 96 | ||
| 97 | #ifdef CONFIG_MEMORY_HOTREMOVE | 97 | #ifdef CONFIG_MEMORY_HOTREMOVE |
| 98 | extern bool is_pageblock_removable_nolock(struct page *page); | 98 | extern bool is_pageblock_removable_nolock(struct page *page); |
| 99 | extern int arch_remove_memory(u64 start, u64 size); | ||
| 99 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | 100 | #endif /* CONFIG_MEMORY_HOTREMOVE */ |
| 100 | 101 | ||
| 101 | /* reasonably generic interface to expand the physical pages in a zone */ | 102 | /* reasonably generic interface to expand the physical pages in a zone */ |
| @@ -173,17 +174,16 @@ static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) | |||
| 173 | #endif /* CONFIG_NUMA */ | 174 | #endif /* CONFIG_NUMA */ |
| 174 | #endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ | 175 | #endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ |
| 175 | 176 | ||
| 176 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | 177 | #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE |
| 178 | extern void register_page_bootmem_info_node(struct pglist_data *pgdat); | ||
| 179 | #else | ||
| 177 | static inline void register_page_bootmem_info_node(struct pglist_data *pgdat) | 180 | static inline void register_page_bootmem_info_node(struct pglist_data *pgdat) |
| 178 | { | 181 | { |
| 179 | } | 182 | } |
| 180 | static inline void put_page_bootmem(struct page *page) | ||
| 181 | { | ||
| 182 | } | ||
| 183 | #else | ||
| 184 | extern void register_page_bootmem_info_node(struct pglist_data *pgdat); | ||
| 185 | extern void put_page_bootmem(struct page *page); | ||
| 186 | #endif | 183 | #endif |
| 184 | extern void put_page_bootmem(struct page *page); | ||
| 185 | extern void get_page_bootmem(unsigned long ingo, struct page *page, | ||
| 186 | unsigned long type); | ||
| 187 | 187 | ||
| 188 | /* | 188 | /* |
| 189 | * Lock for memory hotplug guarantees 1) all callbacks for memory hotplug | 189 | * Lock for memory hotplug guarantees 1) all callbacks for memory hotplug |
| @@ -233,6 +233,7 @@ static inline void unlock_memory_hotplug(void) {} | |||
| 233 | #ifdef CONFIG_MEMORY_HOTREMOVE | 233 | #ifdef CONFIG_MEMORY_HOTREMOVE |
| 234 | 234 | ||
| 235 | extern int is_mem_section_removable(unsigned long pfn, unsigned long nr_pages); | 235 | extern int is_mem_section_removable(unsigned long pfn, unsigned long nr_pages); |
| 236 | extern void try_offline_node(int nid); | ||
| 236 | 237 | ||
| 237 | #else | 238 | #else |
| 238 | static inline int is_mem_section_removable(unsigned long pfn, | 239 | static inline int is_mem_section_removable(unsigned long pfn, |
| @@ -240,6 +241,8 @@ static inline int is_mem_section_removable(unsigned long pfn, | |||
| 240 | { | 241 | { |
| 241 | return 0; | 242 | return 0; |
| 242 | } | 243 | } |
| 244 | |||
| 245 | static inline void try_offline_node(int nid) {} | ||
| 243 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | 246 | #endif /* CONFIG_MEMORY_HOTREMOVE */ |
| 244 | 247 | ||
| 245 | extern int mem_online_node(int nid); | 248 | extern int mem_online_node(int nid); |
| @@ -247,7 +250,8 @@ extern int add_memory(int nid, u64 start, u64 size); | |||
| 247 | extern int arch_add_memory(int nid, u64 start, u64 size); | 250 | extern int arch_add_memory(int nid, u64 start, u64 size); |
| 248 | extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); | 251 | extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); |
| 249 | extern int offline_memory_block(struct memory_block *mem); | 252 | extern int offline_memory_block(struct memory_block *mem); |
| 250 | extern int remove_memory(u64 start, u64 size); | 253 | extern bool is_memblock_offlined(struct memory_block *mem); |
| 254 | extern int remove_memory(int nid, u64 start, u64 size); | ||
| 251 | extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, | 255 | extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, |
| 252 | int nr_pages); | 256 | int nr_pages); |
| 253 | extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms); | 257 | extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms); |
diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 1e9f627967a3..a405d3dc0f61 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h | |||
| @@ -40,11 +40,9 @@ extern void putback_movable_pages(struct list_head *l); | |||
| 40 | extern int migrate_page(struct address_space *, | 40 | extern int migrate_page(struct address_space *, |
| 41 | struct page *, struct page *, enum migrate_mode); | 41 | struct page *, struct page *, enum migrate_mode); |
| 42 | extern int migrate_pages(struct list_head *l, new_page_t x, | 42 | extern int migrate_pages(struct list_head *l, new_page_t x, |
| 43 | unsigned long private, bool offlining, | 43 | unsigned long private, enum migrate_mode mode, int reason); |
| 44 | enum migrate_mode mode, int reason); | ||
| 45 | extern int migrate_huge_page(struct page *, new_page_t x, | 44 | extern int migrate_huge_page(struct page *, new_page_t x, |
| 46 | unsigned long private, bool offlining, | 45 | unsigned long private, enum migrate_mode mode); |
| 47 | enum migrate_mode mode); | ||
| 48 | 46 | ||
| 49 | extern int fail_migrate_page(struct address_space *, | 47 | extern int fail_migrate_page(struct address_space *, |
| 50 | struct page *, struct page *); | 48 | struct page *, struct page *); |
| @@ -62,11 +60,11 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
| 62 | static inline void putback_lru_pages(struct list_head *l) {} | 60 | static inline void putback_lru_pages(struct list_head *l) {} |
| 63 | static inline void putback_movable_pages(struct list_head *l) {} | 61 | static inline void putback_movable_pages(struct list_head *l) {} |
| 64 | static inline int migrate_pages(struct list_head *l, new_page_t x, | 62 | static inline int migrate_pages(struct list_head *l, new_page_t x, |
| 65 | unsigned long private, bool offlining, | 63 | unsigned long private, enum migrate_mode mode, int reason) |
| 66 | enum migrate_mode mode, int reason) { return -ENOSYS; } | 64 | { return -ENOSYS; } |
| 67 | static inline int migrate_huge_page(struct page *page, new_page_t x, | 65 | static inline int migrate_huge_page(struct page *page, new_page_t x, |
| 68 | unsigned long private, bool offlining, | 66 | unsigned long private, enum migrate_mode mode) |
| 69 | enum migrate_mode mode) { return -ENOSYS; } | 67 | { return -ENOSYS; } |
| 70 | 68 | ||
| 71 | static inline int migrate_prep(void) { return -ENOSYS; } | 69 | static inline int migrate_prep(void) { return -ENOSYS; } |
| 72 | static inline int migrate_prep_local(void) { return -ENOSYS; } | 70 | static inline int migrate_prep_local(void) { return -ENOSYS; } |
diff --git a/include/linux/mm.h b/include/linux/mm.h index 9d9dcc35d6a1..e7c3f9a0111a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
| @@ -87,6 +87,7 @@ extern unsigned int kobjsize(const void *objp); | |||
| 87 | #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ | 87 | #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ |
| 88 | #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ | 88 | #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ |
| 89 | 89 | ||
| 90 | #define VM_POPULATE 0x00001000 | ||
| 90 | #define VM_LOCKED 0x00002000 | 91 | #define VM_LOCKED 0x00002000 |
| 91 | #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ | 92 | #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ |
| 92 | 93 | ||
| @@ -366,7 +367,7 @@ static inline struct page *compound_head(struct page *page) | |||
| 366 | * both from it and to it can be tracked, using atomic_inc_and_test | 367 | * both from it and to it can be tracked, using atomic_inc_and_test |
| 367 | * and atomic_add_negative(-1). | 368 | * and atomic_add_negative(-1). |
| 368 | */ | 369 | */ |
| 369 | static inline void reset_page_mapcount(struct page *page) | 370 | static inline void page_mapcount_reset(struct page *page) |
| 370 | { | 371 | { |
| 371 | atomic_set(&(page)->_mapcount, -1); | 372 | atomic_set(&(page)->_mapcount, -1); |
| 372 | } | 373 | } |
| @@ -580,50 +581,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) | |||
| 580 | * sets it, so none of the operations on it need to be atomic. | 581 | * sets it, so none of the operations on it need to be atomic. |
| 581 | */ | 582 | */ |
| 582 | 583 | ||
| 583 | 584 | /* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NID] | ... | FLAGS | */ | |
| 584 | /* | ||
| 585 | * page->flags layout: | ||
| 586 | * | ||
| 587 | * There are three possibilities for how page->flags get | ||
| 588 | * laid out. The first is for the normal case, without | ||
| 589 | * sparsemem. The second is for sparsemem when there is | ||
| 590 | * plenty of space for node and section. The last is when | ||
| 591 | * we have run out of space and have to fall back to an | ||
| 592 | * alternate (slower) way of determining the node. | ||
| 593 | * | ||
| 594 | * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS | | ||
| 595 | * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS | | ||
| 596 | * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS | | ||
| 597 | */ | ||
| 598 | #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) | ||
| 599 | #define SECTIONS_WIDTH SECTIONS_SHIFT | ||
| 600 | #else | ||
| 601 | #define SECTIONS_WIDTH 0 | ||
| 602 | #endif | ||
| 603 | |||
| 604 | #define ZONES_WIDTH ZONES_SHIFT | ||
| 605 | |||
| 606 | #if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS | ||
| 607 | #define NODES_WIDTH NODES_SHIFT | ||
| 608 | #else | ||
| 609 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | ||
| 610 | #error "Vmemmap: No space for nodes field in page flags" | ||
| 611 | #endif | ||
| 612 | #define NODES_WIDTH 0 | ||
| 613 | #endif | ||
| 614 | |||
| 615 | /* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */ | ||
| 616 | #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) | 585 | #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) |
| 617 | #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) | 586 | #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) |
| 618 | #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) | 587 | #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) |
| 619 | 588 | #define LAST_NID_PGOFF (ZONES_PGOFF - LAST_NID_WIDTH) | |
| 620 | /* | ||
| 621 | * We are going to use the flags for the page to node mapping if its in | ||
| 622 | * there. This includes the case where there is no node, so it is implicit. | ||
| 623 | */ | ||
| 624 | #if !(NODES_WIDTH > 0 || NODES_SHIFT == 0) | ||
| 625 | #define NODE_NOT_IN_PAGE_FLAGS | ||
| 626 | #endif | ||
| 627 | 589 | ||
| 628 | /* | 590 | /* |
| 629 | * Define the bit shifts to access each section. For non-existent | 591 | * Define the bit shifts to access each section. For non-existent |
| @@ -633,6 +595,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) | |||
| 633 | #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) | 595 | #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) |
| 634 | #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) | 596 | #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) |
| 635 | #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) | 597 | #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) |
| 598 | #define LAST_NID_PGSHIFT (LAST_NID_PGOFF * (LAST_NID_WIDTH != 0)) | ||
| 636 | 599 | ||
| 637 | /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ | 600 | /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ |
| 638 | #ifdef NODE_NOT_IN_PAGE_FLAGS | 601 | #ifdef NODE_NOT_IN_PAGE_FLAGS |
| @@ -654,6 +617,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) | |||
| 654 | #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) | 617 | #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) |
| 655 | #define NODES_MASK ((1UL << NODES_WIDTH) - 1) | 618 | #define NODES_MASK ((1UL << NODES_WIDTH) - 1) |
| 656 | #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) | 619 | #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) |
| 620 | #define LAST_NID_MASK ((1UL << LAST_NID_WIDTH) - 1) | ||
| 657 | #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) | 621 | #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) |
| 658 | 622 | ||
| 659 | static inline enum zone_type page_zonenum(const struct page *page) | 623 | static inline enum zone_type page_zonenum(const struct page *page) |
| @@ -661,6 +625,10 @@ static inline enum zone_type page_zonenum(const struct page *page) | |||
| 661 | return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; | 625 | return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; |
| 662 | } | 626 | } |
| 663 | 627 | ||
| 628 | #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) | ||
| 629 | #define SECTION_IN_PAGE_FLAGS | ||
| 630 | #endif | ||
| 631 | |||
| 664 | /* | 632 | /* |
| 665 | * The identification function is only used by the buddy allocator for | 633 | * The identification function is only used by the buddy allocator for |
| 666 | * determining if two pages could be buddies. We are not really | 634 | * determining if two pages could be buddies. We are not really |
| @@ -693,31 +661,48 @@ static inline int page_to_nid(const struct page *page) | |||
| 693 | #endif | 661 | #endif |
| 694 | 662 | ||
| 695 | #ifdef CONFIG_NUMA_BALANCING | 663 | #ifdef CONFIG_NUMA_BALANCING |
| 696 | static inline int page_xchg_last_nid(struct page *page, int nid) | 664 | #ifdef LAST_NID_NOT_IN_PAGE_FLAGS |
| 665 | static inline int page_nid_xchg_last(struct page *page, int nid) | ||
| 697 | { | 666 | { |
| 698 | return xchg(&page->_last_nid, nid); | 667 | return xchg(&page->_last_nid, nid); |
| 699 | } | 668 | } |
| 700 | 669 | ||
| 701 | static inline int page_last_nid(struct page *page) | 670 | static inline int page_nid_last(struct page *page) |
| 702 | { | 671 | { |
| 703 | return page->_last_nid; | 672 | return page->_last_nid; |
| 704 | } | 673 | } |
| 705 | static inline void reset_page_last_nid(struct page *page) | 674 | static inline void page_nid_reset_last(struct page *page) |
| 706 | { | 675 | { |
| 707 | page->_last_nid = -1; | 676 | page->_last_nid = -1; |
| 708 | } | 677 | } |
| 709 | #else | 678 | #else |
| 710 | static inline int page_xchg_last_nid(struct page *page, int nid) | 679 | static inline int page_nid_last(struct page *page) |
| 680 | { | ||
| 681 | return (page->flags >> LAST_NID_PGSHIFT) & LAST_NID_MASK; | ||
| 682 | } | ||
| 683 | |||
| 684 | extern int page_nid_xchg_last(struct page *page, int nid); | ||
| 685 | |||
| 686 | static inline void page_nid_reset_last(struct page *page) | ||
| 687 | { | ||
| 688 | int nid = (1 << LAST_NID_SHIFT) - 1; | ||
| 689 | |||
| 690 | page->flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); | ||
| 691 | page->flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; | ||
| 692 | } | ||
| 693 | #endif /* LAST_NID_NOT_IN_PAGE_FLAGS */ | ||
| 694 | #else | ||
| 695 | static inline int page_nid_xchg_last(struct page *page, int nid) | ||
| 711 | { | 696 | { |
| 712 | return page_to_nid(page); | 697 | return page_to_nid(page); |
| 713 | } | 698 | } |
| 714 | 699 | ||
| 715 | static inline int page_last_nid(struct page *page) | 700 | static inline int page_nid_last(struct page *page) |
| 716 | { | 701 | { |
| 717 | return page_to_nid(page); | 702 | return page_to_nid(page); |
| 718 | } | 703 | } |
| 719 | 704 | ||
| 720 | static inline void reset_page_last_nid(struct page *page) | 705 | static inline void page_nid_reset_last(struct page *page) |
| 721 | { | 706 | { |
| 722 | } | 707 | } |
| 723 | #endif | 708 | #endif |
| @@ -727,7 +712,7 @@ static inline struct zone *page_zone(const struct page *page) | |||
| 727 | return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; | 712 | return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; |
| 728 | } | 713 | } |
| 729 | 714 | ||
| 730 | #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) | 715 | #ifdef SECTION_IN_PAGE_FLAGS |
| 731 | static inline void set_page_section(struct page *page, unsigned long section) | 716 | static inline void set_page_section(struct page *page, unsigned long section) |
| 732 | { | 717 | { |
| 733 | page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); | 718 | page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); |
| @@ -757,7 +742,7 @@ static inline void set_page_links(struct page *page, enum zone_type zone, | |||
| 757 | { | 742 | { |
| 758 | set_page_zone(page, zone); | 743 | set_page_zone(page, zone); |
| 759 | set_page_node(page, node); | 744 | set_page_node(page, node); |
| 760 | #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) | 745 | #ifdef SECTION_IN_PAGE_FLAGS |
| 761 | set_page_section(page, pfn_to_section_nr(pfn)); | 746 | set_page_section(page, pfn_to_section_nr(pfn)); |
| 762 | #endif | 747 | #endif |
| 763 | } | 748 | } |
| @@ -817,18 +802,7 @@ void page_address_init(void); | |||
| 817 | #define PAGE_MAPPING_KSM 2 | 802 | #define PAGE_MAPPING_KSM 2 |
| 818 | #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM) | 803 | #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM) |
| 819 | 804 | ||
| 820 | extern struct address_space swapper_space; | 805 | extern struct address_space *page_mapping(struct page *page); |
| 821 | static inline struct address_space *page_mapping(struct page *page) | ||
| 822 | { | ||
| 823 | struct address_space *mapping = page->mapping; | ||
| 824 | |||
| 825 | VM_BUG_ON(PageSlab(page)); | ||
| 826 | if (unlikely(PageSwapCache(page))) | ||
| 827 | mapping = &swapper_space; | ||
| 828 | else if ((unsigned long)mapping & PAGE_MAPPING_ANON) | ||
| 829 | mapping = NULL; | ||
| 830 | return mapping; | ||
| 831 | } | ||
| 832 | 806 | ||
| 833 | /* Neutral page->mapping pointer to address_space or anon_vma or other */ | 807 | /* Neutral page->mapping pointer to address_space or anon_vma or other */ |
| 834 | static inline void *page_rmapping(struct page *page) | 808 | static inline void *page_rmapping(struct page *page) |
| @@ -1035,18 +1009,18 @@ static inline int fixup_user_fault(struct task_struct *tsk, | |||
| 1035 | } | 1009 | } |
| 1036 | #endif | 1010 | #endif |
| 1037 | 1011 | ||
| 1038 | extern int make_pages_present(unsigned long addr, unsigned long end); | ||
| 1039 | extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); | 1012 | extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); |
| 1040 | extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, | 1013 | extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, |
| 1041 | void *buf, int len, int write); | 1014 | void *buf, int len, int write); |
| 1042 | 1015 | ||
| 1043 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 1016 | long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
| 1044 | unsigned long start, int len, unsigned int foll_flags, | 1017 | unsigned long start, unsigned long nr_pages, |
| 1045 | struct page **pages, struct vm_area_struct **vmas, | 1018 | unsigned int foll_flags, struct page **pages, |
| 1046 | int *nonblocking); | 1019 | struct vm_area_struct **vmas, int *nonblocking); |
| 1047 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 1020 | long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
| 1048 | unsigned long start, int nr_pages, int write, int force, | 1021 | unsigned long start, unsigned long nr_pages, |
| 1049 | struct page **pages, struct vm_area_struct **vmas); | 1022 | int write, int force, struct page **pages, |
| 1023 | struct vm_area_struct **vmas); | ||
| 1050 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, | 1024 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, |
| 1051 | struct page **pages); | 1025 | struct page **pages); |
| 1052 | struct kvec; | 1026 | struct kvec; |
| @@ -1359,6 +1333,24 @@ extern void free_bootmem_with_active_regions(int nid, | |||
| 1359 | unsigned long max_low_pfn); | 1333 | unsigned long max_low_pfn); |
| 1360 | extern void sparse_memory_present_with_active_regions(int nid); | 1334 | extern void sparse_memory_present_with_active_regions(int nid); |
| 1361 | 1335 | ||
| 1336 | #define MOVABLEMEM_MAP_MAX MAX_NUMNODES | ||
| 1337 | struct movablemem_entry { | ||
| 1338 | unsigned long start_pfn; /* start pfn of memory segment */ | ||
| 1339 | unsigned long end_pfn; /* end pfn of memory segment (exclusive) */ | ||
| 1340 | }; | ||
| 1341 | |||
| 1342 | struct movablemem_map { | ||
| 1343 | bool acpi; /* true if using SRAT info */ | ||
| 1344 | int nr_map; | ||
| 1345 | struct movablemem_entry map[MOVABLEMEM_MAP_MAX]; | ||
| 1346 | nodemask_t numa_nodes_hotplug; /* on which nodes we specify memory */ | ||
| 1347 | nodemask_t numa_nodes_kernel; /* on which nodes kernel resides in */ | ||
| 1348 | }; | ||
| 1349 | |||
| 1350 | extern void __init insert_movablemem_map(unsigned long start_pfn, | ||
| 1351 | unsigned long end_pfn); | ||
| 1352 | extern int __init movablemem_map_overlap(unsigned long start_pfn, | ||
| 1353 | unsigned long end_pfn); | ||
| 1362 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 1354 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
| 1363 | 1355 | ||
| 1364 | #if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \ | 1356 | #if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \ |
| @@ -1395,6 +1387,9 @@ extern void setup_per_cpu_pageset(void); | |||
| 1395 | extern void zone_pcp_update(struct zone *zone); | 1387 | extern void zone_pcp_update(struct zone *zone); |
| 1396 | extern void zone_pcp_reset(struct zone *zone); | 1388 | extern void zone_pcp_reset(struct zone *zone); |
| 1397 | 1389 | ||
| 1390 | /* page_alloc.c */ | ||
| 1391 | extern int min_free_kbytes; | ||
| 1392 | |||
| 1398 | /* nommu.c */ | 1393 | /* nommu.c */ |
| 1399 | extern atomic_long_t mmap_pages_allocated; | 1394 | extern atomic_long_t mmap_pages_allocated; |
| 1400 | extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); | 1395 | extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); |
| @@ -1472,13 +1467,24 @@ extern int install_special_mapping(struct mm_struct *mm, | |||
| 1472 | extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); | 1467 | extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); |
| 1473 | 1468 | ||
| 1474 | extern unsigned long mmap_region(struct file *file, unsigned long addr, | 1469 | extern unsigned long mmap_region(struct file *file, unsigned long addr, |
| 1475 | unsigned long len, unsigned long flags, | 1470 | unsigned long len, vm_flags_t vm_flags, unsigned long pgoff); |
| 1476 | vm_flags_t vm_flags, unsigned long pgoff); | 1471 | extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, |
| 1477 | extern unsigned long do_mmap_pgoff(struct file *, unsigned long, | 1472 | unsigned long len, unsigned long prot, unsigned long flags, |
| 1478 | unsigned long, unsigned long, | 1473 | unsigned long pgoff, unsigned long *populate); |
| 1479 | unsigned long, unsigned long); | ||
| 1480 | extern int do_munmap(struct mm_struct *, unsigned long, size_t); | 1474 | extern int do_munmap(struct mm_struct *, unsigned long, size_t); |
| 1481 | 1475 | ||
| 1476 | #ifdef CONFIG_MMU | ||
| 1477 | extern int __mm_populate(unsigned long addr, unsigned long len, | ||
| 1478 | int ignore_errors); | ||
| 1479 | static inline void mm_populate(unsigned long addr, unsigned long len) | ||
| 1480 | { | ||
| 1481 | /* Ignore errors */ | ||
| 1482 | (void) __mm_populate(addr, len, 1); | ||
| 1483 | } | ||
| 1484 | #else | ||
| 1485 | static inline void mm_populate(unsigned long addr, unsigned long len) {} | ||
| 1486 | #endif | ||
| 1487 | |||
| 1482 | /* These take the mm semaphore themselves */ | 1488 | /* These take the mm semaphore themselves */ |
| 1483 | extern unsigned long vm_brk(unsigned long, unsigned long); | 1489 | extern unsigned long vm_brk(unsigned long, unsigned long); |
| 1484 | extern int vm_munmap(unsigned long, size_t); | 1490 | extern int vm_munmap(unsigned long, size_t); |
| @@ -1623,8 +1629,17 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, | |||
| 1623 | int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, | 1629 | int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, |
| 1624 | unsigned long pfn); | 1630 | unsigned long pfn); |
| 1625 | 1631 | ||
| 1626 | struct page *follow_page(struct vm_area_struct *, unsigned long address, | 1632 | struct page *follow_page_mask(struct vm_area_struct *vma, |
| 1627 | unsigned int foll_flags); | 1633 | unsigned long address, unsigned int foll_flags, |
| 1634 | unsigned int *page_mask); | ||
| 1635 | |||
| 1636 | static inline struct page *follow_page(struct vm_area_struct *vma, | ||
| 1637 | unsigned long address, unsigned int foll_flags) | ||
| 1638 | { | ||
| 1639 | unsigned int unused_page_mask; | ||
| 1640 | return follow_page_mask(vma, address, foll_flags, &unused_page_mask); | ||
| 1641 | } | ||
| 1642 | |||
| 1628 | #define FOLL_WRITE 0x01 /* check pte is writable */ | 1643 | #define FOLL_WRITE 0x01 /* check pte is writable */ |
| 1629 | #define FOLL_TOUCH 0x02 /* mark page accessed */ | 1644 | #define FOLL_TOUCH 0x02 /* mark page accessed */ |
| 1630 | #define FOLL_GET 0x04 /* do get_page on page */ | 1645 | #define FOLL_GET 0x04 /* do get_page on page */ |
| @@ -1636,6 +1651,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address, | |||
| 1636 | #define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ | 1651 | #define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ |
| 1637 | #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ | 1652 | #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ |
| 1638 | #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ | 1653 | #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ |
| 1654 | #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ | ||
| 1639 | 1655 | ||
| 1640 | typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, | 1656 | typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, |
| 1641 | void *data); | 1657 | void *data); |
| @@ -1707,7 +1723,11 @@ int vmemmap_populate_basepages(struct page *start_page, | |||
| 1707 | unsigned long pages, int node); | 1723 | unsigned long pages, int node); |
| 1708 | int vmemmap_populate(struct page *start_page, unsigned long pages, int node); | 1724 | int vmemmap_populate(struct page *start_page, unsigned long pages, int node); |
| 1709 | void vmemmap_populate_print_last(void); | 1725 | void vmemmap_populate_print_last(void); |
| 1710 | 1726 | #ifdef CONFIG_MEMORY_HOTPLUG | |
| 1727 | void vmemmap_free(struct page *memmap, unsigned long nr_pages); | ||
| 1728 | #endif | ||
| 1729 | void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, | ||
| 1730 | unsigned long size); | ||
| 1711 | 1731 | ||
| 1712 | enum mf_flags { | 1732 | enum mf_flags { |
| 1713 | MF_COUNT_INCREASED = 1 << 0, | 1733 | MF_COUNT_INCREASED = 1 << 0, |
| @@ -1720,7 +1740,7 @@ extern int unpoison_memory(unsigned long pfn); | |||
| 1720 | extern int sysctl_memory_failure_early_kill; | 1740 | extern int sysctl_memory_failure_early_kill; |
| 1721 | extern int sysctl_memory_failure_recovery; | 1741 | extern int sysctl_memory_failure_recovery; |
| 1722 | extern void shake_page(struct page *p, int access); | 1742 | extern void shake_page(struct page *p, int access); |
| 1723 | extern atomic_long_t mce_bad_pages; | 1743 | extern atomic_long_t num_poisoned_pages; |
| 1724 | extern int soft_offline_page(struct page *page, int flags); | 1744 | extern int soft_offline_page(struct page *page, int flags); |
| 1725 | 1745 | ||
| 1726 | extern void dump_page(struct page *page); | 1746 | extern void dump_page(struct page *page); |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f8f5162a3571..ace9a5f01c64 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
| @@ -12,6 +12,7 @@ | |||
| 12 | #include <linux/cpumask.h> | 12 | #include <linux/cpumask.h> |
| 13 | #include <linux/page-debug-flags.h> | 13 | #include <linux/page-debug-flags.h> |
| 14 | #include <linux/uprobes.h> | 14 | #include <linux/uprobes.h> |
| 15 | #include <linux/page-flags-layout.h> | ||
| 15 | #include <asm/page.h> | 16 | #include <asm/page.h> |
| 16 | #include <asm/mmu.h> | 17 | #include <asm/mmu.h> |
| 17 | 18 | ||
| @@ -173,7 +174,7 @@ struct page { | |||
| 173 | void *shadow; | 174 | void *shadow; |
| 174 | #endif | 175 | #endif |
| 175 | 176 | ||
| 176 | #ifdef CONFIG_NUMA_BALANCING | 177 | #ifdef LAST_NID_NOT_IN_PAGE_FLAGS |
| 177 | int _last_nid; | 178 | int _last_nid; |
| 178 | #endif | 179 | #endif |
| 179 | } | 180 | } |
| @@ -414,9 +415,9 @@ struct mm_struct { | |||
| 414 | #endif | 415 | #endif |
| 415 | #ifdef CONFIG_NUMA_BALANCING | 416 | #ifdef CONFIG_NUMA_BALANCING |
| 416 | /* | 417 | /* |
| 417 | * numa_next_scan is the next time when the PTEs will me marked | 418 | * numa_next_scan is the next time that the PTEs will be marked |
| 418 | * pte_numa to gather statistics and migrate pages to new nodes | 419 | * pte_numa. NUMA hinting faults will gather statistics and migrate |
| 419 | * if necessary | 420 | * pages to new nodes if necessary. |
| 420 | */ | 421 | */ |
| 421 | unsigned long numa_next_scan; | 422 | unsigned long numa_next_scan; |
| 422 | 423 | ||
diff --git a/include/linux/mman.h b/include/linux/mman.h index 9aa863da287f..61c7a87e5d2b 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h | |||
| @@ -79,6 +79,8 @@ calc_vm_flag_bits(unsigned long flags) | |||
| 79 | { | 79 | { |
| 80 | return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | | 80 | return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | |
| 81 | _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | | 81 | _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | |
| 82 | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); | 82 | ((flags & MAP_LOCKED) ? (VM_LOCKED | VM_POPULATE) : 0) | |
| 83 | (((flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE) ? | ||
| 84 | VM_POPULATE : 0); | ||
| 83 | } | 85 | } |
| 84 | #endif /* _LINUX_MMAN_H */ | 86 | #endif /* _LINUX_MMAN_H */ |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 73b64a38b984..ede274957e05 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
| @@ -15,7 +15,7 @@ | |||
| 15 | #include <linux/seqlock.h> | 15 | #include <linux/seqlock.h> |
| 16 | #include <linux/nodemask.h> | 16 | #include <linux/nodemask.h> |
| 17 | #include <linux/pageblock-flags.h> | 17 | #include <linux/pageblock-flags.h> |
| 18 | #include <generated/bounds.h> | 18 | #include <linux/page-flags-layout.h> |
| 19 | #include <linux/atomic.h> | 19 | #include <linux/atomic.h> |
| 20 | #include <asm/page.h> | 20 | #include <asm/page.h> |
| 21 | 21 | ||
| @@ -57,7 +57,9 @@ enum { | |||
| 57 | */ | 57 | */ |
| 58 | MIGRATE_CMA, | 58 | MIGRATE_CMA, |
| 59 | #endif | 59 | #endif |
| 60 | #ifdef CONFIG_MEMORY_ISOLATION | ||
| 60 | MIGRATE_ISOLATE, /* can't allocate from here */ | 61 | MIGRATE_ISOLATE, /* can't allocate from here */ |
| 62 | #endif | ||
| 61 | MIGRATE_TYPES | 63 | MIGRATE_TYPES |
| 62 | }; | 64 | }; |
| 63 | 65 | ||
| @@ -308,24 +310,6 @@ enum zone_type { | |||
| 308 | 310 | ||
| 309 | #ifndef __GENERATING_BOUNDS_H | 311 | #ifndef __GENERATING_BOUNDS_H |
| 310 | 312 | ||
| 311 | /* | ||
| 312 | * When a memory allocation must conform to specific limitations (such | ||
| 313 | * as being suitable for DMA) the caller will pass in hints to the | ||
| 314 | * allocator in the gfp_mask, in the zone modifier bits. These bits | ||
| 315 | * are used to select a priority ordered list of memory zones which | ||
| 316 | * match the requested limits. See gfp_zone() in include/linux/gfp.h | ||
| 317 | */ | ||
| 318 | |||
| 319 | #if MAX_NR_ZONES < 2 | ||
| 320 | #define ZONES_SHIFT 0 | ||
| 321 | #elif MAX_NR_ZONES <= 2 | ||
| 322 | #define ZONES_SHIFT 1 | ||
| 323 | #elif MAX_NR_ZONES <= 4 | ||
| 324 | #define ZONES_SHIFT 2 | ||
| 325 | #else | ||
| 326 | #error ZONES_SHIFT -- too many zones configured adjust calculation | ||
| 327 | #endif | ||
| 328 | |||
| 329 | struct zone { | 313 | struct zone { |
| 330 | /* Fields commonly accessed by the page allocator */ | 314 | /* Fields commonly accessed by the page allocator */ |
| 331 | 315 | ||
| @@ -543,6 +527,26 @@ static inline int zone_is_oom_locked(const struct zone *zone) | |||
| 543 | return test_bit(ZONE_OOM_LOCKED, &zone->flags); | 527 | return test_bit(ZONE_OOM_LOCKED, &zone->flags); |
| 544 | } | 528 | } |
| 545 | 529 | ||
| 530 | static inline unsigned zone_end_pfn(const struct zone *zone) | ||
| 531 | { | ||
| 532 | return zone->zone_start_pfn + zone->spanned_pages; | ||
| 533 | } | ||
| 534 | |||
| 535 | static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) | ||
| 536 | { | ||
| 537 | return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone); | ||
| 538 | } | ||
| 539 | |||
| 540 | static inline bool zone_is_initialized(struct zone *zone) | ||
| 541 | { | ||
| 542 | return !!zone->wait_table; | ||
| 543 | } | ||
| 544 | |||
| 545 | static inline bool zone_is_empty(struct zone *zone) | ||
| 546 | { | ||
| 547 | return zone->spanned_pages == 0; | ||
| 548 | } | ||
| 549 | |||
| 546 | /* | 550 | /* |
| 547 | * The "priority" of VM scanning is how much of the queues we will scan in one | 551 | * The "priority" of VM scanning is how much of the queues we will scan in one |
| 548 | * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the | 552 | * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the |
| @@ -752,11 +756,17 @@ typedef struct pglist_data { | |||
| 752 | #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) | 756 | #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) |
| 753 | 757 | ||
| 754 | #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) | 758 | #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) |
| 759 | #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) | ||
| 755 | 760 | ||
| 756 | #define node_end_pfn(nid) ({\ | 761 | static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) |
| 757 | pg_data_t *__pgdat = NODE_DATA(nid);\ | 762 | { |
| 758 | __pgdat->node_start_pfn + __pgdat->node_spanned_pages;\ | 763 | return pgdat->node_start_pfn + pgdat->node_spanned_pages; |
| 759 | }) | 764 | } |
| 765 | |||
| 766 | static inline bool pgdat_is_empty(pg_data_t *pgdat) | ||
| 767 | { | ||
| 768 | return !pgdat->node_start_pfn && !pgdat->node_spanned_pages; | ||
| 769 | } | ||
| 760 | 770 | ||
| 761 | #include <linux/memory_hotplug.h> | 771 | #include <linux/memory_hotplug.h> |
| 762 | 772 | ||
| @@ -1053,8 +1063,6 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn) | |||
| 1053 | * PA_SECTION_SHIFT physical address to/from section number | 1063 | * PA_SECTION_SHIFT physical address to/from section number |
| 1054 | * PFN_SECTION_SHIFT pfn to/from section number | 1064 | * PFN_SECTION_SHIFT pfn to/from section number |
| 1055 | */ | 1065 | */ |
| 1056 | #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) | ||
| 1057 | |||
| 1058 | #define PA_SECTION_SHIFT (SECTION_SIZE_BITS) | 1066 | #define PA_SECTION_SHIFT (SECTION_SIZE_BITS) |
| 1059 | #define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) | 1067 | #define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) |
| 1060 | 1068 | ||
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h new file mode 100644 index 000000000000..93506a114034 --- /dev/null +++ b/include/linux/page-flags-layout.h | |||
| @@ -0,0 +1,88 @@ | |||
| 1 | #ifndef PAGE_FLAGS_LAYOUT_H | ||
| 2 | #define PAGE_FLAGS_LAYOUT_H | ||
| 3 | |||
| 4 | #include <linux/numa.h> | ||
| 5 | #include <generated/bounds.h> | ||
| 6 | |||
| 7 | /* | ||
| 8 | * When a memory allocation must conform to specific limitations (such | ||
| 9 | * as being suitable for DMA) the caller will pass in hints to the | ||
| 10 | * allocator in the gfp_mask, in the zone modifier bits. These bits | ||
| 11 | * are used to select a priority ordered list of memory zones which | ||
| 12 | * match the requested limits. See gfp_zone() in include/linux/gfp.h | ||
| 13 | */ | ||
| 14 | #if MAX_NR_ZONES < 2 | ||
| 15 | #define ZONES_SHIFT 0 | ||
| 16 | #elif MAX_NR_ZONES <= 2 | ||
| 17 | #define ZONES_SHIFT 1 | ||
| 18 | #elif MAX_NR_ZONES <= 4 | ||
| 19 | #define ZONES_SHIFT 2 | ||
| 20 | #else | ||
| 21 | #error ZONES_SHIFT -- too many zones configured adjust calculation | ||
| 22 | #endif | ||
| 23 | |||
| 24 | #ifdef CONFIG_SPARSEMEM | ||
| 25 | #include <asm/sparsemem.h> | ||
| 26 | |||
| 27 | /* SECTION_SHIFT #bits space required to store a section # */ | ||
| 28 | #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) | ||
| 29 | |||
| 30 | #endif /* CONFIG_SPARSEMEM */ | ||
| 31 | |||
| 32 | /* | ||
| 33 | * page->flags layout: | ||
| 34 | * | ||
| 35 | * There are five possibilities for how page->flags get laid out. The first | ||
| 36 | * pair is for the normal case without sparsemem. The second pair is for | ||
| 37 | * sparsemem when there is plenty of space for node and section information. | ||
| 38 | * The last is when there is insufficient space in page->flags and a separate | ||
| 39 | * lookup is necessary. | ||
| 40 | * | ||
| 41 | * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS | | ||
| 42 | * " plus space for last_nid: | NODE | ZONE | LAST_NID ... | FLAGS | | ||
| 43 | * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS | | ||
| 44 | * " plus space for last_nid: | SECTION | NODE | ZONE | LAST_NID ... | FLAGS | | ||
| 45 | * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS | | ||
| 46 | */ | ||
| 47 | #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) | ||
| 48 | #define SECTIONS_WIDTH SECTIONS_SHIFT | ||
| 49 | #else | ||
| 50 | #define SECTIONS_WIDTH 0 | ||
| 51 | #endif | ||
| 52 | |||
| 53 | #define ZONES_WIDTH ZONES_SHIFT | ||
| 54 | |||
| 55 | #if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS | ||
| 56 | #define NODES_WIDTH NODES_SHIFT | ||
| 57 | #else | ||
| 58 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | ||
| 59 | #error "Vmemmap: No space for nodes field in page flags" | ||
| 60 | #endif | ||
| 61 | #define NODES_WIDTH 0 | ||
| 62 | #endif | ||
| 63 | |||
| 64 | #ifdef CONFIG_NUMA_BALANCING | ||
| 65 | #define LAST_NID_SHIFT NODES_SHIFT | ||
| 66 | #else | ||
| 67 | #define LAST_NID_SHIFT 0 | ||
| 68 | #endif | ||
| 69 | |||
| 70 | #if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_NID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS | ||
| 71 | #define LAST_NID_WIDTH LAST_NID_SHIFT | ||
| 72 | #else | ||
| 73 | #define LAST_NID_WIDTH 0 | ||
| 74 | #endif | ||
| 75 | |||
| 76 | /* | ||
| 77 | * We are going to use the flags for the page to node mapping if its in | ||
| 78 | * there. This includes the case where there is no node, so it is implicit. | ||
| 79 | */ | ||
| 80 | #if !(NODES_WIDTH > 0 || NODES_SHIFT == 0) | ||
| 81 | #define NODE_NOT_IN_PAGE_FLAGS | ||
| 82 | #endif | ||
| 83 | |||
| 84 | #if defined(CONFIG_NUMA_BALANCING) && LAST_NID_WIDTH == 0 | ||
| 85 | #define LAST_NID_NOT_IN_PAGE_FLAGS | ||
| 86 | #endif | ||
| 87 | |||
| 88 | #endif /* _LINUX_PAGE_FLAGS_LAYOUT */ | ||
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index a92061e08d48..3fff8e774067 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h | |||
| @@ -1,6 +1,25 @@ | |||
| 1 | #ifndef __LINUX_PAGEISOLATION_H | 1 | #ifndef __LINUX_PAGEISOLATION_H |
| 2 | #define __LINUX_PAGEISOLATION_H | 2 | #define __LINUX_PAGEISOLATION_H |
| 3 | 3 | ||
| 4 | #ifdef CONFIG_MEMORY_ISOLATION | ||
| 5 | static inline bool is_migrate_isolate_page(struct page *page) | ||
| 6 | { | ||
| 7 | return get_pageblock_migratetype(page) == MIGRATE_ISOLATE; | ||
| 8 | } | ||
| 9 | static inline bool is_migrate_isolate(int migratetype) | ||
| 10 | { | ||
| 11 | return migratetype == MIGRATE_ISOLATE; | ||
| 12 | } | ||
| 13 | #else | ||
| 14 | static inline bool is_migrate_isolate_page(struct page *page) | ||
| 15 | { | ||
| 16 | return false; | ||
| 17 | } | ||
| 18 | static inline bool is_migrate_isolate(int migratetype) | ||
| 19 | { | ||
| 20 | return false; | ||
| 21 | } | ||
| 22 | #endif | ||
| 4 | 23 | ||
| 5 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count, | 24 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count, |
| 6 | bool skip_hwpoisoned_pages); | 25 | bool skip_hwpoisoned_pages); |
diff --git a/include/linux/pm.h b/include/linux/pm.h index 97bcf23e045a..e5d7230332a4 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h | |||
| @@ -537,6 +537,7 @@ struct dev_pm_info { | |||
| 537 | unsigned int irq_safe:1; | 537 | unsigned int irq_safe:1; |
| 538 | unsigned int use_autosuspend:1; | 538 | unsigned int use_autosuspend:1; |
| 539 | unsigned int timer_autosuspends:1; | 539 | unsigned int timer_autosuspends:1; |
| 540 | unsigned int memalloc_noio:1; | ||
| 540 | enum rpm_request request; | 541 | enum rpm_request request; |
| 541 | enum rpm_status runtime_status; | 542 | enum rpm_status runtime_status; |
| 542 | int runtime_error; | 543 | int runtime_error; |
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index c785c215abfc..7d7e09efff9b 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h | |||
| @@ -47,6 +47,7 @@ extern void pm_runtime_set_autosuspend_delay(struct device *dev, int delay); | |||
| 47 | extern unsigned long pm_runtime_autosuspend_expiration(struct device *dev); | 47 | extern unsigned long pm_runtime_autosuspend_expiration(struct device *dev); |
| 48 | extern void pm_runtime_update_max_time_suspended(struct device *dev, | 48 | extern void pm_runtime_update_max_time_suspended(struct device *dev, |
| 49 | s64 delta_ns); | 49 | s64 delta_ns); |
| 50 | extern void pm_runtime_set_memalloc_noio(struct device *dev, bool enable); | ||
| 50 | 51 | ||
| 51 | static inline bool pm_children_suspended(struct device *dev) | 52 | static inline bool pm_children_suspended(struct device *dev) |
| 52 | { | 53 | { |
| @@ -156,6 +157,8 @@ static inline void pm_runtime_set_autosuspend_delay(struct device *dev, | |||
| 156 | int delay) {} | 157 | int delay) {} |
| 157 | static inline unsigned long pm_runtime_autosuspend_expiration( | 158 | static inline unsigned long pm_runtime_autosuspend_expiration( |
| 158 | struct device *dev) { return 0; } | 159 | struct device *dev) { return 0; } |
| 160 | static inline void pm_runtime_set_memalloc_noio(struct device *dev, | ||
| 161 | bool enable){} | ||
| 159 | 162 | ||
| 160 | #endif /* !CONFIG_PM_RUNTIME */ | 163 | #endif /* !CONFIG_PM_RUNTIME */ |
| 161 | 164 | ||
diff --git a/include/linux/rmap.h b/include/linux/rmap.h index c20635c527a9..6dacb93a6d94 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h | |||
| @@ -123,7 +123,7 @@ static inline void anon_vma_lock_write(struct anon_vma *anon_vma) | |||
| 123 | down_write(&anon_vma->root->rwsem); | 123 | down_write(&anon_vma->root->rwsem); |
| 124 | } | 124 | } |
| 125 | 125 | ||
| 126 | static inline void anon_vma_unlock(struct anon_vma *anon_vma) | 126 | static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) |
| 127 | { | 127 | { |
| 128 | up_write(&anon_vma->root->rwsem); | 128 | up_write(&anon_vma->root->rwsem); |
| 129 | } | 129 | } |
diff --git a/include/linux/sched.h b/include/linux/sched.h index e4112aad2964..c2182b53dace 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
| @@ -51,6 +51,7 @@ struct sched_param { | |||
| 51 | #include <linux/cred.h> | 51 | #include <linux/cred.h> |
| 52 | #include <linux/llist.h> | 52 | #include <linux/llist.h> |
| 53 | #include <linux/uidgid.h> | 53 | #include <linux/uidgid.h> |
| 54 | #include <linux/gfp.h> | ||
| 54 | 55 | ||
| 55 | #include <asm/processor.h> | 56 | #include <asm/processor.h> |
| 56 | 57 | ||
| @@ -1791,6 +1792,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, | |||
| 1791 | #define PF_FROZEN 0x00010000 /* frozen for system suspend */ | 1792 | #define PF_FROZEN 0x00010000 /* frozen for system suspend */ |
| 1792 | #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ | 1793 | #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ |
| 1793 | #define PF_KSWAPD 0x00040000 /* I am kswapd */ | 1794 | #define PF_KSWAPD 0x00040000 /* I am kswapd */ |
| 1795 | #define PF_MEMALLOC_NOIO 0x00080000 /* Allocating memory without IO involved */ | ||
| 1794 | #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ | 1796 | #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ |
| 1795 | #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ | 1797 | #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ |
| 1796 | #define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */ | 1798 | #define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */ |
| @@ -1828,6 +1830,26 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, | |||
| 1828 | #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) | 1830 | #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) |
| 1829 | #define used_math() tsk_used_math(current) | 1831 | #define used_math() tsk_used_math(current) |
| 1830 | 1832 | ||
| 1833 | /* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags */ | ||
| 1834 | static inline gfp_t memalloc_noio_flags(gfp_t flags) | ||
| 1835 | { | ||
| 1836 | if (unlikely(current->flags & PF_MEMALLOC_NOIO)) | ||
| 1837 | flags &= ~__GFP_IO; | ||
| 1838 | return flags; | ||
| 1839 | } | ||
| 1840 | |||
| 1841 | static inline unsigned int memalloc_noio_save(void) | ||
| 1842 | { | ||
| 1843 | unsigned int flags = current->flags & PF_MEMALLOC_NOIO; | ||
| 1844 | current->flags |= PF_MEMALLOC_NOIO; | ||
| 1845 | return flags; | ||
| 1846 | } | ||
| 1847 | |||
| 1848 | static inline void memalloc_noio_restore(unsigned int flags) | ||
| 1849 | { | ||
| 1850 | current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags; | ||
| 1851 | } | ||
| 1852 | |||
| 1831 | /* | 1853 | /* |
| 1832 | * task->jobctl flags | 1854 | * task->jobctl flags |
| 1833 | */ | 1855 | */ |
diff --git a/include/linux/swap.h b/include/linux/swap.h index 68df9c17fbbb..2818a123f3ea 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
| @@ -8,7 +8,7 @@ | |||
| 8 | #include <linux/memcontrol.h> | 8 | #include <linux/memcontrol.h> |
| 9 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
| 10 | #include <linux/node.h> | 10 | #include <linux/node.h> |
| 11 | 11 | #include <linux/fs.h> | |
| 12 | #include <linux/atomic.h> | 12 | #include <linux/atomic.h> |
| 13 | #include <asm/page.h> | 13 | #include <asm/page.h> |
| 14 | 14 | ||
| @@ -156,7 +156,7 @@ enum { | |||
| 156 | SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ | 156 | SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ |
| 157 | }; | 157 | }; |
| 158 | 158 | ||
| 159 | #define SWAP_CLUSTER_MAX 32 | 159 | #define SWAP_CLUSTER_MAX 32UL |
| 160 | #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX | 160 | #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX |
| 161 | 161 | ||
| 162 | /* | 162 | /* |
| @@ -202,6 +202,18 @@ struct swap_info_struct { | |||
| 202 | unsigned long *frontswap_map; /* frontswap in-use, one bit per page */ | 202 | unsigned long *frontswap_map; /* frontswap in-use, one bit per page */ |
| 203 | atomic_t frontswap_pages; /* frontswap pages in-use counter */ | 203 | atomic_t frontswap_pages; /* frontswap pages in-use counter */ |
| 204 | #endif | 204 | #endif |
| 205 | spinlock_t lock; /* | ||
| 206 | * protect map scan related fields like | ||
| 207 | * swap_map, lowest_bit, highest_bit, | ||
| 208 | * inuse_pages, cluster_next, | ||
| 209 | * cluster_nr, lowest_alloc and | ||
| 210 | * highest_alloc. other fields are only | ||
| 211 | * changed at swapon/swapoff, so are | ||
| 212 | * protected by swap_lock. changing | ||
| 213 | * flags need hold this lock and | ||
| 214 | * swap_lock. If both locks need hold, | ||
| 215 | * hold swap_lock first. | ||
| 216 | */ | ||
| 205 | }; | 217 | }; |
| 206 | 218 | ||
| 207 | struct swap_list_t { | 219 | struct swap_list_t { |
| @@ -209,15 +221,12 @@ struct swap_list_t { | |||
| 209 | int next; /* swapfile to be used next */ | 221 | int next; /* swapfile to be used next */ |
| 210 | }; | 222 | }; |
| 211 | 223 | ||
| 212 | /* Swap 50% full? Release swapcache more aggressively.. */ | ||
| 213 | #define vm_swap_full() (nr_swap_pages*2 < total_swap_pages) | ||
| 214 | |||
| 215 | /* linux/mm/page_alloc.c */ | 224 | /* linux/mm/page_alloc.c */ |
| 216 | extern unsigned long totalram_pages; | 225 | extern unsigned long totalram_pages; |
| 217 | extern unsigned long totalreserve_pages; | 226 | extern unsigned long totalreserve_pages; |
| 218 | extern unsigned long dirty_balance_reserve; | 227 | extern unsigned long dirty_balance_reserve; |
| 219 | extern unsigned int nr_free_buffer_pages(void); | 228 | extern unsigned long nr_free_buffer_pages(void); |
| 220 | extern unsigned int nr_free_pagecache_pages(void); | 229 | extern unsigned long nr_free_pagecache_pages(void); |
| 221 | 230 | ||
| 222 | /* Definition of global_page_state not available yet */ | 231 | /* Definition of global_page_state not available yet */ |
| 223 | #define nr_free_pages() global_page_state(NR_FREE_PAGES) | 232 | #define nr_free_pages() global_page_state(NR_FREE_PAGES) |
| @@ -266,7 +275,7 @@ extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
| 266 | extern unsigned long shrink_all_memory(unsigned long nr_pages); | 275 | extern unsigned long shrink_all_memory(unsigned long nr_pages); |
| 267 | extern int vm_swappiness; | 276 | extern int vm_swappiness; |
| 268 | extern int remove_mapping(struct address_space *mapping, struct page *page); | 277 | extern int remove_mapping(struct address_space *mapping, struct page *page); |
| 269 | extern long vm_total_pages; | 278 | extern unsigned long vm_total_pages; |
| 270 | 279 | ||
| 271 | #ifdef CONFIG_NUMA | 280 | #ifdef CONFIG_NUMA |
| 272 | extern int zone_reclaim_mode; | 281 | extern int zone_reclaim_mode; |
| @@ -330,8 +339,9 @@ int generic_swapfile_activate(struct swap_info_struct *, struct file *, | |||
| 330 | sector_t *); | 339 | sector_t *); |
| 331 | 340 | ||
| 332 | /* linux/mm/swap_state.c */ | 341 | /* linux/mm/swap_state.c */ |
| 333 | extern struct address_space swapper_space; | 342 | extern struct address_space swapper_spaces[]; |
| 334 | #define total_swapcache_pages swapper_space.nrpages | 343 | #define swap_address_space(entry) (&swapper_spaces[swp_type(entry)]) |
| 344 | extern unsigned long total_swapcache_pages(void); | ||
| 335 | extern void show_swap_cache_info(void); | 345 | extern void show_swap_cache_info(void); |
| 336 | extern int add_to_swap(struct page *); | 346 | extern int add_to_swap(struct page *); |
| 337 | extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); | 347 | extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); |
| @@ -346,8 +356,20 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t, | |||
| 346 | struct vm_area_struct *vma, unsigned long addr); | 356 | struct vm_area_struct *vma, unsigned long addr); |
| 347 | 357 | ||
| 348 | /* linux/mm/swapfile.c */ | 358 | /* linux/mm/swapfile.c */ |
| 349 | extern long nr_swap_pages; | 359 | extern atomic_long_t nr_swap_pages; |
| 350 | extern long total_swap_pages; | 360 | extern long total_swap_pages; |
| 361 | |||
| 362 | /* Swap 50% full? Release swapcache more aggressively.. */ | ||
| 363 | static inline bool vm_swap_full(void) | ||
| 364 | { | ||
| 365 | return atomic_long_read(&nr_swap_pages) * 2 < total_swap_pages; | ||
| 366 | } | ||
| 367 | |||
| 368 | static inline long get_nr_swap_pages(void) | ||
| 369 | { | ||
| 370 | return atomic_long_read(&nr_swap_pages); | ||
| 371 | } | ||
| 372 | |||
| 351 | extern void si_swapinfo(struct sysinfo *); | 373 | extern void si_swapinfo(struct sysinfo *); |
| 352 | extern swp_entry_t get_swap_page(void); | 374 | extern swp_entry_t get_swap_page(void); |
| 353 | extern swp_entry_t get_swap_page_of_type(int); | 375 | extern swp_entry_t get_swap_page_of_type(int); |
| @@ -380,9 +402,10 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) | |||
| 380 | 402 | ||
| 381 | #else /* CONFIG_SWAP */ | 403 | #else /* CONFIG_SWAP */ |
| 382 | 404 | ||
| 383 | #define nr_swap_pages 0L | 405 | #define get_nr_swap_pages() 0L |
| 384 | #define total_swap_pages 0L | 406 | #define total_swap_pages 0L |
| 385 | #define total_swapcache_pages 0UL | 407 | #define total_swapcache_pages() 0UL |
| 408 | #define vm_swap_full() 0 | ||
| 386 | 409 | ||
| 387 | #define si_swapinfo(val) \ | 410 | #define si_swapinfo(val) \ |
| 388 | do { (val)->freeswap = (val)->totalswap = 0; } while (0) | 411 | do { (val)->freeswap = (val)->totalswap = 0; } while (0) |
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index fce0a2799d43..bd6cf61142be 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h | |||
| @@ -36,7 +36,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, | |||
| 36 | #endif | 36 | #endif |
| 37 | PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL, | 37 | PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL, |
| 38 | KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY, | 38 | KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY, |
| 39 | KSWAPD_SKIP_CONGESTION_WAIT, | ||
| 40 | PAGEOUTRUN, ALLOCSTALL, PGROTATED, | 39 | PAGEOUTRUN, ALLOCSTALL, PGROTATED, |
| 41 | #ifdef CONFIG_NUMA_BALANCING | 40 | #ifdef CONFIG_NUMA_BALANCING |
| 42 | NUMA_PTE_UPDATES, | 41 | NUMA_PTE_UPDATES, |
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index a13291f7da88..5fd71a7d0dfd 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h | |||
| @@ -85,7 +85,7 @@ static inline void vm_events_fold_cpu(int cpu) | |||
| 85 | #define count_vm_numa_events(x, y) count_vm_events(x, y) | 85 | #define count_vm_numa_events(x, y) count_vm_events(x, y) |
| 86 | #else | 86 | #else |
| 87 | #define count_vm_numa_event(x) do {} while (0) | 87 | #define count_vm_numa_event(x) do {} while (0) |
| 88 | #define count_vm_numa_events(x, y) do {} while (0) | 88 | #define count_vm_numa_events(x, y) do { (void)(y); } while (0) |
| 89 | #endif /* CONFIG_NUMA_BALANCING */ | 89 | #endif /* CONFIG_NUMA_BALANCING */ |
| 90 | 90 | ||
| 91 | #define __count_zone_vm_events(item, zone, delta) \ | 91 | #define __count_zone_vm_events(item, zone, delta) \ |
| @@ -967,11 +967,11 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, | |||
| 967 | unsigned long flags; | 967 | unsigned long flags; |
| 968 | unsigned long prot; | 968 | unsigned long prot; |
| 969 | int acc_mode; | 969 | int acc_mode; |
| 970 | unsigned long user_addr; | ||
| 971 | struct ipc_namespace *ns; | 970 | struct ipc_namespace *ns; |
| 972 | struct shm_file_data *sfd; | 971 | struct shm_file_data *sfd; |
| 973 | struct path path; | 972 | struct path path; |
| 974 | fmode_t f_mode; | 973 | fmode_t f_mode; |
| 974 | unsigned long populate = 0; | ||
| 975 | 975 | ||
| 976 | err = -EINVAL; | 976 | err = -EINVAL; |
| 977 | if (shmid < 0) | 977 | if (shmid < 0) |
| @@ -1070,13 +1070,15 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, | |||
| 1070 | goto invalid; | 1070 | goto invalid; |
| 1071 | } | 1071 | } |
| 1072 | 1072 | ||
| 1073 | user_addr = do_mmap_pgoff(file, addr, size, prot, flags, 0); | 1073 | addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate); |
| 1074 | *raddr = user_addr; | 1074 | *raddr = addr; |
| 1075 | err = 0; | 1075 | err = 0; |
| 1076 | if (IS_ERR_VALUE(user_addr)) | 1076 | if (IS_ERR_VALUE(addr)) |
| 1077 | err = (long)user_addr; | 1077 | err = (long)addr; |
| 1078 | invalid: | 1078 | invalid: |
| 1079 | up_write(¤t->mm->mmap_sem); | 1079 | up_write(¤t->mm->mmap_sem); |
| 1080 | if (populate) | ||
| 1081 | mm_populate(addr, populate); | ||
| 1080 | 1082 | ||
| 1081 | out_fput: | 1083 | out_fput: |
| 1082 | fput(file); | 1084 | fput(file); |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3a673a3b0c6b..053dfd7692d1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -1132,18 +1132,28 @@ EXPORT_SYMBOL_GPL(kick_process); | |||
| 1132 | */ | 1132 | */ |
| 1133 | static int select_fallback_rq(int cpu, struct task_struct *p) | 1133 | static int select_fallback_rq(int cpu, struct task_struct *p) |
| 1134 | { | 1134 | { |
| 1135 | const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); | 1135 | int nid = cpu_to_node(cpu); |
| 1136 | const struct cpumask *nodemask = NULL; | ||
| 1136 | enum { cpuset, possible, fail } state = cpuset; | 1137 | enum { cpuset, possible, fail } state = cpuset; |
| 1137 | int dest_cpu; | 1138 | int dest_cpu; |
| 1138 | 1139 | ||
| 1139 | /* Look for allowed, online CPU in same node. */ | 1140 | /* |
| 1140 | for_each_cpu(dest_cpu, nodemask) { | 1141 | * If the node that the cpu is on has been offlined, cpu_to_node() |
| 1141 | if (!cpu_online(dest_cpu)) | 1142 | * will return -1. There is no cpu on the node, and we should |
| 1142 | continue; | 1143 | * select the cpu on the other node. |
| 1143 | if (!cpu_active(dest_cpu)) | 1144 | */ |
| 1144 | continue; | 1145 | if (nid != -1) { |
| 1145 | if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) | 1146 | nodemask = cpumask_of_node(nid); |
| 1146 | return dest_cpu; | 1147 | |
| 1148 | /* Look for allowed, online CPU in same node. */ | ||
| 1149 | for_each_cpu(dest_cpu, nodemask) { | ||
| 1150 | if (!cpu_online(dest_cpu)) | ||
| 1151 | continue; | ||
| 1152 | if (!cpu_active(dest_cpu)) | ||
| 1153 | continue; | ||
| 1154 | if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) | ||
| 1155 | return dest_cpu; | ||
| 1156 | } | ||
| 1147 | } | 1157 | } |
| 1148 | 1158 | ||
| 1149 | for (;;) { | 1159 | for (;;) { |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 467d8b923fcd..95e9e55602a8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -105,7 +105,6 @@ extern char core_pattern[]; | |||
| 105 | extern unsigned int core_pipe_limit; | 105 | extern unsigned int core_pipe_limit; |
| 106 | #endif | 106 | #endif |
| 107 | extern int pid_max; | 107 | extern int pid_max; |
| 108 | extern int min_free_kbytes; | ||
| 109 | extern int pid_max_min, pid_max_max; | 108 | extern int pid_max_min, pid_max_max; |
| 110 | extern int sysctl_drop_caches; | 109 | extern int sysctl_drop_caches; |
| 111 | extern int percpu_pagelist_fraction; | 110 | extern int percpu_pagelist_fraction; |
diff --git a/mm/Kconfig b/mm/Kconfig index 0b23db9a8791..2c7aea7106f9 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
| @@ -162,10 +162,16 @@ config MOVABLE_NODE | |||
| 162 | Say Y here if you want to hotplug a whole node. | 162 | Say Y here if you want to hotplug a whole node. |
| 163 | Say N here if you want kernel to use memory on all nodes evenly. | 163 | Say N here if you want kernel to use memory on all nodes evenly. |
| 164 | 164 | ||
| 165 | # | ||
| 166 | # Only be set on architectures that have completely implemented memory hotplug | ||
| 167 | # feature. If you are not sure, don't touch it. | ||
| 168 | # | ||
| 169 | config HAVE_BOOTMEM_INFO_NODE | ||
| 170 | def_bool n | ||
| 171 | |||
| 165 | # eventually, we can have this option just 'select SPARSEMEM' | 172 | # eventually, we can have this option just 'select SPARSEMEM' |
| 166 | config MEMORY_HOTPLUG | 173 | config MEMORY_HOTPLUG |
| 167 | bool "Allow for memory hot-add" | 174 | bool "Allow for memory hot-add" |
| 168 | select MEMORY_ISOLATION | ||
| 169 | depends on SPARSEMEM || X86_64_ACPI_NUMA | 175 | depends on SPARSEMEM || X86_64_ACPI_NUMA |
| 170 | depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG | 176 | depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG |
| 171 | depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) | 177 | depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) |
| @@ -176,6 +182,8 @@ config MEMORY_HOTPLUG_SPARSE | |||
| 176 | 182 | ||
| 177 | config MEMORY_HOTREMOVE | 183 | config MEMORY_HOTREMOVE |
| 178 | bool "Allow for memory hot remove" | 184 | bool "Allow for memory hot remove" |
| 185 | select MEMORY_ISOLATION | ||
| 186 | select HAVE_BOOTMEM_INFO_NODE if X86_64 | ||
| 179 | depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE | 187 | depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE |
| 180 | depends on MIGRATION | 188 | depends on MIGRATION |
| 181 | 189 | ||
diff --git a/mm/compaction.c b/mm/compaction.c index c62bd063d766..05ccb4cc0bdb 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
| @@ -15,6 +15,7 @@ | |||
| 15 | #include <linux/sysctl.h> | 15 | #include <linux/sysctl.h> |
| 16 | #include <linux/sysfs.h> | 16 | #include <linux/sysfs.h> |
| 17 | #include <linux/balloon_compaction.h> | 17 | #include <linux/balloon_compaction.h> |
| 18 | #include <linux/page-isolation.h> | ||
| 18 | #include "internal.h" | 19 | #include "internal.h" |
| 19 | 20 | ||
| 20 | #ifdef CONFIG_COMPACTION | 21 | #ifdef CONFIG_COMPACTION |
| @@ -85,7 +86,7 @@ static inline bool isolation_suitable(struct compact_control *cc, | |||
| 85 | static void __reset_isolation_suitable(struct zone *zone) | 86 | static void __reset_isolation_suitable(struct zone *zone) |
| 86 | { | 87 | { |
| 87 | unsigned long start_pfn = zone->zone_start_pfn; | 88 | unsigned long start_pfn = zone->zone_start_pfn; |
| 88 | unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages; | 89 | unsigned long end_pfn = zone_end_pfn(zone); |
| 89 | unsigned long pfn; | 90 | unsigned long pfn; |
| 90 | 91 | ||
| 91 | zone->compact_cached_migrate_pfn = start_pfn; | 92 | zone->compact_cached_migrate_pfn = start_pfn; |
| @@ -215,7 +216,10 @@ static bool suitable_migration_target(struct page *page) | |||
| 215 | int migratetype = get_pageblock_migratetype(page); | 216 | int migratetype = get_pageblock_migratetype(page); |
| 216 | 217 | ||
| 217 | /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ | 218 | /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ |
| 218 | if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) | 219 | if (migratetype == MIGRATE_RESERVE) |
| 220 | return false; | ||
| 221 | |||
| 222 | if (is_migrate_isolate(migratetype)) | ||
| 219 | return false; | 223 | return false; |
| 220 | 224 | ||
| 221 | /* If the page is a large free page, then allow migration */ | 225 | /* If the page is a large free page, then allow migration */ |
| @@ -611,8 +615,7 @@ check_compact_cluster: | |||
| 611 | continue; | 615 | continue; |
| 612 | 616 | ||
| 613 | next_pageblock: | 617 | next_pageblock: |
| 614 | low_pfn += pageblock_nr_pages; | 618 | low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1; |
| 615 | low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; | ||
| 616 | last_pageblock_nr = pageblock_nr; | 619 | last_pageblock_nr = pageblock_nr; |
| 617 | } | 620 | } |
| 618 | 621 | ||
| @@ -644,7 +647,7 @@ static void isolate_freepages(struct zone *zone, | |||
| 644 | struct compact_control *cc) | 647 | struct compact_control *cc) |
| 645 | { | 648 | { |
| 646 | struct page *page; | 649 | struct page *page; |
| 647 | unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn; | 650 | unsigned long high_pfn, low_pfn, pfn, z_end_pfn, end_pfn; |
| 648 | int nr_freepages = cc->nr_freepages; | 651 | int nr_freepages = cc->nr_freepages; |
| 649 | struct list_head *freelist = &cc->freepages; | 652 | struct list_head *freelist = &cc->freepages; |
| 650 | 653 | ||
| @@ -663,7 +666,7 @@ static void isolate_freepages(struct zone *zone, | |||
| 663 | */ | 666 | */ |
| 664 | high_pfn = min(low_pfn, pfn); | 667 | high_pfn = min(low_pfn, pfn); |
| 665 | 668 | ||
| 666 | zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; | 669 | z_end_pfn = zone_end_pfn(zone); |
| 667 | 670 | ||
| 668 | /* | 671 | /* |
| 669 | * Isolate free pages until enough are available to migrate the | 672 | * Isolate free pages until enough are available to migrate the |
| @@ -706,7 +709,7 @@ static void isolate_freepages(struct zone *zone, | |||
| 706 | * only scans within a pageblock | 709 | * only scans within a pageblock |
| 707 | */ | 710 | */ |
| 708 | end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); | 711 | end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); |
| 709 | end_pfn = min(end_pfn, zone_end_pfn); | 712 | end_pfn = min(end_pfn, z_end_pfn); |
| 710 | isolated = isolate_freepages_block(cc, pfn, end_pfn, | 713 | isolated = isolate_freepages_block(cc, pfn, end_pfn, |
| 711 | freelist, false); | 714 | freelist, false); |
| 712 | nr_freepages += isolated; | 715 | nr_freepages += isolated; |
| @@ -795,7 +798,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
| 795 | low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); | 798 | low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); |
| 796 | 799 | ||
| 797 | /* Only scan within a pageblock boundary */ | 800 | /* Only scan within a pageblock boundary */ |
| 798 | end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages); | 801 | end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); |
| 799 | 802 | ||
| 800 | /* Do not cross the free scanner or scan within a memory hole */ | 803 | /* Do not cross the free scanner or scan within a memory hole */ |
| 801 | if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { | 804 | if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { |
| @@ -920,7 +923,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
| 920 | { | 923 | { |
| 921 | int ret; | 924 | int ret; |
| 922 | unsigned long start_pfn = zone->zone_start_pfn; | 925 | unsigned long start_pfn = zone->zone_start_pfn; |
| 923 | unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages; | 926 | unsigned long end_pfn = zone_end_pfn(zone); |
| 924 | 927 | ||
| 925 | ret = compaction_suitable(zone, cc->order); | 928 | ret = compaction_suitable(zone, cc->order); |
| 926 | switch (ret) { | 929 | switch (ret) { |
| @@ -977,7 +980,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
| 977 | 980 | ||
| 978 | nr_migrate = cc->nr_migratepages; | 981 | nr_migrate = cc->nr_migratepages; |
| 979 | err = migrate_pages(&cc->migratepages, compaction_alloc, | 982 | err = migrate_pages(&cc->migratepages, compaction_alloc, |
| 980 | (unsigned long)cc, false, | 983 | (unsigned long)cc, |
| 981 | cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, | 984 | cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, |
| 982 | MR_COMPACTION); | 985 | MR_COMPACTION); |
| 983 | update_nr_listpages(cc); | 986 | update_nr_listpages(cc); |
| @@ -1086,7 +1089,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
| 1086 | 1089 | ||
| 1087 | 1090 | ||
| 1088 | /* Compact all zones within a node */ | 1091 | /* Compact all zones within a node */ |
| 1089 | static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) | 1092 | static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) |
| 1090 | { | 1093 | { |
| 1091 | int zoneid; | 1094 | int zoneid; |
| 1092 | struct zone *zone; | 1095 | struct zone *zone; |
| @@ -1119,28 +1122,26 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) | |||
| 1119 | VM_BUG_ON(!list_empty(&cc->freepages)); | 1122 | VM_BUG_ON(!list_empty(&cc->freepages)); |
| 1120 | VM_BUG_ON(!list_empty(&cc->migratepages)); | 1123 | VM_BUG_ON(!list_empty(&cc->migratepages)); |
| 1121 | } | 1124 | } |
| 1122 | |||
| 1123 | return 0; | ||
| 1124 | } | 1125 | } |
| 1125 | 1126 | ||
| 1126 | int compact_pgdat(pg_data_t *pgdat, int order) | 1127 | void compact_pgdat(pg_data_t *pgdat, int order) |
| 1127 | { | 1128 | { |
| 1128 | struct compact_control cc = { | 1129 | struct compact_control cc = { |
| 1129 | .order = order, | 1130 | .order = order, |
| 1130 | .sync = false, | 1131 | .sync = false, |
| 1131 | }; | 1132 | }; |
| 1132 | 1133 | ||
| 1133 | return __compact_pgdat(pgdat, &cc); | 1134 | __compact_pgdat(pgdat, &cc); |
| 1134 | } | 1135 | } |
| 1135 | 1136 | ||
| 1136 | static int compact_node(int nid) | 1137 | static void compact_node(int nid) |
| 1137 | { | 1138 | { |
| 1138 | struct compact_control cc = { | 1139 | struct compact_control cc = { |
| 1139 | .order = -1, | 1140 | .order = -1, |
| 1140 | .sync = true, | 1141 | .sync = true, |
| 1141 | }; | 1142 | }; |
| 1142 | 1143 | ||
| 1143 | return __compact_pgdat(NODE_DATA(nid), &cc); | 1144 | __compact_pgdat(NODE_DATA(nid), &cc); |
| 1144 | } | 1145 | } |
| 1145 | 1146 | ||
| 1146 | /* Compact all nodes in the system */ | 1147 | /* Compact all nodes in the system */ |
diff --git a/mm/fadvise.c b/mm/fadvise.c index a47f0f50c89f..909ec558625c 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
| @@ -17,6 +17,7 @@ | |||
| 17 | #include <linux/fadvise.h> | 17 | #include <linux/fadvise.h> |
| 18 | #include <linux/writeback.h> | 18 | #include <linux/writeback.h> |
| 19 | #include <linux/syscalls.h> | 19 | #include <linux/syscalls.h> |
| 20 | #include <linux/swap.h> | ||
| 20 | 21 | ||
| 21 | #include <asm/unistd.h> | 22 | #include <asm/unistd.h> |
| 22 | 23 | ||
| @@ -120,9 +121,22 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) | |||
| 120 | start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; | 121 | start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; |
| 121 | end_index = (endbyte >> PAGE_CACHE_SHIFT); | 122 | end_index = (endbyte >> PAGE_CACHE_SHIFT); |
| 122 | 123 | ||
| 123 | if (end_index >= start_index) | 124 | if (end_index >= start_index) { |
| 124 | invalidate_mapping_pages(mapping, start_index, | 125 | unsigned long count = invalidate_mapping_pages(mapping, |
| 126 | start_index, end_index); | ||
| 127 | |||
| 128 | /* | ||
| 129 | * If fewer pages were invalidated than expected then | ||
| 130 | * it is possible that some of the pages were on | ||
| 131 | * a per-cpu pagevec for a remote CPU. Drain all | ||
| 132 | * pagevecs and try again. | ||
| 133 | */ | ||
| 134 | if (count < (end_index - start_index + 1)) { | ||
| 135 | lru_add_drain_all(); | ||
| 136 | invalidate_mapping_pages(mapping, start_index, | ||
| 125 | end_index); | 137 | end_index); |
| 138 | } | ||
| 139 | } | ||
| 126 | break; | 140 | break; |
| 127 | default: | 141 | default: |
| 128 | ret = -EINVAL; | 142 | ret = -EINVAL; |
diff --git a/mm/fremap.c b/mm/fremap.c index a0aaf0e56800..0cd4c11488ed 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
| @@ -129,6 +129,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
| 129 | struct vm_area_struct *vma; | 129 | struct vm_area_struct *vma; |
| 130 | int err = -EINVAL; | 130 | int err = -EINVAL; |
| 131 | int has_write_lock = 0; | 131 | int has_write_lock = 0; |
| 132 | vm_flags_t vm_flags; | ||
| 132 | 133 | ||
| 133 | if (prot) | 134 | if (prot) |
| 134 | return err; | 135 | return err; |
| @@ -160,15 +161,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
| 160 | /* | 161 | /* |
| 161 | * Make sure the vma is shared, that it supports prefaulting, | 162 | * Make sure the vma is shared, that it supports prefaulting, |
| 162 | * and that the remapped range is valid and fully within | 163 | * and that the remapped range is valid and fully within |
| 163 | * the single existing vma. vm_private_data is used as a | 164 | * the single existing vma. |
| 164 | * swapout cursor in a VM_NONLINEAR vma. | ||
| 165 | */ | 165 | */ |
| 166 | if (!vma || !(vma->vm_flags & VM_SHARED)) | 166 | if (!vma || !(vma->vm_flags & VM_SHARED)) |
| 167 | goto out; | 167 | goto out; |
| 168 | 168 | ||
| 169 | if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR)) | ||
| 170 | goto out; | ||
| 171 | |||
| 172 | if (!vma->vm_ops || !vma->vm_ops->remap_pages) | 169 | if (!vma->vm_ops || !vma->vm_ops->remap_pages) |
| 173 | goto out; | 170 | goto out; |
| 174 | 171 | ||
| @@ -177,6 +174,13 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
| 177 | 174 | ||
| 178 | /* Must set VM_NONLINEAR before any pages are populated. */ | 175 | /* Must set VM_NONLINEAR before any pages are populated. */ |
| 179 | if (!(vma->vm_flags & VM_NONLINEAR)) { | 176 | if (!(vma->vm_flags & VM_NONLINEAR)) { |
| 177 | /* | ||
| 178 | * vm_private_data is used as a swapout cursor | ||
| 179 | * in a VM_NONLINEAR vma. | ||
| 180 | */ | ||
| 181 | if (vma->vm_private_data) | ||
| 182 | goto out; | ||
| 183 | |||
| 180 | /* Don't need a nonlinear mapping, exit success */ | 184 | /* Don't need a nonlinear mapping, exit success */ |
| 181 | if (pgoff == linear_page_index(vma, start)) { | 185 | if (pgoff == linear_page_index(vma, start)) { |
| 182 | err = 0; | 186 | err = 0; |
| @@ -184,6 +188,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
| 184 | } | 188 | } |
| 185 | 189 | ||
| 186 | if (!has_write_lock) { | 190 | if (!has_write_lock) { |
| 191 | get_write_lock: | ||
| 187 | up_read(&mm->mmap_sem); | 192 | up_read(&mm->mmap_sem); |
| 188 | down_write(&mm->mmap_sem); | 193 | down_write(&mm->mmap_sem); |
| 189 | has_write_lock = 1; | 194 | has_write_lock = 1; |
| @@ -199,9 +204,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
| 199 | unsigned long addr; | 204 | unsigned long addr; |
| 200 | struct file *file = get_file(vma->vm_file); | 205 | struct file *file = get_file(vma->vm_file); |
| 201 | 206 | ||
| 202 | flags &= MAP_NONBLOCK; | 207 | vm_flags = vma->vm_flags; |
| 203 | addr = mmap_region(file, start, size, | 208 | if (!(flags & MAP_NONBLOCK)) |
| 204 | flags, vma->vm_flags, pgoff); | 209 | vm_flags |= VM_POPULATE; |
| 210 | addr = mmap_region(file, start, size, vm_flags, pgoff); | ||
| 205 | fput(file); | 211 | fput(file); |
| 206 | if (IS_ERR_VALUE(addr)) { | 212 | if (IS_ERR_VALUE(addr)) { |
| 207 | err = addr; | 213 | err = addr; |
| @@ -220,32 +226,26 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
| 220 | mutex_unlock(&mapping->i_mmap_mutex); | 226 | mutex_unlock(&mapping->i_mmap_mutex); |
| 221 | } | 227 | } |
| 222 | 228 | ||
| 229 | if (!(flags & MAP_NONBLOCK) && !(vma->vm_flags & VM_POPULATE)) { | ||
| 230 | if (!has_write_lock) | ||
| 231 | goto get_write_lock; | ||
| 232 | vma->vm_flags |= VM_POPULATE; | ||
| 233 | } | ||
| 234 | |||
| 223 | if (vma->vm_flags & VM_LOCKED) { | 235 | if (vma->vm_flags & VM_LOCKED) { |
| 224 | /* | 236 | /* |
| 225 | * drop PG_Mlocked flag for over-mapped range | 237 | * drop PG_Mlocked flag for over-mapped range |
| 226 | */ | 238 | */ |
| 227 | vm_flags_t saved_flags = vma->vm_flags; | 239 | if (!has_write_lock) |
| 240 | goto get_write_lock; | ||
| 241 | vm_flags = vma->vm_flags; | ||
| 228 | munlock_vma_pages_range(vma, start, start + size); | 242 | munlock_vma_pages_range(vma, start, start + size); |
| 229 | vma->vm_flags = saved_flags; | 243 | vma->vm_flags = vm_flags; |
| 230 | } | 244 | } |
| 231 | 245 | ||
| 232 | mmu_notifier_invalidate_range_start(mm, start, start + size); | 246 | mmu_notifier_invalidate_range_start(mm, start, start + size); |
| 233 | err = vma->vm_ops->remap_pages(vma, start, size, pgoff); | 247 | err = vma->vm_ops->remap_pages(vma, start, size, pgoff); |
| 234 | mmu_notifier_invalidate_range_end(mm, start, start + size); | 248 | mmu_notifier_invalidate_range_end(mm, start, start + size); |
| 235 | if (!err && !(flags & MAP_NONBLOCK)) { | ||
| 236 | if (vma->vm_flags & VM_LOCKED) { | ||
| 237 | /* | ||
| 238 | * might be mapping previously unmapped range of file | ||
| 239 | */ | ||
| 240 | mlock_vma_pages_range(vma, start, start + size); | ||
| 241 | } else { | ||
| 242 | if (unlikely(has_write_lock)) { | ||
| 243 | downgrade_write(&mm->mmap_sem); | ||
| 244 | has_write_lock = 0; | ||
| 245 | } | ||
| 246 | make_pages_present(start, start+size); | ||
| 247 | } | ||
| 248 | } | ||
| 249 | 249 | ||
| 250 | /* | 250 | /* |
| 251 | * We can't clear VM_NONLINEAR because we'd have to do | 251 | * We can't clear VM_NONLINEAR because we'd have to do |
| @@ -254,10 +254,13 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
| 254 | */ | 254 | */ |
| 255 | 255 | ||
| 256 | out: | 256 | out: |
| 257 | vm_flags = vma->vm_flags; | ||
| 257 | if (likely(!has_write_lock)) | 258 | if (likely(!has_write_lock)) |
| 258 | up_read(&mm->mmap_sem); | 259 | up_read(&mm->mmap_sem); |
| 259 | else | 260 | else |
| 260 | up_write(&mm->mmap_sem); | 261 | up_write(&mm->mmap_sem); |
| 262 | if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK))) | ||
| 263 | mm_populate(start, size); | ||
| 261 | 264 | ||
| 262 | return err; | 265 | return err; |
| 263 | } | 266 | } |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b5783d81eda9..bfa142e67b1c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
| @@ -20,6 +20,7 @@ | |||
| 20 | #include <linux/mman.h> | 20 | #include <linux/mman.h> |
| 21 | #include <linux/pagemap.h> | 21 | #include <linux/pagemap.h> |
| 22 | #include <linux/migrate.h> | 22 | #include <linux/migrate.h> |
| 23 | #include <linux/hashtable.h> | ||
| 23 | 24 | ||
| 24 | #include <asm/tlb.h> | 25 | #include <asm/tlb.h> |
| 25 | #include <asm/pgalloc.h> | 26 | #include <asm/pgalloc.h> |
| @@ -62,12 +63,11 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); | |||
| 62 | static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; | 63 | static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; |
| 63 | 64 | ||
| 64 | static int khugepaged(void *none); | 65 | static int khugepaged(void *none); |
| 65 | static int mm_slots_hash_init(void); | ||
| 66 | static int khugepaged_slab_init(void); | 66 | static int khugepaged_slab_init(void); |
| 67 | static void khugepaged_slab_free(void); | ||
| 68 | 67 | ||
| 69 | #define MM_SLOTS_HASH_HEADS 1024 | 68 | #define MM_SLOTS_HASH_BITS 10 |
| 70 | static struct hlist_head *mm_slots_hash __read_mostly; | 69 | static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); |
| 70 | |||
| 71 | static struct kmem_cache *mm_slot_cache __read_mostly; | 71 | static struct kmem_cache *mm_slot_cache __read_mostly; |
| 72 | 72 | ||
| 73 | /** | 73 | /** |
| @@ -105,7 +105,6 @@ static int set_recommended_min_free_kbytes(void) | |||
| 105 | struct zone *zone; | 105 | struct zone *zone; |
| 106 | int nr_zones = 0; | 106 | int nr_zones = 0; |
| 107 | unsigned long recommended_min; | 107 | unsigned long recommended_min; |
| 108 | extern int min_free_kbytes; | ||
| 109 | 108 | ||
| 110 | if (!khugepaged_enabled()) | 109 | if (!khugepaged_enabled()) |
| 111 | return 0; | 110 | return 0; |
| @@ -634,12 +633,6 @@ static int __init hugepage_init(void) | |||
| 634 | if (err) | 633 | if (err) |
| 635 | goto out; | 634 | goto out; |
| 636 | 635 | ||
| 637 | err = mm_slots_hash_init(); | ||
| 638 | if (err) { | ||
| 639 | khugepaged_slab_free(); | ||
| 640 | goto out; | ||
| 641 | } | ||
| 642 | |||
| 643 | register_shrinker(&huge_zero_page_shrinker); | 636 | register_shrinker(&huge_zero_page_shrinker); |
| 644 | 637 | ||
| 645 | /* | 638 | /* |
| @@ -1302,7 +1295,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1302 | int target_nid; | 1295 | int target_nid; |
| 1303 | int current_nid = -1; | 1296 | int current_nid = -1; |
| 1304 | bool migrated; | 1297 | bool migrated; |
| 1305 | bool page_locked = false; | ||
| 1306 | 1298 | ||
| 1307 | spin_lock(&mm->page_table_lock); | 1299 | spin_lock(&mm->page_table_lock); |
| 1308 | if (unlikely(!pmd_same(pmd, *pmdp))) | 1300 | if (unlikely(!pmd_same(pmd, *pmdp))) |
| @@ -1324,7 +1316,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1324 | /* Acquire the page lock to serialise THP migrations */ | 1316 | /* Acquire the page lock to serialise THP migrations */ |
| 1325 | spin_unlock(&mm->page_table_lock); | 1317 | spin_unlock(&mm->page_table_lock); |
| 1326 | lock_page(page); | 1318 | lock_page(page); |
| 1327 | page_locked = true; | ||
| 1328 | 1319 | ||
| 1329 | /* Confirm the PTE did not while locked */ | 1320 | /* Confirm the PTE did not while locked */ |
| 1330 | spin_lock(&mm->page_table_lock); | 1321 | spin_lock(&mm->page_table_lock); |
| @@ -1337,34 +1328,26 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1337 | 1328 | ||
| 1338 | /* Migrate the THP to the requested node */ | 1329 | /* Migrate the THP to the requested node */ |
| 1339 | migrated = migrate_misplaced_transhuge_page(mm, vma, | 1330 | migrated = migrate_misplaced_transhuge_page(mm, vma, |
| 1340 | pmdp, pmd, addr, | 1331 | pmdp, pmd, addr, page, target_nid); |
| 1341 | page, target_nid); | 1332 | if (!migrated) |
| 1342 | if (migrated) | 1333 | goto check_same; |
| 1343 | current_nid = target_nid; | ||
| 1344 | else { | ||
| 1345 | spin_lock(&mm->page_table_lock); | ||
| 1346 | if (unlikely(!pmd_same(pmd, *pmdp))) { | ||
| 1347 | unlock_page(page); | ||
| 1348 | goto out_unlock; | ||
| 1349 | } | ||
| 1350 | goto clear_pmdnuma; | ||
| 1351 | } | ||
| 1352 | 1334 | ||
| 1353 | task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); | 1335 | task_numa_fault(target_nid, HPAGE_PMD_NR, true); |
| 1354 | return 0; | 1336 | return 0; |
| 1355 | 1337 | ||
| 1338 | check_same: | ||
| 1339 | spin_lock(&mm->page_table_lock); | ||
| 1340 | if (unlikely(!pmd_same(pmd, *pmdp))) | ||
| 1341 | goto out_unlock; | ||
| 1356 | clear_pmdnuma: | 1342 | clear_pmdnuma: |
| 1357 | pmd = pmd_mknonnuma(pmd); | 1343 | pmd = pmd_mknonnuma(pmd); |
| 1358 | set_pmd_at(mm, haddr, pmdp, pmd); | 1344 | set_pmd_at(mm, haddr, pmdp, pmd); |
| 1359 | VM_BUG_ON(pmd_numa(*pmdp)); | 1345 | VM_BUG_ON(pmd_numa(*pmdp)); |
| 1360 | update_mmu_cache_pmd(vma, addr, pmdp); | 1346 | update_mmu_cache_pmd(vma, addr, pmdp); |
| 1361 | if (page_locked) | ||
| 1362 | unlock_page(page); | ||
| 1363 | |||
| 1364 | out_unlock: | 1347 | out_unlock: |
| 1365 | spin_unlock(&mm->page_table_lock); | 1348 | spin_unlock(&mm->page_table_lock); |
| 1366 | if (current_nid != -1) | 1349 | if (current_nid != -1) |
| 1367 | task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); | 1350 | task_numa_fault(current_nid, HPAGE_PMD_NR, false); |
| 1368 | return 0; | 1351 | return 0; |
| 1369 | } | 1352 | } |
| 1370 | 1353 | ||
| @@ -1656,7 +1639,7 @@ static void __split_huge_page_refcount(struct page *page) | |||
| 1656 | page_tail->mapping = page->mapping; | 1639 | page_tail->mapping = page->mapping; |
| 1657 | 1640 | ||
| 1658 | page_tail->index = page->index + i; | 1641 | page_tail->index = page->index + i; |
| 1659 | page_xchg_last_nid(page_tail, page_last_nid(page)); | 1642 | page_nid_xchg_last(page_tail, page_nid_last(page)); |
| 1660 | 1643 | ||
| 1661 | BUG_ON(!PageAnon(page_tail)); | 1644 | BUG_ON(!PageAnon(page_tail)); |
| 1662 | BUG_ON(!PageUptodate(page_tail)); | 1645 | BUG_ON(!PageUptodate(page_tail)); |
| @@ -1846,7 +1829,7 @@ int split_huge_page(struct page *page) | |||
| 1846 | 1829 | ||
| 1847 | BUG_ON(PageCompound(page)); | 1830 | BUG_ON(PageCompound(page)); |
| 1848 | out_unlock: | 1831 | out_unlock: |
| 1849 | anon_vma_unlock(anon_vma); | 1832 | anon_vma_unlock_write(anon_vma); |
| 1850 | put_anon_vma(anon_vma); | 1833 | put_anon_vma(anon_vma); |
| 1851 | out: | 1834 | out: |
| 1852 | return ret; | 1835 | return ret; |
| @@ -1908,12 +1891,6 @@ static int __init khugepaged_slab_init(void) | |||
| 1908 | return 0; | 1891 | return 0; |
| 1909 | } | 1892 | } |
| 1910 | 1893 | ||
| 1911 | static void __init khugepaged_slab_free(void) | ||
| 1912 | { | ||
| 1913 | kmem_cache_destroy(mm_slot_cache); | ||
| 1914 | mm_slot_cache = NULL; | ||
| 1915 | } | ||
| 1916 | |||
| 1917 | static inline struct mm_slot *alloc_mm_slot(void) | 1894 | static inline struct mm_slot *alloc_mm_slot(void) |
| 1918 | { | 1895 | { |
| 1919 | if (!mm_slot_cache) /* initialization failed */ | 1896 | if (!mm_slot_cache) /* initialization failed */ |
| @@ -1926,47 +1903,23 @@ static inline void free_mm_slot(struct mm_slot *mm_slot) | |||
| 1926 | kmem_cache_free(mm_slot_cache, mm_slot); | 1903 | kmem_cache_free(mm_slot_cache, mm_slot); |
| 1927 | } | 1904 | } |
| 1928 | 1905 | ||
| 1929 | static int __init mm_slots_hash_init(void) | ||
| 1930 | { | ||
| 1931 | mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), | ||
| 1932 | GFP_KERNEL); | ||
| 1933 | if (!mm_slots_hash) | ||
| 1934 | return -ENOMEM; | ||
| 1935 | return 0; | ||
| 1936 | } | ||
| 1937 | |||
| 1938 | #if 0 | ||
| 1939 | static void __init mm_slots_hash_free(void) | ||
| 1940 | { | ||
| 1941 | kfree(mm_slots_hash); | ||
| 1942 | mm_slots_hash = NULL; | ||
| 1943 | } | ||
| 1944 | #endif | ||
| 1945 | |||
| 1946 | static struct mm_slot *get_mm_slot(struct mm_struct *mm) | 1906 | static struct mm_slot *get_mm_slot(struct mm_struct *mm) |
| 1947 | { | 1907 | { |
| 1948 | struct mm_slot *mm_slot; | 1908 | struct mm_slot *mm_slot; |
| 1949 | struct hlist_head *bucket; | ||
| 1950 | struct hlist_node *node; | 1909 | struct hlist_node *node; |
| 1951 | 1910 | ||
| 1952 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) | 1911 | hash_for_each_possible(mm_slots_hash, mm_slot, node, hash, (unsigned long)mm) |
| 1953 | % MM_SLOTS_HASH_HEADS]; | ||
| 1954 | hlist_for_each_entry(mm_slot, node, bucket, hash) { | ||
| 1955 | if (mm == mm_slot->mm) | 1912 | if (mm == mm_slot->mm) |
| 1956 | return mm_slot; | 1913 | return mm_slot; |
| 1957 | } | 1914 | |
| 1958 | return NULL; | 1915 | return NULL; |
| 1959 | } | 1916 | } |
| 1960 | 1917 | ||
| 1961 | static void insert_to_mm_slots_hash(struct mm_struct *mm, | 1918 | static void insert_to_mm_slots_hash(struct mm_struct *mm, |
| 1962 | struct mm_slot *mm_slot) | 1919 | struct mm_slot *mm_slot) |
| 1963 | { | 1920 | { |
| 1964 | struct hlist_head *bucket; | ||
| 1965 | |||
| 1966 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) | ||
| 1967 | % MM_SLOTS_HASH_HEADS]; | ||
| 1968 | mm_slot->mm = mm; | 1921 | mm_slot->mm = mm; |
| 1969 | hlist_add_head(&mm_slot->hash, bucket); | 1922 | hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); |
| 1970 | } | 1923 | } |
| 1971 | 1924 | ||
| 1972 | static inline int khugepaged_test_exit(struct mm_struct *mm) | 1925 | static inline int khugepaged_test_exit(struct mm_struct *mm) |
| @@ -2035,7 +1988,7 @@ void __khugepaged_exit(struct mm_struct *mm) | |||
| 2035 | spin_lock(&khugepaged_mm_lock); | 1988 | spin_lock(&khugepaged_mm_lock); |
| 2036 | mm_slot = get_mm_slot(mm); | 1989 | mm_slot = get_mm_slot(mm); |
| 2037 | if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { | 1990 | if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { |
| 2038 | hlist_del(&mm_slot->hash); | 1991 | hash_del(&mm_slot->hash); |
| 2039 | list_del(&mm_slot->mm_node); | 1992 | list_del(&mm_slot->mm_node); |
| 2040 | free = 1; | 1993 | free = 1; |
| 2041 | } | 1994 | } |
| @@ -2368,7 +2321,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
| 2368 | BUG_ON(!pmd_none(*pmd)); | 2321 | BUG_ON(!pmd_none(*pmd)); |
| 2369 | set_pmd_at(mm, address, pmd, _pmd); | 2322 | set_pmd_at(mm, address, pmd, _pmd); |
| 2370 | spin_unlock(&mm->page_table_lock); | 2323 | spin_unlock(&mm->page_table_lock); |
| 2371 | anon_vma_unlock(vma->anon_vma); | 2324 | anon_vma_unlock_write(vma->anon_vma); |
| 2372 | goto out; | 2325 | goto out; |
| 2373 | } | 2326 | } |
| 2374 | 2327 | ||
| @@ -2376,7 +2329,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
| 2376 | * All pages are isolated and locked so anon_vma rmap | 2329 | * All pages are isolated and locked so anon_vma rmap |
| 2377 | * can't run anymore. | 2330 | * can't run anymore. |
| 2378 | */ | 2331 | */ |
| 2379 | anon_vma_unlock(vma->anon_vma); | 2332 | anon_vma_unlock_write(vma->anon_vma); |
| 2380 | 2333 | ||
| 2381 | __collapse_huge_page_copy(pte, new_page, vma, address, ptl); | 2334 | __collapse_huge_page_copy(pte, new_page, vma, address, ptl); |
| 2382 | pte_unmap(pte); | 2335 | pte_unmap(pte); |
| @@ -2423,7 +2376,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
| 2423 | struct page *page; | 2376 | struct page *page; |
| 2424 | unsigned long _address; | 2377 | unsigned long _address; |
| 2425 | spinlock_t *ptl; | 2378 | spinlock_t *ptl; |
| 2426 | int node = -1; | 2379 | int node = NUMA_NO_NODE; |
| 2427 | 2380 | ||
| 2428 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 2381 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
| 2429 | 2382 | ||
| @@ -2453,7 +2406,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
| 2453 | * be more sophisticated and look at more pages, | 2406 | * be more sophisticated and look at more pages, |
| 2454 | * but isn't for now. | 2407 | * but isn't for now. |
| 2455 | */ | 2408 | */ |
| 2456 | if (node == -1) | 2409 | if (node == NUMA_NO_NODE) |
| 2457 | node = page_to_nid(page); | 2410 | node = page_to_nid(page); |
| 2458 | VM_BUG_ON(PageCompound(page)); | 2411 | VM_BUG_ON(PageCompound(page)); |
| 2459 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) | 2412 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) |
| @@ -2484,7 +2437,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot) | |||
| 2484 | 2437 | ||
| 2485 | if (khugepaged_test_exit(mm)) { | 2438 | if (khugepaged_test_exit(mm)) { |
| 2486 | /* free mm_slot */ | 2439 | /* free mm_slot */ |
| 2487 | hlist_del(&mm_slot->hash); | 2440 | hash_del(&mm_slot->hash); |
| 2488 | list_del(&mm_slot->mm_node); | 2441 | list_del(&mm_slot->mm_node); |
| 2489 | 2442 | ||
| 2490 | /* | 2443 | /* |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 546db81820e4..cdb64e4d238a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -1293,8 +1293,7 @@ static void __init report_hugepages(void) | |||
| 1293 | 1293 | ||
| 1294 | for_each_hstate(h) { | 1294 | for_each_hstate(h) { |
| 1295 | char buf[32]; | 1295 | char buf[32]; |
| 1296 | printk(KERN_INFO "HugeTLB registered %s page size, " | 1296 | pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n", |
| 1297 | "pre-allocated %ld pages\n", | ||
| 1298 | memfmt(buf, huge_page_size(h)), | 1297 | memfmt(buf, huge_page_size(h)), |
| 1299 | h->free_huge_pages); | 1298 | h->free_huge_pages); |
| 1300 | } | 1299 | } |
| @@ -1702,8 +1701,7 @@ static void __init hugetlb_sysfs_init(void) | |||
| 1702 | err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, | 1701 | err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, |
| 1703 | hstate_kobjs, &hstate_attr_group); | 1702 | hstate_kobjs, &hstate_attr_group); |
| 1704 | if (err) | 1703 | if (err) |
| 1705 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s", | 1704 | pr_err("Hugetlb: Unable to add hstate %s", h->name); |
| 1706 | h->name); | ||
| 1707 | } | 1705 | } |
| 1708 | } | 1706 | } |
| 1709 | 1707 | ||
| @@ -1826,9 +1824,8 @@ void hugetlb_register_node(struct node *node) | |||
| 1826 | nhs->hstate_kobjs, | 1824 | nhs->hstate_kobjs, |
| 1827 | &per_node_hstate_attr_group); | 1825 | &per_node_hstate_attr_group); |
| 1828 | if (err) { | 1826 | if (err) { |
| 1829 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s" | 1827 | pr_err("Hugetlb: Unable to add hstate %s for node %d\n", |
| 1830 | " for node %d\n", | 1828 | h->name, node->dev.id); |
| 1831 | h->name, node->dev.id); | ||
| 1832 | hugetlb_unregister_node(node); | 1829 | hugetlb_unregister_node(node); |
| 1833 | break; | 1830 | break; |
| 1834 | } | 1831 | } |
| @@ -1924,7 +1921,7 @@ void __init hugetlb_add_hstate(unsigned order) | |||
| 1924 | unsigned long i; | 1921 | unsigned long i; |
| 1925 | 1922 | ||
| 1926 | if (size_to_hstate(PAGE_SIZE << order)) { | 1923 | if (size_to_hstate(PAGE_SIZE << order)) { |
| 1927 | printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); | 1924 | pr_warning("hugepagesz= specified twice, ignoring\n"); |
| 1928 | return; | 1925 | return; |
| 1929 | } | 1926 | } |
| 1930 | BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); | 1927 | BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); |
| @@ -1960,8 +1957,8 @@ static int __init hugetlb_nrpages_setup(char *s) | |||
| 1960 | mhp = &parsed_hstate->max_huge_pages; | 1957 | mhp = &parsed_hstate->max_huge_pages; |
| 1961 | 1958 | ||
| 1962 | if (mhp == last_mhp) { | 1959 | if (mhp == last_mhp) { |
| 1963 | printk(KERN_WARNING "hugepages= specified twice without " | 1960 | pr_warning("hugepages= specified twice without " |
| 1964 | "interleaving hugepagesz=, ignoring\n"); | 1961 | "interleaving hugepagesz=, ignoring\n"); |
| 1965 | return 1; | 1962 | return 1; |
| 1966 | } | 1963 | } |
| 1967 | 1964 | ||
| @@ -2692,9 +2689,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2692 | * COW. Warn that such a situation has occurred as it may not be obvious | 2689 | * COW. Warn that such a situation has occurred as it may not be obvious |
| 2693 | */ | 2690 | */ |
| 2694 | if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { | 2691 | if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { |
| 2695 | printk(KERN_WARNING | 2692 | pr_warning("PID %d killed due to inadequate hugepage pool\n", |
| 2696 | "PID %d killed due to inadequate hugepage pool\n", | 2693 | current->pid); |
| 2697 | current->pid); | ||
| 2698 | return ret; | 2694 | return ret; |
| 2699 | } | 2695 | } |
| 2700 | 2696 | ||
| @@ -2924,14 +2920,14 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, | |||
| 2924 | return NULL; | 2920 | return NULL; |
| 2925 | } | 2921 | } |
| 2926 | 2922 | ||
| 2927 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2923 | long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, |
| 2928 | struct page **pages, struct vm_area_struct **vmas, | 2924 | struct page **pages, struct vm_area_struct **vmas, |
| 2929 | unsigned long *position, int *length, int i, | 2925 | unsigned long *position, unsigned long *nr_pages, |
| 2930 | unsigned int flags) | 2926 | long i, unsigned int flags) |
| 2931 | { | 2927 | { |
| 2932 | unsigned long pfn_offset; | 2928 | unsigned long pfn_offset; |
| 2933 | unsigned long vaddr = *position; | 2929 | unsigned long vaddr = *position; |
| 2934 | int remainder = *length; | 2930 | unsigned long remainder = *nr_pages; |
| 2935 | struct hstate *h = hstate_vma(vma); | 2931 | struct hstate *h = hstate_vma(vma); |
| 2936 | 2932 | ||
| 2937 | spin_lock(&mm->page_table_lock); | 2933 | spin_lock(&mm->page_table_lock); |
| @@ -3001,7 +2997,7 @@ same_page: | |||
| 3001 | } | 2997 | } |
| 3002 | } | 2998 | } |
| 3003 | spin_unlock(&mm->page_table_lock); | 2999 | spin_unlock(&mm->page_table_lock); |
| 3004 | *length = remainder; | 3000 | *nr_pages = remainder; |
| 3005 | *position = vaddr; | 3001 | *position = vaddr; |
| 3006 | 3002 | ||
| 3007 | return i ? i : -EFAULT; | 3003 | return i ? i : -EFAULT; |
diff --git a/mm/internal.h b/mm/internal.h index 9ba21100ebf3..1c0c4cc0fcf7 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
| @@ -162,8 +162,8 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 162 | struct vm_area_struct *prev, struct rb_node *rb_parent); | 162 | struct vm_area_struct *prev, struct rb_node *rb_parent); |
| 163 | 163 | ||
| 164 | #ifdef CONFIG_MMU | 164 | #ifdef CONFIG_MMU |
| 165 | extern long mlock_vma_pages_range(struct vm_area_struct *vma, | 165 | extern long __mlock_vma_pages_range(struct vm_area_struct *vma, |
| 166 | unsigned long start, unsigned long end); | 166 | unsigned long start, unsigned long end, int *nonblocking); |
| 167 | extern void munlock_vma_pages_range(struct vm_area_struct *vma, | 167 | extern void munlock_vma_pages_range(struct vm_area_struct *vma, |
| 168 | unsigned long start, unsigned long end); | 168 | unsigned long start, unsigned long end); |
| 169 | static inline void munlock_vma_pages_all(struct vm_area_struct *vma) | 169 | static inline void munlock_vma_pages_all(struct vm_area_struct *vma) |
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 752a705c77c2..83dd5fbf5e60 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
| @@ -1300,9 +1300,8 @@ static void kmemleak_scan(void) | |||
| 1300 | */ | 1300 | */ |
| 1301 | lock_memory_hotplug(); | 1301 | lock_memory_hotplug(); |
| 1302 | for_each_online_node(i) { | 1302 | for_each_online_node(i) { |
| 1303 | pg_data_t *pgdat = NODE_DATA(i); | 1303 | unsigned long start_pfn = node_start_pfn(i); |
| 1304 | unsigned long start_pfn = pgdat->node_start_pfn; | 1304 | unsigned long end_pfn = node_end_pfn(i); |
| 1305 | unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; | ||
| 1306 | unsigned long pfn; | 1305 | unsigned long pfn; |
| 1307 | 1306 | ||
| 1308 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | 1307 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { |
| @@ -33,13 +33,22 @@ | |||
| 33 | #include <linux/mmu_notifier.h> | 33 | #include <linux/mmu_notifier.h> |
| 34 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
| 35 | #include <linux/ksm.h> | 35 | #include <linux/ksm.h> |
| 36 | #include <linux/hash.h> | 36 | #include <linux/hashtable.h> |
| 37 | #include <linux/freezer.h> | 37 | #include <linux/freezer.h> |
| 38 | #include <linux/oom.h> | 38 | #include <linux/oom.h> |
| 39 | #include <linux/numa.h> | ||
| 39 | 40 | ||
| 40 | #include <asm/tlbflush.h> | 41 | #include <asm/tlbflush.h> |
| 41 | #include "internal.h" | 42 | #include "internal.h" |
| 42 | 43 | ||
| 44 | #ifdef CONFIG_NUMA | ||
| 45 | #define NUMA(x) (x) | ||
| 46 | #define DO_NUMA(x) do { (x); } while (0) | ||
| 47 | #else | ||
| 48 | #define NUMA(x) (0) | ||
| 49 | #define DO_NUMA(x) do { } while (0) | ||
| 50 | #endif | ||
| 51 | |||
| 43 | /* | 52 | /* |
| 44 | * A few notes about the KSM scanning process, | 53 | * A few notes about the KSM scanning process, |
| 45 | * to make it easier to understand the data structures below: | 54 | * to make it easier to understand the data structures below: |
| @@ -78,6 +87,9 @@ | |||
| 78 | * take 10 attempts to find a page in the unstable tree, once it is found, | 87 | * take 10 attempts to find a page in the unstable tree, once it is found, |
| 79 | * it is secured in the stable tree. (When we scan a new page, we first | 88 | * it is secured in the stable tree. (When we scan a new page, we first |
| 80 | * compare it against the stable tree, and then against the unstable tree.) | 89 | * compare it against the stable tree, and then against the unstable tree.) |
| 90 | * | ||
| 91 | * If the merge_across_nodes tunable is unset, then KSM maintains multiple | ||
| 92 | * stable trees and multiple unstable trees: one of each for each NUMA node. | ||
| 81 | */ | 93 | */ |
| 82 | 94 | ||
| 83 | /** | 95 | /** |
| @@ -113,19 +125,32 @@ struct ksm_scan { | |||
| 113 | /** | 125 | /** |
| 114 | * struct stable_node - node of the stable rbtree | 126 | * struct stable_node - node of the stable rbtree |
| 115 | * @node: rb node of this ksm page in the stable tree | 127 | * @node: rb node of this ksm page in the stable tree |
| 128 | * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list | ||
| 129 | * @list: linked into migrate_nodes, pending placement in the proper node tree | ||
| 116 | * @hlist: hlist head of rmap_items using this ksm page | 130 | * @hlist: hlist head of rmap_items using this ksm page |
| 117 | * @kpfn: page frame number of this ksm page | 131 | * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid) |
| 132 | * @nid: NUMA node id of stable tree in which linked (may not match kpfn) | ||
| 118 | */ | 133 | */ |
| 119 | struct stable_node { | 134 | struct stable_node { |
| 120 | struct rb_node node; | 135 | union { |
| 136 | struct rb_node node; /* when node of stable tree */ | ||
| 137 | struct { /* when listed for migration */ | ||
| 138 | struct list_head *head; | ||
| 139 | struct list_head list; | ||
| 140 | }; | ||
| 141 | }; | ||
| 121 | struct hlist_head hlist; | 142 | struct hlist_head hlist; |
| 122 | unsigned long kpfn; | 143 | unsigned long kpfn; |
| 144 | #ifdef CONFIG_NUMA | ||
| 145 | int nid; | ||
| 146 | #endif | ||
| 123 | }; | 147 | }; |
| 124 | 148 | ||
| 125 | /** | 149 | /** |
| 126 | * struct rmap_item - reverse mapping item for virtual addresses | 150 | * struct rmap_item - reverse mapping item for virtual addresses |
| 127 | * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list | 151 | * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list |
| 128 | * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree | 152 | * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree |
| 153 | * @nid: NUMA node id of unstable tree in which linked (may not match page) | ||
| 129 | * @mm: the memory structure this rmap_item is pointing into | 154 | * @mm: the memory structure this rmap_item is pointing into |
| 130 | * @address: the virtual address this rmap_item tracks (+ flags in low bits) | 155 | * @address: the virtual address this rmap_item tracks (+ flags in low bits) |
| 131 | * @oldchecksum: previous checksum of the page at that virtual address | 156 | * @oldchecksum: previous checksum of the page at that virtual address |
| @@ -135,7 +160,12 @@ struct stable_node { | |||
| 135 | */ | 160 | */ |
| 136 | struct rmap_item { | 161 | struct rmap_item { |
| 137 | struct rmap_item *rmap_list; | 162 | struct rmap_item *rmap_list; |
| 138 | struct anon_vma *anon_vma; /* when stable */ | 163 | union { |
| 164 | struct anon_vma *anon_vma; /* when stable */ | ||
| 165 | #ifdef CONFIG_NUMA | ||
| 166 | int nid; /* when node of unstable tree */ | ||
| 167 | #endif | ||
| 168 | }; | ||
| 139 | struct mm_struct *mm; | 169 | struct mm_struct *mm; |
| 140 | unsigned long address; /* + low bits used for flags below */ | 170 | unsigned long address; /* + low bits used for flags below */ |
| 141 | unsigned int oldchecksum; /* when unstable */ | 171 | unsigned int oldchecksum; /* when unstable */ |
| @@ -153,12 +183,16 @@ struct rmap_item { | |||
| 153 | #define STABLE_FLAG 0x200 /* is listed from the stable tree */ | 183 | #define STABLE_FLAG 0x200 /* is listed from the stable tree */ |
| 154 | 184 | ||
| 155 | /* The stable and unstable tree heads */ | 185 | /* The stable and unstable tree heads */ |
| 156 | static struct rb_root root_stable_tree = RB_ROOT; | 186 | static struct rb_root one_stable_tree[1] = { RB_ROOT }; |
| 157 | static struct rb_root root_unstable_tree = RB_ROOT; | 187 | static struct rb_root one_unstable_tree[1] = { RB_ROOT }; |
| 188 | static struct rb_root *root_stable_tree = one_stable_tree; | ||
| 189 | static struct rb_root *root_unstable_tree = one_unstable_tree; | ||
| 158 | 190 | ||
| 159 | #define MM_SLOTS_HASH_SHIFT 10 | 191 | /* Recently migrated nodes of stable tree, pending proper placement */ |
| 160 | #define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT) | 192 | static LIST_HEAD(migrate_nodes); |
| 161 | static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS]; | 193 | |
| 194 | #define MM_SLOTS_HASH_BITS 10 | ||
| 195 | static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); | ||
| 162 | 196 | ||
| 163 | static struct mm_slot ksm_mm_head = { | 197 | static struct mm_slot ksm_mm_head = { |
| 164 | .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), | 198 | .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), |
| @@ -189,10 +223,21 @@ static unsigned int ksm_thread_pages_to_scan = 100; | |||
| 189 | /* Milliseconds ksmd should sleep between batches */ | 223 | /* Milliseconds ksmd should sleep between batches */ |
| 190 | static unsigned int ksm_thread_sleep_millisecs = 20; | 224 | static unsigned int ksm_thread_sleep_millisecs = 20; |
| 191 | 225 | ||
| 226 | #ifdef CONFIG_NUMA | ||
| 227 | /* Zeroed when merging across nodes is not allowed */ | ||
| 228 | static unsigned int ksm_merge_across_nodes = 1; | ||
| 229 | static int ksm_nr_node_ids = 1; | ||
| 230 | #else | ||
| 231 | #define ksm_merge_across_nodes 1U | ||
| 232 | #define ksm_nr_node_ids 1 | ||
| 233 | #endif | ||
| 234 | |||
| 192 | #define KSM_RUN_STOP 0 | 235 | #define KSM_RUN_STOP 0 |
| 193 | #define KSM_RUN_MERGE 1 | 236 | #define KSM_RUN_MERGE 1 |
| 194 | #define KSM_RUN_UNMERGE 2 | 237 | #define KSM_RUN_UNMERGE 2 |
| 195 | static unsigned int ksm_run = KSM_RUN_STOP; | 238 | #define KSM_RUN_OFFLINE 4 |
| 239 | static unsigned long ksm_run = KSM_RUN_STOP; | ||
| 240 | static void wait_while_offlining(void); | ||
| 196 | 241 | ||
| 197 | static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); | 242 | static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); |
| 198 | static DEFINE_MUTEX(ksm_thread_mutex); | 243 | static DEFINE_MUTEX(ksm_thread_mutex); |
| @@ -275,31 +320,21 @@ static inline void free_mm_slot(struct mm_slot *mm_slot) | |||
| 275 | 320 | ||
| 276 | static struct mm_slot *get_mm_slot(struct mm_struct *mm) | 321 | static struct mm_slot *get_mm_slot(struct mm_struct *mm) |
| 277 | { | 322 | { |
| 278 | struct mm_slot *mm_slot; | ||
| 279 | struct hlist_head *bucket; | ||
| 280 | struct hlist_node *node; | 323 | struct hlist_node *node; |
| 324 | struct mm_slot *slot; | ||
| 325 | |||
| 326 | hash_for_each_possible(mm_slots_hash, slot, node, link, (unsigned long)mm) | ||
| 327 | if (slot->mm == mm) | ||
| 328 | return slot; | ||
| 281 | 329 | ||
| 282 | bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)]; | ||
| 283 | hlist_for_each_entry(mm_slot, node, bucket, link) { | ||
| 284 | if (mm == mm_slot->mm) | ||
| 285 | return mm_slot; | ||
| 286 | } | ||
| 287 | return NULL; | 330 | return NULL; |
| 288 | } | 331 | } |
| 289 | 332 | ||
| 290 | static void insert_to_mm_slots_hash(struct mm_struct *mm, | 333 | static void insert_to_mm_slots_hash(struct mm_struct *mm, |
| 291 | struct mm_slot *mm_slot) | 334 | struct mm_slot *mm_slot) |
| 292 | { | 335 | { |
| 293 | struct hlist_head *bucket; | ||
| 294 | |||
| 295 | bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)]; | ||
| 296 | mm_slot->mm = mm; | 336 | mm_slot->mm = mm; |
| 297 | hlist_add_head(&mm_slot->link, bucket); | 337 | hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm); |
| 298 | } | ||
| 299 | |||
| 300 | static inline int in_stable_tree(struct rmap_item *rmap_item) | ||
| 301 | { | ||
| 302 | return rmap_item->address & STABLE_FLAG; | ||
| 303 | } | 338 | } |
| 304 | 339 | ||
| 305 | /* | 340 | /* |
| @@ -333,7 +368,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) | |||
| 333 | 368 | ||
| 334 | do { | 369 | do { |
| 335 | cond_resched(); | 370 | cond_resched(); |
| 336 | page = follow_page(vma, addr, FOLL_GET); | 371 | page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION); |
| 337 | if (IS_ERR_OR_NULL(page)) | 372 | if (IS_ERR_OR_NULL(page)) |
| 338 | break; | 373 | break; |
| 339 | if (PageKsm(page)) | 374 | if (PageKsm(page)) |
| @@ -447,6 +482,17 @@ out: page = NULL; | |||
| 447 | return page; | 482 | return page; |
| 448 | } | 483 | } |
| 449 | 484 | ||
| 485 | /* | ||
| 486 | * This helper is used for getting right index into array of tree roots. | ||
| 487 | * When merge_across_nodes knob is set to 1, there are only two rb-trees for | ||
| 488 | * stable and unstable pages from all nodes with roots in index 0. Otherwise, | ||
| 489 | * every node has its own stable and unstable tree. | ||
| 490 | */ | ||
| 491 | static inline int get_kpfn_nid(unsigned long kpfn) | ||
| 492 | { | ||
| 493 | return ksm_merge_across_nodes ? 0 : pfn_to_nid(kpfn); | ||
| 494 | } | ||
| 495 | |||
| 450 | static void remove_node_from_stable_tree(struct stable_node *stable_node) | 496 | static void remove_node_from_stable_tree(struct stable_node *stable_node) |
| 451 | { | 497 | { |
| 452 | struct rmap_item *rmap_item; | 498 | struct rmap_item *rmap_item; |
| @@ -462,7 +508,11 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) | |||
| 462 | cond_resched(); | 508 | cond_resched(); |
| 463 | } | 509 | } |
| 464 | 510 | ||
| 465 | rb_erase(&stable_node->node, &root_stable_tree); | 511 | if (stable_node->head == &migrate_nodes) |
| 512 | list_del(&stable_node->list); | ||
| 513 | else | ||
| 514 | rb_erase(&stable_node->node, | ||
| 515 | root_stable_tree + NUMA(stable_node->nid)); | ||
| 466 | free_stable_node(stable_node); | 516 | free_stable_node(stable_node); |
| 467 | } | 517 | } |
| 468 | 518 | ||
| @@ -472,6 +522,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) | |||
| 472 | * In which case we can trust the content of the page, and it | 522 | * In which case we can trust the content of the page, and it |
| 473 | * returns the gotten page; but if the page has now been zapped, | 523 | * returns the gotten page; but if the page has now been zapped, |
| 474 | * remove the stale node from the stable tree and return NULL. | 524 | * remove the stale node from the stable tree and return NULL. |
| 525 | * But beware, the stable node's page might be being migrated. | ||
| 475 | * | 526 | * |
| 476 | * You would expect the stable_node to hold a reference to the ksm page. | 527 | * You would expect the stable_node to hold a reference to the ksm page. |
| 477 | * But if it increments the page's count, swapping out has to wait for | 528 | * But if it increments the page's count, swapping out has to wait for |
| @@ -482,40 +533,77 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) | |||
| 482 | * pointing back to this stable node. This relies on freeing a PageAnon | 533 | * pointing back to this stable node. This relies on freeing a PageAnon |
| 483 | * page to reset its page->mapping to NULL, and relies on no other use of | 534 | * page to reset its page->mapping to NULL, and relies on no other use of |
| 484 | * a page to put something that might look like our key in page->mapping. | 535 | * a page to put something that might look like our key in page->mapping. |
| 485 | * | ||
| 486 | * include/linux/pagemap.h page_cache_get_speculative() is a good reference, | ||
| 487 | * but this is different - made simpler by ksm_thread_mutex being held, but | ||
| 488 | * interesting for assuming that no other use of the struct page could ever | ||
| 489 | * put our expected_mapping into page->mapping (or a field of the union which | ||
| 490 | * coincides with page->mapping). The RCU calls are not for KSM at all, but | ||
| 491 | * to keep the page_count protocol described with page_cache_get_speculative. | ||
| 492 | * | ||
| 493 | * Note: it is possible that get_ksm_page() will return NULL one moment, | ||
| 494 | * then page the next, if the page is in between page_freeze_refs() and | ||
| 495 | * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page | ||
| 496 | * is on its way to being freed; but it is an anomaly to bear in mind. | 536 | * is on its way to being freed; but it is an anomaly to bear in mind. |
| 497 | */ | 537 | */ |
| 498 | static struct page *get_ksm_page(struct stable_node *stable_node) | 538 | static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it) |
| 499 | { | 539 | { |
| 500 | struct page *page; | 540 | struct page *page; |
| 501 | void *expected_mapping; | 541 | void *expected_mapping; |
| 542 | unsigned long kpfn; | ||
| 502 | 543 | ||
| 503 | page = pfn_to_page(stable_node->kpfn); | ||
| 504 | expected_mapping = (void *)stable_node + | 544 | expected_mapping = (void *)stable_node + |
| 505 | (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); | 545 | (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); |
| 506 | rcu_read_lock(); | 546 | again: |
| 507 | if (page->mapping != expected_mapping) | 547 | kpfn = ACCESS_ONCE(stable_node->kpfn); |
| 508 | goto stale; | 548 | page = pfn_to_page(kpfn); |
| 509 | if (!get_page_unless_zero(page)) | 549 | |
| 550 | /* | ||
| 551 | * page is computed from kpfn, so on most architectures reading | ||
| 552 | * page->mapping is naturally ordered after reading node->kpfn, | ||
| 553 | * but on Alpha we need to be more careful. | ||
| 554 | */ | ||
| 555 | smp_read_barrier_depends(); | ||
| 556 | if (ACCESS_ONCE(page->mapping) != expected_mapping) | ||
| 510 | goto stale; | 557 | goto stale; |
| 511 | if (page->mapping != expected_mapping) { | 558 | |
| 559 | /* | ||
| 560 | * We cannot do anything with the page while its refcount is 0. | ||
| 561 | * Usually 0 means free, or tail of a higher-order page: in which | ||
| 562 | * case this node is no longer referenced, and should be freed; | ||
| 563 | * however, it might mean that the page is under page_freeze_refs(). | ||
| 564 | * The __remove_mapping() case is easy, again the node is now stale; | ||
| 565 | * but if page is swapcache in migrate_page_move_mapping(), it might | ||
| 566 | * still be our page, in which case it's essential to keep the node. | ||
| 567 | */ | ||
| 568 | while (!get_page_unless_zero(page)) { | ||
| 569 | /* | ||
| 570 | * Another check for page->mapping != expected_mapping would | ||
| 571 | * work here too. We have chosen the !PageSwapCache test to | ||
| 572 | * optimize the common case, when the page is or is about to | ||
| 573 | * be freed: PageSwapCache is cleared (under spin_lock_irq) | ||
| 574 | * in the freeze_refs section of __remove_mapping(); but Anon | ||
| 575 | * page->mapping reset to NULL later, in free_pages_prepare(). | ||
| 576 | */ | ||
| 577 | if (!PageSwapCache(page)) | ||
| 578 | goto stale; | ||
| 579 | cpu_relax(); | ||
| 580 | } | ||
| 581 | |||
| 582 | if (ACCESS_ONCE(page->mapping) != expected_mapping) { | ||
| 512 | put_page(page); | 583 | put_page(page); |
| 513 | goto stale; | 584 | goto stale; |
| 514 | } | 585 | } |
| 515 | rcu_read_unlock(); | 586 | |
| 587 | if (lock_it) { | ||
| 588 | lock_page(page); | ||
| 589 | if (ACCESS_ONCE(page->mapping) != expected_mapping) { | ||
| 590 | unlock_page(page); | ||
| 591 | put_page(page); | ||
| 592 | goto stale; | ||
| 593 | } | ||
| 594 | } | ||
| 516 | return page; | 595 | return page; |
| 596 | |||
| 517 | stale: | 597 | stale: |
| 518 | rcu_read_unlock(); | 598 | /* |
| 599 | * We come here from above when page->mapping or !PageSwapCache | ||
| 600 | * suggests that the node is stale; but it might be under migration. | ||
| 601 | * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(), | ||
| 602 | * before checking whether node->kpfn has been changed. | ||
| 603 | */ | ||
| 604 | smp_rmb(); | ||
| 605 | if (ACCESS_ONCE(stable_node->kpfn) != kpfn) | ||
| 606 | goto again; | ||
| 519 | remove_node_from_stable_tree(stable_node); | 607 | remove_node_from_stable_tree(stable_node); |
| 520 | return NULL; | 608 | return NULL; |
| 521 | } | 609 | } |
| @@ -531,11 +619,10 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | |||
| 531 | struct page *page; | 619 | struct page *page; |
| 532 | 620 | ||
| 533 | stable_node = rmap_item->head; | 621 | stable_node = rmap_item->head; |
| 534 | page = get_ksm_page(stable_node); | 622 | page = get_ksm_page(stable_node, true); |
| 535 | if (!page) | 623 | if (!page) |
| 536 | goto out; | 624 | goto out; |
| 537 | 625 | ||
| 538 | lock_page(page); | ||
| 539 | hlist_del(&rmap_item->hlist); | 626 | hlist_del(&rmap_item->hlist); |
| 540 | unlock_page(page); | 627 | unlock_page(page); |
| 541 | put_page(page); | 628 | put_page(page); |
| @@ -560,8 +647,8 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | |||
| 560 | age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); | 647 | age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); |
| 561 | BUG_ON(age > 1); | 648 | BUG_ON(age > 1); |
| 562 | if (!age) | 649 | if (!age) |
| 563 | rb_erase(&rmap_item->node, &root_unstable_tree); | 650 | rb_erase(&rmap_item->node, |
| 564 | 651 | root_unstable_tree + NUMA(rmap_item->nid)); | |
| 565 | ksm_pages_unshared--; | 652 | ksm_pages_unshared--; |
| 566 | rmap_item->address &= PAGE_MASK; | 653 | rmap_item->address &= PAGE_MASK; |
| 567 | } | 654 | } |
| @@ -581,7 +668,7 @@ static void remove_trailing_rmap_items(struct mm_slot *mm_slot, | |||
| 581 | } | 668 | } |
| 582 | 669 | ||
| 583 | /* | 670 | /* |
| 584 | * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather | 671 | * Though it's very tempting to unmerge rmap_items from stable tree rather |
| 585 | * than check every pte of a given vma, the locking doesn't quite work for | 672 | * than check every pte of a given vma, the locking doesn't quite work for |
| 586 | * that - an rmap_item is assigned to the stable tree after inserting ksm | 673 | * that - an rmap_item is assigned to the stable tree after inserting ksm |
| 587 | * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing | 674 | * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing |
| @@ -614,6 +701,71 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, | |||
| 614 | /* | 701 | /* |
| 615 | * Only called through the sysfs control interface: | 702 | * Only called through the sysfs control interface: |
| 616 | */ | 703 | */ |
| 704 | static int remove_stable_node(struct stable_node *stable_node) | ||
| 705 | { | ||
| 706 | struct page *page; | ||
| 707 | int err; | ||
| 708 | |||
| 709 | page = get_ksm_page(stable_node, true); | ||
| 710 | if (!page) { | ||
| 711 | /* | ||
| 712 | * get_ksm_page did remove_node_from_stable_tree itself. | ||
| 713 | */ | ||
| 714 | return 0; | ||
| 715 | } | ||
| 716 | |||
| 717 | if (WARN_ON_ONCE(page_mapped(page))) { | ||
| 718 | /* | ||
| 719 | * This should not happen: but if it does, just refuse to let | ||
| 720 | * merge_across_nodes be switched - there is no need to panic. | ||
| 721 | */ | ||
| 722 | err = -EBUSY; | ||
| 723 | } else { | ||
| 724 | /* | ||
| 725 | * The stable node did not yet appear stale to get_ksm_page(), | ||
| 726 | * since that allows for an unmapped ksm page to be recognized | ||
| 727 | * right up until it is freed; but the node is safe to remove. | ||
| 728 | * This page might be in a pagevec waiting to be freed, | ||
| 729 | * or it might be PageSwapCache (perhaps under writeback), | ||
| 730 | * or it might have been removed from swapcache a moment ago. | ||
| 731 | */ | ||
| 732 | set_page_stable_node(page, NULL); | ||
| 733 | remove_node_from_stable_tree(stable_node); | ||
| 734 | err = 0; | ||
| 735 | } | ||
| 736 | |||
| 737 | unlock_page(page); | ||
| 738 | put_page(page); | ||
| 739 | return err; | ||
| 740 | } | ||
| 741 | |||
| 742 | static int remove_all_stable_nodes(void) | ||
| 743 | { | ||
| 744 | struct stable_node *stable_node; | ||
| 745 | struct list_head *this, *next; | ||
| 746 | int nid; | ||
| 747 | int err = 0; | ||
| 748 | |||
| 749 | for (nid = 0; nid < ksm_nr_node_ids; nid++) { | ||
| 750 | while (root_stable_tree[nid].rb_node) { | ||
| 751 | stable_node = rb_entry(root_stable_tree[nid].rb_node, | ||
| 752 | struct stable_node, node); | ||
| 753 | if (remove_stable_node(stable_node)) { | ||
| 754 | err = -EBUSY; | ||
| 755 | break; /* proceed to next nid */ | ||
| 756 | } | ||
| 757 | cond_resched(); | ||
| 758 | } | ||
| 759 | } | ||
| 760 | list_for_each_safe(this, next, &migrate_nodes) { | ||
| 761 | stable_node = list_entry(this, struct stable_node, list); | ||
| 762 | if (remove_stable_node(stable_node)) | ||
| 763 | err = -EBUSY; | ||
| 764 | cond_resched(); | ||
| 765 | } | ||
| 766 | return err; | ||
| 767 | } | ||
| 768 | |||
| 617 | static int unmerge_and_remove_all_rmap_items(void) | 769 | static int unmerge_and_remove_all_rmap_items(void) |
| 618 | { | 770 | { |
| 619 | struct mm_slot *mm_slot; | 771 | struct mm_slot *mm_slot; |
| @@ -647,7 +799,7 @@ static int unmerge_and_remove_all_rmap_items(void) | |||
| 647 | ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, | 799 | ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, |
| 648 | struct mm_slot, mm_list); | 800 | struct mm_slot, mm_list); |
| 649 | if (ksm_test_exit(mm)) { | 801 | if (ksm_test_exit(mm)) { |
| 650 | hlist_del(&mm_slot->link); | 802 | hash_del(&mm_slot->link); |
| 651 | list_del(&mm_slot->mm_list); | 803 | list_del(&mm_slot->mm_list); |
| 652 | spin_unlock(&ksm_mmlist_lock); | 804 | spin_unlock(&ksm_mmlist_lock); |
| 653 | 805 | ||
| @@ -661,6 +813,8 @@ static int unmerge_and_remove_all_rmap_items(void) | |||
| 661 | } | 813 | } |
| 662 | } | 814 | } |
| 663 | 815 | ||
| 816 | /* Clean up stable nodes, but don't worry if some are still busy */ | ||
| 817 | remove_all_stable_nodes(); | ||
| 664 | ksm_scan.seqnr = 0; | 818 | ksm_scan.seqnr = 0; |
| 665 | return 0; | 819 | return 0; |
| 666 | 820 | ||
| @@ -946,6 +1100,9 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, | |||
| 946 | if (err) | 1100 | if (err) |
| 947 | goto out; | 1101 | goto out; |
| 948 | 1102 | ||
| 1103 | /* Unstable nid is in union with stable anon_vma: remove first */ | ||
| 1104 | remove_rmap_item_from_tree(rmap_item); | ||
| 1105 | |||
| 949 | /* Must get reference to anon_vma while still holding mmap_sem */ | 1106 | /* Must get reference to anon_vma while still holding mmap_sem */ |
| 950 | rmap_item->anon_vma = vma->anon_vma; | 1107 | rmap_item->anon_vma = vma->anon_vma; |
| 951 | get_anon_vma(vma->anon_vma); | 1108 | get_anon_vma(vma->anon_vma); |
| @@ -996,42 +1153,99 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, | |||
| 996 | */ | 1153 | */ |
| 997 | static struct page *stable_tree_search(struct page *page) | 1154 | static struct page *stable_tree_search(struct page *page) |
| 998 | { | 1155 | { |
| 999 | struct rb_node *node = root_stable_tree.rb_node; | 1156 | int nid; |
| 1157 | struct rb_root *root; | ||
| 1158 | struct rb_node **new; | ||
| 1159 | struct rb_node *parent; | ||
| 1000 | struct stable_node *stable_node; | 1160 | struct stable_node *stable_node; |
| 1161 | struct stable_node *page_node; | ||
| 1001 | 1162 | ||
| 1002 | stable_node = page_stable_node(page); | 1163 | page_node = page_stable_node(page); |
| 1003 | if (stable_node) { /* ksm page forked */ | 1164 | if (page_node && page_node->head != &migrate_nodes) { |
| 1165 | /* ksm page forked */ | ||
| 1004 | get_page(page); | 1166 | get_page(page); |
| 1005 | return page; | 1167 | return page; |
| 1006 | } | 1168 | } |
| 1007 | 1169 | ||
| 1008 | while (node) { | 1170 | nid = get_kpfn_nid(page_to_pfn(page)); |
| 1171 | root = root_stable_tree + nid; | ||
| 1172 | again: | ||
| 1173 | new = &root->rb_node; | ||
| 1174 | parent = NULL; | ||
| 1175 | |||
| 1176 | while (*new) { | ||
| 1009 | struct page *tree_page; | 1177 | struct page *tree_page; |
| 1010 | int ret; | 1178 | int ret; |
| 1011 | 1179 | ||
| 1012 | cond_resched(); | 1180 | cond_resched(); |
| 1013 | stable_node = rb_entry(node, struct stable_node, node); | 1181 | stable_node = rb_entry(*new, struct stable_node, node); |
| 1014 | tree_page = get_ksm_page(stable_node); | 1182 | tree_page = get_ksm_page(stable_node, false); |
| 1015 | if (!tree_page) | 1183 | if (!tree_page) |
| 1016 | return NULL; | 1184 | return NULL; |
| 1017 | 1185 | ||
| 1018 | ret = memcmp_pages(page, tree_page); | 1186 | ret = memcmp_pages(page, tree_page); |
| 1187 | put_page(tree_page); | ||
| 1019 | 1188 | ||
| 1020 | if (ret < 0) { | 1189 | parent = *new; |
| 1021 | put_page(tree_page); | 1190 | if (ret < 0) |
| 1022 | node = node->rb_left; | 1191 | new = &parent->rb_left; |
| 1023 | } else if (ret > 0) { | 1192 | else if (ret > 0) |
| 1024 | put_page(tree_page); | 1193 | new = &parent->rb_right; |
| 1025 | node = node->rb_right; | 1194 | else { |
| 1026 | } else | 1195 | /* |
| 1027 | return tree_page; | 1196 | * Lock and unlock the stable_node's page (which |
| 1197 | * might already have been migrated) so that page | ||
| 1198 | * migration is sure to notice its raised count. | ||
| 1199 | * It would be more elegant to return stable_node | ||
| 1200 | * than kpage, but that involves more changes. | ||
| 1201 | */ | ||
| 1202 | tree_page = get_ksm_page(stable_node, true); | ||
| 1203 | if (tree_page) { | ||
| 1204 | unlock_page(tree_page); | ||
| 1205 | if (get_kpfn_nid(stable_node->kpfn) != | ||
| 1206 | NUMA(stable_node->nid)) { | ||
| 1207 | put_page(tree_page); | ||
| 1208 | goto replace; | ||
| 1209 | } | ||
| 1210 | return tree_page; | ||
| 1211 | } | ||
| 1212 | /* | ||
| 1213 | * There is now a place for page_node, but the tree may | ||
| 1214 | * have been rebalanced, so re-evaluate parent and new. | ||
| 1215 | */ | ||
| 1216 | if (page_node) | ||
| 1217 | goto again; | ||
| 1218 | return NULL; | ||
| 1219 | } | ||
| 1028 | } | 1220 | } |
| 1029 | 1221 | ||
| 1030 | return NULL; | 1222 | if (!page_node) |
| 1223 | return NULL; | ||
| 1224 | |||
| 1225 | list_del(&page_node->list); | ||
| 1226 | DO_NUMA(page_node->nid = nid); | ||
| 1227 | rb_link_node(&page_node->node, parent, new); | ||
| 1228 | rb_insert_color(&page_node->node, root); | ||
| 1229 | get_page(page); | ||
| 1230 | return page; | ||
| 1231 | |||
| 1232 | replace: | ||
| 1233 | if (page_node) { | ||
| 1234 | list_del(&page_node->list); | ||
| 1235 | DO_NUMA(page_node->nid = nid); | ||
| 1236 | rb_replace_node(&stable_node->node, &page_node->node, root); | ||
| 1237 | get_page(page); | ||
| 1238 | } else { | ||
| 1239 | rb_erase(&stable_node->node, root); | ||
| 1240 | page = NULL; | ||
| 1241 | } | ||
| 1242 | stable_node->head = &migrate_nodes; | ||
| 1243 | list_add(&stable_node->list, stable_node->head); | ||
| 1244 | return page; | ||
| 1031 | } | 1245 | } |
| 1032 | 1246 | ||
| 1033 | /* | 1247 | /* |
| 1034 | * stable_tree_insert - insert rmap_item pointing to new ksm page | 1248 | * stable_tree_insert - insert stable tree node pointing to new ksm page |
| 1035 | * into the stable tree. | 1249 | * into the stable tree. |
| 1036 | * | 1250 | * |
| 1037 | * This function returns the stable tree node just allocated on success, | 1251 | * This function returns the stable tree node just allocated on success, |
| @@ -1039,17 +1253,25 @@ static struct page *stable_tree_search(struct page *page) | |||
| 1039 | */ | 1253 | */ |
| 1040 | static struct stable_node *stable_tree_insert(struct page *kpage) | 1254 | static struct stable_node *stable_tree_insert(struct page *kpage) |
| 1041 | { | 1255 | { |
| 1042 | struct rb_node **new = &root_stable_tree.rb_node; | 1256 | int nid; |
| 1257 | unsigned long kpfn; | ||
| 1258 | struct rb_root *root; | ||
| 1259 | struct rb_node **new; | ||
| 1043 | struct rb_node *parent = NULL; | 1260 | struct rb_node *parent = NULL; |
| 1044 | struct stable_node *stable_node; | 1261 | struct stable_node *stable_node; |
| 1045 | 1262 | ||
| 1263 | kpfn = page_to_pfn(kpage); | ||
| 1264 | nid = get_kpfn_nid(kpfn); | ||
| 1265 | root = root_stable_tree + nid; | ||
| 1266 | new = &root->rb_node; | ||
| 1267 | |||
| 1046 | while (*new) { | 1268 | while (*new) { |
| 1047 | struct page *tree_page; | 1269 | struct page *tree_page; |
| 1048 | int ret; | 1270 | int ret; |
| 1049 | 1271 | ||
| 1050 | cond_resched(); | 1272 | cond_resched(); |
| 1051 | stable_node = rb_entry(*new, struct stable_node, node); | 1273 | stable_node = rb_entry(*new, struct stable_node, node); |
| 1052 | tree_page = get_ksm_page(stable_node); | 1274 | tree_page = get_ksm_page(stable_node, false); |
| 1053 | if (!tree_page) | 1275 | if (!tree_page) |
| 1054 | return NULL; | 1276 | return NULL; |
| 1055 | 1277 | ||
| @@ -1075,13 +1297,12 @@ static struct stable_node *stable_tree_insert(struct page *kpage) | |||
| 1075 | if (!stable_node) | 1297 | if (!stable_node) |
| 1076 | return NULL; | 1298 | return NULL; |
| 1077 | 1299 | ||
| 1078 | rb_link_node(&stable_node->node, parent, new); | ||
| 1079 | rb_insert_color(&stable_node->node, &root_stable_tree); | ||
| 1080 | |||
| 1081 | INIT_HLIST_HEAD(&stable_node->hlist); | 1300 | INIT_HLIST_HEAD(&stable_node->hlist); |
| 1082 | 1301 | stable_node->kpfn = kpfn; | |
| 1083 | stable_node->kpfn = page_to_pfn(kpage); | ||
| 1084 | set_page_stable_node(kpage, stable_node); | 1302 | set_page_stable_node(kpage, stable_node); |
| 1303 | DO_NUMA(stable_node->nid = nid); | ||
| 1304 | rb_link_node(&stable_node->node, parent, new); | ||
| 1305 | rb_insert_color(&stable_node->node, root); | ||
| 1085 | 1306 | ||
| 1086 | return stable_node; | 1307 | return stable_node; |
| 1087 | } | 1308 | } |
| @@ -1104,10 +1325,15 @@ static | |||
| 1104 | struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, | 1325 | struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, |
| 1105 | struct page *page, | 1326 | struct page *page, |
| 1106 | struct page **tree_pagep) | 1327 | struct page **tree_pagep) |
| 1107 | |||
| 1108 | { | 1328 | { |
| 1109 | struct rb_node **new = &root_unstable_tree.rb_node; | 1329 | struct rb_node **new; |
| 1330 | struct rb_root *root; | ||
| 1110 | struct rb_node *parent = NULL; | 1331 | struct rb_node *parent = NULL; |
| 1332 | int nid; | ||
| 1333 | |||
| 1334 | nid = get_kpfn_nid(page_to_pfn(page)); | ||
| 1335 | root = root_unstable_tree + nid; | ||
| 1336 | new = &root->rb_node; | ||
| 1111 | 1337 | ||
| 1112 | while (*new) { | 1338 | while (*new) { |
| 1113 | struct rmap_item *tree_rmap_item; | 1339 | struct rmap_item *tree_rmap_item; |
| @@ -1137,6 +1363,15 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, | |||
| 1137 | } else if (ret > 0) { | 1363 | } else if (ret > 0) { |
| 1138 | put_page(tree_page); | 1364 | put_page(tree_page); |
| 1139 | new = &parent->rb_right; | 1365 | new = &parent->rb_right; |
| 1366 | } else if (!ksm_merge_across_nodes && | ||
| 1367 | page_to_nid(tree_page) != nid) { | ||
| 1368 | /* | ||
| 1369 | * If tree_page has been migrated to another NUMA node, | ||
| 1370 | * it will be flushed out and put in the right unstable | ||
| 1371 | * tree next time: only merge with it when across_nodes. | ||
| 1372 | */ | ||
| 1373 | put_page(tree_page); | ||
| 1374 | return NULL; | ||
| 1140 | } else { | 1375 | } else { |
| 1141 | *tree_pagep = tree_page; | 1376 | *tree_pagep = tree_page; |
| 1142 | return tree_rmap_item; | 1377 | return tree_rmap_item; |
| @@ -1145,8 +1380,9 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, | |||
| 1145 | 1380 | ||
| 1146 | rmap_item->address |= UNSTABLE_FLAG; | 1381 | rmap_item->address |= UNSTABLE_FLAG; |
| 1147 | rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); | 1382 | rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); |
| 1383 | DO_NUMA(rmap_item->nid = nid); | ||
| 1148 | rb_link_node(&rmap_item->node, parent, new); | 1384 | rb_link_node(&rmap_item->node, parent, new); |
| 1149 | rb_insert_color(&rmap_item->node, &root_unstable_tree); | 1385 | rb_insert_color(&rmap_item->node, root); |
| 1150 | 1386 | ||
| 1151 | ksm_pages_unshared++; | 1387 | ksm_pages_unshared++; |
| 1152 | return NULL; | 1388 | return NULL; |
| @@ -1188,10 +1424,29 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
| 1188 | unsigned int checksum; | 1424 | unsigned int checksum; |
| 1189 | int err; | 1425 | int err; |
| 1190 | 1426 | ||
| 1191 | remove_rmap_item_from_tree(rmap_item); | 1427 | stable_node = page_stable_node(page); |
| 1428 | if (stable_node) { | ||
| 1429 | if (stable_node->head != &migrate_nodes && | ||
| 1430 | get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) { | ||
| 1431 | rb_erase(&stable_node->node, | ||
| 1432 | root_stable_tree + NUMA(stable_node->nid)); | ||
| 1433 | stable_node->head = &migrate_nodes; | ||
| 1434 | list_add(&stable_node->list, stable_node->head); | ||
| 1435 | } | ||
| 1436 | if (stable_node->head != &migrate_nodes && | ||
| 1437 | rmap_item->head == stable_node) | ||
| 1438 | return; | ||
| 1439 | } | ||
| 1192 | 1440 | ||
| 1193 | /* We first start with searching the page inside the stable tree */ | 1441 | /* We first start with searching the page inside the stable tree */ |
| 1194 | kpage = stable_tree_search(page); | 1442 | kpage = stable_tree_search(page); |
| 1443 | if (kpage == page && rmap_item->head == stable_node) { | ||
| 1444 | put_page(kpage); | ||
| 1445 | return; | ||
| 1446 | } | ||
| 1447 | |||
| 1448 | remove_rmap_item_from_tree(rmap_item); | ||
| 1449 | |||
| 1195 | if (kpage) { | 1450 | if (kpage) { |
| 1196 | err = try_to_merge_with_ksm_page(rmap_item, page, kpage); | 1451 | err = try_to_merge_with_ksm_page(rmap_item, page, kpage); |
| 1197 | if (!err) { | 1452 | if (!err) { |
| @@ -1225,14 +1480,11 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
| 1225 | kpage = try_to_merge_two_pages(rmap_item, page, | 1480 | kpage = try_to_merge_two_pages(rmap_item, page, |
| 1226 | tree_rmap_item, tree_page); | 1481 | tree_rmap_item, tree_page); |
| 1227 | put_page(tree_page); | 1482 | put_page(tree_page); |
| 1228 | /* | ||
| 1229 | * As soon as we merge this page, we want to remove the | ||
| 1230 | * rmap_item of the page we have merged with from the unstable | ||
| 1231 | * tree, and insert it instead as new node in the stable tree. | ||
| 1232 | */ | ||
| 1233 | if (kpage) { | 1483 | if (kpage) { |
| 1234 | remove_rmap_item_from_tree(tree_rmap_item); | 1484 | /* |
| 1235 | 1485 | * The pages were successfully merged: insert new | |
| 1486 | * node in the stable tree and add both rmap_items. | ||
| 1487 | */ | ||
| 1236 | lock_page(kpage); | 1488 | lock_page(kpage); |
| 1237 | stable_node = stable_tree_insert(kpage); | 1489 | stable_node = stable_tree_insert(kpage); |
| 1238 | if (stable_node) { | 1490 | if (stable_node) { |
| @@ -1289,6 +1541,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) | |||
| 1289 | struct mm_slot *slot; | 1541 | struct mm_slot *slot; |
| 1290 | struct vm_area_struct *vma; | 1542 | struct vm_area_struct *vma; |
| 1291 | struct rmap_item *rmap_item; | 1543 | struct rmap_item *rmap_item; |
| 1544 | int nid; | ||
| 1292 | 1545 | ||
| 1293 | if (list_empty(&ksm_mm_head.mm_list)) | 1546 | if (list_empty(&ksm_mm_head.mm_list)) |
| 1294 | return NULL; | 1547 | return NULL; |
| @@ -1307,7 +1560,29 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) | |||
| 1307 | */ | 1560 | */ |
| 1308 | lru_add_drain_all(); | 1561 | lru_add_drain_all(); |
| 1309 | 1562 | ||
| 1310 | root_unstable_tree = RB_ROOT; | 1563 | /* |
| 1564 | * Whereas stale stable_nodes on the stable_tree itself | ||
| 1565 | * get pruned in the regular course of stable_tree_search(), | ||
| 1566 | * those moved out to the migrate_nodes list can accumulate: | ||
| 1567 | * so prune them once before each full scan. | ||
| 1568 | */ | ||
| 1569 | if (!ksm_merge_across_nodes) { | ||
| 1570 | struct stable_node *stable_node; | ||
| 1571 | struct list_head *this, *next; | ||
| 1572 | struct page *page; | ||
| 1573 | |||
| 1574 | list_for_each_safe(this, next, &migrate_nodes) { | ||
| 1575 | stable_node = list_entry(this, | ||
| 1576 | struct stable_node, list); | ||
| 1577 | page = get_ksm_page(stable_node, false); | ||
| 1578 | if (page) | ||
| 1579 | put_page(page); | ||
| 1580 | cond_resched(); | ||
| 1581 | } | ||
| 1582 | } | ||
| 1583 | |||
| 1584 | for (nid = 0; nid < ksm_nr_node_ids; nid++) | ||
| 1585 | root_unstable_tree[nid] = RB_ROOT; | ||
| 1311 | 1586 | ||
| 1312 | spin_lock(&ksm_mmlist_lock); | 1587 | spin_lock(&ksm_mmlist_lock); |
| 1313 | slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); | 1588 | slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); |
| @@ -1392,7 +1667,7 @@ next_mm: | |||
| 1392 | * or when all VM_MERGEABLE areas have been unmapped (and | 1667 | * or when all VM_MERGEABLE areas have been unmapped (and |
| 1393 | * mmap_sem then protects against race with MADV_MERGEABLE). | 1668 | * mmap_sem then protects against race with MADV_MERGEABLE). |
| 1394 | */ | 1669 | */ |
| 1395 | hlist_del(&slot->link); | 1670 | hash_del(&slot->link); |
| 1396 | list_del(&slot->mm_list); | 1671 | list_del(&slot->mm_list); |
| 1397 | spin_unlock(&ksm_mmlist_lock); | 1672 | spin_unlock(&ksm_mmlist_lock); |
| 1398 | 1673 | ||
| @@ -1428,8 +1703,7 @@ static void ksm_do_scan(unsigned int scan_npages) | |||
| 1428 | rmap_item = scan_get_next_rmap_item(&page); | 1703 | rmap_item = scan_get_next_rmap_item(&page); |
| 1429 | if (!rmap_item) | 1704 | if (!rmap_item) |
| 1430 | return; | 1705 | return; |
| 1431 | if (!PageKsm(page) || !in_stable_tree(rmap_item)) | 1706 | cmp_and_merge_page(page, rmap_item); |
| 1432 | cmp_and_merge_page(page, rmap_item); | ||
| 1433 | put_page(page); | 1707 | put_page(page); |
| 1434 | } | 1708 | } |
| 1435 | } | 1709 | } |
| @@ -1446,6 +1720,7 @@ static int ksm_scan_thread(void *nothing) | |||
| 1446 | 1720 | ||
| 1447 | while (!kthread_should_stop()) { | 1721 | while (!kthread_should_stop()) { |
| 1448 | mutex_lock(&ksm_thread_mutex); | 1722 | mutex_lock(&ksm_thread_mutex); |
| 1723 | wait_while_offlining(); | ||
| 1449 | if (ksmd_should_run()) | 1724 | if (ksmd_should_run()) |
| 1450 | ksm_do_scan(ksm_thread_pages_to_scan); | 1725 | ksm_do_scan(ksm_thread_pages_to_scan); |
| 1451 | mutex_unlock(&ksm_thread_mutex); | 1726 | mutex_unlock(&ksm_thread_mutex); |
| @@ -1525,11 +1800,19 @@ int __ksm_enter(struct mm_struct *mm) | |||
| 1525 | spin_lock(&ksm_mmlist_lock); | 1800 | spin_lock(&ksm_mmlist_lock); |
| 1526 | insert_to_mm_slots_hash(mm, mm_slot); | 1801 | insert_to_mm_slots_hash(mm, mm_slot); |
| 1527 | /* | 1802 | /* |
| 1528 | * Insert just behind the scanning cursor, to let the area settle | 1803 | * When KSM_RUN_MERGE (or KSM_RUN_STOP), |
| 1804 | * insert just behind the scanning cursor, to let the area settle | ||
| 1529 | * down a little; when fork is followed by immediate exec, we don't | 1805 | * down a little; when fork is followed by immediate exec, we don't |
| 1530 | * want ksmd to waste time setting up and tearing down an rmap_list. | 1806 | * want ksmd to waste time setting up and tearing down an rmap_list. |
| 1807 | * | ||
| 1808 | * But when KSM_RUN_UNMERGE, it's important to insert ahead of its | ||
| 1809 | * scanning cursor, otherwise KSM pages in newly forked mms will be | ||
| 1810 | * missed: then we might as well insert at the end of the list. | ||
| 1531 | */ | 1811 | */ |
| 1532 | list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); | 1812 | if (ksm_run & KSM_RUN_UNMERGE) |
| 1813 | list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list); | ||
| 1814 | else | ||
| 1815 | list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); | ||
| 1533 | spin_unlock(&ksm_mmlist_lock); | 1816 | spin_unlock(&ksm_mmlist_lock); |
| 1534 | 1817 | ||
| 1535 | set_bit(MMF_VM_MERGEABLE, &mm->flags); | 1818 | set_bit(MMF_VM_MERGEABLE, &mm->flags); |
| @@ -1559,7 +1842,7 @@ void __ksm_exit(struct mm_struct *mm) | |||
| 1559 | mm_slot = get_mm_slot(mm); | 1842 | mm_slot = get_mm_slot(mm); |
| 1560 | if (mm_slot && ksm_scan.mm_slot != mm_slot) { | 1843 | if (mm_slot && ksm_scan.mm_slot != mm_slot) { |
| 1561 | if (!mm_slot->rmap_list) { | 1844 | if (!mm_slot->rmap_list) { |
| 1562 | hlist_del(&mm_slot->link); | 1845 | hash_del(&mm_slot->link); |
| 1563 | list_del(&mm_slot->mm_list); | 1846 | list_del(&mm_slot->mm_list); |
| 1564 | easy_to_free = 1; | 1847 | easy_to_free = 1; |
| 1565 | } else { | 1848 | } else { |
| @@ -1579,24 +1862,32 @@ void __ksm_exit(struct mm_struct *mm) | |||
| 1579 | } | 1862 | } |
| 1580 | } | 1863 | } |
| 1581 | 1864 | ||
| 1582 | struct page *ksm_does_need_to_copy(struct page *page, | 1865 | struct page *ksm_might_need_to_copy(struct page *page, |
| 1583 | struct vm_area_struct *vma, unsigned long address) | 1866 | struct vm_area_struct *vma, unsigned long address) |
| 1584 | { | 1867 | { |
| 1868 | struct anon_vma *anon_vma = page_anon_vma(page); | ||
| 1585 | struct page *new_page; | 1869 | struct page *new_page; |
| 1586 | 1870 | ||
| 1871 | if (PageKsm(page)) { | ||
| 1872 | if (page_stable_node(page) && | ||
| 1873 | !(ksm_run & KSM_RUN_UNMERGE)) | ||
| 1874 | return page; /* no need to copy it */ | ||
| 1875 | } else if (!anon_vma) { | ||
| 1876 | return page; /* no need to copy it */ | ||
| 1877 | } else if (anon_vma->root == vma->anon_vma->root && | ||
| 1878 | page->index == linear_page_index(vma, address)) { | ||
| 1879 | return page; /* still no need to copy it */ | ||
| 1880 | } | ||
| 1881 | if (!PageUptodate(page)) | ||
| 1882 | return page; /* let do_swap_page report the error */ | ||
| 1883 | |||
| 1587 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | 1884 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); |
| 1588 | if (new_page) { | 1885 | if (new_page) { |
| 1589 | copy_user_highpage(new_page, page, address, vma); | 1886 | copy_user_highpage(new_page, page, address, vma); |
| 1590 | 1887 | ||
| 1591 | SetPageDirty(new_page); | 1888 | SetPageDirty(new_page); |
| 1592 | __SetPageUptodate(new_page); | 1889 | __SetPageUptodate(new_page); |
| 1593 | SetPageSwapBacked(new_page); | ||
| 1594 | __set_page_locked(new_page); | 1890 | __set_page_locked(new_page); |
| 1595 | |||
| 1596 | if (!mlocked_vma_newpage(vma, new_page)) | ||
| 1597 | lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); | ||
| 1598 | else | ||
| 1599 | add_page_to_unevictable_list(new_page); | ||
| 1600 | } | 1891 | } |
| 1601 | 1892 | ||
| 1602 | return new_page; | 1893 | return new_page; |
| @@ -1773,64 +2064,115 @@ void ksm_migrate_page(struct page *newpage, struct page *oldpage) | |||
| 1773 | if (stable_node) { | 2064 | if (stable_node) { |
| 1774 | VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); | 2065 | VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); |
| 1775 | stable_node->kpfn = page_to_pfn(newpage); | 2066 | stable_node->kpfn = page_to_pfn(newpage); |
| 2067 | /* | ||
| 2068 | * newpage->mapping was set in advance; now we need smp_wmb() | ||
| 2069 | * to make sure that the new stable_node->kpfn is visible | ||
| 2070 | * to get_ksm_page() before it can see that oldpage->mapping | ||
| 2071 | * has gone stale (or that PageSwapCache has been cleared). | ||
| 2072 | */ | ||
| 2073 | smp_wmb(); | ||
| 2074 | set_page_stable_node(oldpage, NULL); | ||
| 1776 | } | 2075 | } |
| 1777 | } | 2076 | } |
| 1778 | #endif /* CONFIG_MIGRATION */ | 2077 | #endif /* CONFIG_MIGRATION */ |
| 1779 | 2078 | ||
| 1780 | #ifdef CONFIG_MEMORY_HOTREMOVE | 2079 | #ifdef CONFIG_MEMORY_HOTREMOVE |
| 1781 | static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn, | 2080 | static int just_wait(void *word) |
| 1782 | unsigned long end_pfn) | ||
| 1783 | { | 2081 | { |
| 1784 | struct rb_node *node; | 2082 | schedule(); |
| 2083 | return 0; | ||
| 2084 | } | ||
| 1785 | 2085 | ||
| 1786 | for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) { | 2086 | static void wait_while_offlining(void) |
| 1787 | struct stable_node *stable_node; | 2087 | { |
| 2088 | while (ksm_run & KSM_RUN_OFFLINE) { | ||
| 2089 | mutex_unlock(&ksm_thread_mutex); | ||
| 2090 | wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE), | ||
| 2091 | just_wait, TASK_UNINTERRUPTIBLE); | ||
| 2092 | mutex_lock(&ksm_thread_mutex); | ||
| 2093 | } | ||
| 2094 | } | ||
| 1788 | 2095 | ||
| 1789 | stable_node = rb_entry(node, struct stable_node, node); | 2096 | static void ksm_check_stable_tree(unsigned long start_pfn, |
| 2097 | unsigned long end_pfn) | ||
| 2098 | { | ||
| 2099 | struct stable_node *stable_node; | ||
| 2100 | struct list_head *this, *next; | ||
| 2101 | struct rb_node *node; | ||
| 2102 | int nid; | ||
| 2103 | |||
| 2104 | for (nid = 0; nid < ksm_nr_node_ids; nid++) { | ||
| 2105 | node = rb_first(root_stable_tree + nid); | ||
| 2106 | while (node) { | ||
| 2107 | stable_node = rb_entry(node, struct stable_node, node); | ||
| 2108 | if (stable_node->kpfn >= start_pfn && | ||
| 2109 | stable_node->kpfn < end_pfn) { | ||
| 2110 | /* | ||
| 2111 | * Don't get_ksm_page, page has already gone: | ||
| 2112 | * which is why we keep kpfn instead of page* | ||
| 2113 | */ | ||
| 2114 | remove_node_from_stable_tree(stable_node); | ||
| 2115 | node = rb_first(root_stable_tree + nid); | ||
| 2116 | } else | ||
| 2117 | node = rb_next(node); | ||
| 2118 | cond_resched(); | ||
| 2119 | } | ||
| 2120 | } | ||
| 2121 | list_for_each_safe(this, next, &migrate_nodes) { | ||
| 2122 | stable_node = list_entry(this, struct stable_node, list); | ||
| 1790 | if (stable_node->kpfn >= start_pfn && | 2123 | if (stable_node->kpfn >= start_pfn && |
| 1791 | stable_node->kpfn < end_pfn) | 2124 | stable_node->kpfn < end_pfn) |
| 1792 | return stable_node; | 2125 | remove_node_from_stable_tree(stable_node); |
| 2126 | cond_resched(); | ||
| 1793 | } | 2127 | } |
| 1794 | return NULL; | ||
| 1795 | } | 2128 | } |
| 1796 | 2129 | ||
| 1797 | static int ksm_memory_callback(struct notifier_block *self, | 2130 | static int ksm_memory_callback(struct notifier_block *self, |
| 1798 | unsigned long action, void *arg) | 2131 | unsigned long action, void *arg) |
| 1799 | { | 2132 | { |
| 1800 | struct memory_notify *mn = arg; | 2133 | struct memory_notify *mn = arg; |
| 1801 | struct stable_node *stable_node; | ||
| 1802 | 2134 | ||
| 1803 | switch (action) { | 2135 | switch (action) { |
| 1804 | case MEM_GOING_OFFLINE: | 2136 | case MEM_GOING_OFFLINE: |
| 1805 | /* | 2137 | /* |
| 1806 | * Keep it very simple for now: just lock out ksmd and | 2138 | * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items() |
| 1807 | * MADV_UNMERGEABLE while any memory is going offline. | 2139 | * and remove_all_stable_nodes() while memory is going offline: |
| 1808 | * mutex_lock_nested() is necessary because lockdep was alarmed | 2140 | * it is unsafe for them to touch the stable tree at this time. |
| 1809 | * that here we take ksm_thread_mutex inside notifier chain | 2141 | * But unmerge_ksm_pages(), rmap lookups and other entry points |
| 1810 | * mutex, and later take notifier chain mutex inside | 2142 | * which do not need the ksm_thread_mutex are all safe. |
| 1811 | * ksm_thread_mutex to unlock it. But that's safe because both | ||
| 1812 | * are inside mem_hotplug_mutex. | ||
| 1813 | */ | 2143 | */ |
| 1814 | mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING); | 2144 | mutex_lock(&ksm_thread_mutex); |
| 2145 | ksm_run |= KSM_RUN_OFFLINE; | ||
| 2146 | mutex_unlock(&ksm_thread_mutex); | ||
| 1815 | break; | 2147 | break; |
| 1816 | 2148 | ||
| 1817 | case MEM_OFFLINE: | 2149 | case MEM_OFFLINE: |
| 1818 | /* | 2150 | /* |
| 1819 | * Most of the work is done by page migration; but there might | 2151 | * Most of the work is done by page migration; but there might |
| 1820 | * be a few stable_nodes left over, still pointing to struct | 2152 | * be a few stable_nodes left over, still pointing to struct |
| 1821 | * pages which have been offlined: prune those from the tree. | 2153 | * pages which have been offlined: prune those from the tree, |
| 2154 | * otherwise get_ksm_page() might later try to access a | ||
| 2155 | * non-existent struct page. | ||
| 1822 | */ | 2156 | */ |
| 1823 | while ((stable_node = ksm_check_stable_tree(mn->start_pfn, | 2157 | ksm_check_stable_tree(mn->start_pfn, |
| 1824 | mn->start_pfn + mn->nr_pages)) != NULL) | 2158 | mn->start_pfn + mn->nr_pages); |
| 1825 | remove_node_from_stable_tree(stable_node); | ||
| 1826 | /* fallthrough */ | 2159 | /* fallthrough */ |
| 1827 | 2160 | ||
| 1828 | case MEM_CANCEL_OFFLINE: | 2161 | case MEM_CANCEL_OFFLINE: |
| 2162 | mutex_lock(&ksm_thread_mutex); | ||
| 2163 | ksm_run &= ~KSM_RUN_OFFLINE; | ||
| 1829 | mutex_unlock(&ksm_thread_mutex); | 2164 | mutex_unlock(&ksm_thread_mutex); |
| 2165 | |||
| 2166 | smp_mb(); /* wake_up_bit advises this */ | ||
| 2167 | wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE)); | ||
| 1830 | break; | 2168 | break; |
| 1831 | } | 2169 | } |
| 1832 | return NOTIFY_OK; | 2170 | return NOTIFY_OK; |
| 1833 | } | 2171 | } |
| 2172 | #else | ||
| 2173 | static void wait_while_offlining(void) | ||
| 2174 | { | ||
| 2175 | } | ||
| 1834 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | 2176 | #endif /* CONFIG_MEMORY_HOTREMOVE */ |
| 1835 | 2177 | ||
| 1836 | #ifdef CONFIG_SYSFS | 2178 | #ifdef CONFIG_SYSFS |
| @@ -1893,7 +2235,7 @@ KSM_ATTR(pages_to_scan); | |||
| 1893 | static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, | 2235 | static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, |
| 1894 | char *buf) | 2236 | char *buf) |
| 1895 | { | 2237 | { |
| 1896 | return sprintf(buf, "%u\n", ksm_run); | 2238 | return sprintf(buf, "%lu\n", ksm_run); |
| 1897 | } | 2239 | } |
| 1898 | 2240 | ||
| 1899 | static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | 2241 | static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, |
| @@ -1916,6 +2258,7 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
| 1916 | */ | 2258 | */ |
| 1917 | 2259 | ||
| 1918 | mutex_lock(&ksm_thread_mutex); | 2260 | mutex_lock(&ksm_thread_mutex); |
| 2261 | wait_while_offlining(); | ||
| 1919 | if (ksm_run != flags) { | 2262 | if (ksm_run != flags) { |
| 1920 | ksm_run = flags; | 2263 | ksm_run = flags; |
| 1921 | if (flags & KSM_RUN_UNMERGE) { | 2264 | if (flags & KSM_RUN_UNMERGE) { |
| @@ -1937,6 +2280,64 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
| 1937 | } | 2280 | } |
| 1938 | KSM_ATTR(run); | 2281 | KSM_ATTR(run); |
| 1939 | 2282 | ||
| 2283 | #ifdef CONFIG_NUMA | ||
| 2284 | static ssize_t merge_across_nodes_show(struct kobject *kobj, | ||
| 2285 | struct kobj_attribute *attr, char *buf) | ||
| 2286 | { | ||
| 2287 | return sprintf(buf, "%u\n", ksm_merge_across_nodes); | ||
| 2288 | } | ||
| 2289 | |||
| 2290 | static ssize_t merge_across_nodes_store(struct kobject *kobj, | ||
| 2291 | struct kobj_attribute *attr, | ||
| 2292 | const char *buf, size_t count) | ||
| 2293 | { | ||
| 2294 | int err; | ||
| 2295 | unsigned long knob; | ||
| 2296 | |||
| 2297 | err = kstrtoul(buf, 10, &knob); | ||
| 2298 | if (err) | ||
| 2299 | return err; | ||
| 2300 | if (knob > 1) | ||
| 2301 | return -EINVAL; | ||
| 2302 | |||
| 2303 | mutex_lock(&ksm_thread_mutex); | ||
| 2304 | wait_while_offlining(); | ||
| 2305 | if (ksm_merge_across_nodes != knob) { | ||
| 2306 | if (ksm_pages_shared || remove_all_stable_nodes()) | ||
| 2307 | err = -EBUSY; | ||
| 2308 | else if (root_stable_tree == one_stable_tree) { | ||
| 2309 | struct rb_root *buf; | ||
| 2310 | /* | ||
| 2311 | * This is the first time that we switch away from the | ||
| 2312 | * default of merging across nodes: must now allocate | ||
| 2313 | * a buffer to hold as many roots as may be needed. | ||
| 2314 | * Allocate stable and unstable together: | ||
| 2315 | * MAXSMP NODES_SHIFT 10 will use 16kB. | ||
| 2316 | */ | ||
| 2317 | buf = kcalloc(nr_node_ids + nr_node_ids, | ||
| 2318 | sizeof(*buf), GFP_KERNEL | __GFP_ZERO); | ||
| 2319 | /* Let us assume that RB_ROOT is NULL is zero */ | ||
| 2320 | if (!buf) | ||
| 2321 | err = -ENOMEM; | ||
| 2322 | else { | ||
| 2323 | root_stable_tree = buf; | ||
| 2324 | root_unstable_tree = buf + nr_node_ids; | ||
| 2325 | /* Stable tree is empty but not the unstable */ | ||
| 2326 | root_unstable_tree[0] = one_unstable_tree[0]; | ||
| 2327 | } | ||
| 2328 | } | ||
| 2329 | if (!err) { | ||
| 2330 | ksm_merge_across_nodes = knob; | ||
| 2331 | ksm_nr_node_ids = knob ? 1 : nr_node_ids; | ||
| 2332 | } | ||
| 2333 | } | ||
| 2334 | mutex_unlock(&ksm_thread_mutex); | ||
| 2335 | |||
| 2336 | return err ? err : count; | ||
| 2337 | } | ||
| 2338 | KSM_ATTR(merge_across_nodes); | ||
| 2339 | #endif | ||
| 2340 | |||
| 1940 | static ssize_t pages_shared_show(struct kobject *kobj, | 2341 | static ssize_t pages_shared_show(struct kobject *kobj, |
| 1941 | struct kobj_attribute *attr, char *buf) | 2342 | struct kobj_attribute *attr, char *buf) |
| 1942 | { | 2343 | { |
| @@ -1991,6 +2392,9 @@ static struct attribute *ksm_attrs[] = { | |||
| 1991 | &pages_unshared_attr.attr, | 2392 | &pages_unshared_attr.attr, |
| 1992 | &pages_volatile_attr.attr, | 2393 | &pages_volatile_attr.attr, |
| 1993 | &full_scans_attr.attr, | 2394 | &full_scans_attr.attr, |
| 2395 | #ifdef CONFIG_NUMA | ||
| 2396 | &merge_across_nodes_attr.attr, | ||
| 2397 | #endif | ||
| 1994 | NULL, | 2398 | NULL, |
| 1995 | }; | 2399 | }; |
| 1996 | 2400 | ||
| @@ -2029,10 +2433,7 @@ static int __init ksm_init(void) | |||
| 2029 | #endif /* CONFIG_SYSFS */ | 2433 | #endif /* CONFIG_SYSFS */ |
| 2030 | 2434 | ||
| 2031 | #ifdef CONFIG_MEMORY_HOTREMOVE | 2435 | #ifdef CONFIG_MEMORY_HOTREMOVE |
| 2032 | /* | 2436 | /* There is no significance to this priority 100 */ |
| 2033 | * Choose a high priority since the callback takes ksm_thread_mutex: | ||
| 2034 | * later callbacks could only be taking locks which nest within that. | ||
| 2035 | */ | ||
| 2036 | hotplug_memory_notifier(ksm_memory_callback, 100); | 2437 | hotplug_memory_notifier(ksm_memory_callback, 100); |
| 2037 | #endif | 2438 | #endif |
| 2038 | return 0; | 2439 | return 0; |
diff --git a/mm/madvise.c b/mm/madvise.c index 03dfa5c7adb3..c58c94b56c3d 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
| @@ -16,6 +16,9 @@ | |||
| 16 | #include <linux/ksm.h> | 16 | #include <linux/ksm.h> |
| 17 | #include <linux/fs.h> | 17 | #include <linux/fs.h> |
| 18 | #include <linux/file.h> | 18 | #include <linux/file.h> |
| 19 | #include <linux/blkdev.h> | ||
| 20 | #include <linux/swap.h> | ||
| 21 | #include <linux/swapops.h> | ||
| 19 | 22 | ||
| 20 | /* | 23 | /* |
| 21 | * Any behaviour which results in changes to the vma->vm_flags needs to | 24 | * Any behaviour which results in changes to the vma->vm_flags needs to |
| @@ -131,6 +134,84 @@ out: | |||
| 131 | return error; | 134 | return error; |
| 132 | } | 135 | } |
| 133 | 136 | ||
| 137 | #ifdef CONFIG_SWAP | ||
| 138 | static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, | ||
| 139 | unsigned long end, struct mm_walk *walk) | ||
| 140 | { | ||
| 141 | pte_t *orig_pte; | ||
| 142 | struct vm_area_struct *vma = walk->private; | ||
| 143 | unsigned long index; | ||
| 144 | |||
| 145 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | ||
| 146 | return 0; | ||
| 147 | |||
| 148 | for (index = start; index != end; index += PAGE_SIZE) { | ||
| 149 | pte_t pte; | ||
| 150 | swp_entry_t entry; | ||
| 151 | struct page *page; | ||
| 152 | spinlock_t *ptl; | ||
| 153 | |||
| 154 | orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); | ||
| 155 | pte = *(orig_pte + ((index - start) / PAGE_SIZE)); | ||
| 156 | pte_unmap_unlock(orig_pte, ptl); | ||
| 157 | |||
| 158 | if (pte_present(pte) || pte_none(pte) || pte_file(pte)) | ||
| 159 | continue; | ||
| 160 | entry = pte_to_swp_entry(pte); | ||
| 161 | if (unlikely(non_swap_entry(entry))) | ||
| 162 | continue; | ||
| 163 | |||
| 164 | page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, | ||
| 165 | vma, index); | ||
| 166 | if (page) | ||
| 167 | page_cache_release(page); | ||
| 168 | } | ||
| 169 | |||
| 170 | return 0; | ||
| 171 | } | ||
| 172 | |||
| 173 | static void force_swapin_readahead(struct vm_area_struct *vma, | ||
| 174 | unsigned long start, unsigned long end) | ||
| 175 | { | ||
| 176 | struct mm_walk walk = { | ||
| 177 | .mm = vma->vm_mm, | ||
| 178 | .pmd_entry = swapin_walk_pmd_entry, | ||
| 179 | .private = vma, | ||
| 180 | }; | ||
| 181 | |||
| 182 | walk_page_range(start, end, &walk); | ||
| 183 | |||
| 184 | lru_add_drain(); /* Push any new pages onto the LRU now */ | ||
| 185 | } | ||
| 186 | |||
| 187 | static void force_shm_swapin_readahead(struct vm_area_struct *vma, | ||
| 188 | unsigned long start, unsigned long end, | ||
| 189 | struct address_space *mapping) | ||
| 190 | { | ||
| 191 | pgoff_t index; | ||
| 192 | struct page *page; | ||
| 193 | swp_entry_t swap; | ||
| 194 | |||
| 195 | for (; start < end; start += PAGE_SIZE) { | ||
| 196 | index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||
| 197 | |||
| 198 | page = find_get_page(mapping, index); | ||
| 199 | if (!radix_tree_exceptional_entry(page)) { | ||
| 200 | if (page) | ||
| 201 | page_cache_release(page); | ||
| 202 | continue; | ||
| 203 | } | ||
| 204 | swap = radix_to_swp_entry(page); | ||
| 205 | page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, | ||
| 206 | NULL, 0); | ||
| 207 | if (page) | ||
| 208 | page_cache_release(page); | ||
| 209 | } | ||
| 210 | |||
| 211 | lru_add_drain(); /* Push any new pages onto the LRU now */ | ||
| 212 | } | ||
| 213 | #endif /* CONFIG_SWAP */ | ||
| 214 | |||
| 134 | /* | 215 | /* |
| 135 | * Schedule all required I/O operations. Do not wait for completion. | 216 | * Schedule all required I/O operations. Do not wait for completion. |
| 136 | */ | 217 | */ |
| @@ -140,6 +221,18 @@ static long madvise_willneed(struct vm_area_struct * vma, | |||
| 140 | { | 221 | { |
| 141 | struct file *file = vma->vm_file; | 222 | struct file *file = vma->vm_file; |
| 142 | 223 | ||
| 224 | #ifdef CONFIG_SWAP | ||
| 225 | if (!file || mapping_cap_swap_backed(file->f_mapping)) { | ||
| 226 | *prev = vma; | ||
| 227 | if (!file) | ||
| 228 | force_swapin_readahead(vma, start, end); | ||
| 229 | else | ||
| 230 | force_shm_swapin_readahead(vma, start, end, | ||
| 231 | file->f_mapping); | ||
| 232 | return 0; | ||
| 233 | } | ||
| 234 | #endif | ||
| 235 | |||
| 143 | if (!file) | 236 | if (!file) |
| 144 | return -EBADF; | 237 | return -EBADF; |
| 145 | 238 | ||
| @@ -371,6 +464,7 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |||
| 371 | int error = -EINVAL; | 464 | int error = -EINVAL; |
| 372 | int write; | 465 | int write; |
| 373 | size_t len; | 466 | size_t len; |
| 467 | struct blk_plug plug; | ||
| 374 | 468 | ||
| 375 | #ifdef CONFIG_MEMORY_FAILURE | 469 | #ifdef CONFIG_MEMORY_FAILURE |
| 376 | if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) | 470 | if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) |
| @@ -410,18 +504,19 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |||
| 410 | if (vma && start > vma->vm_start) | 504 | if (vma && start > vma->vm_start) |
| 411 | prev = vma; | 505 | prev = vma; |
| 412 | 506 | ||
| 507 | blk_start_plug(&plug); | ||
| 413 | for (;;) { | 508 | for (;;) { |
| 414 | /* Still start < end. */ | 509 | /* Still start < end. */ |
| 415 | error = -ENOMEM; | 510 | error = -ENOMEM; |
| 416 | if (!vma) | 511 | if (!vma) |
| 417 | goto out; | 512 | goto out_plug; |
| 418 | 513 | ||
| 419 | /* Here start < (end|vma->vm_end). */ | 514 | /* Here start < (end|vma->vm_end). */ |
| 420 | if (start < vma->vm_start) { | 515 | if (start < vma->vm_start) { |
| 421 | unmapped_error = -ENOMEM; | 516 | unmapped_error = -ENOMEM; |
| 422 | start = vma->vm_start; | 517 | start = vma->vm_start; |
| 423 | if (start >= end) | 518 | if (start >= end) |
| 424 | goto out; | 519 | goto out_plug; |
| 425 | } | 520 | } |
| 426 | 521 | ||
| 427 | /* Here vma->vm_start <= start < (end|vma->vm_end) */ | 522 | /* Here vma->vm_start <= start < (end|vma->vm_end) */ |
| @@ -432,18 +527,20 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |||
| 432 | /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ | 527 | /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ |
| 433 | error = madvise_vma(vma, &prev, start, tmp, behavior); | 528 | error = madvise_vma(vma, &prev, start, tmp, behavior); |
| 434 | if (error) | 529 | if (error) |
| 435 | goto out; | 530 | goto out_plug; |
| 436 | start = tmp; | 531 | start = tmp; |
| 437 | if (prev && start < prev->vm_end) | 532 | if (prev && start < prev->vm_end) |
| 438 | start = prev->vm_end; | 533 | start = prev->vm_end; |
| 439 | error = unmapped_error; | 534 | error = unmapped_error; |
| 440 | if (start >= end) | 535 | if (start >= end) |
| 441 | goto out; | 536 | goto out_plug; |
| 442 | if (prev) | 537 | if (prev) |
| 443 | vma = prev->vm_next; | 538 | vma = prev->vm_next; |
| 444 | else /* madvise_remove dropped mmap_sem */ | 539 | else /* madvise_remove dropped mmap_sem */ |
| 445 | vma = find_vma(current->mm, start); | 540 | vma = find_vma(current->mm, start); |
| 446 | } | 541 | } |
| 542 | out_plug: | ||
| 543 | blk_finish_plug(&plug); | ||
| 447 | out: | 544 | out: |
| 448 | if (write) | 545 | if (write) |
| 449 | up_write(¤t->mm->mmap_sem); | 546 | up_write(¤t->mm->mmap_sem); |
diff --git a/mm/memblock.c b/mm/memblock.c index b8d9147e5c08..1bcd9b970564 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
| @@ -92,9 +92,58 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, | |||
| 92 | * | 92 | * |
| 93 | * Find @size free area aligned to @align in the specified range and node. | 93 | * Find @size free area aligned to @align in the specified range and node. |
| 94 | * | 94 | * |
| 95 | * If we have CONFIG_HAVE_MEMBLOCK_NODE_MAP defined, we need to check if the | ||
| 96 | * memory we found if not in hotpluggable ranges. | ||
| 97 | * | ||
| 95 | * RETURNS: | 98 | * RETURNS: |
| 96 | * Found address on success, %0 on failure. | 99 | * Found address on success, %0 on failure. |
| 97 | */ | 100 | */ |
| 101 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | ||
| 102 | phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, | ||
| 103 | phys_addr_t end, phys_addr_t size, | ||
| 104 | phys_addr_t align, int nid) | ||
| 105 | { | ||
| 106 | phys_addr_t this_start, this_end, cand; | ||
| 107 | u64 i; | ||
| 108 | int curr = movablemem_map.nr_map - 1; | ||
| 109 | |||
| 110 | /* pump up @end */ | ||
| 111 | if (end == MEMBLOCK_ALLOC_ACCESSIBLE) | ||
| 112 | end = memblock.current_limit; | ||
| 113 | |||
| 114 | /* avoid allocating the first page */ | ||
| 115 | start = max_t(phys_addr_t, start, PAGE_SIZE); | ||
| 116 | end = max(start, end); | ||
| 117 | |||
| 118 | for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { | ||
| 119 | this_start = clamp(this_start, start, end); | ||
| 120 | this_end = clamp(this_end, start, end); | ||
| 121 | |||
| 122 | restart: | ||
| 123 | if (this_end <= this_start || this_end < size) | ||
| 124 | continue; | ||
| 125 | |||
| 126 | for (; curr >= 0; curr--) { | ||
| 127 | if ((movablemem_map.map[curr].start_pfn << PAGE_SHIFT) | ||
| 128 | < this_end) | ||
| 129 | break; | ||
| 130 | } | ||
| 131 | |||
| 132 | cand = round_down(this_end - size, align); | ||
| 133 | if (curr >= 0 && | ||
| 134 | cand < movablemem_map.map[curr].end_pfn << PAGE_SHIFT) { | ||
| 135 | this_end = movablemem_map.map[curr].start_pfn | ||
| 136 | << PAGE_SHIFT; | ||
| 137 | goto restart; | ||
| 138 | } | ||
| 139 | |||
| 140 | if (cand >= this_start) | ||
| 141 | return cand; | ||
| 142 | } | ||
| 143 | |||
| 144 | return 0; | ||
| 145 | } | ||
| 146 | #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | ||
| 98 | phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, | 147 | phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, |
| 99 | phys_addr_t end, phys_addr_t size, | 148 | phys_addr_t end, phys_addr_t size, |
| 100 | phys_addr_t align, int nid) | 149 | phys_addr_t align, int nid) |
| @@ -123,6 +172,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, | |||
| 123 | } | 172 | } |
| 124 | return 0; | 173 | return 0; |
| 125 | } | 174 | } |
| 175 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | ||
| 126 | 176 | ||
| 127 | /** | 177 | /** |
| 128 | * memblock_find_in_range - find free area in given range | 178 | * memblock_find_in_range - find free area in given range |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fbb60b103e64..53b8201b31eb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -120,6 +120,14 @@ static const char * const mem_cgroup_events_names[] = { | |||
| 120 | "pgmajfault", | 120 | "pgmajfault", |
| 121 | }; | 121 | }; |
| 122 | 122 | ||
| 123 | static const char * const mem_cgroup_lru_names[] = { | ||
| 124 | "inactive_anon", | ||
| 125 | "active_anon", | ||
| 126 | "inactive_file", | ||
| 127 | "active_file", | ||
| 128 | "unevictable", | ||
| 129 | }; | ||
| 130 | |||
| 123 | /* | 131 | /* |
| 124 | * Per memcg event counter is incremented at every pagein/pageout. With THP, | 132 | * Per memcg event counter is incremented at every pagein/pageout. With THP, |
| 125 | * it will be incremated by the number of pages. This counter is used for | 133 | * it will be incremated by the number of pages. This counter is used for |
| @@ -172,7 +180,7 @@ struct mem_cgroup_per_node { | |||
| 172 | }; | 180 | }; |
| 173 | 181 | ||
| 174 | struct mem_cgroup_lru_info { | 182 | struct mem_cgroup_lru_info { |
| 175 | struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; | 183 | struct mem_cgroup_per_node *nodeinfo[0]; |
| 176 | }; | 184 | }; |
| 177 | 185 | ||
| 178 | /* | 186 | /* |
| @@ -276,17 +284,6 @@ struct mem_cgroup { | |||
| 276 | */ | 284 | */ |
| 277 | struct res_counter kmem; | 285 | struct res_counter kmem; |
| 278 | /* | 286 | /* |
| 279 | * Per cgroup active and inactive list, similar to the | ||
| 280 | * per zone LRU lists. | ||
| 281 | */ | ||
| 282 | struct mem_cgroup_lru_info info; | ||
| 283 | int last_scanned_node; | ||
| 284 | #if MAX_NUMNODES > 1 | ||
| 285 | nodemask_t scan_nodes; | ||
| 286 | atomic_t numainfo_events; | ||
| 287 | atomic_t numainfo_updating; | ||
| 288 | #endif | ||
| 289 | /* | ||
| 290 | * Should the accounting and control be hierarchical, per subtree? | 287 | * Should the accounting and control be hierarchical, per subtree? |
| 291 | */ | 288 | */ |
| 292 | bool use_hierarchy; | 289 | bool use_hierarchy; |
| @@ -349,8 +346,29 @@ struct mem_cgroup { | |||
| 349 | /* Index in the kmem_cache->memcg_params->memcg_caches array */ | 346 | /* Index in the kmem_cache->memcg_params->memcg_caches array */ |
| 350 | int kmemcg_id; | 347 | int kmemcg_id; |
| 351 | #endif | 348 | #endif |
| 349 | |||
| 350 | int last_scanned_node; | ||
| 351 | #if MAX_NUMNODES > 1 | ||
| 352 | nodemask_t scan_nodes; | ||
| 353 | atomic_t numainfo_events; | ||
| 354 | atomic_t numainfo_updating; | ||
| 355 | #endif | ||
| 356 | /* | ||
| 357 | * Per cgroup active and inactive list, similar to the | ||
| 358 | * per zone LRU lists. | ||
| 359 | * | ||
| 360 | * WARNING: This has to be the last element of the struct. Don't | ||
| 361 | * add new fields after this point. | ||
| 362 | */ | ||
| 363 | struct mem_cgroup_lru_info info; | ||
| 352 | }; | 364 | }; |
| 353 | 365 | ||
| 366 | static size_t memcg_size(void) | ||
| 367 | { | ||
| 368 | return sizeof(struct mem_cgroup) + | ||
| 369 | nr_node_ids * sizeof(struct mem_cgroup_per_node); | ||
| 370 | } | ||
| 371 | |||
| 354 | /* internal only representation about the status of kmem accounting. */ | 372 | /* internal only representation about the status of kmem accounting. */ |
| 355 | enum { | 373 | enum { |
| 356 | KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ | 374 | KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ |
| @@ -398,8 +416,8 @@ static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) | |||
| 398 | 416 | ||
| 399 | /* Stuffs for move charges at task migration. */ | 417 | /* Stuffs for move charges at task migration. */ |
| 400 | /* | 418 | /* |
| 401 | * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a | 419 | * Types of charges to be moved. "move_charge_at_immitgrate" and |
| 402 | * left-shifted bitmap of these types. | 420 | * "immigrate_flags" are treated as a left-shifted bitmap of these types. |
| 403 | */ | 421 | */ |
| 404 | enum move_type { | 422 | enum move_type { |
| 405 | MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ | 423 | MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ |
| @@ -412,6 +430,7 @@ static struct move_charge_struct { | |||
| 412 | spinlock_t lock; /* for from, to */ | 430 | spinlock_t lock; /* for from, to */ |
| 413 | struct mem_cgroup *from; | 431 | struct mem_cgroup *from; |
| 414 | struct mem_cgroup *to; | 432 | struct mem_cgroup *to; |
| 433 | unsigned long immigrate_flags; | ||
| 415 | unsigned long precharge; | 434 | unsigned long precharge; |
| 416 | unsigned long moved_charge; | 435 | unsigned long moved_charge; |
| 417 | unsigned long moved_swap; | 436 | unsigned long moved_swap; |
| @@ -424,14 +443,12 @@ static struct move_charge_struct { | |||
| 424 | 443 | ||
| 425 | static bool move_anon(void) | 444 | static bool move_anon(void) |
| 426 | { | 445 | { |
| 427 | return test_bit(MOVE_CHARGE_TYPE_ANON, | 446 | return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags); |
| 428 | &mc.to->move_charge_at_immigrate); | ||
| 429 | } | 447 | } |
| 430 | 448 | ||
| 431 | static bool move_file(void) | 449 | static bool move_file(void) |
| 432 | { | 450 | { |
| 433 | return test_bit(MOVE_CHARGE_TYPE_FILE, | 451 | return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags); |
| 434 | &mc.to->move_charge_at_immigrate); | ||
| 435 | } | 452 | } |
| 436 | 453 | ||
| 437 | /* | 454 | /* |
| @@ -471,6 +488,13 @@ enum res_type { | |||
| 471 | #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 | 488 | #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 |
| 472 | #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) | 489 | #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) |
| 473 | 490 | ||
| 491 | /* | ||
| 492 | * The memcg_create_mutex will be held whenever a new cgroup is created. | ||
| 493 | * As a consequence, any change that needs to protect against new child cgroups | ||
| 494 | * appearing has to hold it as well. | ||
| 495 | */ | ||
| 496 | static DEFINE_MUTEX(memcg_create_mutex); | ||
| 497 | |||
| 474 | static void mem_cgroup_get(struct mem_cgroup *memcg); | 498 | static void mem_cgroup_get(struct mem_cgroup *memcg); |
| 475 | static void mem_cgroup_put(struct mem_cgroup *memcg); | 499 | static void mem_cgroup_put(struct mem_cgroup *memcg); |
| 476 | 500 | ||
| @@ -627,6 +651,7 @@ static void drain_all_stock_async(struct mem_cgroup *memcg); | |||
| 627 | static struct mem_cgroup_per_zone * | 651 | static struct mem_cgroup_per_zone * |
| 628 | mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) | 652 | mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) |
| 629 | { | 653 | { |
| 654 | VM_BUG_ON((unsigned)nid >= nr_node_ids); | ||
| 630 | return &memcg->info.nodeinfo[nid]->zoneinfo[zid]; | 655 | return &memcg->info.nodeinfo[nid]->zoneinfo[zid]; |
| 631 | } | 656 | } |
| 632 | 657 | ||
| @@ -1371,17 +1396,6 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) | |||
| 1371 | return inactive * inactive_ratio < active; | 1396 | return inactive * inactive_ratio < active; |
| 1372 | } | 1397 | } |
| 1373 | 1398 | ||
| 1374 | int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec) | ||
| 1375 | { | ||
| 1376 | unsigned long active; | ||
| 1377 | unsigned long inactive; | ||
| 1378 | |||
| 1379 | inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE); | ||
| 1380 | active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE); | ||
| 1381 | |||
| 1382 | return (active > inactive); | ||
| 1383 | } | ||
| 1384 | |||
| 1385 | #define mem_cgroup_from_res_counter(counter, member) \ | 1399 | #define mem_cgroup_from_res_counter(counter, member) \ |
| 1386 | container_of(counter, struct mem_cgroup, member) | 1400 | container_of(counter, struct mem_cgroup, member) |
| 1387 | 1401 | ||
| @@ -1524,8 +1538,9 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, | |||
| 1524 | spin_unlock_irqrestore(&memcg->move_lock, *flags); | 1538 | spin_unlock_irqrestore(&memcg->move_lock, *flags); |
| 1525 | } | 1539 | } |
| 1526 | 1540 | ||
| 1541 | #define K(x) ((x) << (PAGE_SHIFT-10)) | ||
| 1527 | /** | 1542 | /** |
| 1528 | * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. | 1543 | * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. |
| 1529 | * @memcg: The memory cgroup that went over limit | 1544 | * @memcg: The memory cgroup that went over limit |
| 1530 | * @p: Task that is going to be killed | 1545 | * @p: Task that is going to be killed |
| 1531 | * | 1546 | * |
| @@ -1543,8 +1558,10 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
| 1543 | */ | 1558 | */ |
| 1544 | static char memcg_name[PATH_MAX]; | 1559 | static char memcg_name[PATH_MAX]; |
| 1545 | int ret; | 1560 | int ret; |
| 1561 | struct mem_cgroup *iter; | ||
| 1562 | unsigned int i; | ||
| 1546 | 1563 | ||
| 1547 | if (!memcg || !p) | 1564 | if (!p) |
| 1548 | return; | 1565 | return; |
| 1549 | 1566 | ||
| 1550 | rcu_read_lock(); | 1567 | rcu_read_lock(); |
| @@ -1563,7 +1580,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
| 1563 | } | 1580 | } |
| 1564 | rcu_read_unlock(); | 1581 | rcu_read_unlock(); |
| 1565 | 1582 | ||
| 1566 | printk(KERN_INFO "Task in %s killed", memcg_name); | 1583 | pr_info("Task in %s killed", memcg_name); |
| 1567 | 1584 | ||
| 1568 | rcu_read_lock(); | 1585 | rcu_read_lock(); |
| 1569 | ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); | 1586 | ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); |
| @@ -1576,22 +1593,45 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
| 1576 | /* | 1593 | /* |
| 1577 | * Continues from above, so we don't need an KERN_ level | 1594 | * Continues from above, so we don't need an KERN_ level |
| 1578 | */ | 1595 | */ |
| 1579 | printk(KERN_CONT " as a result of limit of %s\n", memcg_name); | 1596 | pr_cont(" as a result of limit of %s\n", memcg_name); |
| 1580 | done: | 1597 | done: |
| 1581 | 1598 | ||
| 1582 | printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", | 1599 | pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", |
| 1583 | res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, | 1600 | res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, |
| 1584 | res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, | 1601 | res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, |
| 1585 | res_counter_read_u64(&memcg->res, RES_FAILCNT)); | 1602 | res_counter_read_u64(&memcg->res, RES_FAILCNT)); |
| 1586 | printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " | 1603 | pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n", |
| 1587 | "failcnt %llu\n", | ||
| 1588 | res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, | 1604 | res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, |
| 1589 | res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, | 1605 | res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, |
| 1590 | res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); | 1606 | res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); |
| 1591 | printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n", | 1607 | pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n", |
| 1592 | res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, | 1608 | res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, |
| 1593 | res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, | 1609 | res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, |
| 1594 | res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); | 1610 | res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); |
| 1611 | |||
| 1612 | for_each_mem_cgroup_tree(iter, memcg) { | ||
| 1613 | pr_info("Memory cgroup stats"); | ||
| 1614 | |||
| 1615 | rcu_read_lock(); | ||
| 1616 | ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX); | ||
| 1617 | if (!ret) | ||
| 1618 | pr_cont(" for %s", memcg_name); | ||
| 1619 | rcu_read_unlock(); | ||
| 1620 | pr_cont(":"); | ||
| 1621 | |||
| 1622 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { | ||
| 1623 | if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) | ||
| 1624 | continue; | ||
| 1625 | pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i], | ||
| 1626 | K(mem_cgroup_read_stat(iter, i))); | ||
| 1627 | } | ||
| 1628 | |||
| 1629 | for (i = 0; i < NR_LRU_LISTS; i++) | ||
| 1630 | pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], | ||
| 1631 | K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); | ||
| 1632 | |||
| 1633 | pr_cont("\n"); | ||
| 1634 | } | ||
| 1595 | } | 1635 | } |
| 1596 | 1636 | ||
| 1597 | /* | 1637 | /* |
| @@ -2256,6 +2296,17 @@ static void drain_local_stock(struct work_struct *dummy) | |||
| 2256 | clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); | 2296 | clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); |
| 2257 | } | 2297 | } |
| 2258 | 2298 | ||
| 2299 | static void __init memcg_stock_init(void) | ||
| 2300 | { | ||
| 2301 | int cpu; | ||
| 2302 | |||
| 2303 | for_each_possible_cpu(cpu) { | ||
| 2304 | struct memcg_stock_pcp *stock = | ||
| 2305 | &per_cpu(memcg_stock, cpu); | ||
| 2306 | INIT_WORK(&stock->work, drain_local_stock); | ||
| 2307 | } | ||
| 2308 | } | ||
| 2309 | |||
| 2259 | /* | 2310 | /* |
| 2260 | * Cache charges(val) which is from res_counter, to local per_cpu area. | 2311 | * Cache charges(val) which is from res_counter, to local per_cpu area. |
| 2261 | * This will be consumed by consume_stock() function, later. | 2312 | * This will be consumed by consume_stock() function, later. |
| @@ -4391,8 +4442,8 @@ void mem_cgroup_print_bad_page(struct page *page) | |||
| 4391 | 4442 | ||
| 4392 | pc = lookup_page_cgroup_used(page); | 4443 | pc = lookup_page_cgroup_used(page); |
| 4393 | if (pc) { | 4444 | if (pc) { |
| 4394 | printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", | 4445 | pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", |
| 4395 | pc, pc->flags, pc->mem_cgroup); | 4446 | pc, pc->flags, pc->mem_cgroup); |
| 4396 | } | 4447 | } |
| 4397 | } | 4448 | } |
| 4398 | #endif | 4449 | #endif |
| @@ -4719,6 +4770,33 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) | |||
| 4719 | } | 4770 | } |
| 4720 | 4771 | ||
| 4721 | /* | 4772 | /* |
| 4773 | * This mainly exists for tests during the setting of set of use_hierarchy. | ||
| 4774 | * Since this is the very setting we are changing, the current hierarchy value | ||
| 4775 | * is meaningless | ||
| 4776 | */ | ||
| 4777 | static inline bool __memcg_has_children(struct mem_cgroup *memcg) | ||
| 4778 | { | ||
| 4779 | struct cgroup *pos; | ||
| 4780 | |||
| 4781 | /* bounce at first found */ | ||
| 4782 | cgroup_for_each_child(pos, memcg->css.cgroup) | ||
| 4783 | return true; | ||
| 4784 | return false; | ||
| 4785 | } | ||
| 4786 | |||
| 4787 | /* | ||
| 4788 | * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed | ||
| 4789 | * to be already dead (as in mem_cgroup_force_empty, for instance). This is | ||
| 4790 | * from mem_cgroup_count_children(), in the sense that we don't really care how | ||
| 4791 | * many children we have; we only need to know if we have any. It also counts | ||
| 4792 | * any memcg without hierarchy as infertile. | ||
| 4793 | */ | ||
| 4794 | static inline bool memcg_has_children(struct mem_cgroup *memcg) | ||
| 4795 | { | ||
| 4796 | return memcg->use_hierarchy && __memcg_has_children(memcg); | ||
| 4797 | } | ||
| 4798 | |||
| 4799 | /* | ||
| 4722 | * Reclaims as many pages from the given memcg as possible and moves | 4800 | * Reclaims as many pages from the given memcg as possible and moves |
| 4723 | * the rest to the parent. | 4801 | * the rest to the parent. |
| 4724 | * | 4802 | * |
| @@ -4788,7 +4866,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
| 4788 | if (parent) | 4866 | if (parent) |
| 4789 | parent_memcg = mem_cgroup_from_cont(parent); | 4867 | parent_memcg = mem_cgroup_from_cont(parent); |
| 4790 | 4868 | ||
| 4791 | cgroup_lock(); | 4869 | mutex_lock(&memcg_create_mutex); |
| 4792 | 4870 | ||
| 4793 | if (memcg->use_hierarchy == val) | 4871 | if (memcg->use_hierarchy == val) |
| 4794 | goto out; | 4872 | goto out; |
| @@ -4803,7 +4881,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
| 4803 | */ | 4881 | */ |
| 4804 | if ((!parent_memcg || !parent_memcg->use_hierarchy) && | 4882 | if ((!parent_memcg || !parent_memcg->use_hierarchy) && |
| 4805 | (val == 1 || val == 0)) { | 4883 | (val == 1 || val == 0)) { |
| 4806 | if (list_empty(&cont->children)) | 4884 | if (!__memcg_has_children(memcg)) |
| 4807 | memcg->use_hierarchy = val; | 4885 | memcg->use_hierarchy = val; |
| 4808 | else | 4886 | else |
| 4809 | retval = -EBUSY; | 4887 | retval = -EBUSY; |
| @@ -4811,7 +4889,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
| 4811 | retval = -EINVAL; | 4889 | retval = -EINVAL; |
| 4812 | 4890 | ||
| 4813 | out: | 4891 | out: |
| 4814 | cgroup_unlock(); | 4892 | mutex_unlock(&memcg_create_mutex); |
| 4815 | 4893 | ||
| 4816 | return retval; | 4894 | return retval; |
| 4817 | } | 4895 | } |
| @@ -4896,8 +4974,6 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) | |||
| 4896 | { | 4974 | { |
| 4897 | int ret = -EINVAL; | 4975 | int ret = -EINVAL; |
| 4898 | #ifdef CONFIG_MEMCG_KMEM | 4976 | #ifdef CONFIG_MEMCG_KMEM |
| 4899 | bool must_inc_static_branch = false; | ||
| 4900 | |||
| 4901 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 4977 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
| 4902 | /* | 4978 | /* |
| 4903 | * For simplicity, we won't allow this to be disabled. It also can't | 4979 | * For simplicity, we won't allow this to be disabled. It also can't |
| @@ -4910,18 +4986,11 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) | |||
| 4910 | * | 4986 | * |
| 4911 | * After it first became limited, changes in the value of the limit are | 4987 | * After it first became limited, changes in the value of the limit are |
| 4912 | * of course permitted. | 4988 | * of course permitted. |
| 4913 | * | ||
| 4914 | * Taking the cgroup_lock is really offensive, but it is so far the only | ||
| 4915 | * way to guarantee that no children will appear. There are plenty of | ||
| 4916 | * other offenders, and they should all go away. Fine grained locking | ||
| 4917 | * is probably the way to go here. When we are fully hierarchical, we | ||
| 4918 | * can also get rid of the use_hierarchy check. | ||
| 4919 | */ | 4989 | */ |
| 4920 | cgroup_lock(); | 4990 | mutex_lock(&memcg_create_mutex); |
| 4921 | mutex_lock(&set_limit_mutex); | 4991 | mutex_lock(&set_limit_mutex); |
| 4922 | if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { | 4992 | if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { |
| 4923 | if (cgroup_task_count(cont) || (memcg->use_hierarchy && | 4993 | if (cgroup_task_count(cont) || memcg_has_children(memcg)) { |
| 4924 | !list_empty(&cont->children))) { | ||
| 4925 | ret = -EBUSY; | 4994 | ret = -EBUSY; |
| 4926 | goto out; | 4995 | goto out; |
| 4927 | } | 4996 | } |
| @@ -4933,7 +5002,13 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) | |||
| 4933 | res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); | 5002 | res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); |
| 4934 | goto out; | 5003 | goto out; |
| 4935 | } | 5004 | } |
| 4936 | must_inc_static_branch = true; | 5005 | static_key_slow_inc(&memcg_kmem_enabled_key); |
| 5006 | /* | ||
| 5007 | * setting the active bit after the inc will guarantee no one | ||
| 5008 | * starts accounting before all call sites are patched | ||
| 5009 | */ | ||
| 5010 | memcg_kmem_set_active(memcg); | ||
| 5011 | |||
| 4937 | /* | 5012 | /* |
| 4938 | * kmem charges can outlive the cgroup. In the case of slab | 5013 | * kmem charges can outlive the cgroup. In the case of slab |
| 4939 | * pages, for instance, a page contain objects from various | 5014 | * pages, for instance, a page contain objects from various |
| @@ -4945,32 +5020,12 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) | |||
| 4945 | ret = res_counter_set_limit(&memcg->kmem, val); | 5020 | ret = res_counter_set_limit(&memcg->kmem, val); |
| 4946 | out: | 5021 | out: |
| 4947 | mutex_unlock(&set_limit_mutex); | 5022 | mutex_unlock(&set_limit_mutex); |
| 4948 | cgroup_unlock(); | 5023 | mutex_unlock(&memcg_create_mutex); |
| 4949 | |||
| 4950 | /* | ||
| 4951 | * We are by now familiar with the fact that we can't inc the static | ||
| 4952 | * branch inside cgroup_lock. See disarm functions for details. A | ||
| 4953 | * worker here is overkill, but also wrong: After the limit is set, we | ||
| 4954 | * must start accounting right away. Since this operation can't fail, | ||
| 4955 | * we can safely defer it to here - no rollback will be needed. | ||
| 4956 | * | ||
| 4957 | * The boolean used to control this is also safe, because | ||
| 4958 | * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be | ||
| 4959 | * able to set it to true; | ||
| 4960 | */ | ||
| 4961 | if (must_inc_static_branch) { | ||
| 4962 | static_key_slow_inc(&memcg_kmem_enabled_key); | ||
| 4963 | /* | ||
| 4964 | * setting the active bit after the inc will guarantee no one | ||
| 4965 | * starts accounting before all call sites are patched | ||
| 4966 | */ | ||
| 4967 | memcg_kmem_set_active(memcg); | ||
| 4968 | } | ||
| 4969 | |||
| 4970 | #endif | 5024 | #endif |
| 4971 | return ret; | 5025 | return ret; |
| 4972 | } | 5026 | } |
| 4973 | 5027 | ||
| 5028 | #ifdef CONFIG_MEMCG_KMEM | ||
| 4974 | static int memcg_propagate_kmem(struct mem_cgroup *memcg) | 5029 | static int memcg_propagate_kmem(struct mem_cgroup *memcg) |
| 4975 | { | 5030 | { |
| 4976 | int ret = 0; | 5031 | int ret = 0; |
| @@ -4979,7 +5034,6 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) | |||
| 4979 | goto out; | 5034 | goto out; |
| 4980 | 5035 | ||
| 4981 | memcg->kmem_account_flags = parent->kmem_account_flags; | 5036 | memcg->kmem_account_flags = parent->kmem_account_flags; |
| 4982 | #ifdef CONFIG_MEMCG_KMEM | ||
| 4983 | /* | 5037 | /* |
| 4984 | * When that happen, we need to disable the static branch only on those | 5038 | * When that happen, we need to disable the static branch only on those |
| 4985 | * memcgs that enabled it. To achieve this, we would be forced to | 5039 | * memcgs that enabled it. To achieve this, we would be forced to |
| @@ -5005,10 +5059,10 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) | |||
| 5005 | mutex_lock(&set_limit_mutex); | 5059 | mutex_lock(&set_limit_mutex); |
| 5006 | ret = memcg_update_cache_sizes(memcg); | 5060 | ret = memcg_update_cache_sizes(memcg); |
| 5007 | mutex_unlock(&set_limit_mutex); | 5061 | mutex_unlock(&set_limit_mutex); |
| 5008 | #endif | ||
| 5009 | out: | 5062 | out: |
| 5010 | return ret; | 5063 | return ret; |
| 5011 | } | 5064 | } |
| 5065 | #endif /* CONFIG_MEMCG_KMEM */ | ||
| 5012 | 5066 | ||
| 5013 | /* | 5067 | /* |
| 5014 | * The user of this function is... | 5068 | * The user of this function is... |
| @@ -5148,15 +5202,14 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | |||
| 5148 | 5202 | ||
| 5149 | if (val >= (1 << NR_MOVE_TYPE)) | 5203 | if (val >= (1 << NR_MOVE_TYPE)) |
| 5150 | return -EINVAL; | 5204 | return -EINVAL; |
| 5205 | |||
| 5151 | /* | 5206 | /* |
| 5152 | * We check this value several times in both in can_attach() and | 5207 | * No kind of locking is needed in here, because ->can_attach() will |
| 5153 | * attach(), so we need cgroup lock to prevent this value from being | 5208 | * check this value once in the beginning of the process, and then carry |
| 5154 | * inconsistent. | 5209 | * on with stale data. This means that changes to this value will only |
| 5210 | * affect task migrations starting after the change. | ||
| 5155 | */ | 5211 | */ |
| 5156 | cgroup_lock(); | ||
| 5157 | memcg->move_charge_at_immigrate = val; | 5212 | memcg->move_charge_at_immigrate = val; |
| 5158 | cgroup_unlock(); | ||
| 5159 | |||
| 5160 | return 0; | 5213 | return 0; |
| 5161 | } | 5214 | } |
| 5162 | #else | 5215 | #else |
| @@ -5214,14 +5267,6 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, | |||
| 5214 | } | 5267 | } |
| 5215 | #endif /* CONFIG_NUMA */ | 5268 | #endif /* CONFIG_NUMA */ |
| 5216 | 5269 | ||
| 5217 | static const char * const mem_cgroup_lru_names[] = { | ||
| 5218 | "inactive_anon", | ||
| 5219 | "active_anon", | ||
| 5220 | "inactive_file", | ||
| 5221 | "active_file", | ||
| 5222 | "unevictable", | ||
| 5223 | }; | ||
| 5224 | |||
| 5225 | static inline void mem_cgroup_lru_names_not_uptodate(void) | 5270 | static inline void mem_cgroup_lru_names_not_uptodate(void) |
| 5226 | { | 5271 | { |
| 5227 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); | 5272 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); |
| @@ -5335,18 +5380,17 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, | |||
| 5335 | 5380 | ||
| 5336 | parent = mem_cgroup_from_cont(cgrp->parent); | 5381 | parent = mem_cgroup_from_cont(cgrp->parent); |
| 5337 | 5382 | ||
| 5338 | cgroup_lock(); | 5383 | mutex_lock(&memcg_create_mutex); |
| 5339 | 5384 | ||
| 5340 | /* If under hierarchy, only empty-root can set this value */ | 5385 | /* If under hierarchy, only empty-root can set this value */ |
| 5341 | if ((parent->use_hierarchy) || | 5386 | if ((parent->use_hierarchy) || memcg_has_children(memcg)) { |
| 5342 | (memcg->use_hierarchy && !list_empty(&cgrp->children))) { | 5387 | mutex_unlock(&memcg_create_mutex); |
| 5343 | cgroup_unlock(); | ||
| 5344 | return -EINVAL; | 5388 | return -EINVAL; |
| 5345 | } | 5389 | } |
| 5346 | 5390 | ||
| 5347 | memcg->swappiness = val; | 5391 | memcg->swappiness = val; |
| 5348 | 5392 | ||
| 5349 | cgroup_unlock(); | 5393 | mutex_unlock(&memcg_create_mutex); |
| 5350 | 5394 | ||
| 5351 | return 0; | 5395 | return 0; |
| 5352 | } | 5396 | } |
| @@ -5672,17 +5716,16 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, | |||
| 5672 | 5716 | ||
| 5673 | parent = mem_cgroup_from_cont(cgrp->parent); | 5717 | parent = mem_cgroup_from_cont(cgrp->parent); |
| 5674 | 5718 | ||
| 5675 | cgroup_lock(); | 5719 | mutex_lock(&memcg_create_mutex); |
| 5676 | /* oom-kill-disable is a flag for subhierarchy. */ | 5720 | /* oom-kill-disable is a flag for subhierarchy. */ |
| 5677 | if ((parent->use_hierarchy) || | 5721 | if ((parent->use_hierarchy) || memcg_has_children(memcg)) { |
| 5678 | (memcg->use_hierarchy && !list_empty(&cgrp->children))) { | 5722 | mutex_unlock(&memcg_create_mutex); |
| 5679 | cgroup_unlock(); | ||
| 5680 | return -EINVAL; | 5723 | return -EINVAL; |
| 5681 | } | 5724 | } |
| 5682 | memcg->oom_kill_disable = val; | 5725 | memcg->oom_kill_disable = val; |
| 5683 | if (!val) | 5726 | if (!val) |
| 5684 | memcg_oom_recover(memcg); | 5727 | memcg_oom_recover(memcg); |
| 5685 | cgroup_unlock(); | 5728 | mutex_unlock(&memcg_create_mutex); |
| 5686 | return 0; | 5729 | return 0; |
| 5687 | } | 5730 | } |
| 5688 | 5731 | ||
| @@ -5797,33 +5840,6 @@ static struct cftype mem_cgroup_files[] = { | |||
| 5797 | .read_seq_string = memcg_numa_stat_show, | 5840 | .read_seq_string = memcg_numa_stat_show, |
| 5798 | }, | 5841 | }, |
| 5799 | #endif | 5842 | #endif |
| 5800 | #ifdef CONFIG_MEMCG_SWAP | ||
| 5801 | { | ||
| 5802 | .name = "memsw.usage_in_bytes", | ||
| 5803 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | ||
| 5804 | .read = mem_cgroup_read, | ||
| 5805 | .register_event = mem_cgroup_usage_register_event, | ||
| 5806 | .unregister_event = mem_cgroup_usage_unregister_event, | ||
| 5807 | }, | ||
| 5808 | { | ||
| 5809 | .name = "memsw.max_usage_in_bytes", | ||
| 5810 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), | ||
| 5811 | .trigger = mem_cgroup_reset, | ||
| 5812 | .read = mem_cgroup_read, | ||
| 5813 | }, | ||
| 5814 | { | ||
| 5815 | .name = "memsw.limit_in_bytes", | ||
| 5816 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), | ||
| 5817 | .write_string = mem_cgroup_write, | ||
| 5818 | .read = mem_cgroup_read, | ||
| 5819 | }, | ||
| 5820 | { | ||
| 5821 | .name = "memsw.failcnt", | ||
| 5822 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), | ||
| 5823 | .trigger = mem_cgroup_reset, | ||
| 5824 | .read = mem_cgroup_read, | ||
| 5825 | }, | ||
| 5826 | #endif | ||
| 5827 | #ifdef CONFIG_MEMCG_KMEM | 5843 | #ifdef CONFIG_MEMCG_KMEM |
| 5828 | { | 5844 | { |
| 5829 | .name = "kmem.limit_in_bytes", | 5845 | .name = "kmem.limit_in_bytes", |
| @@ -5858,6 +5874,36 @@ static struct cftype mem_cgroup_files[] = { | |||
| 5858 | { }, /* terminate */ | 5874 | { }, /* terminate */ |
| 5859 | }; | 5875 | }; |
| 5860 | 5876 | ||
| 5877 | #ifdef CONFIG_MEMCG_SWAP | ||
| 5878 | static struct cftype memsw_cgroup_files[] = { | ||
| 5879 | { | ||
| 5880 | .name = "memsw.usage_in_bytes", | ||
| 5881 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | ||
| 5882 | .read = mem_cgroup_read, | ||
| 5883 | .register_event = mem_cgroup_usage_register_event, | ||
| 5884 | .unregister_event = mem_cgroup_usage_unregister_event, | ||
| 5885 | }, | ||
| 5886 | { | ||
| 5887 | .name = "memsw.max_usage_in_bytes", | ||
| 5888 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), | ||
| 5889 | .trigger = mem_cgroup_reset, | ||
| 5890 | .read = mem_cgroup_read, | ||
| 5891 | }, | ||
| 5892 | { | ||
| 5893 | .name = "memsw.limit_in_bytes", | ||
| 5894 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), | ||
| 5895 | .write_string = mem_cgroup_write, | ||
| 5896 | .read = mem_cgroup_read, | ||
| 5897 | }, | ||
| 5898 | { | ||
| 5899 | .name = "memsw.failcnt", | ||
| 5900 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), | ||
| 5901 | .trigger = mem_cgroup_reset, | ||
| 5902 | .read = mem_cgroup_read, | ||
| 5903 | }, | ||
| 5904 | { }, /* terminate */ | ||
| 5905 | }; | ||
| 5906 | #endif | ||
| 5861 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | 5907 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) |
| 5862 | { | 5908 | { |
| 5863 | struct mem_cgroup_per_node *pn; | 5909 | struct mem_cgroup_per_node *pn; |
| @@ -5896,9 +5942,9 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
| 5896 | static struct mem_cgroup *mem_cgroup_alloc(void) | 5942 | static struct mem_cgroup *mem_cgroup_alloc(void) |
| 5897 | { | 5943 | { |
| 5898 | struct mem_cgroup *memcg; | 5944 | struct mem_cgroup *memcg; |
| 5899 | int size = sizeof(struct mem_cgroup); | 5945 | size_t size = memcg_size(); |
| 5900 | 5946 | ||
| 5901 | /* Can be very big if MAX_NUMNODES is very big */ | 5947 | /* Can be very big if nr_node_ids is very big */ |
| 5902 | if (size < PAGE_SIZE) | 5948 | if (size < PAGE_SIZE) |
| 5903 | memcg = kzalloc(size, GFP_KERNEL); | 5949 | memcg = kzalloc(size, GFP_KERNEL); |
| 5904 | else | 5950 | else |
| @@ -5935,7 +5981,7 @@ out_free: | |||
| 5935 | static void __mem_cgroup_free(struct mem_cgroup *memcg) | 5981 | static void __mem_cgroup_free(struct mem_cgroup *memcg) |
| 5936 | { | 5982 | { |
| 5937 | int node; | 5983 | int node; |
| 5938 | int size = sizeof(struct mem_cgroup); | 5984 | size_t size = memcg_size(); |
| 5939 | 5985 | ||
| 5940 | mem_cgroup_remove_from_trees(memcg); | 5986 | mem_cgroup_remove_from_trees(memcg); |
| 5941 | free_css_id(&mem_cgroup_subsys, &memcg->css); | 5987 | free_css_id(&mem_cgroup_subsys, &memcg->css); |
| @@ -6017,19 +6063,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) | |||
| 6017 | } | 6063 | } |
| 6018 | EXPORT_SYMBOL(parent_mem_cgroup); | 6064 | EXPORT_SYMBOL(parent_mem_cgroup); |
| 6019 | 6065 | ||
| 6020 | #ifdef CONFIG_MEMCG_SWAP | 6066 | static void __init mem_cgroup_soft_limit_tree_init(void) |
| 6021 | static void __init enable_swap_cgroup(void) | ||
| 6022 | { | ||
| 6023 | if (!mem_cgroup_disabled() && really_do_swap_account) | ||
| 6024 | do_swap_account = 1; | ||
| 6025 | } | ||
| 6026 | #else | ||
| 6027 | static void __init enable_swap_cgroup(void) | ||
| 6028 | { | ||
| 6029 | } | ||
| 6030 | #endif | ||
| 6031 | |||
| 6032 | static int mem_cgroup_soft_limit_tree_init(void) | ||
| 6033 | { | 6067 | { |
| 6034 | struct mem_cgroup_tree_per_node *rtpn; | 6068 | struct mem_cgroup_tree_per_node *rtpn; |
| 6035 | struct mem_cgroup_tree_per_zone *rtpz; | 6069 | struct mem_cgroup_tree_per_zone *rtpz; |
| @@ -6040,8 +6074,7 @@ static int mem_cgroup_soft_limit_tree_init(void) | |||
| 6040 | if (!node_state(node, N_NORMAL_MEMORY)) | 6074 | if (!node_state(node, N_NORMAL_MEMORY)) |
| 6041 | tmp = -1; | 6075 | tmp = -1; |
| 6042 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); | 6076 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); |
| 6043 | if (!rtpn) | 6077 | BUG_ON(!rtpn); |
| 6044 | goto err_cleanup; | ||
| 6045 | 6078 | ||
| 6046 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | 6079 | soft_limit_tree.rb_tree_per_node[node] = rtpn; |
| 6047 | 6080 | ||
| @@ -6051,23 +6084,12 @@ static int mem_cgroup_soft_limit_tree_init(void) | |||
| 6051 | spin_lock_init(&rtpz->lock); | 6084 | spin_lock_init(&rtpz->lock); |
| 6052 | } | 6085 | } |
| 6053 | } | 6086 | } |
| 6054 | return 0; | ||
| 6055 | |||
| 6056 | err_cleanup: | ||
| 6057 | for_each_node(node) { | ||
| 6058 | if (!soft_limit_tree.rb_tree_per_node[node]) | ||
| 6059 | break; | ||
| 6060 | kfree(soft_limit_tree.rb_tree_per_node[node]); | ||
| 6061 | soft_limit_tree.rb_tree_per_node[node] = NULL; | ||
| 6062 | } | ||
| 6063 | return 1; | ||
| 6064 | |||
| 6065 | } | 6087 | } |
| 6066 | 6088 | ||
| 6067 | static struct cgroup_subsys_state * __ref | 6089 | static struct cgroup_subsys_state * __ref |
| 6068 | mem_cgroup_css_alloc(struct cgroup *cont) | 6090 | mem_cgroup_css_alloc(struct cgroup *cont) |
| 6069 | { | 6091 | { |
| 6070 | struct mem_cgroup *memcg, *parent; | 6092 | struct mem_cgroup *memcg; |
| 6071 | long error = -ENOMEM; | 6093 | long error = -ENOMEM; |
| 6072 | int node; | 6094 | int node; |
| 6073 | 6095 | ||
| @@ -6081,24 +6103,44 @@ mem_cgroup_css_alloc(struct cgroup *cont) | |||
| 6081 | 6103 | ||
| 6082 | /* root ? */ | 6104 | /* root ? */ |
| 6083 | if (cont->parent == NULL) { | 6105 | if (cont->parent == NULL) { |
| 6084 | int cpu; | ||
| 6085 | enable_swap_cgroup(); | ||
| 6086 | parent = NULL; | ||
| 6087 | if (mem_cgroup_soft_limit_tree_init()) | ||
| 6088 | goto free_out; | ||
| 6089 | root_mem_cgroup = memcg; | 6106 | root_mem_cgroup = memcg; |
| 6090 | for_each_possible_cpu(cpu) { | 6107 | res_counter_init(&memcg->res, NULL); |
| 6091 | struct memcg_stock_pcp *stock = | 6108 | res_counter_init(&memcg->memsw, NULL); |
| 6092 | &per_cpu(memcg_stock, cpu); | 6109 | res_counter_init(&memcg->kmem, NULL); |
| 6093 | INIT_WORK(&stock->work, drain_local_stock); | ||
| 6094 | } | ||
| 6095 | } else { | ||
| 6096 | parent = mem_cgroup_from_cont(cont->parent); | ||
| 6097 | memcg->use_hierarchy = parent->use_hierarchy; | ||
| 6098 | memcg->oom_kill_disable = parent->oom_kill_disable; | ||
| 6099 | } | 6110 | } |
| 6100 | 6111 | ||
| 6101 | if (parent && parent->use_hierarchy) { | 6112 | memcg->last_scanned_node = MAX_NUMNODES; |
| 6113 | INIT_LIST_HEAD(&memcg->oom_notify); | ||
| 6114 | atomic_set(&memcg->refcnt, 1); | ||
| 6115 | memcg->move_charge_at_immigrate = 0; | ||
| 6116 | mutex_init(&memcg->thresholds_lock); | ||
| 6117 | spin_lock_init(&memcg->move_lock); | ||
| 6118 | |||
| 6119 | return &memcg->css; | ||
| 6120 | |||
| 6121 | free_out: | ||
| 6122 | __mem_cgroup_free(memcg); | ||
| 6123 | return ERR_PTR(error); | ||
| 6124 | } | ||
| 6125 | |||
| 6126 | static int | ||
| 6127 | mem_cgroup_css_online(struct cgroup *cont) | ||
| 6128 | { | ||
| 6129 | struct mem_cgroup *memcg, *parent; | ||
| 6130 | int error = 0; | ||
| 6131 | |||
| 6132 | if (!cont->parent) | ||
| 6133 | return 0; | ||
| 6134 | |||
| 6135 | mutex_lock(&memcg_create_mutex); | ||
| 6136 | memcg = mem_cgroup_from_cont(cont); | ||
| 6137 | parent = mem_cgroup_from_cont(cont->parent); | ||
| 6138 | |||
| 6139 | memcg->use_hierarchy = parent->use_hierarchy; | ||
| 6140 | memcg->oom_kill_disable = parent->oom_kill_disable; | ||
| 6141 | memcg->swappiness = mem_cgroup_swappiness(parent); | ||
| 6142 | |||
| 6143 | if (parent->use_hierarchy) { | ||
| 6102 | res_counter_init(&memcg->res, &parent->res); | 6144 | res_counter_init(&memcg->res, &parent->res); |
| 6103 | res_counter_init(&memcg->memsw, &parent->memsw); | 6145 | res_counter_init(&memcg->memsw, &parent->memsw); |
| 6104 | res_counter_init(&memcg->kmem, &parent->kmem); | 6146 | res_counter_init(&memcg->kmem, &parent->kmem); |
| @@ -6119,20 +6161,12 @@ mem_cgroup_css_alloc(struct cgroup *cont) | |||
| 6119 | * much sense so let cgroup subsystem know about this | 6161 | * much sense so let cgroup subsystem know about this |
| 6120 | * unfortunate state in our controller. | 6162 | * unfortunate state in our controller. |
| 6121 | */ | 6163 | */ |
| 6122 | if (parent && parent != root_mem_cgroup) | 6164 | if (parent != root_mem_cgroup) |
| 6123 | mem_cgroup_subsys.broken_hierarchy = true; | 6165 | mem_cgroup_subsys.broken_hierarchy = true; |
| 6124 | } | 6166 | } |
| 6125 | memcg->last_scanned_node = MAX_NUMNODES; | ||
| 6126 | INIT_LIST_HEAD(&memcg->oom_notify); | ||
| 6127 | |||
| 6128 | if (parent) | ||
| 6129 | memcg->swappiness = mem_cgroup_swappiness(parent); | ||
| 6130 | atomic_set(&memcg->refcnt, 1); | ||
| 6131 | memcg->move_charge_at_immigrate = 0; | ||
| 6132 | mutex_init(&memcg->thresholds_lock); | ||
| 6133 | spin_lock_init(&memcg->move_lock); | ||
| 6134 | 6167 | ||
| 6135 | error = memcg_init_kmem(memcg, &mem_cgroup_subsys); | 6168 | error = memcg_init_kmem(memcg, &mem_cgroup_subsys); |
| 6169 | mutex_unlock(&memcg_create_mutex); | ||
| 6136 | if (error) { | 6170 | if (error) { |
| 6137 | /* | 6171 | /* |
| 6138 | * We call put now because our (and parent's) refcnts | 6172 | * We call put now because our (and parent's) refcnts |
| @@ -6140,12 +6174,10 @@ mem_cgroup_css_alloc(struct cgroup *cont) | |||
| 6140 | * call __mem_cgroup_free, so return directly | 6174 | * call __mem_cgroup_free, so return directly |
| 6141 | */ | 6175 | */ |
| 6142 | mem_cgroup_put(memcg); | 6176 | mem_cgroup_put(memcg); |
| 6143 | return ERR_PTR(error); | 6177 | if (parent->use_hierarchy) |
| 6178 | mem_cgroup_put(parent); | ||
| 6144 | } | 6179 | } |
| 6145 | return &memcg->css; | 6180 | return error; |
| 6146 | free_out: | ||
| 6147 | __mem_cgroup_free(memcg); | ||
| 6148 | return ERR_PTR(error); | ||
| 6149 | } | 6181 | } |
| 6150 | 6182 | ||
| 6151 | static void mem_cgroup_css_offline(struct cgroup *cont) | 6183 | static void mem_cgroup_css_offline(struct cgroup *cont) |
| @@ -6281,7 +6313,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, | |||
| 6281 | * Because lookup_swap_cache() updates some statistics counter, | 6313 | * Because lookup_swap_cache() updates some statistics counter, |
| 6282 | * we call find_get_page() with swapper_space directly. | 6314 | * we call find_get_page() with swapper_space directly. |
| 6283 | */ | 6315 | */ |
| 6284 | page = find_get_page(&swapper_space, ent.val); | 6316 | page = find_get_page(swap_address_space(ent), ent.val); |
| 6285 | if (do_swap_account) | 6317 | if (do_swap_account) |
| 6286 | entry->val = ent.val; | 6318 | entry->val = ent.val; |
| 6287 | 6319 | ||
| @@ -6322,7 +6354,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | |||
| 6322 | swp_entry_t swap = radix_to_swp_entry(page); | 6354 | swp_entry_t swap = radix_to_swp_entry(page); |
| 6323 | if (do_swap_account) | 6355 | if (do_swap_account) |
| 6324 | *entry = swap; | 6356 | *entry = swap; |
| 6325 | page = find_get_page(&swapper_space, swap.val); | 6357 | page = find_get_page(swap_address_space(swap), swap.val); |
| 6326 | } | 6358 | } |
| 6327 | #endif | 6359 | #endif |
| 6328 | return page; | 6360 | return page; |
| @@ -6532,8 +6564,15 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup, | |||
| 6532 | struct task_struct *p = cgroup_taskset_first(tset); | 6564 | struct task_struct *p = cgroup_taskset_first(tset); |
| 6533 | int ret = 0; | 6565 | int ret = 0; |
| 6534 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); | 6566 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); |
| 6567 | unsigned long move_charge_at_immigrate; | ||
| 6535 | 6568 | ||
| 6536 | if (memcg->move_charge_at_immigrate) { | 6569 | /* |
| 6570 | * We are now commited to this value whatever it is. Changes in this | ||
| 6571 | * tunable will only affect upcoming migrations, not the current one. | ||
| 6572 | * So we need to save it, and keep it going. | ||
| 6573 | */ | ||
| 6574 | move_charge_at_immigrate = memcg->move_charge_at_immigrate; | ||
| 6575 | if (move_charge_at_immigrate) { | ||
| 6537 | struct mm_struct *mm; | 6576 | struct mm_struct *mm; |
| 6538 | struct mem_cgroup *from = mem_cgroup_from_task(p); | 6577 | struct mem_cgroup *from = mem_cgroup_from_task(p); |
| 6539 | 6578 | ||
| @@ -6553,6 +6592,7 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup, | |||
| 6553 | spin_lock(&mc.lock); | 6592 | spin_lock(&mc.lock); |
| 6554 | mc.from = from; | 6593 | mc.from = from; |
| 6555 | mc.to = memcg; | 6594 | mc.to = memcg; |
| 6595 | mc.immigrate_flags = move_charge_at_immigrate; | ||
| 6556 | spin_unlock(&mc.lock); | 6596 | spin_unlock(&mc.lock); |
| 6557 | /* We set mc.moving_task later */ | 6597 | /* We set mc.moving_task later */ |
| 6558 | 6598 | ||
| @@ -6747,6 +6787,7 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
| 6747 | .name = "memory", | 6787 | .name = "memory", |
| 6748 | .subsys_id = mem_cgroup_subsys_id, | 6788 | .subsys_id = mem_cgroup_subsys_id, |
| 6749 | .css_alloc = mem_cgroup_css_alloc, | 6789 | .css_alloc = mem_cgroup_css_alloc, |
| 6790 | .css_online = mem_cgroup_css_online, | ||
| 6750 | .css_offline = mem_cgroup_css_offline, | 6791 | .css_offline = mem_cgroup_css_offline, |
| 6751 | .css_free = mem_cgroup_css_free, | 6792 | .css_free = mem_cgroup_css_free, |
| 6752 | .can_attach = mem_cgroup_can_attach, | 6793 | .can_attach = mem_cgroup_can_attach, |
| @@ -6757,19 +6798,6 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
| 6757 | .use_id = 1, | 6798 | .use_id = 1, |
| 6758 | }; | 6799 | }; |
| 6759 | 6800 | ||
| 6760 | /* | ||
| 6761 | * The rest of init is performed during ->css_alloc() for root css which | ||
| 6762 | * happens before initcalls. hotcpu_notifier() can't be done together as | ||
| 6763 | * it would introduce circular locking by adding cgroup_lock -> cpu hotplug | ||
| 6764 | * dependency. Do it from a subsys_initcall(). | ||
| 6765 | */ | ||
| 6766 | static int __init mem_cgroup_init(void) | ||
| 6767 | { | ||
| 6768 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); | ||
| 6769 | return 0; | ||
| 6770 | } | ||
| 6771 | subsys_initcall(mem_cgroup_init); | ||
| 6772 | |||
| 6773 | #ifdef CONFIG_MEMCG_SWAP | 6801 | #ifdef CONFIG_MEMCG_SWAP |
| 6774 | static int __init enable_swap_account(char *s) | 6802 | static int __init enable_swap_account(char *s) |
| 6775 | { | 6803 | { |
| @@ -6782,4 +6810,39 @@ static int __init enable_swap_account(char *s) | |||
| 6782 | } | 6810 | } |
| 6783 | __setup("swapaccount=", enable_swap_account); | 6811 | __setup("swapaccount=", enable_swap_account); |
| 6784 | 6812 | ||
| 6813 | static void __init memsw_file_init(void) | ||
| 6814 | { | ||
| 6815 | WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files)); | ||
| 6816 | } | ||
| 6817 | |||
| 6818 | static void __init enable_swap_cgroup(void) | ||
| 6819 | { | ||
| 6820 | if (!mem_cgroup_disabled() && really_do_swap_account) { | ||
| 6821 | do_swap_account = 1; | ||
| 6822 | memsw_file_init(); | ||
| 6823 | } | ||
| 6824 | } | ||
| 6825 | |||
| 6826 | #else | ||
| 6827 | static void __init enable_swap_cgroup(void) | ||
| 6828 | { | ||
| 6829 | } | ||
| 6785 | #endif | 6830 | #endif |
| 6831 | |||
| 6832 | /* | ||
| 6833 | * subsys_initcall() for memory controller. | ||
| 6834 | * | ||
| 6835 | * Some parts like hotcpu_notifier() have to be initialized from this context | ||
| 6836 | * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically | ||
| 6837 | * everything that doesn't depend on a specific mem_cgroup structure should | ||
| 6838 | * be initialized from here. | ||
| 6839 | */ | ||
| 6840 | static int __init mem_cgroup_init(void) | ||
| 6841 | { | ||
| 6842 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); | ||
| 6843 | enable_swap_cgroup(); | ||
| 6844 | mem_cgroup_soft_limit_tree_init(); | ||
| 6845 | memcg_stock_init(); | ||
| 6846 | return 0; | ||
| 6847 | } | ||
| 6848 | subsys_initcall(mem_cgroup_init); | ||
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c6e4dd3e1c08..df0694c6adef 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
| @@ -61,7 +61,7 @@ int sysctl_memory_failure_early_kill __read_mostly = 0; | |||
| 61 | 61 | ||
| 62 | int sysctl_memory_failure_recovery __read_mostly = 1; | 62 | int sysctl_memory_failure_recovery __read_mostly = 1; |
| 63 | 63 | ||
| 64 | atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); | 64 | atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); |
| 65 | 65 | ||
| 66 | #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) | 66 | #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) |
| 67 | 67 | ||
| @@ -784,12 +784,12 @@ static struct page_state { | |||
| 784 | { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, | 784 | { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, |
| 785 | { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, | 785 | { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, |
| 786 | 786 | ||
| 787 | { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, | ||
| 788 | { unevict, unevict, "clean unevictable LRU", me_pagecache_clean }, | ||
| 789 | |||
| 790 | { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, | 787 | { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, |
| 791 | { mlock, mlock, "clean mlocked LRU", me_pagecache_clean }, | 788 | { mlock, mlock, "clean mlocked LRU", me_pagecache_clean }, |
| 792 | 789 | ||
| 790 | { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, | ||
| 791 | { unevict, unevict, "clean unevictable LRU", me_pagecache_clean }, | ||
| 792 | |||
| 793 | { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, | 793 | { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, |
| 794 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, | 794 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, |
| 795 | 795 | ||
| @@ -1021,6 +1021,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 1021 | struct page *hpage; | 1021 | struct page *hpage; |
| 1022 | int res; | 1022 | int res; |
| 1023 | unsigned int nr_pages; | 1023 | unsigned int nr_pages; |
| 1024 | unsigned long page_flags; | ||
| 1024 | 1025 | ||
| 1025 | if (!sysctl_memory_failure_recovery) | 1026 | if (!sysctl_memory_failure_recovery) |
| 1026 | panic("Memory failure from trap %d on page %lx", trapno, pfn); | 1027 | panic("Memory failure from trap %d on page %lx", trapno, pfn); |
| @@ -1039,8 +1040,18 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 1039 | return 0; | 1040 | return 0; |
| 1040 | } | 1041 | } |
| 1041 | 1042 | ||
| 1042 | nr_pages = 1 << compound_trans_order(hpage); | 1043 | /* |
| 1043 | atomic_long_add(nr_pages, &mce_bad_pages); | 1044 | * Currently errors on hugetlbfs pages are measured in hugepage units, |
| 1045 | * so nr_pages should be 1 << compound_order. OTOH when errors are on | ||
| 1046 | * transparent hugepages, they are supposed to be split and error | ||
| 1047 | * measurement is done in normal page units. So nr_pages should be one | ||
| 1048 | * in this case. | ||
| 1049 | */ | ||
| 1050 | if (PageHuge(p)) | ||
| 1051 | nr_pages = 1 << compound_order(hpage); | ||
| 1052 | else /* normal page or thp */ | ||
| 1053 | nr_pages = 1; | ||
| 1054 | atomic_long_add(nr_pages, &num_poisoned_pages); | ||
| 1044 | 1055 | ||
| 1045 | /* | 1056 | /* |
| 1046 | * We need/can do nothing about count=0 pages. | 1057 | * We need/can do nothing about count=0 pages. |
| @@ -1070,7 +1081,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 1070 | if (!PageHWPoison(hpage) | 1081 | if (!PageHWPoison(hpage) |
| 1071 | || (hwpoison_filter(p) && TestClearPageHWPoison(p)) | 1082 | || (hwpoison_filter(p) && TestClearPageHWPoison(p)) |
| 1072 | || (p != hpage && TestSetPageHWPoison(hpage))) { | 1083 | || (p != hpage && TestSetPageHWPoison(hpage))) { |
| 1073 | atomic_long_sub(nr_pages, &mce_bad_pages); | 1084 | atomic_long_sub(nr_pages, &num_poisoned_pages); |
| 1074 | return 0; | 1085 | return 0; |
| 1075 | } | 1086 | } |
| 1076 | set_page_hwpoison_huge_page(hpage); | 1087 | set_page_hwpoison_huge_page(hpage); |
| @@ -1119,6 +1130,15 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 1119 | lock_page(hpage); | 1130 | lock_page(hpage); |
| 1120 | 1131 | ||
| 1121 | /* | 1132 | /* |
| 1133 | * We use page flags to determine what action should be taken, but | ||
| 1134 | * the flags can be modified by the error containment action. One | ||
| 1135 | * example is an mlocked page, where PG_mlocked is cleared by | ||
| 1136 | * page_remove_rmap() in try_to_unmap_one(). So to determine page status | ||
| 1137 | * correctly, we save a copy of the page flags at this time. | ||
| 1138 | */ | ||
| 1139 | page_flags = p->flags; | ||
| 1140 | |||
| 1141 | /* | ||
| 1122 | * unpoison always clear PG_hwpoison inside page lock | 1142 | * unpoison always clear PG_hwpoison inside page lock |
| 1123 | */ | 1143 | */ |
| 1124 | if (!PageHWPoison(p)) { | 1144 | if (!PageHWPoison(p)) { |
| @@ -1128,7 +1148,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 1128 | } | 1148 | } |
| 1129 | if (hwpoison_filter(p)) { | 1149 | if (hwpoison_filter(p)) { |
| 1130 | if (TestClearPageHWPoison(p)) | 1150 | if (TestClearPageHWPoison(p)) |
| 1131 | atomic_long_sub(nr_pages, &mce_bad_pages); | 1151 | atomic_long_sub(nr_pages, &num_poisoned_pages); |
| 1132 | unlock_page(hpage); | 1152 | unlock_page(hpage); |
| 1133 | put_page(hpage); | 1153 | put_page(hpage); |
| 1134 | return 0; | 1154 | return 0; |
| @@ -1176,12 +1196,19 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 1176 | } | 1196 | } |
| 1177 | 1197 | ||
| 1178 | res = -EBUSY; | 1198 | res = -EBUSY; |
| 1179 | for (ps = error_states;; ps++) { | 1199 | /* |
| 1180 | if ((p->flags & ps->mask) == ps->res) { | 1200 | * The first check uses the current page flags which may not have any |
| 1181 | res = page_action(ps, p, pfn); | 1201 | * relevant information. The second check with the saved page flagss is |
| 1202 | * carried out only if the first check can't determine the page status. | ||
| 1203 | */ | ||
| 1204 | for (ps = error_states;; ps++) | ||
| 1205 | if ((p->flags & ps->mask) == ps->res) | ||
| 1182 | break; | 1206 | break; |
| 1183 | } | 1207 | if (!ps->mask) |
| 1184 | } | 1208 | for (ps = error_states;; ps++) |
| 1209 | if ((page_flags & ps->mask) == ps->res) | ||
| 1210 | break; | ||
| 1211 | res = page_action(ps, p, pfn); | ||
| 1185 | out: | 1212 | out: |
| 1186 | unlock_page(hpage); | 1213 | unlock_page(hpage); |
| 1187 | return res; | 1214 | return res; |
| @@ -1323,7 +1350,7 @@ int unpoison_memory(unsigned long pfn) | |||
| 1323 | return 0; | 1350 | return 0; |
| 1324 | } | 1351 | } |
| 1325 | if (TestClearPageHWPoison(p)) | 1352 | if (TestClearPageHWPoison(p)) |
| 1326 | atomic_long_sub(nr_pages, &mce_bad_pages); | 1353 | atomic_long_sub(nr_pages, &num_poisoned_pages); |
| 1327 | pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); | 1354 | pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); |
| 1328 | return 0; | 1355 | return 0; |
| 1329 | } | 1356 | } |
| @@ -1337,7 +1364,7 @@ int unpoison_memory(unsigned long pfn) | |||
| 1337 | */ | 1364 | */ |
| 1338 | if (TestClearPageHWPoison(page)) { | 1365 | if (TestClearPageHWPoison(page)) { |
| 1339 | pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); | 1366 | pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); |
| 1340 | atomic_long_sub(nr_pages, &mce_bad_pages); | 1367 | atomic_long_sub(nr_pages, &num_poisoned_pages); |
| 1341 | freeit = 1; | 1368 | freeit = 1; |
| 1342 | if (PageHuge(page)) | 1369 | if (PageHuge(page)) |
| 1343 | clear_page_hwpoison_huge_page(page); | 1370 | clear_page_hwpoison_huge_page(page); |
| @@ -1368,7 +1395,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x) | |||
| 1368 | * that is not free, and 1 for any other page type. | 1395 | * that is not free, and 1 for any other page type. |
| 1369 | * For 1 the page is returned with increased page count, otherwise not. | 1396 | * For 1 the page is returned with increased page count, otherwise not. |
| 1370 | */ | 1397 | */ |
| 1371 | static int get_any_page(struct page *p, unsigned long pfn, int flags) | 1398 | static int __get_any_page(struct page *p, unsigned long pfn, int flags) |
| 1372 | { | 1399 | { |
| 1373 | int ret; | 1400 | int ret; |
| 1374 | 1401 | ||
| @@ -1393,11 +1420,9 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) | |||
| 1393 | if (!get_page_unless_zero(compound_head(p))) { | 1420 | if (!get_page_unless_zero(compound_head(p))) { |
| 1394 | if (PageHuge(p)) { | 1421 | if (PageHuge(p)) { |
| 1395 | pr_info("%s: %#lx free huge page\n", __func__, pfn); | 1422 | pr_info("%s: %#lx free huge page\n", __func__, pfn); |
| 1396 | ret = dequeue_hwpoisoned_huge_page(compound_head(p)); | 1423 | ret = 0; |
| 1397 | } else if (is_free_buddy_page(p)) { | 1424 | } else if (is_free_buddy_page(p)) { |
| 1398 | pr_info("%s: %#lx free buddy page\n", __func__, pfn); | 1425 | pr_info("%s: %#lx free buddy page\n", __func__, pfn); |
| 1399 | /* Set hwpoison bit while page is still isolated */ | ||
| 1400 | SetPageHWPoison(p); | ||
| 1401 | ret = 0; | 1426 | ret = 0; |
| 1402 | } else { | 1427 | } else { |
| 1403 | pr_info("%s: %#lx: unknown zero refcount page type %lx\n", | 1428 | pr_info("%s: %#lx: unknown zero refcount page type %lx\n", |
| @@ -1413,43 +1438,68 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) | |||
| 1413 | return ret; | 1438 | return ret; |
| 1414 | } | 1439 | } |
| 1415 | 1440 | ||
| 1441 | static int get_any_page(struct page *page, unsigned long pfn, int flags) | ||
| 1442 | { | ||
| 1443 | int ret = __get_any_page(page, pfn, flags); | ||
| 1444 | |||
| 1445 | if (ret == 1 && !PageHuge(page) && !PageLRU(page)) { | ||
| 1446 | /* | ||
| 1447 | * Try to free it. | ||
| 1448 | */ | ||
| 1449 | put_page(page); | ||
| 1450 | shake_page(page, 1); | ||
| 1451 | |||
| 1452 | /* | ||
| 1453 | * Did it turn free? | ||
| 1454 | */ | ||
| 1455 | ret = __get_any_page(page, pfn, 0); | ||
| 1456 | if (!PageLRU(page)) { | ||
| 1457 | pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", | ||
| 1458 | pfn, page->flags); | ||
| 1459 | return -EIO; | ||
| 1460 | } | ||
| 1461 | } | ||
| 1462 | return ret; | ||
| 1463 | } | ||
| 1464 | |||
| 1416 | static int soft_offline_huge_page(struct page *page, int flags) | 1465 | static int soft_offline_huge_page(struct page *page, int flags) |
| 1417 | { | 1466 | { |
| 1418 | int ret; | 1467 | int ret; |
| 1419 | unsigned long pfn = page_to_pfn(page); | 1468 | unsigned long pfn = page_to_pfn(page); |
| 1420 | struct page *hpage = compound_head(page); | 1469 | struct page *hpage = compound_head(page); |
| 1421 | 1470 | ||
| 1422 | ret = get_any_page(page, pfn, flags); | 1471 | /* |
| 1423 | if (ret < 0) | 1472 | * This double-check of PageHWPoison is to avoid the race with |
| 1424 | return ret; | 1473 | * memory_failure(). See also comment in __soft_offline_page(). |
| 1425 | if (ret == 0) | 1474 | */ |
| 1426 | goto done; | 1475 | lock_page(hpage); |
| 1427 | |||
| 1428 | if (PageHWPoison(hpage)) { | 1476 | if (PageHWPoison(hpage)) { |
| 1477 | unlock_page(hpage); | ||
| 1429 | put_page(hpage); | 1478 | put_page(hpage); |
| 1430 | pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); | 1479 | pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); |
| 1431 | return -EBUSY; | 1480 | return -EBUSY; |
| 1432 | } | 1481 | } |
| 1482 | unlock_page(hpage); | ||
| 1433 | 1483 | ||
| 1434 | /* Keep page count to indicate a given hugepage is isolated. */ | 1484 | /* Keep page count to indicate a given hugepage is isolated. */ |
| 1435 | ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false, | 1485 | ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, |
| 1436 | MIGRATE_SYNC); | 1486 | MIGRATE_SYNC); |
| 1437 | put_page(hpage); | 1487 | put_page(hpage); |
| 1438 | if (ret) { | 1488 | if (ret) { |
| 1439 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1489 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
| 1440 | pfn, ret, page->flags); | 1490 | pfn, ret, page->flags); |
| 1441 | return ret; | 1491 | } else { |
| 1442 | } | 1492 | set_page_hwpoison_huge_page(hpage); |
| 1443 | done: | 1493 | dequeue_hwpoisoned_huge_page(hpage); |
| 1444 | if (!PageHWPoison(hpage)) | ||
| 1445 | atomic_long_add(1 << compound_trans_order(hpage), | 1494 | atomic_long_add(1 << compound_trans_order(hpage), |
| 1446 | &mce_bad_pages); | 1495 | &num_poisoned_pages); |
| 1447 | set_page_hwpoison_huge_page(hpage); | 1496 | } |
| 1448 | dequeue_hwpoisoned_huge_page(hpage); | ||
| 1449 | /* keep elevated page count for bad page */ | 1497 | /* keep elevated page count for bad page */ |
| 1450 | return ret; | 1498 | return ret; |
| 1451 | } | 1499 | } |
| 1452 | 1500 | ||
| 1501 | static int __soft_offline_page(struct page *page, int flags); | ||
| 1502 | |||
| 1453 | /** | 1503 | /** |
| 1454 | * soft_offline_page - Soft offline a page. | 1504 | * soft_offline_page - Soft offline a page. |
| 1455 | * @page: page to offline | 1505 | * @page: page to offline |
| @@ -1478,9 +1528,11 @@ int soft_offline_page(struct page *page, int flags) | |||
| 1478 | unsigned long pfn = page_to_pfn(page); | 1528 | unsigned long pfn = page_to_pfn(page); |
| 1479 | struct page *hpage = compound_trans_head(page); | 1529 | struct page *hpage = compound_trans_head(page); |
| 1480 | 1530 | ||
| 1481 | if (PageHuge(page)) | 1531 | if (PageHWPoison(page)) { |
| 1482 | return soft_offline_huge_page(page, flags); | 1532 | pr_info("soft offline: %#lx page already poisoned\n", pfn); |
| 1483 | if (PageTransHuge(hpage)) { | 1533 | return -EBUSY; |
| 1534 | } | ||
| 1535 | if (!PageHuge(page) && PageTransHuge(hpage)) { | ||
| 1484 | if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { | 1536 | if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { |
| 1485 | pr_info("soft offline: %#lx: failed to split THP\n", | 1537 | pr_info("soft offline: %#lx: failed to split THP\n", |
| 1486 | pfn); | 1538 | pfn); |
| @@ -1491,47 +1543,45 @@ int soft_offline_page(struct page *page, int flags) | |||
| 1491 | ret = get_any_page(page, pfn, flags); | 1543 | ret = get_any_page(page, pfn, flags); |
| 1492 | if (ret < 0) | 1544 | if (ret < 0) |
| 1493 | return ret; | 1545 | return ret; |
| 1494 | if (ret == 0) | 1546 | if (ret) { /* for in-use pages */ |
| 1495 | goto done; | 1547 | if (PageHuge(page)) |
| 1496 | 1548 | ret = soft_offline_huge_page(page, flags); | |
| 1497 | /* | 1549 | else |
| 1498 | * Page cache page we can handle? | 1550 | ret = __soft_offline_page(page, flags); |
| 1499 | */ | 1551 | } else { /* for free pages */ |
| 1500 | if (!PageLRU(page)) { | 1552 | if (PageHuge(page)) { |
| 1501 | /* | 1553 | set_page_hwpoison_huge_page(hpage); |
| 1502 | * Try to free it. | 1554 | dequeue_hwpoisoned_huge_page(hpage); |
| 1503 | */ | 1555 | atomic_long_add(1 << compound_trans_order(hpage), |
| 1504 | put_page(page); | 1556 | &num_poisoned_pages); |
| 1505 | shake_page(page, 1); | 1557 | } else { |
| 1506 | 1558 | SetPageHWPoison(page); | |
| 1507 | /* | 1559 | atomic_long_inc(&num_poisoned_pages); |
| 1508 | * Did it turn free? | 1560 | } |
| 1509 | */ | ||
| 1510 | ret = get_any_page(page, pfn, 0); | ||
| 1511 | if (ret < 0) | ||
| 1512 | return ret; | ||
| 1513 | if (ret == 0) | ||
| 1514 | goto done; | ||
| 1515 | } | ||
| 1516 | if (!PageLRU(page)) { | ||
| 1517 | pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", | ||
| 1518 | pfn, page->flags); | ||
| 1519 | return -EIO; | ||
| 1520 | } | 1561 | } |
| 1562 | /* keep elevated page count for bad page */ | ||
| 1563 | return ret; | ||
| 1564 | } | ||
| 1521 | 1565 | ||
| 1522 | lock_page(page); | 1566 | static int __soft_offline_page(struct page *page, int flags) |
| 1523 | wait_on_page_writeback(page); | 1567 | { |
| 1568 | int ret; | ||
| 1569 | unsigned long pfn = page_to_pfn(page); | ||
| 1524 | 1570 | ||
| 1525 | /* | 1571 | /* |
| 1526 | * Synchronized using the page lock with memory_failure() | 1572 | * Check PageHWPoison again inside page lock because PageHWPoison |
| 1573 | * is set by memory_failure() outside page lock. Note that | ||
| 1574 | * memory_failure() also double-checks PageHWPoison inside page lock, | ||
| 1575 | * so there's no race between soft_offline_page() and memory_failure(). | ||
| 1527 | */ | 1576 | */ |
| 1577 | lock_page(page); | ||
| 1578 | wait_on_page_writeback(page); | ||
| 1528 | if (PageHWPoison(page)) { | 1579 | if (PageHWPoison(page)) { |
| 1529 | unlock_page(page); | 1580 | unlock_page(page); |
| 1530 | put_page(page); | 1581 | put_page(page); |
| 1531 | pr_info("soft offline: %#lx page already poisoned\n", pfn); | 1582 | pr_info("soft offline: %#lx page already poisoned\n", pfn); |
| 1532 | return -EBUSY; | 1583 | return -EBUSY; |
| 1533 | } | 1584 | } |
| 1534 | |||
| 1535 | /* | 1585 | /* |
| 1536 | * Try to invalidate first. This should work for | 1586 | * Try to invalidate first. This should work for |
| 1537 | * non dirty unmapped page cache pages. | 1587 | * non dirty unmapped page cache pages. |
| @@ -1544,9 +1594,10 @@ int soft_offline_page(struct page *page, int flags) | |||
| 1544 | */ | 1594 | */ |
| 1545 | if (ret == 1) { | 1595 | if (ret == 1) { |
| 1546 | put_page(page); | 1596 | put_page(page); |
| 1547 | ret = 0; | ||
| 1548 | pr_info("soft_offline: %#lx: invalidated\n", pfn); | 1597 | pr_info("soft_offline: %#lx: invalidated\n", pfn); |
| 1549 | goto done; | 1598 | SetPageHWPoison(page); |
| 1599 | atomic_long_inc(&num_poisoned_pages); | ||
| 1600 | return 0; | ||
| 1550 | } | 1601 | } |
| 1551 | 1602 | ||
| 1552 | /* | 1603 | /* |
| @@ -1563,28 +1614,23 @@ int soft_offline_page(struct page *page, int flags) | |||
| 1563 | if (!ret) { | 1614 | if (!ret) { |
| 1564 | LIST_HEAD(pagelist); | 1615 | LIST_HEAD(pagelist); |
| 1565 | inc_zone_page_state(page, NR_ISOLATED_ANON + | 1616 | inc_zone_page_state(page, NR_ISOLATED_ANON + |
| 1566 | page_is_file_cache(page)); | 1617 | page_is_file_cache(page)); |
| 1567 | list_add(&page->lru, &pagelist); | 1618 | list_add(&page->lru, &pagelist); |
| 1568 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, | 1619 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, |
| 1569 | false, MIGRATE_SYNC, | 1620 | MIGRATE_SYNC, MR_MEMORY_FAILURE); |
| 1570 | MR_MEMORY_FAILURE); | ||
| 1571 | if (ret) { | 1621 | if (ret) { |
| 1572 | putback_lru_pages(&pagelist); | 1622 | putback_lru_pages(&pagelist); |
| 1573 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1623 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
| 1574 | pfn, ret, page->flags); | 1624 | pfn, ret, page->flags); |
| 1575 | if (ret > 0) | 1625 | if (ret > 0) |
| 1576 | ret = -EIO; | 1626 | ret = -EIO; |
| 1627 | } else { | ||
| 1628 | SetPageHWPoison(page); | ||
| 1629 | atomic_long_inc(&num_poisoned_pages); | ||
| 1577 | } | 1630 | } |
| 1578 | } else { | 1631 | } else { |
| 1579 | pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", | 1632 | pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", |
| 1580 | pfn, ret, page_count(page), page->flags); | 1633 | pfn, ret, page_count(page), page->flags); |
| 1581 | } | 1634 | } |
| 1582 | if (ret) | ||
| 1583 | return ret; | ||
| 1584 | |||
| 1585 | done: | ||
| 1586 | atomic_long_add(1, &mce_bad_pages); | ||
| 1587 | SetPageHWPoison(page); | ||
| 1588 | /* keep elevated page count for bad page */ | ||
| 1589 | return ret; | 1635 | return ret; |
| 1590 | } | 1636 | } |
diff --git a/mm/memory.c b/mm/memory.c index bb1369f7b9b4..705473afc1f4 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -69,6 +69,10 @@ | |||
| 69 | 69 | ||
| 70 | #include "internal.h" | 70 | #include "internal.h" |
| 71 | 71 | ||
| 72 | #ifdef LAST_NID_NOT_IN_PAGE_FLAGS | ||
| 73 | #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid. | ||
| 74 | #endif | ||
| 75 | |||
| 72 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 76 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
| 73 | /* use the per-pgdat data instead for discontigmem - mbligh */ | 77 | /* use the per-pgdat data instead for discontigmem - mbligh */ |
| 74 | unsigned long max_mapnr; | 78 | unsigned long max_mapnr; |
| @@ -1458,10 +1462,11 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, | |||
| 1458 | EXPORT_SYMBOL_GPL(zap_vma_ptes); | 1462 | EXPORT_SYMBOL_GPL(zap_vma_ptes); |
| 1459 | 1463 | ||
| 1460 | /** | 1464 | /** |
| 1461 | * follow_page - look up a page descriptor from a user-virtual address | 1465 | * follow_page_mask - look up a page descriptor from a user-virtual address |
| 1462 | * @vma: vm_area_struct mapping @address | 1466 | * @vma: vm_area_struct mapping @address |
| 1463 | * @address: virtual address to look up | 1467 | * @address: virtual address to look up |
| 1464 | * @flags: flags modifying lookup behaviour | 1468 | * @flags: flags modifying lookup behaviour |
| 1469 | * @page_mask: on output, *page_mask is set according to the size of the page | ||
| 1465 | * | 1470 | * |
| 1466 | * @flags can have FOLL_ flags set, defined in <linux/mm.h> | 1471 | * @flags can have FOLL_ flags set, defined in <linux/mm.h> |
| 1467 | * | 1472 | * |
| @@ -1469,8 +1474,9 @@ EXPORT_SYMBOL_GPL(zap_vma_ptes); | |||
| 1469 | * an error pointer if there is a mapping to something not represented | 1474 | * an error pointer if there is a mapping to something not represented |
| 1470 | * by a page descriptor (see also vm_normal_page()). | 1475 | * by a page descriptor (see also vm_normal_page()). |
| 1471 | */ | 1476 | */ |
| 1472 | struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | 1477 | struct page *follow_page_mask(struct vm_area_struct *vma, |
| 1473 | unsigned int flags) | 1478 | unsigned long address, unsigned int flags, |
| 1479 | unsigned int *page_mask) | ||
| 1474 | { | 1480 | { |
| 1475 | pgd_t *pgd; | 1481 | pgd_t *pgd; |
| 1476 | pud_t *pud; | 1482 | pud_t *pud; |
| @@ -1480,6 +1486,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
| 1480 | struct page *page; | 1486 | struct page *page; |
| 1481 | struct mm_struct *mm = vma->vm_mm; | 1487 | struct mm_struct *mm = vma->vm_mm; |
| 1482 | 1488 | ||
| 1489 | *page_mask = 0; | ||
| 1490 | |||
| 1483 | page = follow_huge_addr(mm, address, flags & FOLL_WRITE); | 1491 | page = follow_huge_addr(mm, address, flags & FOLL_WRITE); |
| 1484 | if (!IS_ERR(page)) { | 1492 | if (!IS_ERR(page)) { |
| 1485 | BUG_ON(flags & FOLL_GET); | 1493 | BUG_ON(flags & FOLL_GET); |
| @@ -1526,6 +1534,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
| 1526 | page = follow_trans_huge_pmd(vma, address, | 1534 | page = follow_trans_huge_pmd(vma, address, |
| 1527 | pmd, flags); | 1535 | pmd, flags); |
| 1528 | spin_unlock(&mm->page_table_lock); | 1536 | spin_unlock(&mm->page_table_lock); |
| 1537 | *page_mask = HPAGE_PMD_NR - 1; | ||
| 1529 | goto out; | 1538 | goto out; |
| 1530 | } | 1539 | } |
| 1531 | } else | 1540 | } else |
| @@ -1539,8 +1548,24 @@ split_fallthrough: | |||
| 1539 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); | 1548 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); |
| 1540 | 1549 | ||
| 1541 | pte = *ptep; | 1550 | pte = *ptep; |
| 1542 | if (!pte_present(pte)) | 1551 | if (!pte_present(pte)) { |
| 1543 | goto no_page; | 1552 | swp_entry_t entry; |
| 1553 | /* | ||
| 1554 | * KSM's break_ksm() relies upon recognizing a ksm page | ||
| 1555 | * even while it is being migrated, so for that case we | ||
| 1556 | * need migration_entry_wait(). | ||
| 1557 | */ | ||
| 1558 | if (likely(!(flags & FOLL_MIGRATION))) | ||
| 1559 | goto no_page; | ||
| 1560 | if (pte_none(pte) || pte_file(pte)) | ||
| 1561 | goto no_page; | ||
| 1562 | entry = pte_to_swp_entry(pte); | ||
| 1563 | if (!is_migration_entry(entry)) | ||
| 1564 | goto no_page; | ||
| 1565 | pte_unmap_unlock(ptep, ptl); | ||
| 1566 | migration_entry_wait(mm, pmd, address); | ||
| 1567 | goto split_fallthrough; | ||
| 1568 | } | ||
| 1544 | if ((flags & FOLL_NUMA) && pte_numa(pte)) | 1569 | if ((flags & FOLL_NUMA) && pte_numa(pte)) |
| 1545 | goto no_page; | 1570 | goto no_page; |
| 1546 | if ((flags & FOLL_WRITE) && !pte_write(pte)) | 1571 | if ((flags & FOLL_WRITE) && !pte_write(pte)) |
| @@ -1673,15 +1698,16 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add | |||
| 1673 | * instead of __get_user_pages. __get_user_pages should be used only if | 1698 | * instead of __get_user_pages. __get_user_pages should be used only if |
| 1674 | * you need some special @gup_flags. | 1699 | * you need some special @gup_flags. |
| 1675 | */ | 1700 | */ |
| 1676 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 1701 | long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
| 1677 | unsigned long start, int nr_pages, unsigned int gup_flags, | 1702 | unsigned long start, unsigned long nr_pages, |
| 1678 | struct page **pages, struct vm_area_struct **vmas, | 1703 | unsigned int gup_flags, struct page **pages, |
| 1679 | int *nonblocking) | 1704 | struct vm_area_struct **vmas, int *nonblocking) |
| 1680 | { | 1705 | { |
| 1681 | int i; | 1706 | long i; |
| 1682 | unsigned long vm_flags; | 1707 | unsigned long vm_flags; |
| 1708 | unsigned int page_mask; | ||
| 1683 | 1709 | ||
| 1684 | if (nr_pages <= 0) | 1710 | if (!nr_pages) |
| 1685 | return 0; | 1711 | return 0; |
| 1686 | 1712 | ||
| 1687 | VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); | 1713 | VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); |
| @@ -1757,6 +1783,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 1757 | get_page(page); | 1783 | get_page(page); |
| 1758 | } | 1784 | } |
| 1759 | pte_unmap(pte); | 1785 | pte_unmap(pte); |
| 1786 | page_mask = 0; | ||
| 1760 | goto next_page; | 1787 | goto next_page; |
| 1761 | } | 1788 | } |
| 1762 | 1789 | ||
| @@ -1774,6 +1801,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 1774 | do { | 1801 | do { |
| 1775 | struct page *page; | 1802 | struct page *page; |
| 1776 | unsigned int foll_flags = gup_flags; | 1803 | unsigned int foll_flags = gup_flags; |
| 1804 | unsigned int page_increm; | ||
| 1777 | 1805 | ||
| 1778 | /* | 1806 | /* |
| 1779 | * If we have a pending SIGKILL, don't keep faulting | 1807 | * If we have a pending SIGKILL, don't keep faulting |
| @@ -1783,7 +1811,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 1783 | return i ? i : -ERESTARTSYS; | 1811 | return i ? i : -ERESTARTSYS; |
| 1784 | 1812 | ||
| 1785 | cond_resched(); | 1813 | cond_resched(); |
| 1786 | while (!(page = follow_page(vma, start, foll_flags))) { | 1814 | while (!(page = follow_page_mask(vma, start, |
| 1815 | foll_flags, &page_mask))) { | ||
| 1787 | int ret; | 1816 | int ret; |
| 1788 | unsigned int fault_flags = 0; | 1817 | unsigned int fault_flags = 0; |
| 1789 | 1818 | ||
| @@ -1857,13 +1886,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 1857 | 1886 | ||
| 1858 | flush_anon_page(vma, page, start); | 1887 | flush_anon_page(vma, page, start); |
| 1859 | flush_dcache_page(page); | 1888 | flush_dcache_page(page); |
| 1889 | page_mask = 0; | ||
| 1860 | } | 1890 | } |
| 1861 | next_page: | 1891 | next_page: |
| 1862 | if (vmas) | 1892 | if (vmas) { |
| 1863 | vmas[i] = vma; | 1893 | vmas[i] = vma; |
| 1864 | i++; | 1894 | page_mask = 0; |
| 1865 | start += PAGE_SIZE; | 1895 | } |
| 1866 | nr_pages--; | 1896 | page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); |
| 1897 | if (page_increm > nr_pages) | ||
| 1898 | page_increm = nr_pages; | ||
| 1899 | i += page_increm; | ||
| 1900 | start += page_increm * PAGE_SIZE; | ||
| 1901 | nr_pages -= page_increm; | ||
| 1867 | } while (nr_pages && start < vma->vm_end); | 1902 | } while (nr_pages && start < vma->vm_end); |
| 1868 | } while (nr_pages); | 1903 | } while (nr_pages); |
| 1869 | return i; | 1904 | return i; |
| @@ -1977,9 +2012,9 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, | |||
| 1977 | * | 2012 | * |
| 1978 | * See also get_user_pages_fast, for performance critical applications. | 2013 | * See also get_user_pages_fast, for performance critical applications. |
| 1979 | */ | 2014 | */ |
| 1980 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 2015 | long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
| 1981 | unsigned long start, int nr_pages, int write, int force, | 2016 | unsigned long start, unsigned long nr_pages, int write, |
| 1982 | struct page **pages, struct vm_area_struct **vmas) | 2017 | int force, struct page **pages, struct vm_area_struct **vmas) |
| 1983 | { | 2018 | { |
| 1984 | int flags = FOLL_TOUCH; | 2019 | int flags = FOLL_TOUCH; |
| 1985 | 2020 | ||
| @@ -2919,7 +2954,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2919 | unsigned int flags, pte_t orig_pte) | 2954 | unsigned int flags, pte_t orig_pte) |
| 2920 | { | 2955 | { |
| 2921 | spinlock_t *ptl; | 2956 | spinlock_t *ptl; |
| 2922 | struct page *page, *swapcache = NULL; | 2957 | struct page *page, *swapcache; |
| 2923 | swp_entry_t entry; | 2958 | swp_entry_t entry; |
| 2924 | pte_t pte; | 2959 | pte_t pte; |
| 2925 | int locked; | 2960 | int locked; |
| @@ -2970,9 +3005,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2970 | */ | 3005 | */ |
| 2971 | ret = VM_FAULT_HWPOISON; | 3006 | ret = VM_FAULT_HWPOISON; |
| 2972 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 3007 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
| 3008 | swapcache = page; | ||
| 2973 | goto out_release; | 3009 | goto out_release; |
| 2974 | } | 3010 | } |
| 2975 | 3011 | ||
| 3012 | swapcache = page; | ||
| 2976 | locked = lock_page_or_retry(page, mm, flags); | 3013 | locked = lock_page_or_retry(page, mm, flags); |
| 2977 | 3014 | ||
| 2978 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 3015 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
| @@ -2990,16 +3027,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2990 | if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) | 3027 | if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) |
| 2991 | goto out_page; | 3028 | goto out_page; |
| 2992 | 3029 | ||
| 2993 | if (ksm_might_need_to_copy(page, vma, address)) { | 3030 | page = ksm_might_need_to_copy(page, vma, address); |
| 2994 | swapcache = page; | 3031 | if (unlikely(!page)) { |
| 2995 | page = ksm_does_need_to_copy(page, vma, address); | 3032 | ret = VM_FAULT_OOM; |
| 2996 | 3033 | page = swapcache; | |
| 2997 | if (unlikely(!page)) { | 3034 | goto out_page; |
| 2998 | ret = VM_FAULT_OOM; | ||
| 2999 | page = swapcache; | ||
| 3000 | swapcache = NULL; | ||
| 3001 | goto out_page; | ||
| 3002 | } | ||
| 3003 | } | 3035 | } |
| 3004 | 3036 | ||
| 3005 | if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { | 3037 | if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { |
| @@ -3044,7 +3076,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3044 | } | 3076 | } |
| 3045 | flush_icache_page(vma, page); | 3077 | flush_icache_page(vma, page); |
| 3046 | set_pte_at(mm, address, page_table, pte); | 3078 | set_pte_at(mm, address, page_table, pte); |
| 3047 | do_page_add_anon_rmap(page, vma, address, exclusive); | 3079 | if (page == swapcache) |
| 3080 | do_page_add_anon_rmap(page, vma, address, exclusive); | ||
| 3081 | else /* ksm created a completely new copy */ | ||
| 3082 | page_add_new_anon_rmap(page, vma, address); | ||
| 3048 | /* It's better to call commit-charge after rmap is established */ | 3083 | /* It's better to call commit-charge after rmap is established */ |
| 3049 | mem_cgroup_commit_charge_swapin(page, ptr); | 3084 | mem_cgroup_commit_charge_swapin(page, ptr); |
| 3050 | 3085 | ||
| @@ -3052,7 +3087,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3052 | if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) | 3087 | if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) |
| 3053 | try_to_free_swap(page); | 3088 | try_to_free_swap(page); |
| 3054 | unlock_page(page); | 3089 | unlock_page(page); |
| 3055 | if (swapcache) { | 3090 | if (page != swapcache) { |
| 3056 | /* | 3091 | /* |
| 3057 | * Hold the lock to avoid the swap entry to be reused | 3092 | * Hold the lock to avoid the swap entry to be reused |
| 3058 | * until we take the PT lock for the pte_same() check | 3093 | * until we take the PT lock for the pte_same() check |
| @@ -3085,7 +3120,7 @@ out_page: | |||
| 3085 | unlock_page(page); | 3120 | unlock_page(page); |
| 3086 | out_release: | 3121 | out_release: |
| 3087 | page_cache_release(page); | 3122 | page_cache_release(page); |
| 3088 | if (swapcache) { | 3123 | if (page != swapcache) { |
| 3089 | unlock_page(swapcache); | 3124 | unlock_page(swapcache); |
| 3090 | page_cache_release(swapcache); | 3125 | page_cache_release(swapcache); |
| 3091 | } | 3126 | } |
| @@ -3821,30 +3856,6 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) | |||
| 3821 | } | 3856 | } |
| 3822 | #endif /* __PAGETABLE_PMD_FOLDED */ | 3857 | #endif /* __PAGETABLE_PMD_FOLDED */ |
| 3823 | 3858 | ||
| 3824 | int make_pages_present(unsigned long addr, unsigned long end) | ||
| 3825 | { | ||
| 3826 | int ret, len, write; | ||
| 3827 | struct vm_area_struct * vma; | ||
| 3828 | |||
| 3829 | vma = find_vma(current->mm, addr); | ||
| 3830 | if (!vma) | ||
| 3831 | return -ENOMEM; | ||
| 3832 | /* | ||
| 3833 | * We want to touch writable mappings with a write fault in order | ||
| 3834 | * to break COW, except for shared mappings because these don't COW | ||
| 3835 | * and we would not want to dirty them for nothing. | ||
| 3836 | */ | ||
| 3837 | write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE; | ||
| 3838 | BUG_ON(addr >= end); | ||
| 3839 | BUG_ON(end > vma->vm_end); | ||
| 3840 | len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; | ||
| 3841 | ret = get_user_pages(current, current->mm, addr, | ||
| 3842 | len, write, 0, NULL, NULL); | ||
| 3843 | if (ret < 0) | ||
| 3844 | return ret; | ||
| 3845 | return ret == len ? 0 : -EFAULT; | ||
| 3846 | } | ||
| 3847 | |||
| 3848 | #if !defined(__HAVE_ARCH_GATE_AREA) | 3859 | #if !defined(__HAVE_ARCH_GATE_AREA) |
| 3849 | 3860 | ||
| 3850 | #if defined(AT_SYSINFO_EHDR) | 3861 | #if defined(AT_SYSINFO_EHDR) |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index d04ed87bfacb..b81a367b9f39 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
| @@ -29,6 +29,7 @@ | |||
| 29 | #include <linux/suspend.h> | 29 | #include <linux/suspend.h> |
| 30 | #include <linux/mm_inline.h> | 30 | #include <linux/mm_inline.h> |
| 31 | #include <linux/firmware-map.h> | 31 | #include <linux/firmware-map.h> |
| 32 | #include <linux/stop_machine.h> | ||
| 32 | 33 | ||
| 33 | #include <asm/tlbflush.h> | 34 | #include <asm/tlbflush.h> |
| 34 | 35 | ||
| @@ -91,9 +92,8 @@ static void release_memory_resource(struct resource *res) | |||
| 91 | } | 92 | } |
| 92 | 93 | ||
| 93 | #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE | 94 | #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE |
| 94 | #ifndef CONFIG_SPARSEMEM_VMEMMAP | 95 | void get_page_bootmem(unsigned long info, struct page *page, |
| 95 | static void get_page_bootmem(unsigned long info, struct page *page, | 96 | unsigned long type) |
| 96 | unsigned long type) | ||
| 97 | { | 97 | { |
| 98 | page->lru.next = (struct list_head *) type; | 98 | page->lru.next = (struct list_head *) type; |
| 99 | SetPagePrivate(page); | 99 | SetPagePrivate(page); |
| @@ -124,10 +124,13 @@ void __ref put_page_bootmem(struct page *page) | |||
| 124 | mutex_lock(&ppb_lock); | 124 | mutex_lock(&ppb_lock); |
| 125 | __free_pages_bootmem(page, 0); | 125 | __free_pages_bootmem(page, 0); |
| 126 | mutex_unlock(&ppb_lock); | 126 | mutex_unlock(&ppb_lock); |
| 127 | totalram_pages++; | ||
| 127 | } | 128 | } |
| 128 | 129 | ||
| 129 | } | 130 | } |
| 130 | 131 | ||
| 132 | #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE | ||
| 133 | #ifndef CONFIG_SPARSEMEM_VMEMMAP | ||
| 131 | static void register_page_bootmem_info_section(unsigned long start_pfn) | 134 | static void register_page_bootmem_info_section(unsigned long start_pfn) |
| 132 | { | 135 | { |
| 133 | unsigned long *usemap, mapsize, section_nr, i; | 136 | unsigned long *usemap, mapsize, section_nr, i; |
| @@ -161,6 +164,32 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) | |||
| 161 | get_page_bootmem(section_nr, page, MIX_SECTION_INFO); | 164 | get_page_bootmem(section_nr, page, MIX_SECTION_INFO); |
| 162 | 165 | ||
| 163 | } | 166 | } |
| 167 | #else /* CONFIG_SPARSEMEM_VMEMMAP */ | ||
| 168 | static void register_page_bootmem_info_section(unsigned long start_pfn) | ||
| 169 | { | ||
| 170 | unsigned long *usemap, mapsize, section_nr, i; | ||
| 171 | struct mem_section *ms; | ||
| 172 | struct page *page, *memmap; | ||
| 173 | |||
| 174 | if (!pfn_valid(start_pfn)) | ||
| 175 | return; | ||
| 176 | |||
| 177 | section_nr = pfn_to_section_nr(start_pfn); | ||
| 178 | ms = __nr_to_section(section_nr); | ||
| 179 | |||
| 180 | memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); | ||
| 181 | |||
| 182 | register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); | ||
| 183 | |||
| 184 | usemap = __nr_to_section(section_nr)->pageblock_flags; | ||
| 185 | page = virt_to_page(usemap); | ||
| 186 | |||
| 187 | mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; | ||
| 188 | |||
| 189 | for (i = 0; i < mapsize; i++, page++) | ||
| 190 | get_page_bootmem(section_nr, page, MIX_SECTION_INFO); | ||
| 191 | } | ||
| 192 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ | ||
| 164 | 193 | ||
| 165 | void register_page_bootmem_info_node(struct pglist_data *pgdat) | 194 | void register_page_bootmem_info_node(struct pglist_data *pgdat) |
| 166 | { | 195 | { |
| @@ -189,7 +218,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) | |||
| 189 | } | 218 | } |
| 190 | 219 | ||
| 191 | pfn = pgdat->node_start_pfn; | 220 | pfn = pgdat->node_start_pfn; |
| 192 | end_pfn = pfn + pgdat->node_spanned_pages; | 221 | end_pfn = pgdat_end_pfn(pgdat); |
| 193 | 222 | ||
| 194 | /* register_section info */ | 223 | /* register_section info */ |
| 195 | for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { | 224 | for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { |
| @@ -203,7 +232,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) | |||
| 203 | register_page_bootmem_info_section(pfn); | 232 | register_page_bootmem_info_section(pfn); |
| 204 | } | 233 | } |
| 205 | } | 234 | } |
| 206 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ | 235 | #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ |
| 207 | 236 | ||
| 208 | static void grow_zone_span(struct zone *zone, unsigned long start_pfn, | 237 | static void grow_zone_span(struct zone *zone, unsigned long start_pfn, |
| 209 | unsigned long end_pfn) | 238 | unsigned long end_pfn) |
| @@ -253,6 +282,17 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn, | |||
| 253 | set_page_links(pfn_to_page(pfn), zid, nid, pfn); | 282 | set_page_links(pfn_to_page(pfn), zid, nid, pfn); |
| 254 | } | 283 | } |
| 255 | 284 | ||
| 285 | /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or | ||
| 286 | * alloc_bootmem_node_nopanic() */ | ||
| 287 | static int __ref ensure_zone_is_initialized(struct zone *zone, | ||
| 288 | unsigned long start_pfn, unsigned long num_pages) | ||
| 289 | { | ||
| 290 | if (!zone_is_initialized(zone)) | ||
| 291 | return init_currently_empty_zone(zone, start_pfn, num_pages, | ||
| 292 | MEMMAP_HOTPLUG); | ||
| 293 | return 0; | ||
| 294 | } | ||
| 295 | |||
| 256 | static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, | 296 | static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, |
| 257 | unsigned long start_pfn, unsigned long end_pfn) | 297 | unsigned long start_pfn, unsigned long end_pfn) |
| 258 | { | 298 | { |
| @@ -260,17 +300,14 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, | |||
| 260 | unsigned long flags; | 300 | unsigned long flags; |
| 261 | unsigned long z1_start_pfn; | 301 | unsigned long z1_start_pfn; |
| 262 | 302 | ||
| 263 | if (!z1->wait_table) { | 303 | ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); |
| 264 | ret = init_currently_empty_zone(z1, start_pfn, | 304 | if (ret) |
| 265 | end_pfn - start_pfn, MEMMAP_HOTPLUG); | 305 | return ret; |
| 266 | if (ret) | ||
| 267 | return ret; | ||
| 268 | } | ||
| 269 | 306 | ||
| 270 | pgdat_resize_lock(z1->zone_pgdat, &flags); | 307 | pgdat_resize_lock(z1->zone_pgdat, &flags); |
| 271 | 308 | ||
| 272 | /* can't move pfns which are higher than @z2 */ | 309 | /* can't move pfns which are higher than @z2 */ |
| 273 | if (end_pfn > z2->zone_start_pfn + z2->spanned_pages) | 310 | if (end_pfn > zone_end_pfn(z2)) |
| 274 | goto out_fail; | 311 | goto out_fail; |
| 275 | /* the move out part mast at the left most of @z2 */ | 312 | /* the move out part mast at the left most of @z2 */ |
| 276 | if (start_pfn > z2->zone_start_pfn) | 313 | if (start_pfn > z2->zone_start_pfn) |
| @@ -286,7 +323,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, | |||
| 286 | z1_start_pfn = start_pfn; | 323 | z1_start_pfn = start_pfn; |
| 287 | 324 | ||
| 288 | resize_zone(z1, z1_start_pfn, end_pfn); | 325 | resize_zone(z1, z1_start_pfn, end_pfn); |
| 289 | resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages); | 326 | resize_zone(z2, end_pfn, zone_end_pfn(z2)); |
| 290 | 327 | ||
| 291 | pgdat_resize_unlock(z1->zone_pgdat, &flags); | 328 | pgdat_resize_unlock(z1->zone_pgdat, &flags); |
| 292 | 329 | ||
| @@ -305,12 +342,9 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, | |||
| 305 | unsigned long flags; | 342 | unsigned long flags; |
| 306 | unsigned long z2_end_pfn; | 343 | unsigned long z2_end_pfn; |
| 307 | 344 | ||
| 308 | if (!z2->wait_table) { | 345 | ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); |
| 309 | ret = init_currently_empty_zone(z2, start_pfn, | 346 | if (ret) |
| 310 | end_pfn - start_pfn, MEMMAP_HOTPLUG); | 347 | return ret; |
| 311 | if (ret) | ||
| 312 | return ret; | ||
| 313 | } | ||
| 314 | 348 | ||
| 315 | pgdat_resize_lock(z1->zone_pgdat, &flags); | 349 | pgdat_resize_lock(z1->zone_pgdat, &flags); |
| 316 | 350 | ||
| @@ -318,15 +352,15 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, | |||
| 318 | if (z1->zone_start_pfn > start_pfn) | 352 | if (z1->zone_start_pfn > start_pfn) |
| 319 | goto out_fail; | 353 | goto out_fail; |
| 320 | /* the move out part mast at the right most of @z1 */ | 354 | /* the move out part mast at the right most of @z1 */ |
| 321 | if (z1->zone_start_pfn + z1->spanned_pages > end_pfn) | 355 | if (zone_end_pfn(z1) > end_pfn) |
| 322 | goto out_fail; | 356 | goto out_fail; |
| 323 | /* must included/overlap */ | 357 | /* must included/overlap */ |
| 324 | if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages) | 358 | if (start_pfn >= zone_end_pfn(z1)) |
| 325 | goto out_fail; | 359 | goto out_fail; |
| 326 | 360 | ||
| 327 | /* use end_pfn for z2's end_pfn if z2 is empty */ | 361 | /* use end_pfn for z2's end_pfn if z2 is empty */ |
| 328 | if (z2->spanned_pages) | 362 | if (z2->spanned_pages) |
| 329 | z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages; | 363 | z2_end_pfn = zone_end_pfn(z2); |
| 330 | else | 364 | else |
| 331 | z2_end_pfn = end_pfn; | 365 | z2_end_pfn = end_pfn; |
| 332 | 366 | ||
| @@ -363,16 +397,13 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) | |||
| 363 | int nid = pgdat->node_id; | 397 | int nid = pgdat->node_id; |
| 364 | int zone_type; | 398 | int zone_type; |
| 365 | unsigned long flags; | 399 | unsigned long flags; |
| 400 | int ret; | ||
| 366 | 401 | ||
| 367 | zone_type = zone - pgdat->node_zones; | 402 | zone_type = zone - pgdat->node_zones; |
| 368 | if (!zone->wait_table) { | 403 | ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); |
| 369 | int ret; | 404 | if (ret) |
| 405 | return ret; | ||
| 370 | 406 | ||
| 371 | ret = init_currently_empty_zone(zone, phys_start_pfn, | ||
| 372 | nr_pages, MEMMAP_HOTPLUG); | ||
| 373 | if (ret) | ||
| 374 | return ret; | ||
| 375 | } | ||
| 376 | pgdat_resize_lock(zone->zone_pgdat, &flags); | 407 | pgdat_resize_lock(zone->zone_pgdat, &flags); |
| 377 | grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); | 408 | grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); |
| 378 | grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, | 409 | grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, |
| @@ -405,20 +436,211 @@ static int __meminit __add_section(int nid, struct zone *zone, | |||
| 405 | return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); | 436 | return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); |
| 406 | } | 437 | } |
| 407 | 438 | ||
| 408 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | 439 | /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ |
| 409 | static int __remove_section(struct zone *zone, struct mem_section *ms) | 440 | static int find_smallest_section_pfn(int nid, struct zone *zone, |
| 441 | unsigned long start_pfn, | ||
| 442 | unsigned long end_pfn) | ||
| 443 | { | ||
| 444 | struct mem_section *ms; | ||
| 445 | |||
| 446 | for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { | ||
| 447 | ms = __pfn_to_section(start_pfn); | ||
| 448 | |||
| 449 | if (unlikely(!valid_section(ms))) | ||
| 450 | continue; | ||
| 451 | |||
| 452 | if (unlikely(pfn_to_nid(start_pfn) != nid)) | ||
| 453 | continue; | ||
| 454 | |||
| 455 | if (zone && zone != page_zone(pfn_to_page(start_pfn))) | ||
| 456 | continue; | ||
| 457 | |||
| 458 | return start_pfn; | ||
| 459 | } | ||
| 460 | |||
| 461 | return 0; | ||
| 462 | } | ||
| 463 | |||
| 464 | /* find the biggest valid pfn in the range [start_pfn, end_pfn). */ | ||
| 465 | static int find_biggest_section_pfn(int nid, struct zone *zone, | ||
| 466 | unsigned long start_pfn, | ||
| 467 | unsigned long end_pfn) | ||
| 468 | { | ||
| 469 | struct mem_section *ms; | ||
| 470 | unsigned long pfn; | ||
| 471 | |||
| 472 | /* pfn is the end pfn of a memory section. */ | ||
| 473 | pfn = end_pfn - 1; | ||
| 474 | for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { | ||
| 475 | ms = __pfn_to_section(pfn); | ||
| 476 | |||
| 477 | if (unlikely(!valid_section(ms))) | ||
| 478 | continue; | ||
| 479 | |||
| 480 | if (unlikely(pfn_to_nid(pfn) != nid)) | ||
| 481 | continue; | ||
| 482 | |||
| 483 | if (zone && zone != page_zone(pfn_to_page(pfn))) | ||
| 484 | continue; | ||
| 485 | |||
| 486 | return pfn; | ||
| 487 | } | ||
| 488 | |||
| 489 | return 0; | ||
| 490 | } | ||
| 491 | |||
| 492 | static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, | ||
| 493 | unsigned long end_pfn) | ||
| 410 | { | 494 | { |
| 495 | unsigned long zone_start_pfn = zone->zone_start_pfn; | ||
| 496 | unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; | ||
| 497 | unsigned long pfn; | ||
| 498 | struct mem_section *ms; | ||
| 499 | int nid = zone_to_nid(zone); | ||
| 500 | |||
| 501 | zone_span_writelock(zone); | ||
| 502 | if (zone_start_pfn == start_pfn) { | ||
| 503 | /* | ||
| 504 | * If the section is smallest section in the zone, it need | ||
| 505 | * shrink zone->zone_start_pfn and zone->zone_spanned_pages. | ||
| 506 | * In this case, we find second smallest valid mem_section | ||
| 507 | * for shrinking zone. | ||
| 508 | */ | ||
| 509 | pfn = find_smallest_section_pfn(nid, zone, end_pfn, | ||
| 510 | zone_end_pfn); | ||
| 511 | if (pfn) { | ||
| 512 | zone->zone_start_pfn = pfn; | ||
| 513 | zone->spanned_pages = zone_end_pfn - pfn; | ||
| 514 | } | ||
| 515 | } else if (zone_end_pfn == end_pfn) { | ||
| 516 | /* | ||
| 517 | * If the section is biggest section in the zone, it need | ||
| 518 | * shrink zone->spanned_pages. | ||
| 519 | * In this case, we find second biggest valid mem_section for | ||
| 520 | * shrinking zone. | ||
| 521 | */ | ||
| 522 | pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, | ||
| 523 | start_pfn); | ||
| 524 | if (pfn) | ||
| 525 | zone->spanned_pages = pfn - zone_start_pfn + 1; | ||
| 526 | } | ||
| 527 | |||
| 411 | /* | 528 | /* |
| 412 | * XXX: Freeing memmap with vmemmap is not implement yet. | 529 | * The section is not biggest or smallest mem_section in the zone, it |
| 413 | * This should be removed later. | 530 | * only creates a hole in the zone. So in this case, we need not |
| 531 | * change the zone. But perhaps, the zone has only hole data. Thus | ||
| 532 | * it check the zone has only hole or not. | ||
| 414 | */ | 533 | */ |
| 415 | return -EBUSY; | 534 | pfn = zone_start_pfn; |
| 535 | for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { | ||
| 536 | ms = __pfn_to_section(pfn); | ||
| 537 | |||
| 538 | if (unlikely(!valid_section(ms))) | ||
| 539 | continue; | ||
| 540 | |||
| 541 | if (page_zone(pfn_to_page(pfn)) != zone) | ||
| 542 | continue; | ||
| 543 | |||
| 544 | /* If the section is current section, it continues the loop */ | ||
| 545 | if (start_pfn == pfn) | ||
| 546 | continue; | ||
| 547 | |||
| 548 | /* If we find valid section, we have nothing to do */ | ||
| 549 | zone_span_writeunlock(zone); | ||
| 550 | return; | ||
| 551 | } | ||
| 552 | |||
| 553 | /* The zone has no valid section */ | ||
| 554 | zone->zone_start_pfn = 0; | ||
| 555 | zone->spanned_pages = 0; | ||
| 556 | zone_span_writeunlock(zone); | ||
| 416 | } | 557 | } |
| 417 | #else | 558 | |
| 418 | static int __remove_section(struct zone *zone, struct mem_section *ms) | 559 | static void shrink_pgdat_span(struct pglist_data *pgdat, |
| 560 | unsigned long start_pfn, unsigned long end_pfn) | ||
| 561 | { | ||
| 562 | unsigned long pgdat_start_pfn = pgdat->node_start_pfn; | ||
| 563 | unsigned long pgdat_end_pfn = | ||
| 564 | pgdat->node_start_pfn + pgdat->node_spanned_pages; | ||
| 565 | unsigned long pfn; | ||
| 566 | struct mem_section *ms; | ||
| 567 | int nid = pgdat->node_id; | ||
| 568 | |||
| 569 | if (pgdat_start_pfn == start_pfn) { | ||
| 570 | /* | ||
| 571 | * If the section is smallest section in the pgdat, it need | ||
| 572 | * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages. | ||
| 573 | * In this case, we find second smallest valid mem_section | ||
| 574 | * for shrinking zone. | ||
| 575 | */ | ||
| 576 | pfn = find_smallest_section_pfn(nid, NULL, end_pfn, | ||
| 577 | pgdat_end_pfn); | ||
| 578 | if (pfn) { | ||
| 579 | pgdat->node_start_pfn = pfn; | ||
| 580 | pgdat->node_spanned_pages = pgdat_end_pfn - pfn; | ||
| 581 | } | ||
| 582 | } else if (pgdat_end_pfn == end_pfn) { | ||
| 583 | /* | ||
| 584 | * If the section is biggest section in the pgdat, it need | ||
| 585 | * shrink pgdat->node_spanned_pages. | ||
| 586 | * In this case, we find second biggest valid mem_section for | ||
| 587 | * shrinking zone. | ||
| 588 | */ | ||
| 589 | pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn, | ||
| 590 | start_pfn); | ||
| 591 | if (pfn) | ||
| 592 | pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1; | ||
| 593 | } | ||
| 594 | |||
| 595 | /* | ||
| 596 | * If the section is not biggest or smallest mem_section in the pgdat, | ||
| 597 | * it only creates a hole in the pgdat. So in this case, we need not | ||
| 598 | * change the pgdat. | ||
| 599 | * But perhaps, the pgdat has only hole data. Thus it check the pgdat | ||
| 600 | * has only hole or not. | ||
| 601 | */ | ||
| 602 | pfn = pgdat_start_pfn; | ||
| 603 | for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { | ||
| 604 | ms = __pfn_to_section(pfn); | ||
| 605 | |||
| 606 | if (unlikely(!valid_section(ms))) | ||
| 607 | continue; | ||
| 608 | |||
| 609 | if (pfn_to_nid(pfn) != nid) | ||
| 610 | continue; | ||
| 611 | |||
| 612 | /* If the section is current section, it continues the loop */ | ||
| 613 | if (start_pfn == pfn) | ||
| 614 | continue; | ||
| 615 | |||
| 616 | /* If we find valid section, we have nothing to do */ | ||
| 617 | return; | ||
| 618 | } | ||
| 619 | |||
| 620 | /* The pgdat has no valid section */ | ||
| 621 | pgdat->node_start_pfn = 0; | ||
| 622 | pgdat->node_spanned_pages = 0; | ||
| 623 | } | ||
| 624 | |||
| 625 | static void __remove_zone(struct zone *zone, unsigned long start_pfn) | ||
| 419 | { | 626 | { |
| 420 | unsigned long flags; | ||
| 421 | struct pglist_data *pgdat = zone->zone_pgdat; | 627 | struct pglist_data *pgdat = zone->zone_pgdat; |
| 628 | int nr_pages = PAGES_PER_SECTION; | ||
| 629 | int zone_type; | ||
| 630 | unsigned long flags; | ||
| 631 | |||
| 632 | zone_type = zone - pgdat->node_zones; | ||
| 633 | |||
| 634 | pgdat_resize_lock(zone->zone_pgdat, &flags); | ||
| 635 | shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); | ||
| 636 | shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); | ||
| 637 | pgdat_resize_unlock(zone->zone_pgdat, &flags); | ||
| 638 | } | ||
| 639 | |||
| 640 | static int __remove_section(struct zone *zone, struct mem_section *ms) | ||
| 641 | { | ||
| 642 | unsigned long start_pfn; | ||
| 643 | int scn_nr; | ||
| 422 | int ret = -EINVAL; | 644 | int ret = -EINVAL; |
| 423 | 645 | ||
| 424 | if (!valid_section(ms)) | 646 | if (!valid_section(ms)) |
| @@ -428,12 +650,13 @@ static int __remove_section(struct zone *zone, struct mem_section *ms) | |||
| 428 | if (ret) | 650 | if (ret) |
| 429 | return ret; | 651 | return ret; |
| 430 | 652 | ||
| 431 | pgdat_resize_lock(pgdat, &flags); | 653 | scn_nr = __section_nr(ms); |
| 654 | start_pfn = section_nr_to_pfn(scn_nr); | ||
| 655 | __remove_zone(zone, start_pfn); | ||
| 656 | |||
| 432 | sparse_remove_one_section(zone, ms); | 657 | sparse_remove_one_section(zone, ms); |
| 433 | pgdat_resize_unlock(pgdat, &flags); | ||
| 434 | return 0; | 658 | return 0; |
| 435 | } | 659 | } |
| 436 | #endif | ||
| 437 | 660 | ||
| 438 | /* | 661 | /* |
| 439 | * Reasonably generic function for adding memory. It is | 662 | * Reasonably generic function for adding memory. It is |
| @@ -797,11 +1020,14 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) | |||
| 797 | unsigned long zholes_size[MAX_NR_ZONES] = {0}; | 1020 | unsigned long zholes_size[MAX_NR_ZONES] = {0}; |
| 798 | unsigned long start_pfn = start >> PAGE_SHIFT; | 1021 | unsigned long start_pfn = start >> PAGE_SHIFT; |
| 799 | 1022 | ||
| 800 | pgdat = arch_alloc_nodedata(nid); | 1023 | pgdat = NODE_DATA(nid); |
| 801 | if (!pgdat) | 1024 | if (!pgdat) { |
| 802 | return NULL; | 1025 | pgdat = arch_alloc_nodedata(nid); |
| 1026 | if (!pgdat) | ||
| 1027 | return NULL; | ||
| 803 | 1028 | ||
| 804 | arch_refresh_nodedata(nid, pgdat); | 1029 | arch_refresh_nodedata(nid, pgdat); |
| 1030 | } | ||
| 805 | 1031 | ||
| 806 | /* we can use NODE_DATA(nid) from here */ | 1032 | /* we can use NODE_DATA(nid) from here */ |
| 807 | 1033 | ||
| @@ -854,7 +1080,8 @@ out: | |||
| 854 | int __ref add_memory(int nid, u64 start, u64 size) | 1080 | int __ref add_memory(int nid, u64 start, u64 size) |
| 855 | { | 1081 | { |
| 856 | pg_data_t *pgdat = NULL; | 1082 | pg_data_t *pgdat = NULL; |
| 857 | int new_pgdat = 0; | 1083 | bool new_pgdat; |
| 1084 | bool new_node; | ||
| 858 | struct resource *res; | 1085 | struct resource *res; |
| 859 | int ret; | 1086 | int ret; |
| 860 | 1087 | ||
| @@ -865,12 +1092,16 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
| 865 | if (!res) | 1092 | if (!res) |
| 866 | goto out; | 1093 | goto out; |
| 867 | 1094 | ||
| 868 | if (!node_online(nid)) { | 1095 | { /* Stupid hack to suppress address-never-null warning */ |
| 1096 | void *p = NODE_DATA(nid); | ||
| 1097 | new_pgdat = !p; | ||
| 1098 | } | ||
| 1099 | new_node = !node_online(nid); | ||
| 1100 | if (new_node) { | ||
| 869 | pgdat = hotadd_new_pgdat(nid, start); | 1101 | pgdat = hotadd_new_pgdat(nid, start); |
| 870 | ret = -ENOMEM; | 1102 | ret = -ENOMEM; |
| 871 | if (!pgdat) | 1103 | if (!pgdat) |
| 872 | goto error; | 1104 | goto error; |
| 873 | new_pgdat = 1; | ||
| 874 | } | 1105 | } |
| 875 | 1106 | ||
| 876 | /* call arch's memory hotadd */ | 1107 | /* call arch's memory hotadd */ |
| @@ -882,7 +1113,7 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
| 882 | /* we online node here. we can't roll back from here. */ | 1113 | /* we online node here. we can't roll back from here. */ |
| 883 | node_set_online(nid); | 1114 | node_set_online(nid); |
| 884 | 1115 | ||
| 885 | if (new_pgdat) { | 1116 | if (new_node) { |
| 886 | ret = register_one_node(nid); | 1117 | ret = register_one_node(nid); |
| 887 | /* | 1118 | /* |
| 888 | * If sysfs file of new node can't create, cpu on the node | 1119 | * If sysfs file of new node can't create, cpu on the node |
| @@ -901,8 +1132,7 @@ error: | |||
| 901 | /* rollback pgdat allocation and others */ | 1132 | /* rollback pgdat allocation and others */ |
| 902 | if (new_pgdat) | 1133 | if (new_pgdat) |
| 903 | rollback_node_hotadd(nid, pgdat); | 1134 | rollback_node_hotadd(nid, pgdat); |
| 904 | if (res) | 1135 | release_memory_resource(res); |
| 905 | release_memory_resource(res); | ||
| 906 | 1136 | ||
| 907 | out: | 1137 | out: |
| 908 | unlock_memory_hotplug(); | 1138 | unlock_memory_hotplug(); |
| @@ -1058,8 +1288,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
| 1058 | * migrate_pages returns # of failed pages. | 1288 | * migrate_pages returns # of failed pages. |
| 1059 | */ | 1289 | */ |
| 1060 | ret = migrate_pages(&source, alloc_migrate_target, 0, | 1290 | ret = migrate_pages(&source, alloc_migrate_target, 0, |
| 1061 | true, MIGRATE_SYNC, | 1291 | MIGRATE_SYNC, MR_MEMORY_HOTPLUG); |
| 1062 | MR_MEMORY_HOTPLUG); | ||
| 1063 | if (ret) | 1292 | if (ret) |
| 1064 | putback_lru_pages(&source); | 1293 | putback_lru_pages(&source); |
| 1065 | } | 1294 | } |
| @@ -1381,17 +1610,26 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) | |||
| 1381 | return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); | 1610 | return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); |
| 1382 | } | 1611 | } |
| 1383 | 1612 | ||
| 1384 | int remove_memory(u64 start, u64 size) | 1613 | /** |
| 1614 | * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) | ||
| 1615 | * @start_pfn: start pfn of the memory range | ||
| 1616 | * @end_pfn: end pft of the memory range | ||
| 1617 | * @arg: argument passed to func | ||
| 1618 | * @func: callback for each memory section walked | ||
| 1619 | * | ||
| 1620 | * This function walks through all present mem sections in range | ||
| 1621 | * [start_pfn, end_pfn) and call func on each mem section. | ||
| 1622 | * | ||
| 1623 | * Returns the return value of func. | ||
| 1624 | */ | ||
| 1625 | static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, | ||
| 1626 | void *arg, int (*func)(struct memory_block *, void *)) | ||
| 1385 | { | 1627 | { |
| 1386 | struct memory_block *mem = NULL; | 1628 | struct memory_block *mem = NULL; |
| 1387 | struct mem_section *section; | 1629 | struct mem_section *section; |
| 1388 | unsigned long start_pfn, end_pfn; | ||
| 1389 | unsigned long pfn, section_nr; | 1630 | unsigned long pfn, section_nr; |
| 1390 | int ret; | 1631 | int ret; |
| 1391 | 1632 | ||
| 1392 | start_pfn = PFN_DOWN(start); | ||
| 1393 | end_pfn = start_pfn + PFN_DOWN(size); | ||
| 1394 | |||
| 1395 | for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { | 1633 | for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { |
| 1396 | section_nr = pfn_to_section_nr(pfn); | 1634 | section_nr = pfn_to_section_nr(pfn); |
| 1397 | if (!present_section_nr(section_nr)) | 1635 | if (!present_section_nr(section_nr)) |
| @@ -1408,7 +1646,7 @@ int remove_memory(u64 start, u64 size) | |||
| 1408 | if (!mem) | 1646 | if (!mem) |
| 1409 | continue; | 1647 | continue; |
| 1410 | 1648 | ||
| 1411 | ret = offline_memory_block(mem); | 1649 | ret = func(mem, arg); |
| 1412 | if (ret) { | 1650 | if (ret) { |
| 1413 | kobject_put(&mem->dev.kobj); | 1651 | kobject_put(&mem->dev.kobj); |
| 1414 | return ret; | 1652 | return ret; |
| @@ -1420,12 +1658,209 @@ int remove_memory(u64 start, u64 size) | |||
| 1420 | 1658 | ||
| 1421 | return 0; | 1659 | return 0; |
| 1422 | } | 1660 | } |
| 1661 | |||
| 1662 | /** | ||
| 1663 | * offline_memory_block_cb - callback function for offlining memory block | ||
| 1664 | * @mem: the memory block to be offlined | ||
| 1665 | * @arg: buffer to hold error msg | ||
| 1666 | * | ||
| 1667 | * Always return 0, and put the error msg in arg if any. | ||
| 1668 | */ | ||
| 1669 | static int offline_memory_block_cb(struct memory_block *mem, void *arg) | ||
| 1670 | { | ||
| 1671 | int *ret = arg; | ||
| 1672 | int error = offline_memory_block(mem); | ||
| 1673 | |||
| 1674 | if (error != 0 && *ret == 0) | ||
| 1675 | *ret = error; | ||
| 1676 | |||
| 1677 | return 0; | ||
| 1678 | } | ||
| 1679 | |||
| 1680 | static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) | ||
| 1681 | { | ||
| 1682 | int ret = !is_memblock_offlined(mem); | ||
| 1683 | |||
| 1684 | if (unlikely(ret)) | ||
| 1685 | pr_warn("removing memory fails, because memory " | ||
| 1686 | "[%#010llx-%#010llx] is onlined\n", | ||
| 1687 | PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)), | ||
| 1688 | PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1); | ||
| 1689 | |||
| 1690 | return ret; | ||
| 1691 | } | ||
| 1692 | |||
| 1693 | static int check_cpu_on_node(void *data) | ||
| 1694 | { | ||
| 1695 | struct pglist_data *pgdat = data; | ||
| 1696 | int cpu; | ||
| 1697 | |||
| 1698 | for_each_present_cpu(cpu) { | ||
| 1699 | if (cpu_to_node(cpu) == pgdat->node_id) | ||
| 1700 | /* | ||
| 1701 | * the cpu on this node isn't removed, and we can't | ||
| 1702 | * offline this node. | ||
| 1703 | */ | ||
| 1704 | return -EBUSY; | ||
| 1705 | } | ||
| 1706 | |||
| 1707 | return 0; | ||
| 1708 | } | ||
| 1709 | |||
| 1710 | static void unmap_cpu_on_node(void *data) | ||
| 1711 | { | ||
| 1712 | #ifdef CONFIG_ACPI_NUMA | ||
| 1713 | struct pglist_data *pgdat = data; | ||
| 1714 | int cpu; | ||
| 1715 | |||
| 1716 | for_each_possible_cpu(cpu) | ||
| 1717 | if (cpu_to_node(cpu) == pgdat->node_id) | ||
| 1718 | numa_clear_node(cpu); | ||
| 1719 | #endif | ||
| 1720 | } | ||
| 1721 | |||
| 1722 | static int check_and_unmap_cpu_on_node(void *data) | ||
| 1723 | { | ||
| 1724 | int ret = check_cpu_on_node(data); | ||
| 1725 | |||
| 1726 | if (ret) | ||
| 1727 | return ret; | ||
| 1728 | |||
| 1729 | /* | ||
| 1730 | * the node will be offlined when we come here, so we can clear | ||
| 1731 | * the cpu_to_node() now. | ||
| 1732 | */ | ||
| 1733 | |||
| 1734 | unmap_cpu_on_node(data); | ||
| 1735 | return 0; | ||
| 1736 | } | ||
| 1737 | |||
| 1738 | /* offline the node if all memory sections of this node are removed */ | ||
| 1739 | void try_offline_node(int nid) | ||
| 1740 | { | ||
| 1741 | pg_data_t *pgdat = NODE_DATA(nid); | ||
| 1742 | unsigned long start_pfn = pgdat->node_start_pfn; | ||
| 1743 | unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; | ||
| 1744 | unsigned long pfn; | ||
| 1745 | struct page *pgdat_page = virt_to_page(pgdat); | ||
| 1746 | int i; | ||
| 1747 | |||
| 1748 | for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { | ||
| 1749 | unsigned long section_nr = pfn_to_section_nr(pfn); | ||
| 1750 | |||
| 1751 | if (!present_section_nr(section_nr)) | ||
| 1752 | continue; | ||
| 1753 | |||
| 1754 | if (pfn_to_nid(pfn) != nid) | ||
| 1755 | continue; | ||
| 1756 | |||
| 1757 | /* | ||
| 1758 | * some memory sections of this node are not removed, and we | ||
| 1759 | * can't offline node now. | ||
| 1760 | */ | ||
| 1761 | return; | ||
| 1762 | } | ||
| 1763 | |||
| 1764 | if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL)) | ||
| 1765 | return; | ||
| 1766 | |||
| 1767 | /* | ||
| 1768 | * all memory/cpu of this node are removed, we can offline this | ||
| 1769 | * node now. | ||
| 1770 | */ | ||
| 1771 | node_set_offline(nid); | ||
| 1772 | unregister_one_node(nid); | ||
| 1773 | |||
| 1774 | if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page)) | ||
| 1775 | /* node data is allocated from boot memory */ | ||
| 1776 | return; | ||
| 1777 | |||
| 1778 | /* free waittable in each zone */ | ||
| 1779 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
| 1780 | struct zone *zone = pgdat->node_zones + i; | ||
| 1781 | |||
| 1782 | if (zone->wait_table) | ||
| 1783 | vfree(zone->wait_table); | ||
| 1784 | } | ||
| 1785 | |||
| 1786 | /* | ||
| 1787 | * Since there is no way to guarentee the address of pgdat/zone is not | ||
| 1788 | * on stack of any kernel threads or used by other kernel objects | ||
| 1789 | * without reference counting or other symchronizing method, do not | ||
| 1790 | * reset node_data and free pgdat here. Just reset it to 0 and reuse | ||
| 1791 | * the memory when the node is online again. | ||
| 1792 | */ | ||
| 1793 | memset(pgdat, 0, sizeof(*pgdat)); | ||
| 1794 | } | ||
| 1795 | EXPORT_SYMBOL(try_offline_node); | ||
| 1796 | |||
| 1797 | int __ref remove_memory(int nid, u64 start, u64 size) | ||
| 1798 | { | ||
| 1799 | unsigned long start_pfn, end_pfn; | ||
| 1800 | int ret = 0; | ||
| 1801 | int retry = 1; | ||
| 1802 | |||
| 1803 | start_pfn = PFN_DOWN(start); | ||
| 1804 | end_pfn = start_pfn + PFN_DOWN(size); | ||
| 1805 | |||
| 1806 | /* | ||
| 1807 | * When CONFIG_MEMCG is on, one memory block may be used by other | ||
| 1808 | * blocks to store page cgroup when onlining pages. But we don't know | ||
| 1809 | * in what order pages are onlined. So we iterate twice to offline | ||
| 1810 | * memory: | ||
| 1811 | * 1st iterate: offline every non primary memory block. | ||
| 1812 | * 2nd iterate: offline primary (i.e. first added) memory block. | ||
| 1813 | */ | ||
| 1814 | repeat: | ||
| 1815 | walk_memory_range(start_pfn, end_pfn, &ret, | ||
| 1816 | offline_memory_block_cb); | ||
| 1817 | if (ret) { | ||
| 1818 | if (!retry) | ||
| 1819 | return ret; | ||
| 1820 | |||
| 1821 | retry = 0; | ||
| 1822 | ret = 0; | ||
| 1823 | goto repeat; | ||
| 1824 | } | ||
| 1825 | |||
| 1826 | lock_memory_hotplug(); | ||
| 1827 | |||
| 1828 | /* | ||
| 1829 | * we have offlined all memory blocks like this: | ||
| 1830 | * 1. lock memory hotplug | ||
| 1831 | * 2. offline a memory block | ||
| 1832 | * 3. unlock memory hotplug | ||
| 1833 | * | ||
| 1834 | * repeat step1-3 to offline the memory block. All memory blocks | ||
| 1835 | * must be offlined before removing memory. But we don't hold the | ||
| 1836 | * lock in the whole operation. So we should check whether all | ||
| 1837 | * memory blocks are offlined. | ||
| 1838 | */ | ||
| 1839 | |||
| 1840 | ret = walk_memory_range(start_pfn, end_pfn, NULL, | ||
| 1841 | is_memblock_offlined_cb); | ||
| 1842 | if (ret) { | ||
| 1843 | unlock_memory_hotplug(); | ||
| 1844 | return ret; | ||
| 1845 | } | ||
| 1846 | |||
| 1847 | /* remove memmap entry */ | ||
| 1848 | firmware_map_remove(start, start + size, "System RAM"); | ||
| 1849 | |||
| 1850 | arch_remove_memory(start, size); | ||
| 1851 | |||
| 1852 | try_offline_node(nid); | ||
| 1853 | |||
| 1854 | unlock_memory_hotplug(); | ||
| 1855 | |||
| 1856 | return 0; | ||
| 1857 | } | ||
| 1423 | #else | 1858 | #else |
| 1424 | int offline_pages(unsigned long start_pfn, unsigned long nr_pages) | 1859 | int offline_pages(unsigned long start_pfn, unsigned long nr_pages) |
| 1425 | { | 1860 | { |
| 1426 | return -EINVAL; | 1861 | return -EINVAL; |
| 1427 | } | 1862 | } |
| 1428 | int remove_memory(u64 start, u64 size) | 1863 | int remove_memory(int nid, u64 start, u64 size) |
| 1429 | { | 1864 | { |
| 1430 | return -EINVAL; | 1865 | return -EINVAL; |
| 1431 | } | 1866 | } |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e2df1c1fb41f..31d26637b658 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
| @@ -26,7 +26,7 @@ | |||
| 26 | * the allocation to memory nodes instead | 26 | * the allocation to memory nodes instead |
| 27 | * | 27 | * |
| 28 | * preferred Try a specific node first before normal fallback. | 28 | * preferred Try a specific node first before normal fallback. |
| 29 | * As a special case node -1 here means do the allocation | 29 | * As a special case NUMA_NO_NODE here means do the allocation |
| 30 | * on the local CPU. This is normally identical to default, | 30 | * on the local CPU. This is normally identical to default, |
| 31 | * but useful to set in a VMA when you have a non default | 31 | * but useful to set in a VMA when you have a non default |
| 32 | * process policy. | 32 | * process policy. |
| @@ -127,7 +127,7 @@ static struct mempolicy *get_task_policy(struct task_struct *p) | |||
| 127 | 127 | ||
| 128 | if (!pol) { | 128 | if (!pol) { |
| 129 | node = numa_node_id(); | 129 | node = numa_node_id(); |
| 130 | if (node != -1) | 130 | if (node != NUMA_NO_NODE) |
| 131 | pol = &preferred_node_policy[node]; | 131 | pol = &preferred_node_policy[node]; |
| 132 | 132 | ||
| 133 | /* preferred_node_policy is not initialised early in boot */ | 133 | /* preferred_node_policy is not initialised early in boot */ |
| @@ -161,19 +161,7 @@ static const struct mempolicy_operations { | |||
| 161 | /* Check that the nodemask contains at least one populated zone */ | 161 | /* Check that the nodemask contains at least one populated zone */ |
| 162 | static int is_valid_nodemask(const nodemask_t *nodemask) | 162 | static int is_valid_nodemask(const nodemask_t *nodemask) |
| 163 | { | 163 | { |
| 164 | int nd, k; | 164 | return nodes_intersects(*nodemask, node_states[N_MEMORY]); |
| 165 | |||
| 166 | for_each_node_mask(nd, *nodemask) { | ||
| 167 | struct zone *z; | ||
| 168 | |||
| 169 | for (k = 0; k <= policy_zone; k++) { | ||
| 170 | z = &NODE_DATA(nd)->node_zones[k]; | ||
| 171 | if (z->present_pages > 0) | ||
| 172 | return 1; | ||
| 173 | } | ||
| 174 | } | ||
| 175 | |||
| 176 | return 0; | ||
| 177 | } | 165 | } |
| 178 | 166 | ||
| 179 | static inline int mpol_store_user_nodemask(const struct mempolicy *pol) | 167 | static inline int mpol_store_user_nodemask(const struct mempolicy *pol) |
| @@ -270,7 +258,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | |||
| 270 | struct mempolicy *policy; | 258 | struct mempolicy *policy; |
| 271 | 259 | ||
| 272 | pr_debug("setting mode %d flags %d nodes[0] %lx\n", | 260 | pr_debug("setting mode %d flags %d nodes[0] %lx\n", |
| 273 | mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); | 261 | mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE); |
| 274 | 262 | ||
| 275 | if (mode == MPOL_DEFAULT) { | 263 | if (mode == MPOL_DEFAULT) { |
| 276 | if (nodes && !nodes_empty(*nodes)) | 264 | if (nodes && !nodes_empty(*nodes)) |
| @@ -508,9 +496,8 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 508 | /* | 496 | /* |
| 509 | * vm_normal_page() filters out zero pages, but there might | 497 | * vm_normal_page() filters out zero pages, but there might |
| 510 | * still be PageReserved pages to skip, perhaps in a VDSO. | 498 | * still be PageReserved pages to skip, perhaps in a VDSO. |
| 511 | * And we cannot move PageKsm pages sensibly or safely yet. | ||
| 512 | */ | 499 | */ |
| 513 | if (PageReserved(page) || PageKsm(page)) | 500 | if (PageReserved(page)) |
| 514 | continue; | 501 | continue; |
| 515 | nid = page_to_nid(page); | 502 | nid = page_to_nid(page); |
| 516 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) | 503 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) |
| @@ -1027,8 +1014,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
| 1027 | 1014 | ||
| 1028 | if (!list_empty(&pagelist)) { | 1015 | if (!list_empty(&pagelist)) { |
| 1029 | err = migrate_pages(&pagelist, new_node_page, dest, | 1016 | err = migrate_pages(&pagelist, new_node_page, dest, |
| 1030 | false, MIGRATE_SYNC, | 1017 | MIGRATE_SYNC, MR_SYSCALL); |
| 1031 | MR_SYSCALL); | ||
| 1032 | if (err) | 1018 | if (err) |
| 1033 | putback_lru_pages(&pagelist); | 1019 | putback_lru_pages(&pagelist); |
| 1034 | } | 1020 | } |
| @@ -1235,7 +1221,7 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
| 1235 | 1221 | ||
| 1236 | pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", | 1222 | pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", |
| 1237 | start, start + len, mode, mode_flags, | 1223 | start, start + len, mode, mode_flags, |
| 1238 | nmask ? nodes_addr(*nmask)[0] : -1); | 1224 | nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE); |
| 1239 | 1225 | ||
| 1240 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { | 1226 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { |
| 1241 | 1227 | ||
| @@ -1272,9 +1258,8 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
| 1272 | if (!list_empty(&pagelist)) { | 1258 | if (!list_empty(&pagelist)) { |
| 1273 | WARN_ON_ONCE(flags & MPOL_MF_LAZY); | 1259 | WARN_ON_ONCE(flags & MPOL_MF_LAZY); |
| 1274 | nr_failed = migrate_pages(&pagelist, new_vma_page, | 1260 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
| 1275 | (unsigned long)vma, | 1261 | (unsigned long)vma, |
| 1276 | false, MIGRATE_SYNC, | 1262 | MIGRATE_SYNC, MR_MEMPOLICY_MBIND); |
| 1277 | MR_MEMPOLICY_MBIND); | ||
| 1278 | if (nr_failed) | 1263 | if (nr_failed) |
| 1279 | putback_lru_pages(&pagelist); | 1264 | putback_lru_pages(&pagelist); |
| 1280 | } | 1265 | } |
| @@ -1644,6 +1629,26 @@ struct mempolicy *get_vma_policy(struct task_struct *task, | |||
| 1644 | return pol; | 1629 | return pol; |
| 1645 | } | 1630 | } |
| 1646 | 1631 | ||
| 1632 | static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) | ||
| 1633 | { | ||
| 1634 | enum zone_type dynamic_policy_zone = policy_zone; | ||
| 1635 | |||
| 1636 | BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); | ||
| 1637 | |||
| 1638 | /* | ||
| 1639 | * if policy->v.nodes has movable memory only, | ||
| 1640 | * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. | ||
| 1641 | * | ||
| 1642 | * policy->v.nodes is intersect with node_states[N_MEMORY]. | ||
| 1643 | * so if the following test faile, it implies | ||
| 1644 | * policy->v.nodes has movable memory only. | ||
| 1645 | */ | ||
| 1646 | if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY])) | ||
| 1647 | dynamic_policy_zone = ZONE_MOVABLE; | ||
| 1648 | |||
| 1649 | return zone >= dynamic_policy_zone; | ||
| 1650 | } | ||
| 1651 | |||
| 1647 | /* | 1652 | /* |
| 1648 | * Return a nodemask representing a mempolicy for filtering nodes for | 1653 | * Return a nodemask representing a mempolicy for filtering nodes for |
| 1649 | * page allocation | 1654 | * page allocation |
| @@ -1652,7 +1657,7 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) | |||
| 1652 | { | 1657 | { |
| 1653 | /* Lower zones don't get a nodemask applied for MPOL_BIND */ | 1658 | /* Lower zones don't get a nodemask applied for MPOL_BIND */ |
| 1654 | if (unlikely(policy->mode == MPOL_BIND) && | 1659 | if (unlikely(policy->mode == MPOL_BIND) && |
| 1655 | gfp_zone(gfp) >= policy_zone && | 1660 | apply_policy_zone(policy, gfp_zone(gfp)) && |
| 1656 | cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) | 1661 | cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) |
| 1657 | return &policy->v.nodes; | 1662 | return &policy->v.nodes; |
| 1658 | 1663 | ||
| @@ -2308,7 +2313,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long | |||
| 2308 | * it less likely we act on an unlikely task<->page | 2313 | * it less likely we act on an unlikely task<->page |
| 2309 | * relation. | 2314 | * relation. |
| 2310 | */ | 2315 | */ |
| 2311 | last_nid = page_xchg_last_nid(page, polnid); | 2316 | last_nid = page_nid_xchg_last(page, polnid); |
| 2312 | if (last_nid != polnid) | 2317 | if (last_nid != polnid) |
| 2313 | goto out; | 2318 | goto out; |
| 2314 | } | 2319 | } |
| @@ -2483,7 +2488,7 @@ int mpol_set_shared_policy(struct shared_policy *info, | |||
| 2483 | vma->vm_pgoff, | 2488 | vma->vm_pgoff, |
| 2484 | sz, npol ? npol->mode : -1, | 2489 | sz, npol ? npol->mode : -1, |
| 2485 | npol ? npol->flags : -1, | 2490 | npol ? npol->flags : -1, |
| 2486 | npol ? nodes_addr(npol->v.nodes)[0] : -1); | 2491 | npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE); |
| 2487 | 2492 | ||
| 2488 | if (npol) { | 2493 | if (npol) { |
| 2489 | new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); | 2494 | new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); |
diff --git a/mm/migrate.c b/mm/migrate.c index 2fd8b4af4744..3bbaf5d230b0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
| @@ -464,7 +464,10 @@ void migrate_page_copy(struct page *newpage, struct page *page) | |||
| 464 | 464 | ||
| 465 | mlock_migrate_page(newpage, page); | 465 | mlock_migrate_page(newpage, page); |
| 466 | ksm_migrate_page(newpage, page); | 466 | ksm_migrate_page(newpage, page); |
| 467 | 467 | /* | |
| 468 | * Please do not reorder this without considering how mm/ksm.c's | ||
| 469 | * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). | ||
| 470 | */ | ||
| 468 | ClearPageSwapCache(page); | 471 | ClearPageSwapCache(page); |
| 469 | ClearPagePrivate(page); | 472 | ClearPagePrivate(page); |
| 470 | set_page_private(page, 0); | 473 | set_page_private(page, 0); |
| @@ -698,7 +701,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, | |||
| 698 | } | 701 | } |
| 699 | 702 | ||
| 700 | static int __unmap_and_move(struct page *page, struct page *newpage, | 703 | static int __unmap_and_move(struct page *page, struct page *newpage, |
| 701 | int force, bool offlining, enum migrate_mode mode) | 704 | int force, enum migrate_mode mode) |
| 702 | { | 705 | { |
| 703 | int rc = -EAGAIN; | 706 | int rc = -EAGAIN; |
| 704 | int remap_swapcache = 1; | 707 | int remap_swapcache = 1; |
| @@ -728,20 +731,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
| 728 | lock_page(page); | 731 | lock_page(page); |
| 729 | } | 732 | } |
| 730 | 733 | ||
| 731 | /* | ||
| 732 | * Only memory hotplug's offline_pages() caller has locked out KSM, | ||
| 733 | * and can safely migrate a KSM page. The other cases have skipped | ||
| 734 | * PageKsm along with PageReserved - but it is only now when we have | ||
| 735 | * the page lock that we can be certain it will not go KSM beneath us | ||
| 736 | * (KSM will not upgrade a page from PageAnon to PageKsm when it sees | ||
| 737 | * its pagecount raised, but only here do we take the page lock which | ||
| 738 | * serializes that). | ||
| 739 | */ | ||
| 740 | if (PageKsm(page) && !offlining) { | ||
| 741 | rc = -EBUSY; | ||
| 742 | goto unlock; | ||
| 743 | } | ||
| 744 | |||
| 745 | /* charge against new page */ | 734 | /* charge against new page */ |
| 746 | mem_cgroup_prepare_migration(page, newpage, &mem); | 735 | mem_cgroup_prepare_migration(page, newpage, &mem); |
| 747 | 736 | ||
| @@ -768,7 +757,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
| 768 | * File Caches may use write_page() or lock_page() in migration, then, | 757 | * File Caches may use write_page() or lock_page() in migration, then, |
| 769 | * just care Anon page here. | 758 | * just care Anon page here. |
| 770 | */ | 759 | */ |
| 771 | if (PageAnon(page)) { | 760 | if (PageAnon(page) && !PageKsm(page)) { |
| 772 | /* | 761 | /* |
| 773 | * Only page_lock_anon_vma_read() understands the subtleties of | 762 | * Only page_lock_anon_vma_read() understands the subtleties of |
| 774 | * getting a hold on an anon_vma from outside one of its mms. | 763 | * getting a hold on an anon_vma from outside one of its mms. |
| @@ -848,7 +837,6 @@ uncharge: | |||
| 848 | mem_cgroup_end_migration(mem, page, newpage, | 837 | mem_cgroup_end_migration(mem, page, newpage, |
| 849 | (rc == MIGRATEPAGE_SUCCESS || | 838 | (rc == MIGRATEPAGE_SUCCESS || |
| 850 | rc == MIGRATEPAGE_BALLOON_SUCCESS)); | 839 | rc == MIGRATEPAGE_BALLOON_SUCCESS)); |
| 851 | unlock: | ||
| 852 | unlock_page(page); | 840 | unlock_page(page); |
| 853 | out: | 841 | out: |
| 854 | return rc; | 842 | return rc; |
| @@ -859,8 +847,7 @@ out: | |||
| 859 | * to the newly allocated page in newpage. | 847 | * to the newly allocated page in newpage. |
| 860 | */ | 848 | */ |
| 861 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, | 849 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, |
| 862 | struct page *page, int force, bool offlining, | 850 | struct page *page, int force, enum migrate_mode mode) |
| 863 | enum migrate_mode mode) | ||
| 864 | { | 851 | { |
| 865 | int rc = 0; | 852 | int rc = 0; |
| 866 | int *result = NULL; | 853 | int *result = NULL; |
| @@ -878,7 +865,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
| 878 | if (unlikely(split_huge_page(page))) | 865 | if (unlikely(split_huge_page(page))) |
| 879 | goto out; | 866 | goto out; |
| 880 | 867 | ||
| 881 | rc = __unmap_and_move(page, newpage, force, offlining, mode); | 868 | rc = __unmap_and_move(page, newpage, force, mode); |
| 882 | 869 | ||
| 883 | if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) { | 870 | if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) { |
| 884 | /* | 871 | /* |
| @@ -938,8 +925,7 @@ out: | |||
| 938 | */ | 925 | */ |
| 939 | static int unmap_and_move_huge_page(new_page_t get_new_page, | 926 | static int unmap_and_move_huge_page(new_page_t get_new_page, |
| 940 | unsigned long private, struct page *hpage, | 927 | unsigned long private, struct page *hpage, |
| 941 | int force, bool offlining, | 928 | int force, enum migrate_mode mode) |
| 942 | enum migrate_mode mode) | ||
| 943 | { | 929 | { |
| 944 | int rc = 0; | 930 | int rc = 0; |
| 945 | int *result = NULL; | 931 | int *result = NULL; |
| @@ -1001,9 +987,8 @@ out: | |||
| 1001 | * | 987 | * |
| 1002 | * Return: Number of pages not migrated or error code. | 988 | * Return: Number of pages not migrated or error code. |
| 1003 | */ | 989 | */ |
| 1004 | int migrate_pages(struct list_head *from, | 990 | int migrate_pages(struct list_head *from, new_page_t get_new_page, |
| 1005 | new_page_t get_new_page, unsigned long private, bool offlining, | 991 | unsigned long private, enum migrate_mode mode, int reason) |
| 1006 | enum migrate_mode mode, int reason) | ||
| 1007 | { | 992 | { |
| 1008 | int retry = 1; | 993 | int retry = 1; |
| 1009 | int nr_failed = 0; | 994 | int nr_failed = 0; |
| @@ -1024,8 +1009,7 @@ int migrate_pages(struct list_head *from, | |||
| 1024 | cond_resched(); | 1009 | cond_resched(); |
| 1025 | 1010 | ||
| 1026 | rc = unmap_and_move(get_new_page, private, | 1011 | rc = unmap_and_move(get_new_page, private, |
| 1027 | page, pass > 2, offlining, | 1012 | page, pass > 2, mode); |
| 1028 | mode); | ||
| 1029 | 1013 | ||
| 1030 | switch(rc) { | 1014 | switch(rc) { |
| 1031 | case -ENOMEM: | 1015 | case -ENOMEM: |
| @@ -1058,15 +1042,13 @@ out: | |||
| 1058 | } | 1042 | } |
| 1059 | 1043 | ||
| 1060 | int migrate_huge_page(struct page *hpage, new_page_t get_new_page, | 1044 | int migrate_huge_page(struct page *hpage, new_page_t get_new_page, |
| 1061 | unsigned long private, bool offlining, | 1045 | unsigned long private, enum migrate_mode mode) |
| 1062 | enum migrate_mode mode) | ||
| 1063 | { | 1046 | { |
| 1064 | int pass, rc; | 1047 | int pass, rc; |
| 1065 | 1048 | ||
| 1066 | for (pass = 0; pass < 10; pass++) { | 1049 | for (pass = 0; pass < 10; pass++) { |
| 1067 | rc = unmap_and_move_huge_page(get_new_page, | 1050 | rc = unmap_and_move_huge_page(get_new_page, private, |
| 1068 | private, hpage, pass > 2, offlining, | 1051 | hpage, pass > 2, mode); |
| 1069 | mode); | ||
| 1070 | switch (rc) { | 1052 | switch (rc) { |
| 1071 | case -ENOMEM: | 1053 | case -ENOMEM: |
| 1072 | goto out; | 1054 | goto out; |
| @@ -1152,7 +1134,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
| 1152 | goto set_status; | 1134 | goto set_status; |
| 1153 | 1135 | ||
| 1154 | /* Use PageReserved to check for zero page */ | 1136 | /* Use PageReserved to check for zero page */ |
| 1155 | if (PageReserved(page) || PageKsm(page)) | 1137 | if (PageReserved(page)) |
| 1156 | goto put_and_set; | 1138 | goto put_and_set; |
| 1157 | 1139 | ||
| 1158 | pp->page = page; | 1140 | pp->page = page; |
| @@ -1189,8 +1171,7 @@ set_status: | |||
| 1189 | err = 0; | 1171 | err = 0; |
| 1190 | if (!list_empty(&pagelist)) { | 1172 | if (!list_empty(&pagelist)) { |
| 1191 | err = migrate_pages(&pagelist, new_page_node, | 1173 | err = migrate_pages(&pagelist, new_page_node, |
| 1192 | (unsigned long)pm, 0, MIGRATE_SYNC, | 1174 | (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); |
| 1193 | MR_SYSCALL); | ||
| 1194 | if (err) | 1175 | if (err) |
| 1195 | putback_lru_pages(&pagelist); | 1176 | putback_lru_pages(&pagelist); |
| 1196 | } | 1177 | } |
| @@ -1314,7 +1295,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, | |||
| 1314 | 1295 | ||
| 1315 | err = -ENOENT; | 1296 | err = -ENOENT; |
| 1316 | /* Use PageReserved to check for zero page */ | 1297 | /* Use PageReserved to check for zero page */ |
| 1317 | if (!page || PageReserved(page) || PageKsm(page)) | 1298 | if (!page || PageReserved(page)) |
| 1318 | goto set_status; | 1299 | goto set_status; |
| 1319 | 1300 | ||
| 1320 | err = page_to_nid(page); | 1301 | err = page_to_nid(page); |
| @@ -1461,7 +1442,7 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, | |||
| 1461 | * pages. Currently it only checks the watermarks which crude | 1442 | * pages. Currently it only checks the watermarks which crude |
| 1462 | */ | 1443 | */ |
| 1463 | static bool migrate_balanced_pgdat(struct pglist_data *pgdat, | 1444 | static bool migrate_balanced_pgdat(struct pglist_data *pgdat, |
| 1464 | int nr_migrate_pages) | 1445 | unsigned long nr_migrate_pages) |
| 1465 | { | 1446 | { |
| 1466 | int z; | 1447 | int z; |
| 1467 | for (z = pgdat->nr_zones - 1; z >= 0; z--) { | 1448 | for (z = pgdat->nr_zones - 1; z >= 0; z--) { |
| @@ -1497,7 +1478,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page, | |||
| 1497 | __GFP_NOWARN) & | 1478 | __GFP_NOWARN) & |
| 1498 | ~GFP_IOFS, 0); | 1479 | ~GFP_IOFS, 0); |
| 1499 | if (newpage) | 1480 | if (newpage) |
| 1500 | page_xchg_last_nid(newpage, page_last_nid(page)); | 1481 | page_nid_xchg_last(newpage, page_nid_last(page)); |
| 1501 | 1482 | ||
| 1502 | return newpage; | 1483 | return newpage; |
| 1503 | } | 1484 | } |
| @@ -1557,39 +1538,40 @@ bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) | |||
| 1557 | 1538 | ||
| 1558 | int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) | 1539 | int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) |
| 1559 | { | 1540 | { |
| 1560 | int ret = 0; | 1541 | int page_lru; |
| 1542 | |||
| 1543 | VM_BUG_ON(compound_order(page) && !PageTransHuge(page)); | ||
| 1561 | 1544 | ||
| 1562 | /* Avoid migrating to a node that is nearly full */ | 1545 | /* Avoid migrating to a node that is nearly full */ |
| 1563 | if (migrate_balanced_pgdat(pgdat, 1)) { | 1546 | if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page))) |
| 1564 | int page_lru; | 1547 | return 0; |
| 1565 | 1548 | ||
| 1566 | if (isolate_lru_page(page)) { | 1549 | if (isolate_lru_page(page)) |
| 1567 | put_page(page); | 1550 | return 0; |
| 1568 | return 0; | ||
| 1569 | } | ||
| 1570 | 1551 | ||
| 1571 | /* Page is isolated */ | 1552 | /* |
| 1572 | ret = 1; | 1553 | * migrate_misplaced_transhuge_page() skips page migration's usual |
| 1573 | page_lru = page_is_file_cache(page); | 1554 | * check on page_count(), so we must do it here, now that the page |
| 1574 | if (!PageTransHuge(page)) | 1555 | * has been isolated: a GUP pin, or any other pin, prevents migration. |
| 1575 | inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); | 1556 | * The expected page count is 3: 1 for page's mapcount and 1 for the |
| 1576 | else | 1557 | * caller's pin and 1 for the reference taken by isolate_lru_page(). |
| 1577 | mod_zone_page_state(page_zone(page), | 1558 | */ |
| 1578 | NR_ISOLATED_ANON + page_lru, | 1559 | if (PageTransHuge(page) && page_count(page) != 3) { |
| 1579 | HPAGE_PMD_NR); | 1560 | putback_lru_page(page); |
| 1561 | return 0; | ||
| 1580 | } | 1562 | } |
| 1581 | 1563 | ||
| 1564 | page_lru = page_is_file_cache(page); | ||
| 1565 | mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru, | ||
| 1566 | hpage_nr_pages(page)); | ||
| 1567 | |||
| 1582 | /* | 1568 | /* |
| 1583 | * Page is either isolated or there is not enough space on the target | 1569 | * Isolating the page has taken another reference, so the |
| 1584 | * node. If isolated, then it has taken a reference count and the | 1570 | * caller's reference can be safely dropped without the page |
| 1585 | * callers reference can be safely dropped without the page | 1571 | * disappearing underneath us during migration. |
| 1586 | * disappearing underneath us during migration. Otherwise the page is | ||
| 1587 | * not to be migrated but the callers reference should still be | ||
| 1588 | * dropped so it does not leak. | ||
| 1589 | */ | 1572 | */ |
| 1590 | put_page(page); | 1573 | put_page(page); |
| 1591 | 1574 | return 1; | |
| 1592 | return ret; | ||
| 1593 | } | 1575 | } |
| 1594 | 1576 | ||
| 1595 | /* | 1577 | /* |
| @@ -1600,7 +1582,7 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) | |||
| 1600 | int migrate_misplaced_page(struct page *page, int node) | 1582 | int migrate_misplaced_page(struct page *page, int node) |
| 1601 | { | 1583 | { |
| 1602 | pg_data_t *pgdat = NODE_DATA(node); | 1584 | pg_data_t *pgdat = NODE_DATA(node); |
| 1603 | int isolated = 0; | 1585 | int isolated; |
| 1604 | int nr_remaining; | 1586 | int nr_remaining; |
| 1605 | LIST_HEAD(migratepages); | 1587 | LIST_HEAD(migratepages); |
| 1606 | 1588 | ||
| @@ -1608,42 +1590,43 @@ int migrate_misplaced_page(struct page *page, int node) | |||
| 1608 | * Don't migrate pages that are mapped in multiple processes. | 1590 | * Don't migrate pages that are mapped in multiple processes. |
| 1609 | * TODO: Handle false sharing detection instead of this hammer | 1591 | * TODO: Handle false sharing detection instead of this hammer |
| 1610 | */ | 1592 | */ |
| 1611 | if (page_mapcount(page) != 1) { | 1593 | if (page_mapcount(page) != 1) |
| 1612 | put_page(page); | ||
| 1613 | goto out; | 1594 | goto out; |
| 1614 | } | ||
| 1615 | 1595 | ||
| 1616 | /* | 1596 | /* |
| 1617 | * Rate-limit the amount of data that is being migrated to a node. | 1597 | * Rate-limit the amount of data that is being migrated to a node. |
| 1618 | * Optimal placement is no good if the memory bus is saturated and | 1598 | * Optimal placement is no good if the memory bus is saturated and |
| 1619 | * all the time is being spent migrating! | 1599 | * all the time is being spent migrating! |
| 1620 | */ | 1600 | */ |
| 1621 | if (numamigrate_update_ratelimit(pgdat, 1)) { | 1601 | if (numamigrate_update_ratelimit(pgdat, 1)) |
| 1622 | put_page(page); | ||
| 1623 | goto out; | 1602 | goto out; |
| 1624 | } | ||
| 1625 | 1603 | ||
| 1626 | isolated = numamigrate_isolate_page(pgdat, page); | 1604 | isolated = numamigrate_isolate_page(pgdat, page); |
| 1627 | if (!isolated) | 1605 | if (!isolated) |
| 1628 | goto out; | 1606 | goto out; |
| 1629 | 1607 | ||
| 1630 | list_add(&page->lru, &migratepages); | 1608 | list_add(&page->lru, &migratepages); |
| 1631 | nr_remaining = migrate_pages(&migratepages, | 1609 | nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, |
| 1632 | alloc_misplaced_dst_page, | 1610 | node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); |
| 1633 | node, false, MIGRATE_ASYNC, | ||
| 1634 | MR_NUMA_MISPLACED); | ||
| 1635 | if (nr_remaining) { | 1611 | if (nr_remaining) { |
| 1636 | putback_lru_pages(&migratepages); | 1612 | putback_lru_pages(&migratepages); |
| 1637 | isolated = 0; | 1613 | isolated = 0; |
| 1638 | } else | 1614 | } else |
| 1639 | count_vm_numa_event(NUMA_PAGE_MIGRATE); | 1615 | count_vm_numa_event(NUMA_PAGE_MIGRATE); |
| 1640 | BUG_ON(!list_empty(&migratepages)); | 1616 | BUG_ON(!list_empty(&migratepages)); |
| 1641 | out: | ||
| 1642 | return isolated; | 1617 | return isolated; |
| 1618 | |||
| 1619 | out: | ||
| 1620 | put_page(page); | ||
| 1621 | return 0; | ||
| 1643 | } | 1622 | } |
| 1644 | #endif /* CONFIG_NUMA_BALANCING */ | 1623 | #endif /* CONFIG_NUMA_BALANCING */ |
| 1645 | 1624 | ||
| 1646 | #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) | 1625 | #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) |
| 1626 | /* | ||
| 1627 | * Migrates a THP to a given target node. page must be locked and is unlocked | ||
| 1628 | * before returning. | ||
| 1629 | */ | ||
| 1647 | int migrate_misplaced_transhuge_page(struct mm_struct *mm, | 1630 | int migrate_misplaced_transhuge_page(struct mm_struct *mm, |
| 1648 | struct vm_area_struct *vma, | 1631 | struct vm_area_struct *vma, |
| 1649 | pmd_t *pmd, pmd_t entry, | 1632 | pmd_t *pmd, pmd_t entry, |
| @@ -1674,29 +1657,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
| 1674 | 1657 | ||
| 1675 | new_page = alloc_pages_node(node, | 1658 | new_page = alloc_pages_node(node, |
| 1676 | (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); | 1659 | (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); |
| 1677 | if (!new_page) { | 1660 | if (!new_page) |
| 1678 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | 1661 | goto out_fail; |
| 1679 | goto out_dropref; | ||
| 1680 | } | ||
| 1681 | page_xchg_last_nid(new_page, page_last_nid(page)); | ||
| 1682 | 1662 | ||
| 1683 | isolated = numamigrate_isolate_page(pgdat, page); | 1663 | page_nid_xchg_last(new_page, page_nid_last(page)); |
| 1684 | 1664 | ||
| 1685 | /* | 1665 | isolated = numamigrate_isolate_page(pgdat, page); |
| 1686 | * Failing to isolate or a GUP pin prevents migration. The expected | 1666 | if (!isolated) { |
| 1687 | * page count is 2. 1 for anonymous pages without a mapping and 1 | ||
| 1688 | * for the callers pin. If the page was isolated, the page will | ||
| 1689 | * need to be put back on the LRU. | ||
| 1690 | */ | ||
| 1691 | if (!isolated || page_count(page) != 2) { | ||
| 1692 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | ||
| 1693 | put_page(new_page); | 1667 | put_page(new_page); |
| 1694 | if (isolated) { | 1668 | goto out_fail; |
| 1695 | putback_lru_page(page); | ||
| 1696 | isolated = 0; | ||
| 1697 | goto out; | ||
| 1698 | } | ||
| 1699 | goto out_keep_locked; | ||
| 1700 | } | 1669 | } |
| 1701 | 1670 | ||
| 1702 | /* Prepare a page as a migration target */ | 1671 | /* Prepare a page as a migration target */ |
| @@ -1728,6 +1697,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
| 1728 | putback_lru_page(page); | 1697 | putback_lru_page(page); |
| 1729 | 1698 | ||
| 1730 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | 1699 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); |
| 1700 | isolated = 0; | ||
| 1731 | goto out; | 1701 | goto out; |
| 1732 | } | 1702 | } |
| 1733 | 1703 | ||
| @@ -1772,9 +1742,11 @@ out: | |||
| 1772 | -HPAGE_PMD_NR); | 1742 | -HPAGE_PMD_NR); |
| 1773 | return isolated; | 1743 | return isolated; |
| 1774 | 1744 | ||
| 1745 | out_fail: | ||
| 1746 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | ||
| 1775 | out_dropref: | 1747 | out_dropref: |
| 1748 | unlock_page(page); | ||
| 1776 | put_page(page); | 1749 | put_page(page); |
| 1777 | out_keep_locked: | ||
| 1778 | return 0; | 1750 | return 0; |
| 1779 | } | 1751 | } |
| 1780 | #endif /* CONFIG_NUMA_BALANCING */ | 1752 | #endif /* CONFIG_NUMA_BALANCING */ |
diff --git a/mm/mincore.c b/mm/mincore.c index 936b4cee8cb1..da2be56a7b8f 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
| @@ -75,7 +75,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) | |||
| 75 | /* shmem/tmpfs may return swap: account for swapcache page too. */ | 75 | /* shmem/tmpfs may return swap: account for swapcache page too. */ |
| 76 | if (radix_tree_exceptional_entry(page)) { | 76 | if (radix_tree_exceptional_entry(page)) { |
| 77 | swp_entry_t swap = radix_to_swp_entry(page); | 77 | swp_entry_t swap = radix_to_swp_entry(page); |
| 78 | page = find_get_page(&swapper_space, swap.val); | 78 | page = find_get_page(swap_address_space(swap), swap.val); |
| 79 | } | 79 | } |
| 80 | #endif | 80 | #endif |
| 81 | if (page) { | 81 | if (page) { |
| @@ -135,7 +135,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 135 | } else { | 135 | } else { |
| 136 | #ifdef CONFIG_SWAP | 136 | #ifdef CONFIG_SWAP |
| 137 | pgoff = entry.val; | 137 | pgoff = entry.val; |
| 138 | *vec = mincore_page(&swapper_space, pgoff); | 138 | *vec = mincore_page(swap_address_space(entry), |
| 139 | pgoff); | ||
| 139 | #else | 140 | #else |
| 140 | WARN_ON(1); | 141 | WARN_ON(1); |
| 141 | *vec = 1; | 142 | *vec = 1; |
diff --git a/mm/mlock.c b/mm/mlock.c index c9bd528b01d2..e6638f565d42 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
| @@ -155,13 +155,12 @@ void munlock_vma_page(struct page *page) | |||
| 155 | * | 155 | * |
| 156 | * vma->vm_mm->mmap_sem must be held for at least read. | 156 | * vma->vm_mm->mmap_sem must be held for at least read. |
| 157 | */ | 157 | */ |
| 158 | static long __mlock_vma_pages_range(struct vm_area_struct *vma, | 158 | long __mlock_vma_pages_range(struct vm_area_struct *vma, |
| 159 | unsigned long start, unsigned long end, | 159 | unsigned long start, unsigned long end, int *nonblocking) |
| 160 | int *nonblocking) | ||
| 161 | { | 160 | { |
| 162 | struct mm_struct *mm = vma->vm_mm; | 161 | struct mm_struct *mm = vma->vm_mm; |
| 163 | unsigned long addr = start; | 162 | unsigned long addr = start; |
| 164 | int nr_pages = (end - start) / PAGE_SIZE; | 163 | unsigned long nr_pages = (end - start) / PAGE_SIZE; |
| 165 | int gup_flags; | 164 | int gup_flags; |
| 166 | 165 | ||
| 167 | VM_BUG_ON(start & ~PAGE_MASK); | 166 | VM_BUG_ON(start & ~PAGE_MASK); |
| @@ -186,6 +185,10 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, | |||
| 186 | if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) | 185 | if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) |
| 187 | gup_flags |= FOLL_FORCE; | 186 | gup_flags |= FOLL_FORCE; |
| 188 | 187 | ||
| 188 | /* | ||
| 189 | * We made sure addr is within a VMA, so the following will | ||
| 190 | * not result in a stack expansion that recurses back here. | ||
| 191 | */ | ||
| 189 | return __get_user_pages(current, mm, addr, nr_pages, gup_flags, | 192 | return __get_user_pages(current, mm, addr, nr_pages, gup_flags, |
| 190 | NULL, NULL, nonblocking); | 193 | NULL, NULL, nonblocking); |
| 191 | } | 194 | } |
| @@ -202,56 +205,6 @@ static int __mlock_posix_error_return(long retval) | |||
| 202 | return retval; | 205 | return retval; |
| 203 | } | 206 | } |
| 204 | 207 | ||
| 205 | /** | ||
| 206 | * mlock_vma_pages_range() - mlock pages in specified vma range. | ||
| 207 | * @vma - the vma containing the specfied address range | ||
| 208 | * @start - starting address in @vma to mlock | ||
| 209 | * @end - end address [+1] in @vma to mlock | ||
| 210 | * | ||
| 211 | * For mmap()/mremap()/expansion of mlocked vma. | ||
| 212 | * | ||
| 213 | * return 0 on success for "normal" vmas. | ||
| 214 | * | ||
| 215 | * return number of pages [> 0] to be removed from locked_vm on success | ||
| 216 | * of "special" vmas. | ||
| 217 | */ | ||
| 218 | long mlock_vma_pages_range(struct vm_area_struct *vma, | ||
| 219 | unsigned long start, unsigned long end) | ||
| 220 | { | ||
| 221 | int nr_pages = (end - start) / PAGE_SIZE; | ||
| 222 | BUG_ON(!(vma->vm_flags & VM_LOCKED)); | ||
| 223 | |||
| 224 | /* | ||
| 225 | * filter unlockable vmas | ||
| 226 | */ | ||
| 227 | if (vma->vm_flags & (VM_IO | VM_PFNMAP)) | ||
| 228 | goto no_mlock; | ||
| 229 | |||
| 230 | if (!((vma->vm_flags & VM_DONTEXPAND) || | ||
| 231 | is_vm_hugetlb_page(vma) || | ||
| 232 | vma == get_gate_vma(current->mm))) { | ||
| 233 | |||
| 234 | __mlock_vma_pages_range(vma, start, end, NULL); | ||
| 235 | |||
| 236 | /* Hide errors from mmap() and other callers */ | ||
| 237 | return 0; | ||
| 238 | } | ||
| 239 | |||
| 240 | /* | ||
| 241 | * User mapped kernel pages or huge pages: | ||
| 242 | * make these pages present to populate the ptes, but | ||
| 243 | * fall thru' to reset VM_LOCKED--no need to unlock, and | ||
| 244 | * return nr_pages so these don't get counted against task's | ||
| 245 | * locked limit. huge pages are already counted against | ||
| 246 | * locked vm limit. | ||
| 247 | */ | ||
| 248 | make_pages_present(start, end); | ||
| 249 | |||
| 250 | no_mlock: | ||
| 251 | vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */ | ||
| 252 | return nr_pages; /* error or pages NOT mlocked */ | ||
| 253 | } | ||
| 254 | |||
| 255 | /* | 208 | /* |
| 256 | * munlock_vma_pages_range() - munlock all pages in the vma range.' | 209 | * munlock_vma_pages_range() - munlock all pages in the vma range.' |
| 257 | * @vma - vma containing range to be munlock()ed. | 210 | * @vma - vma containing range to be munlock()ed. |
| @@ -303,7 +256,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, | |||
| 303 | * | 256 | * |
| 304 | * Filters out "special" vmas -- VM_LOCKED never gets set for these, and | 257 | * Filters out "special" vmas -- VM_LOCKED never gets set for these, and |
| 305 | * munlock is a no-op. However, for some special vmas, we go ahead and | 258 | * munlock is a no-op. However, for some special vmas, we go ahead and |
| 306 | * populate the ptes via make_pages_present(). | 259 | * populate the ptes. |
| 307 | * | 260 | * |
| 308 | * For vmas that pass the filters, merge/split as appropriate. | 261 | * For vmas that pass the filters, merge/split as appropriate. |
| 309 | */ | 262 | */ |
| @@ -391,9 +344,9 @@ static int do_mlock(unsigned long start, size_t len, int on) | |||
| 391 | 344 | ||
| 392 | /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ | 345 | /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ |
| 393 | 346 | ||
| 394 | newflags = vma->vm_flags | VM_LOCKED; | 347 | newflags = vma->vm_flags & ~VM_LOCKED; |
| 395 | if (!on) | 348 | if (on) |
| 396 | newflags &= ~VM_LOCKED; | 349 | newflags |= VM_LOCKED | VM_POPULATE; |
| 397 | 350 | ||
| 398 | tmp = vma->vm_end; | 351 | tmp = vma->vm_end; |
| 399 | if (tmp > end) | 352 | if (tmp > end) |
| @@ -416,13 +369,20 @@ static int do_mlock(unsigned long start, size_t len, int on) | |||
| 416 | return error; | 369 | return error; |
| 417 | } | 370 | } |
| 418 | 371 | ||
| 419 | static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) | 372 | /* |
| 373 | * __mm_populate - populate and/or mlock pages within a range of address space. | ||
| 374 | * | ||
| 375 | * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap | ||
| 376 | * flags. VMAs must be already marked with the desired vm_flags, and | ||
| 377 | * mmap_sem must not be held. | ||
| 378 | */ | ||
| 379 | int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) | ||
| 420 | { | 380 | { |
| 421 | struct mm_struct *mm = current->mm; | 381 | struct mm_struct *mm = current->mm; |
| 422 | unsigned long end, nstart, nend; | 382 | unsigned long end, nstart, nend; |
| 423 | struct vm_area_struct *vma = NULL; | 383 | struct vm_area_struct *vma = NULL; |
| 424 | int locked = 0; | 384 | int locked = 0; |
| 425 | int ret = 0; | 385 | long ret = 0; |
| 426 | 386 | ||
| 427 | VM_BUG_ON(start & ~PAGE_MASK); | 387 | VM_BUG_ON(start & ~PAGE_MASK); |
| 428 | VM_BUG_ON(len != PAGE_ALIGN(len)); | 388 | VM_BUG_ON(len != PAGE_ALIGN(len)); |
| @@ -446,7 +406,8 @@ static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) | |||
| 446 | * range with the first VMA. Also, skip undesirable VMA types. | 406 | * range with the first VMA. Also, skip undesirable VMA types. |
| 447 | */ | 407 | */ |
| 448 | nend = min(end, vma->vm_end); | 408 | nend = min(end, vma->vm_end); |
| 449 | if (vma->vm_flags & (VM_IO | VM_PFNMAP)) | 409 | if ((vma->vm_flags & (VM_IO | VM_PFNMAP | VM_POPULATE)) != |
| 410 | VM_POPULATE) | ||
| 450 | continue; | 411 | continue; |
| 451 | if (nstart < vma->vm_start) | 412 | if (nstart < vma->vm_start) |
| 452 | nstart = vma->vm_start; | 413 | nstart = vma->vm_start; |
| @@ -498,7 +459,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) | |||
| 498 | error = do_mlock(start, len, 1); | 459 | error = do_mlock(start, len, 1); |
| 499 | up_write(¤t->mm->mmap_sem); | 460 | up_write(¤t->mm->mmap_sem); |
| 500 | if (!error) | 461 | if (!error) |
| 501 | error = do_mlock_pages(start, len, 0); | 462 | error = __mm_populate(start, len, 0); |
| 502 | return error; | 463 | return error; |
| 503 | } | 464 | } |
| 504 | 465 | ||
| @@ -519,18 +480,18 @@ static int do_mlockall(int flags) | |||
| 519 | struct vm_area_struct * vma, * prev = NULL; | 480 | struct vm_area_struct * vma, * prev = NULL; |
| 520 | 481 | ||
| 521 | if (flags & MCL_FUTURE) | 482 | if (flags & MCL_FUTURE) |
| 522 | current->mm->def_flags |= VM_LOCKED; | 483 | current->mm->def_flags |= VM_LOCKED | VM_POPULATE; |
| 523 | else | 484 | else |
| 524 | current->mm->def_flags &= ~VM_LOCKED; | 485 | current->mm->def_flags &= ~(VM_LOCKED | VM_POPULATE); |
| 525 | if (flags == MCL_FUTURE) | 486 | if (flags == MCL_FUTURE) |
| 526 | goto out; | 487 | goto out; |
| 527 | 488 | ||
| 528 | for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { | 489 | for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { |
| 529 | vm_flags_t newflags; | 490 | vm_flags_t newflags; |
| 530 | 491 | ||
| 531 | newflags = vma->vm_flags | VM_LOCKED; | 492 | newflags = vma->vm_flags & ~VM_LOCKED; |
| 532 | if (!(flags & MCL_CURRENT)) | 493 | if (flags & MCL_CURRENT) |
| 533 | newflags &= ~VM_LOCKED; | 494 | newflags |= VM_LOCKED | VM_POPULATE; |
| 534 | 495 | ||
| 535 | /* Ignore errors */ | 496 | /* Ignore errors */ |
| 536 | mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); | 497 | mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); |
| @@ -564,10 +525,8 @@ SYSCALL_DEFINE1(mlockall, int, flags) | |||
| 564 | capable(CAP_IPC_LOCK)) | 525 | capable(CAP_IPC_LOCK)) |
| 565 | ret = do_mlockall(flags); | 526 | ret = do_mlockall(flags); |
| 566 | up_write(¤t->mm->mmap_sem); | 527 | up_write(¤t->mm->mmap_sem); |
| 567 | if (!ret && (flags & MCL_CURRENT)) { | 528 | if (!ret && (flags & MCL_CURRENT)) |
| 568 | /* Ignore errors */ | 529 | mm_populate(0, TASK_SIZE); |
| 569 | do_mlock_pages(0, TASK_SIZE, 1); | ||
| 570 | } | ||
| 571 | out: | 530 | out: |
| 572 | return ret; | 531 | return ret; |
| 573 | } | 532 | } |
diff --git a/mm/mm_init.c b/mm/mm_init.c index 1ffd97ae26d7..c280a02ea11e 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c | |||
| @@ -69,34 +69,41 @@ void __init mminit_verify_pageflags_layout(void) | |||
| 69 | unsigned long or_mask, add_mask; | 69 | unsigned long or_mask, add_mask; |
| 70 | 70 | ||
| 71 | shift = 8 * sizeof(unsigned long); | 71 | shift = 8 * sizeof(unsigned long); |
| 72 | width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH; | 72 | width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NID_SHIFT; |
| 73 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", | 73 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", |
| 74 | "Section %d Node %d Zone %d Flags %d\n", | 74 | "Section %d Node %d Zone %d Lastnid %d Flags %d\n", |
| 75 | SECTIONS_WIDTH, | 75 | SECTIONS_WIDTH, |
| 76 | NODES_WIDTH, | 76 | NODES_WIDTH, |
| 77 | ZONES_WIDTH, | 77 | ZONES_WIDTH, |
| 78 | LAST_NID_WIDTH, | ||
| 78 | NR_PAGEFLAGS); | 79 | NR_PAGEFLAGS); |
| 79 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", | 80 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", |
| 80 | "Section %d Node %d Zone %d\n", | 81 | "Section %d Node %d Zone %d Lastnid %d\n", |
| 81 | SECTIONS_SHIFT, | 82 | SECTIONS_SHIFT, |
| 82 | NODES_SHIFT, | 83 | NODES_SHIFT, |
| 83 | ZONES_SHIFT); | 84 | ZONES_SHIFT, |
| 84 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets", | 85 | LAST_NID_SHIFT); |
| 85 | "Section %lu Node %lu Zone %lu\n", | 86 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", |
| 87 | "Section %lu Node %lu Zone %lu Lastnid %lu\n", | ||
| 86 | (unsigned long)SECTIONS_PGSHIFT, | 88 | (unsigned long)SECTIONS_PGSHIFT, |
| 87 | (unsigned long)NODES_PGSHIFT, | 89 | (unsigned long)NODES_PGSHIFT, |
| 88 | (unsigned long)ZONES_PGSHIFT); | 90 | (unsigned long)ZONES_PGSHIFT, |
| 89 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid", | 91 | (unsigned long)LAST_NID_PGSHIFT); |
| 90 | "Zone ID: %lu -> %lu\n", | 92 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", |
| 91 | (unsigned long)ZONEID_PGOFF, | 93 | "Node/Zone ID: %lu -> %lu\n", |
| 92 | (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT)); | 94 | (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), |
| 95 | (unsigned long)ZONEID_PGOFF); | ||
| 93 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage", | 96 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage", |
| 94 | "location: %d -> %d unused %d -> %d flags %d -> %d\n", | 97 | "location: %d -> %d layout %d -> %d unused %d -> %d page-flags\n", |
| 95 | shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0); | 98 | shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0); |
| 96 | #ifdef NODE_NOT_IN_PAGE_FLAGS | 99 | #ifdef NODE_NOT_IN_PAGE_FLAGS |
| 97 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", | 100 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", |
| 98 | "Node not in page flags"); | 101 | "Node not in page flags"); |
| 99 | #endif | 102 | #endif |
| 103 | #ifdef LAST_NID_NOT_IN_PAGE_FLAGS | ||
| 104 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", | ||
| 105 | "Last nid not in page flags"); | ||
| 106 | #endif | ||
| 100 | 107 | ||
| 101 | if (SECTIONS_WIDTH) { | 108 | if (SECTIONS_WIDTH) { |
| 102 | shift -= SECTIONS_WIDTH; | 109 | shift -= SECTIONS_WIDTH; |
| @@ -144,7 +144,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
| 144 | */ | 144 | */ |
| 145 | free -= global_page_state(NR_SHMEM); | 145 | free -= global_page_state(NR_SHMEM); |
| 146 | 146 | ||
| 147 | free += nr_swap_pages; | 147 | free += get_nr_swap_pages(); |
| 148 | 148 | ||
| 149 | /* | 149 | /* |
| 150 | * Any slabs which are created with the | 150 | * Any slabs which are created with the |
| @@ -256,6 +256,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) | |||
| 256 | unsigned long newbrk, oldbrk; | 256 | unsigned long newbrk, oldbrk; |
| 257 | struct mm_struct *mm = current->mm; | 257 | struct mm_struct *mm = current->mm; |
| 258 | unsigned long min_brk; | 258 | unsigned long min_brk; |
| 259 | bool populate; | ||
| 259 | 260 | ||
| 260 | down_write(&mm->mmap_sem); | 261 | down_write(&mm->mmap_sem); |
| 261 | 262 | ||
| @@ -305,8 +306,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) | |||
| 305 | /* Ok, looks good - let it rip. */ | 306 | /* Ok, looks good - let it rip. */ |
| 306 | if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) | 307 | if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) |
| 307 | goto out; | 308 | goto out; |
| 309 | |||
| 308 | set_brk: | 310 | set_brk: |
| 309 | mm->brk = brk; | 311 | mm->brk = brk; |
| 312 | populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; | ||
| 313 | up_write(&mm->mmap_sem); | ||
| 314 | if (populate) | ||
| 315 | mm_populate(oldbrk, newbrk - oldbrk); | ||
| 316 | return brk; | ||
| 317 | |||
| 310 | out: | 318 | out: |
| 311 | retval = mm->brk; | 319 | retval = mm->brk; |
| 312 | up_write(&mm->mmap_sem); | 320 | up_write(&mm->mmap_sem); |
| @@ -801,7 +809,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
| 801 | anon_vma_interval_tree_post_update_vma(vma); | 809 | anon_vma_interval_tree_post_update_vma(vma); |
| 802 | if (adjust_next) | 810 | if (adjust_next) |
| 803 | anon_vma_interval_tree_post_update_vma(next); | 811 | anon_vma_interval_tree_post_update_vma(next); |
| 804 | anon_vma_unlock(anon_vma); | 812 | anon_vma_unlock_write(anon_vma); |
| 805 | } | 813 | } |
| 806 | if (mapping) | 814 | if (mapping) |
| 807 | mutex_unlock(&mapping->i_mmap_mutex); | 815 | mutex_unlock(&mapping->i_mmap_mutex); |
| @@ -1154,12 +1162,15 @@ static inline unsigned long round_hint_to_min(unsigned long hint) | |||
| 1154 | 1162 | ||
| 1155 | unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | 1163 | unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, |
| 1156 | unsigned long len, unsigned long prot, | 1164 | unsigned long len, unsigned long prot, |
| 1157 | unsigned long flags, unsigned long pgoff) | 1165 | unsigned long flags, unsigned long pgoff, |
| 1166 | unsigned long *populate) | ||
| 1158 | { | 1167 | { |
| 1159 | struct mm_struct * mm = current->mm; | 1168 | struct mm_struct * mm = current->mm; |
| 1160 | struct inode *inode; | 1169 | struct inode *inode; |
| 1161 | vm_flags_t vm_flags; | 1170 | vm_flags_t vm_flags; |
| 1162 | 1171 | ||
| 1172 | *populate = 0; | ||
| 1173 | |||
| 1163 | /* | 1174 | /* |
| 1164 | * Does the application expect PROT_READ to imply PROT_EXEC? | 1175 | * Does the application expect PROT_READ to imply PROT_EXEC? |
| 1165 | * | 1176 | * |
| @@ -1280,7 +1291,24 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
| 1280 | } | 1291 | } |
| 1281 | } | 1292 | } |
| 1282 | 1293 | ||
| 1283 | return mmap_region(file, addr, len, flags, vm_flags, pgoff); | 1294 | /* |
| 1295 | * Set 'VM_NORESERVE' if we should not account for the | ||
| 1296 | * memory use of this mapping. | ||
| 1297 | */ | ||
| 1298 | if (flags & MAP_NORESERVE) { | ||
| 1299 | /* We honor MAP_NORESERVE if allowed to overcommit */ | ||
| 1300 | if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) | ||
| 1301 | vm_flags |= VM_NORESERVE; | ||
| 1302 | |||
| 1303 | /* hugetlb applies strict overcommit unless MAP_NORESERVE */ | ||
| 1304 | if (file && is_file_hugepages(file)) | ||
| 1305 | vm_flags |= VM_NORESERVE; | ||
| 1306 | } | ||
| 1307 | |||
| 1308 | addr = mmap_region(file, addr, len, vm_flags, pgoff); | ||
| 1309 | if (!IS_ERR_VALUE(addr) && (vm_flags & VM_POPULATE)) | ||
| 1310 | *populate = len; | ||
| 1311 | return addr; | ||
| 1284 | } | 1312 | } |
| 1285 | 1313 | ||
| 1286 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | 1314 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, |
| @@ -1395,8 +1423,7 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) | |||
| 1395 | } | 1423 | } |
| 1396 | 1424 | ||
| 1397 | unsigned long mmap_region(struct file *file, unsigned long addr, | 1425 | unsigned long mmap_region(struct file *file, unsigned long addr, |
| 1398 | unsigned long len, unsigned long flags, | 1426 | unsigned long len, vm_flags_t vm_flags, unsigned long pgoff) |
| 1399 | vm_flags_t vm_flags, unsigned long pgoff) | ||
| 1400 | { | 1427 | { |
| 1401 | struct mm_struct *mm = current->mm; | 1428 | struct mm_struct *mm = current->mm; |
| 1402 | struct vm_area_struct *vma, *prev; | 1429 | struct vm_area_struct *vma, *prev; |
| @@ -1420,20 +1447,6 @@ munmap_back: | |||
| 1420 | return -ENOMEM; | 1447 | return -ENOMEM; |
| 1421 | 1448 | ||
| 1422 | /* | 1449 | /* |
| 1423 | * Set 'VM_NORESERVE' if we should not account for the | ||
| 1424 | * memory use of this mapping. | ||
| 1425 | */ | ||
| 1426 | if ((flags & MAP_NORESERVE)) { | ||
| 1427 | /* We honor MAP_NORESERVE if allowed to overcommit */ | ||
| 1428 | if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) | ||
| 1429 | vm_flags |= VM_NORESERVE; | ||
| 1430 | |||
| 1431 | /* hugetlb applies strict overcommit unless MAP_NORESERVE */ | ||
| 1432 | if (file && is_file_hugepages(file)) | ||
| 1433 | vm_flags |= VM_NORESERVE; | ||
| 1434 | } | ||
| 1435 | |||
| 1436 | /* | ||
| 1437 | * Private writable mapping: check memory availability | 1450 | * Private writable mapping: check memory availability |
| 1438 | */ | 1451 | */ |
| 1439 | if (accountable_mapping(file, vm_flags)) { | 1452 | if (accountable_mapping(file, vm_flags)) { |
| @@ -1531,10 +1544,12 @@ out: | |||
| 1531 | 1544 | ||
| 1532 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); | 1545 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); |
| 1533 | if (vm_flags & VM_LOCKED) { | 1546 | if (vm_flags & VM_LOCKED) { |
| 1534 | if (!mlock_vma_pages_range(vma, addr, addr + len)) | 1547 | if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || |
| 1548 | vma == get_gate_vma(current->mm))) | ||
| 1535 | mm->locked_vm += (len >> PAGE_SHIFT); | 1549 | mm->locked_vm += (len >> PAGE_SHIFT); |
| 1536 | } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) | 1550 | else |
| 1537 | make_pages_present(addr, addr + len); | 1551 | vma->vm_flags &= ~VM_LOCKED; |
| 1552 | } | ||
| 1538 | 1553 | ||
| 1539 | if (file) | 1554 | if (file) |
| 1540 | uprobe_mmap(vma); | 1555 | uprobe_mmap(vma); |
| @@ -2187,9 +2202,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) | |||
| 2187 | return vma; | 2202 | return vma; |
| 2188 | if (!prev || expand_stack(prev, addr)) | 2203 | if (!prev || expand_stack(prev, addr)) |
| 2189 | return NULL; | 2204 | return NULL; |
| 2190 | if (prev->vm_flags & VM_LOCKED) { | 2205 | if (prev->vm_flags & VM_LOCKED) |
| 2191 | mlock_vma_pages_range(prev, addr, prev->vm_end); | 2206 | __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL); |
| 2192 | } | ||
| 2193 | return prev; | 2207 | return prev; |
| 2194 | } | 2208 | } |
| 2195 | #else | 2209 | #else |
| @@ -2215,9 +2229,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) | |||
| 2215 | start = vma->vm_start; | 2229 | start = vma->vm_start; |
| 2216 | if (expand_stack(vma, addr)) | 2230 | if (expand_stack(vma, addr)) |
| 2217 | return NULL; | 2231 | return NULL; |
| 2218 | if (vma->vm_flags & VM_LOCKED) { | 2232 | if (vma->vm_flags & VM_LOCKED) |
| 2219 | mlock_vma_pages_range(vma, addr, start); | 2233 | __mlock_vma_pages_range(vma, addr, start, NULL); |
| 2220 | } | ||
| 2221 | return vma; | 2234 | return vma; |
| 2222 | } | 2235 | } |
| 2223 | #endif | 2236 | #endif |
| @@ -2590,10 +2603,8 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) | |||
| 2590 | out: | 2603 | out: |
| 2591 | perf_event_mmap(vma); | 2604 | perf_event_mmap(vma); |
| 2592 | mm->total_vm += len >> PAGE_SHIFT; | 2605 | mm->total_vm += len >> PAGE_SHIFT; |
| 2593 | if (flags & VM_LOCKED) { | 2606 | if (flags & VM_LOCKED) |
| 2594 | if (!mlock_vma_pages_range(vma, addr, addr + len)) | 2607 | mm->locked_vm += (len >> PAGE_SHIFT); |
| 2595 | mm->locked_vm += (len >> PAGE_SHIFT); | ||
| 2596 | } | ||
| 2597 | return addr; | 2608 | return addr; |
| 2598 | } | 2609 | } |
| 2599 | 2610 | ||
| @@ -2601,10 +2612,14 @@ unsigned long vm_brk(unsigned long addr, unsigned long len) | |||
| 2601 | { | 2612 | { |
| 2602 | struct mm_struct *mm = current->mm; | 2613 | struct mm_struct *mm = current->mm; |
| 2603 | unsigned long ret; | 2614 | unsigned long ret; |
| 2615 | bool populate; | ||
| 2604 | 2616 | ||
| 2605 | down_write(&mm->mmap_sem); | 2617 | down_write(&mm->mmap_sem); |
| 2606 | ret = do_brk(addr, len); | 2618 | ret = do_brk(addr, len); |
| 2619 | populate = ((mm->def_flags & VM_LOCKED) != 0); | ||
| 2607 | up_write(&mm->mmap_sem); | 2620 | up_write(&mm->mmap_sem); |
| 2621 | if (populate) | ||
| 2622 | mm_populate(addr, len); | ||
| 2608 | return ret; | 2623 | return ret; |
| 2609 | } | 2624 | } |
| 2610 | EXPORT_SYMBOL(vm_brk); | 2625 | EXPORT_SYMBOL(vm_brk); |
| @@ -3002,7 +3017,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) | |||
| 3002 | if (!__test_and_clear_bit(0, (unsigned long *) | 3017 | if (!__test_and_clear_bit(0, (unsigned long *) |
| 3003 | &anon_vma->root->rb_root.rb_node)) | 3018 | &anon_vma->root->rb_root.rb_node)) |
| 3004 | BUG(); | 3019 | BUG(); |
| 3005 | anon_vma_unlock(anon_vma); | 3020 | anon_vma_unlock_write(anon_vma); |
| 3006 | } | 3021 | } |
| 3007 | } | 3022 | } |
| 3008 | 3023 | ||
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 8a5ac8c686b0..2175fb0d501c 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
| @@ -37,49 +37,51 @@ static struct srcu_struct srcu; | |||
| 37 | void __mmu_notifier_release(struct mm_struct *mm) | 37 | void __mmu_notifier_release(struct mm_struct *mm) |
| 38 | { | 38 | { |
| 39 | struct mmu_notifier *mn; | 39 | struct mmu_notifier *mn; |
| 40 | struct hlist_node *n; | ||
| 41 | int id; | 40 | int id; |
| 42 | 41 | ||
| 43 | /* | 42 | /* |
| 44 | * SRCU here will block mmu_notifier_unregister until | 43 | * srcu_read_lock() here will block synchronize_srcu() in |
| 45 | * ->release returns. | 44 | * mmu_notifier_unregister() until all registered |
| 45 | * ->release() callouts this function makes have | ||
| 46 | * returned. | ||
| 46 | */ | 47 | */ |
| 47 | id = srcu_read_lock(&srcu); | 48 | id = srcu_read_lock(&srcu); |
| 48 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) | ||
| 49 | /* | ||
| 50 | * if ->release runs before mmu_notifier_unregister it | ||
| 51 | * must be handled as it's the only way for the driver | ||
| 52 | * to flush all existing sptes and stop the driver | ||
| 53 | * from establishing any more sptes before all the | ||
| 54 | * pages in the mm are freed. | ||
| 55 | */ | ||
| 56 | if (mn->ops->release) | ||
| 57 | mn->ops->release(mn, mm); | ||
| 58 | srcu_read_unlock(&srcu, id); | ||
| 59 | |||
| 60 | spin_lock(&mm->mmu_notifier_mm->lock); | 49 | spin_lock(&mm->mmu_notifier_mm->lock); |
| 61 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { | 50 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { |
| 62 | mn = hlist_entry(mm->mmu_notifier_mm->list.first, | 51 | mn = hlist_entry(mm->mmu_notifier_mm->list.first, |
| 63 | struct mmu_notifier, | 52 | struct mmu_notifier, |
| 64 | hlist); | 53 | hlist); |
| 54 | |||
| 65 | /* | 55 | /* |
| 66 | * We arrived before mmu_notifier_unregister so | 56 | * Unlink. This will prevent mmu_notifier_unregister() |
| 67 | * mmu_notifier_unregister will do nothing other than | 57 | * from also making the ->release() callout. |
| 68 | * to wait ->release to finish and | ||
| 69 | * mmu_notifier_unregister to return. | ||
| 70 | */ | 58 | */ |
| 71 | hlist_del_init_rcu(&mn->hlist); | 59 | hlist_del_init_rcu(&mn->hlist); |
| 60 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
| 61 | |||
| 62 | /* | ||
| 63 | * Clear sptes. (see 'release' description in mmu_notifier.h) | ||
| 64 | */ | ||
| 65 | if (mn->ops->release) | ||
| 66 | mn->ops->release(mn, mm); | ||
| 67 | |||
| 68 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
| 72 | } | 69 | } |
| 73 | spin_unlock(&mm->mmu_notifier_mm->lock); | 70 | spin_unlock(&mm->mmu_notifier_mm->lock); |
| 74 | 71 | ||
| 75 | /* | 72 | /* |
| 76 | * synchronize_srcu here prevents mmu_notifier_release to | 73 | * All callouts to ->release() which we have done are complete. |
| 77 | * return to exit_mmap (which would proceed freeing all pages | 74 | * Allow synchronize_srcu() in mmu_notifier_unregister() to complete |
| 78 | * in the mm) until the ->release method returns, if it was | 75 | */ |
| 79 | * invoked by mmu_notifier_unregister. | 76 | srcu_read_unlock(&srcu, id); |
| 80 | * | 77 | |
| 81 | * The mmu_notifier_mm can't go away from under us because one | 78 | /* |
| 82 | * mm_count is hold by exit_mmap. | 79 | * mmu_notifier_unregister() may have unlinked a notifier and may |
| 80 | * still be calling out to it. Additionally, other notifiers | ||
| 81 | * may have been active via vmtruncate() et. al. Block here | ||
| 82 | * to ensure that all notifier callouts for this mm have been | ||
| 83 | * completed and the sptes are really cleaned up before returning | ||
| 84 | * to exit_mmap(). | ||
| 83 | */ | 85 | */ |
| 84 | synchronize_srcu(&srcu); | 86 | synchronize_srcu(&srcu); |
| 85 | } | 87 | } |
| @@ -170,6 +172,7 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, | |||
| 170 | } | 172 | } |
| 171 | srcu_read_unlock(&srcu, id); | 173 | srcu_read_unlock(&srcu, id); |
| 172 | } | 174 | } |
| 175 | EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start); | ||
| 173 | 176 | ||
| 174 | void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, | 177 | void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, |
| 175 | unsigned long start, unsigned long end) | 178 | unsigned long start, unsigned long end) |
| @@ -185,6 +188,7 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, | |||
| 185 | } | 188 | } |
| 186 | srcu_read_unlock(&srcu, id); | 189 | srcu_read_unlock(&srcu, id); |
| 187 | } | 190 | } |
| 191 | EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end); | ||
| 188 | 192 | ||
| 189 | static int do_mmu_notifier_register(struct mmu_notifier *mn, | 193 | static int do_mmu_notifier_register(struct mmu_notifier *mn, |
| 190 | struct mm_struct *mm, | 194 | struct mm_struct *mm, |
| @@ -294,31 +298,31 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |||
| 294 | { | 298 | { |
| 295 | BUG_ON(atomic_read(&mm->mm_count) <= 0); | 299 | BUG_ON(atomic_read(&mm->mm_count) <= 0); |
| 296 | 300 | ||
| 301 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
| 297 | if (!hlist_unhashed(&mn->hlist)) { | 302 | if (!hlist_unhashed(&mn->hlist)) { |
| 298 | /* | ||
| 299 | * SRCU here will force exit_mmap to wait ->release to finish | ||
| 300 | * before freeing the pages. | ||
| 301 | */ | ||
| 302 | int id; | 303 | int id; |
| 303 | 304 | ||
| 304 | id = srcu_read_lock(&srcu); | ||
| 305 | /* | 305 | /* |
| 306 | * exit_mmap will block in mmu_notifier_release to | 306 | * Ensure we synchronize up with __mmu_notifier_release(). |
| 307 | * guarantee ->release is called before freeing the | ||
| 308 | * pages. | ||
| 309 | */ | 307 | */ |
| 308 | id = srcu_read_lock(&srcu); | ||
| 309 | |||
| 310 | hlist_del_rcu(&mn->hlist); | ||
| 311 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
| 312 | |||
| 310 | if (mn->ops->release) | 313 | if (mn->ops->release) |
| 311 | mn->ops->release(mn, mm); | 314 | mn->ops->release(mn, mm); |
| 312 | srcu_read_unlock(&srcu, id); | ||
| 313 | 315 | ||
| 314 | spin_lock(&mm->mmu_notifier_mm->lock); | 316 | /* |
| 315 | hlist_del_rcu(&mn->hlist); | 317 | * Allow __mmu_notifier_release() to complete. |
| 318 | */ | ||
| 319 | srcu_read_unlock(&srcu, id); | ||
| 320 | } else | ||
| 316 | spin_unlock(&mm->mmu_notifier_mm->lock); | 321 | spin_unlock(&mm->mmu_notifier_mm->lock); |
| 317 | } | ||
| 318 | 322 | ||
| 319 | /* | 323 | /* |
| 320 | * Wait any running method to finish, of course including | 324 | * Wait for any running method to finish, including ->release() if it |
| 321 | * ->release if it was run by mmu_notifier_relase instead of us. | 325 | * was run by __mmu_notifier_release() instead of us. |
| 322 | */ | 326 | */ |
| 323 | synchronize_srcu(&srcu); | 327 | synchronize_srcu(&srcu); |
| 324 | 328 | ||
diff --git a/mm/mmzone.c b/mm/mmzone.c index 4596d81b89b1..2ac0afbd68f3 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
| @@ -1,7 +1,7 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * linux/mm/mmzone.c | 2 | * linux/mm/mmzone.c |
| 3 | * | 3 | * |
| 4 | * management codes for pgdats and zones. | 4 | * management codes for pgdats, zones and page flags |
| 5 | */ | 5 | */ |
| 6 | 6 | ||
| 7 | 7 | ||
| @@ -96,3 +96,21 @@ void lruvec_init(struct lruvec *lruvec) | |||
| 96 | for_each_lru(lru) | 96 | for_each_lru(lru) |
| 97 | INIT_LIST_HEAD(&lruvec->lists[lru]); | 97 | INIT_LIST_HEAD(&lruvec->lists[lru]); |
| 98 | } | 98 | } |
| 99 | |||
| 100 | #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS) | ||
| 101 | int page_nid_xchg_last(struct page *page, int nid) | ||
| 102 | { | ||
| 103 | unsigned long old_flags, flags; | ||
| 104 | int last_nid; | ||
| 105 | |||
| 106 | do { | ||
| 107 | old_flags = flags = page->flags; | ||
| 108 | last_nid = page_nid_last(page); | ||
| 109 | |||
| 110 | flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); | ||
| 111 | flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; | ||
| 112 | } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); | ||
| 113 | |||
| 114 | return last_nid; | ||
| 115 | } | ||
| 116 | #endif | ||
diff --git a/mm/mremap.c b/mm/mremap.c index f9766f460299..463a25705ac6 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
| @@ -135,7 +135,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
| 135 | pte_unmap(new_pte - 1); | 135 | pte_unmap(new_pte - 1); |
| 136 | pte_unmap_unlock(old_pte - 1, old_ptl); | 136 | pte_unmap_unlock(old_pte - 1, old_ptl); |
| 137 | if (anon_vma) | 137 | if (anon_vma) |
| 138 | anon_vma_unlock(anon_vma); | 138 | anon_vma_unlock_write(anon_vma); |
| 139 | if (mapping) | 139 | if (mapping) |
| 140 | mutex_unlock(&mapping->i_mmap_mutex); | 140 | mutex_unlock(&mapping->i_mmap_mutex); |
| 141 | } | 141 | } |
| @@ -209,7 +209,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, | |||
| 209 | 209 | ||
| 210 | static unsigned long move_vma(struct vm_area_struct *vma, | 210 | static unsigned long move_vma(struct vm_area_struct *vma, |
| 211 | unsigned long old_addr, unsigned long old_len, | 211 | unsigned long old_addr, unsigned long old_len, |
| 212 | unsigned long new_len, unsigned long new_addr) | 212 | unsigned long new_len, unsigned long new_addr, bool *locked) |
| 213 | { | 213 | { |
| 214 | struct mm_struct *mm = vma->vm_mm; | 214 | struct mm_struct *mm = vma->vm_mm; |
| 215 | struct vm_area_struct *new_vma; | 215 | struct vm_area_struct *new_vma; |
| @@ -300,9 +300,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
| 300 | 300 | ||
| 301 | if (vm_flags & VM_LOCKED) { | 301 | if (vm_flags & VM_LOCKED) { |
| 302 | mm->locked_vm += new_len >> PAGE_SHIFT; | 302 | mm->locked_vm += new_len >> PAGE_SHIFT; |
| 303 | if (new_len > old_len) | 303 | *locked = true; |
| 304 | mlock_vma_pages_range(new_vma, new_addr + old_len, | ||
| 305 | new_addr + new_len); | ||
| 306 | } | 304 | } |
| 307 | 305 | ||
| 308 | return new_addr; | 306 | return new_addr; |
| @@ -367,9 +365,8 @@ Eagain: | |||
| 367 | return ERR_PTR(-EAGAIN); | 365 | return ERR_PTR(-EAGAIN); |
| 368 | } | 366 | } |
| 369 | 367 | ||
| 370 | static unsigned long mremap_to(unsigned long addr, | 368 | static unsigned long mremap_to(unsigned long addr, unsigned long old_len, |
| 371 | unsigned long old_len, unsigned long new_addr, | 369 | unsigned long new_addr, unsigned long new_len, bool *locked) |
| 372 | unsigned long new_len) | ||
| 373 | { | 370 | { |
| 374 | struct mm_struct *mm = current->mm; | 371 | struct mm_struct *mm = current->mm; |
| 375 | struct vm_area_struct *vma; | 372 | struct vm_area_struct *vma; |
| @@ -419,7 +416,7 @@ static unsigned long mremap_to(unsigned long addr, | |||
| 419 | if (ret & ~PAGE_MASK) | 416 | if (ret & ~PAGE_MASK) |
| 420 | goto out1; | 417 | goto out1; |
| 421 | 418 | ||
| 422 | ret = move_vma(vma, addr, old_len, new_len, new_addr); | 419 | ret = move_vma(vma, addr, old_len, new_len, new_addr, locked); |
| 423 | if (!(ret & ~PAGE_MASK)) | 420 | if (!(ret & ~PAGE_MASK)) |
| 424 | goto out; | 421 | goto out; |
| 425 | out1: | 422 | out1: |
| @@ -457,6 +454,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
| 457 | struct vm_area_struct *vma; | 454 | struct vm_area_struct *vma; |
| 458 | unsigned long ret = -EINVAL; | 455 | unsigned long ret = -EINVAL; |
| 459 | unsigned long charged = 0; | 456 | unsigned long charged = 0; |
| 457 | bool locked = false; | ||
| 460 | 458 | ||
| 461 | down_write(¤t->mm->mmap_sem); | 459 | down_write(¤t->mm->mmap_sem); |
| 462 | 460 | ||
| @@ -479,7 +477,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
| 479 | 477 | ||
| 480 | if (flags & MREMAP_FIXED) { | 478 | if (flags & MREMAP_FIXED) { |
| 481 | if (flags & MREMAP_MAYMOVE) | 479 | if (flags & MREMAP_MAYMOVE) |
| 482 | ret = mremap_to(addr, old_len, new_addr, new_len); | 480 | ret = mremap_to(addr, old_len, new_addr, new_len, |
| 481 | &locked); | ||
| 483 | goto out; | 482 | goto out; |
| 484 | } | 483 | } |
| 485 | 484 | ||
| @@ -521,8 +520,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
| 521 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); | 520 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); |
| 522 | if (vma->vm_flags & VM_LOCKED) { | 521 | if (vma->vm_flags & VM_LOCKED) { |
| 523 | mm->locked_vm += pages; | 522 | mm->locked_vm += pages; |
| 524 | mlock_vma_pages_range(vma, addr + old_len, | 523 | locked = true; |
| 525 | addr + new_len); | 524 | new_addr = addr; |
| 526 | } | 525 | } |
| 527 | ret = addr; | 526 | ret = addr; |
| 528 | goto out; | 527 | goto out; |
| @@ -548,11 +547,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
| 548 | goto out; | 547 | goto out; |
| 549 | } | 548 | } |
| 550 | 549 | ||
| 551 | ret = move_vma(vma, addr, old_len, new_len, new_addr); | 550 | ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked); |
| 552 | } | 551 | } |
| 553 | out: | 552 | out: |
| 554 | if (ret & ~PAGE_MASK) | 553 | if (ret & ~PAGE_MASK) |
| 555 | vm_unacct_memory(charged); | 554 | vm_unacct_memory(charged); |
| 556 | up_write(¤t->mm->mmap_sem); | 555 | up_write(¤t->mm->mmap_sem); |
| 556 | if (locked && new_len > old_len) | ||
| 557 | mm_populate(new_addr + old_len, new_len - old_len); | ||
| 557 | return ret; | 558 | return ret; |
| 558 | } | 559 | } |
diff --git a/mm/nommu.c b/mm/nommu.c index b20db4e22263..da0d210fd403 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
| @@ -140,10 +140,10 @@ unsigned int kobjsize(const void *objp) | |||
| 140 | return PAGE_SIZE << compound_order(page); | 140 | return PAGE_SIZE << compound_order(page); |
| 141 | } | 141 | } |
| 142 | 142 | ||
| 143 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 143 | long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
| 144 | unsigned long start, int nr_pages, unsigned int foll_flags, | 144 | unsigned long start, unsigned long nr_pages, |
| 145 | struct page **pages, struct vm_area_struct **vmas, | 145 | unsigned int foll_flags, struct page **pages, |
| 146 | int *retry) | 146 | struct vm_area_struct **vmas, int *nonblocking) |
| 147 | { | 147 | { |
| 148 | struct vm_area_struct *vma; | 148 | struct vm_area_struct *vma; |
| 149 | unsigned long vm_flags; | 149 | unsigned long vm_flags; |
| @@ -190,9 +190,10 @@ finish_or_fault: | |||
| 190 | * slab page or a secondary page from a compound page | 190 | * slab page or a secondary page from a compound page |
| 191 | * - don't permit access to VMAs that don't support it, such as I/O mappings | 191 | * - don't permit access to VMAs that don't support it, such as I/O mappings |
| 192 | */ | 192 | */ |
| 193 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 193 | long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
| 194 | unsigned long start, int nr_pages, int write, int force, | 194 | unsigned long start, unsigned long nr_pages, |
| 195 | struct page **pages, struct vm_area_struct **vmas) | 195 | int write, int force, struct page **pages, |
| 196 | struct vm_area_struct **vmas) | ||
| 196 | { | 197 | { |
| 197 | int flags = 0; | 198 | int flags = 0; |
| 198 | 199 | ||
| @@ -1250,7 +1251,8 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
| 1250 | unsigned long len, | 1251 | unsigned long len, |
| 1251 | unsigned long prot, | 1252 | unsigned long prot, |
| 1252 | unsigned long flags, | 1253 | unsigned long flags, |
| 1253 | unsigned long pgoff) | 1254 | unsigned long pgoff, |
| 1255 | unsigned long *populate) | ||
| 1254 | { | 1256 | { |
| 1255 | struct vm_area_struct *vma; | 1257 | struct vm_area_struct *vma; |
| 1256 | struct vm_region *region; | 1258 | struct vm_region *region; |
| @@ -1260,6 +1262,8 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
| 1260 | 1262 | ||
| 1261 | kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); | 1263 | kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); |
| 1262 | 1264 | ||
| 1265 | *populate = 0; | ||
| 1266 | |||
| 1263 | /* decide whether we should attempt the mapping, and if so what sort of | 1267 | /* decide whether we should attempt the mapping, and if so what sort of |
| 1264 | * mapping */ | 1268 | * mapping */ |
| 1265 | ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, | 1269 | ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, |
| @@ -1815,9 +1819,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
| 1815 | return ret; | 1819 | return ret; |
| 1816 | } | 1820 | } |
| 1817 | 1821 | ||
| 1818 | struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | 1822 | struct page *follow_page_mask(struct vm_area_struct *vma, |
| 1819 | unsigned int foll_flags) | 1823 | unsigned long address, unsigned int flags, |
| 1824 | unsigned int *page_mask) | ||
| 1820 | { | 1825 | { |
| 1826 | *page_mask = 0; | ||
| 1821 | return NULL; | 1827 | return NULL; |
| 1822 | } | 1828 | } |
| 1823 | 1829 | ||
| @@ -1904,7 +1910,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
| 1904 | */ | 1910 | */ |
| 1905 | free -= global_page_state(NR_SHMEM); | 1911 | free -= global_page_state(NR_SHMEM); |
| 1906 | 1912 | ||
| 1907 | free += nr_swap_pages; | 1913 | free += get_nr_swap_pages(); |
| 1908 | 1914 | ||
| 1909 | /* | 1915 | /* |
| 1910 | * Any slabs which are created with the | 1916 | * Any slabs which are created with the |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 0399f146ae49..79e451a78c9e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
| @@ -386,8 +386,10 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | |||
| 386 | cpuset_print_task_mems_allowed(current); | 386 | cpuset_print_task_mems_allowed(current); |
| 387 | task_unlock(current); | 387 | task_unlock(current); |
| 388 | dump_stack(); | 388 | dump_stack(); |
| 389 | mem_cgroup_print_oom_info(memcg, p); | 389 | if (memcg) |
| 390 | show_mem(SHOW_MEM_FILTER_NODES); | 390 | mem_cgroup_print_oom_info(memcg, p); |
| 391 | else | ||
| 392 | show_mem(SHOW_MEM_FILTER_NODES); | ||
| 391 | if (sysctl_oom_dump_tasks) | 393 | if (sysctl_oom_dump_tasks) |
| 392 | dump_tasks(memcg, nodemask); | 394 | dump_tasks(memcg, nodemask); |
| 393 | } | 395 | } |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7300c9d5e1d9..cdc377c456c0 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
| @@ -241,6 +241,9 @@ static unsigned long global_dirtyable_memory(void) | |||
| 241 | if (!vm_highmem_is_dirtyable) | 241 | if (!vm_highmem_is_dirtyable) |
| 242 | x -= highmem_dirtyable_memory(x); | 242 | x -= highmem_dirtyable_memory(x); |
| 243 | 243 | ||
| 244 | /* Subtract min_free_kbytes */ | ||
| 245 | x -= min_t(unsigned long, x, min_free_kbytes >> (PAGE_SHIFT - 10)); | ||
| 246 | |||
| 244 | return x + 1; /* Ensure that we never return 0 */ | 247 | return x + 1; /* Ensure that we never return 0 */ |
| 245 | } | 248 | } |
| 246 | 249 | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d1107adf174a..e9075fdef695 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -202,11 +202,18 @@ static unsigned long __meminitdata nr_all_pages; | |||
| 202 | static unsigned long __meminitdata dma_reserve; | 202 | static unsigned long __meminitdata dma_reserve; |
| 203 | 203 | ||
| 204 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 204 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
| 205 | /* Movable memory ranges, will also be used by memblock subsystem. */ | ||
| 206 | struct movablemem_map movablemem_map = { | ||
| 207 | .acpi = false, | ||
| 208 | .nr_map = 0, | ||
| 209 | }; | ||
| 210 | |||
| 205 | static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; | 211 | static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; |
| 206 | static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; | 212 | static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; |
| 207 | static unsigned long __initdata required_kernelcore; | 213 | static unsigned long __initdata required_kernelcore; |
| 208 | static unsigned long __initdata required_movablecore; | 214 | static unsigned long __initdata required_movablecore; |
| 209 | static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; | 215 | static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; |
| 216 | static unsigned long __meminitdata zone_movable_limit[MAX_NUMNODES]; | ||
| 210 | 217 | ||
| 211 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ | 218 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ |
| 212 | int movable_zone; | 219 | int movable_zone; |
| @@ -240,15 +247,20 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | |||
| 240 | int ret = 0; | 247 | int ret = 0; |
| 241 | unsigned seq; | 248 | unsigned seq; |
| 242 | unsigned long pfn = page_to_pfn(page); | 249 | unsigned long pfn = page_to_pfn(page); |
| 250 | unsigned long sp, start_pfn; | ||
| 243 | 251 | ||
| 244 | do { | 252 | do { |
| 245 | seq = zone_span_seqbegin(zone); | 253 | seq = zone_span_seqbegin(zone); |
| 246 | if (pfn >= zone->zone_start_pfn + zone->spanned_pages) | 254 | start_pfn = zone->zone_start_pfn; |
| 247 | ret = 1; | 255 | sp = zone->spanned_pages; |
| 248 | else if (pfn < zone->zone_start_pfn) | 256 | if (!zone_spans_pfn(zone, pfn)) |
| 249 | ret = 1; | 257 | ret = 1; |
| 250 | } while (zone_span_seqretry(zone, seq)); | 258 | } while (zone_span_seqretry(zone, seq)); |
| 251 | 259 | ||
| 260 | if (ret) | ||
| 261 | pr_err("page %lu outside zone [ %lu - %lu ]\n", | ||
| 262 | pfn, start_pfn, start_pfn + sp); | ||
| 263 | |||
| 252 | return ret; | 264 | return ret; |
| 253 | } | 265 | } |
| 254 | 266 | ||
| @@ -288,7 +300,7 @@ static void bad_page(struct page *page) | |||
| 288 | 300 | ||
| 289 | /* Don't complain about poisoned pages */ | 301 | /* Don't complain about poisoned pages */ |
| 290 | if (PageHWPoison(page)) { | 302 | if (PageHWPoison(page)) { |
| 291 | reset_page_mapcount(page); /* remove PageBuddy */ | 303 | page_mapcount_reset(page); /* remove PageBuddy */ |
| 292 | return; | 304 | return; |
| 293 | } | 305 | } |
| 294 | 306 | ||
| @@ -320,7 +332,7 @@ static void bad_page(struct page *page) | |||
| 320 | dump_stack(); | 332 | dump_stack(); |
| 321 | out: | 333 | out: |
| 322 | /* Leave bad fields for debug, except PageBuddy could make trouble */ | 334 | /* Leave bad fields for debug, except PageBuddy could make trouble */ |
| 323 | reset_page_mapcount(page); /* remove PageBuddy */ | 335 | page_mapcount_reset(page); /* remove PageBuddy */ |
| 324 | add_taint(TAINT_BAD_PAGE); | 336 | add_taint(TAINT_BAD_PAGE); |
| 325 | } | 337 | } |
| 326 | 338 | ||
| @@ -533,6 +545,8 @@ static inline void __free_one_page(struct page *page, | |||
| 533 | unsigned long uninitialized_var(buddy_idx); | 545 | unsigned long uninitialized_var(buddy_idx); |
| 534 | struct page *buddy; | 546 | struct page *buddy; |
| 535 | 547 | ||
| 548 | VM_BUG_ON(!zone_is_initialized(zone)); | ||
| 549 | |||
| 536 | if (unlikely(PageCompound(page))) | 550 | if (unlikely(PageCompound(page))) |
| 537 | if (unlikely(destroy_compound_page(page, order))) | 551 | if (unlikely(destroy_compound_page(page, order))) |
| 538 | return; | 552 | return; |
| @@ -606,7 +620,7 @@ static inline int free_pages_check(struct page *page) | |||
| 606 | bad_page(page); | 620 | bad_page(page); |
| 607 | return 1; | 621 | return 1; |
| 608 | } | 622 | } |
| 609 | reset_page_last_nid(page); | 623 | page_nid_reset_last(page); |
| 610 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | 624 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
| 611 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; | 625 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; |
| 612 | return 0; | 626 | return 0; |
| @@ -666,7 +680,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
| 666 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ | 680 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ |
| 667 | __free_one_page(page, zone, 0, mt); | 681 | __free_one_page(page, zone, 0, mt); |
| 668 | trace_mm_page_pcpu_drain(page, 0, mt); | 682 | trace_mm_page_pcpu_drain(page, 0, mt); |
| 669 | if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) { | 683 | if (likely(!is_migrate_isolate_page(page))) { |
| 670 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1); | 684 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1); |
| 671 | if (is_migrate_cma(mt)) | 685 | if (is_migrate_cma(mt)) |
| 672 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); | 686 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); |
| @@ -684,7 +698,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order, | |||
| 684 | zone->pages_scanned = 0; | 698 | zone->pages_scanned = 0; |
| 685 | 699 | ||
| 686 | __free_one_page(page, zone, order, migratetype); | 700 | __free_one_page(page, zone, order, migratetype); |
| 687 | if (unlikely(migratetype != MIGRATE_ISOLATE)) | 701 | if (unlikely(!is_migrate_isolate(migratetype))) |
| 688 | __mod_zone_freepage_state(zone, 1 << order, migratetype); | 702 | __mod_zone_freepage_state(zone, 1 << order, migratetype); |
| 689 | spin_unlock(&zone->lock); | 703 | spin_unlock(&zone->lock); |
| 690 | } | 704 | } |
| @@ -916,7 +930,9 @@ static int fallbacks[MIGRATE_TYPES][4] = { | |||
| 916 | [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, | 930 | [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, |
| 917 | #endif | 931 | #endif |
| 918 | [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ | 932 | [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ |
| 933 | #ifdef CONFIG_MEMORY_ISOLATION | ||
| 919 | [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ | 934 | [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ |
| 935 | #endif | ||
| 920 | }; | 936 | }; |
| 921 | 937 | ||
| 922 | /* | 938 | /* |
| @@ -981,9 +997,9 @@ int move_freepages_block(struct zone *zone, struct page *page, | |||
| 981 | end_pfn = start_pfn + pageblock_nr_pages - 1; | 997 | end_pfn = start_pfn + pageblock_nr_pages - 1; |
| 982 | 998 | ||
| 983 | /* Do not cross zone boundaries */ | 999 | /* Do not cross zone boundaries */ |
| 984 | if (start_pfn < zone->zone_start_pfn) | 1000 | if (!zone_spans_pfn(zone, start_pfn)) |
| 985 | start_page = page; | 1001 | start_page = page; |
| 986 | if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages) | 1002 | if (!zone_spans_pfn(zone, end_pfn)) |
| 987 | return 0; | 1003 | return 0; |
| 988 | 1004 | ||
| 989 | return move_freepages(zone, start_page, end_page, migratetype); | 1005 | return move_freepages(zone, start_page, end_page, migratetype); |
| @@ -1142,7 +1158,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
| 1142 | list_add_tail(&page->lru, list); | 1158 | list_add_tail(&page->lru, list); |
| 1143 | if (IS_ENABLED(CONFIG_CMA)) { | 1159 | if (IS_ENABLED(CONFIG_CMA)) { |
| 1144 | mt = get_pageblock_migratetype(page); | 1160 | mt = get_pageblock_migratetype(page); |
| 1145 | if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) | 1161 | if (!is_migrate_cma(mt) && !is_migrate_isolate(mt)) |
| 1146 | mt = migratetype; | 1162 | mt = migratetype; |
| 1147 | } | 1163 | } |
| 1148 | set_freepage_migratetype(page, mt); | 1164 | set_freepage_migratetype(page, mt); |
| @@ -1277,7 +1293,7 @@ void mark_free_pages(struct zone *zone) | |||
| 1277 | 1293 | ||
| 1278 | spin_lock_irqsave(&zone->lock, flags); | 1294 | spin_lock_irqsave(&zone->lock, flags); |
| 1279 | 1295 | ||
| 1280 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; | 1296 | max_zone_pfn = zone_end_pfn(zone); |
| 1281 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | 1297 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) |
| 1282 | if (pfn_valid(pfn)) { | 1298 | if (pfn_valid(pfn)) { |
| 1283 | struct page *page = pfn_to_page(pfn); | 1299 | struct page *page = pfn_to_page(pfn); |
| @@ -1326,7 +1342,7 @@ void free_hot_cold_page(struct page *page, int cold) | |||
| 1326 | * excessively into the page allocator | 1342 | * excessively into the page allocator |
| 1327 | */ | 1343 | */ |
| 1328 | if (migratetype >= MIGRATE_PCPTYPES) { | 1344 | if (migratetype >= MIGRATE_PCPTYPES) { |
| 1329 | if (unlikely(migratetype == MIGRATE_ISOLATE)) { | 1345 | if (unlikely(is_migrate_isolate(migratetype))) { |
| 1330 | free_one_page(zone, page, 0, migratetype); | 1346 | free_one_page(zone, page, 0, migratetype); |
| 1331 | goto out; | 1347 | goto out; |
| 1332 | } | 1348 | } |
| @@ -1400,7 +1416,7 @@ static int __isolate_free_page(struct page *page, unsigned int order) | |||
| 1400 | zone = page_zone(page); | 1416 | zone = page_zone(page); |
| 1401 | mt = get_pageblock_migratetype(page); | 1417 | mt = get_pageblock_migratetype(page); |
| 1402 | 1418 | ||
| 1403 | if (mt != MIGRATE_ISOLATE) { | 1419 | if (!is_migrate_isolate(mt)) { |
| 1404 | /* Obey watermarks as if the page was being allocated */ | 1420 | /* Obey watermarks as if the page was being allocated */ |
| 1405 | watermark = low_wmark_pages(zone) + (1 << order); | 1421 | watermark = low_wmark_pages(zone) + (1 << order); |
| 1406 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) | 1422 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) |
| @@ -1419,7 +1435,7 @@ static int __isolate_free_page(struct page *page, unsigned int order) | |||
| 1419 | struct page *endpage = page + (1 << order) - 1; | 1435 | struct page *endpage = page + (1 << order) - 1; |
| 1420 | for (; page < endpage; page += pageblock_nr_pages) { | 1436 | for (; page < endpage; page += pageblock_nr_pages) { |
| 1421 | int mt = get_pageblock_migratetype(page); | 1437 | int mt = get_pageblock_migratetype(page); |
| 1422 | if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt)) | 1438 | if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)) |
| 1423 | set_pageblock_migratetype(page, | 1439 | set_pageblock_migratetype(page, |
| 1424 | MIGRATE_MOVABLE); | 1440 | MIGRATE_MOVABLE); |
| 1425 | } | 1441 | } |
| @@ -2615,10 +2631,17 @@ retry_cpuset: | |||
| 2615 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 2631 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, |
| 2616 | zonelist, high_zoneidx, alloc_flags, | 2632 | zonelist, high_zoneidx, alloc_flags, |
| 2617 | preferred_zone, migratetype); | 2633 | preferred_zone, migratetype); |
| 2618 | if (unlikely(!page)) | 2634 | if (unlikely(!page)) { |
| 2635 | /* | ||
| 2636 | * Runtime PM, block IO and its error handling path | ||
| 2637 | * can deadlock because I/O on the device might not | ||
| 2638 | * complete. | ||
| 2639 | */ | ||
| 2640 | gfp_mask = memalloc_noio_flags(gfp_mask); | ||
| 2619 | page = __alloc_pages_slowpath(gfp_mask, order, | 2641 | page = __alloc_pages_slowpath(gfp_mask, order, |
| 2620 | zonelist, high_zoneidx, nodemask, | 2642 | zonelist, high_zoneidx, nodemask, |
| 2621 | preferred_zone, migratetype); | 2643 | preferred_zone, migratetype); |
| 2644 | } | ||
| 2622 | 2645 | ||
| 2623 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); | 2646 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); |
| 2624 | 2647 | ||
| @@ -2790,18 +2813,27 @@ void free_pages_exact(void *virt, size_t size) | |||
| 2790 | } | 2813 | } |
| 2791 | EXPORT_SYMBOL(free_pages_exact); | 2814 | EXPORT_SYMBOL(free_pages_exact); |
| 2792 | 2815 | ||
| 2793 | static unsigned int nr_free_zone_pages(int offset) | 2816 | /** |
| 2817 | * nr_free_zone_pages - count number of pages beyond high watermark | ||
| 2818 | * @offset: The zone index of the highest zone | ||
| 2819 | * | ||
| 2820 | * nr_free_zone_pages() counts the number of counts pages which are beyond the | ||
| 2821 | * high watermark within all zones at or below a given zone index. For each | ||
| 2822 | * zone, the number of pages is calculated as: | ||
| 2823 | * present_pages - high_pages | ||
| 2824 | */ | ||
| 2825 | static unsigned long nr_free_zone_pages(int offset) | ||
| 2794 | { | 2826 | { |
| 2795 | struct zoneref *z; | 2827 | struct zoneref *z; |
| 2796 | struct zone *zone; | 2828 | struct zone *zone; |
| 2797 | 2829 | ||
| 2798 | /* Just pick one node, since fallback list is circular */ | 2830 | /* Just pick one node, since fallback list is circular */ |
| 2799 | unsigned int sum = 0; | 2831 | unsigned long sum = 0; |
| 2800 | 2832 | ||
| 2801 | struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); | 2833 | struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); |
| 2802 | 2834 | ||
| 2803 | for_each_zone_zonelist(zone, z, zonelist, offset) { | 2835 | for_each_zone_zonelist(zone, z, zonelist, offset) { |
| 2804 | unsigned long size = zone->present_pages; | 2836 | unsigned long size = zone->managed_pages; |
| 2805 | unsigned long high = high_wmark_pages(zone); | 2837 | unsigned long high = high_wmark_pages(zone); |
| 2806 | if (size > high) | 2838 | if (size > high) |
| 2807 | sum += size - high; | 2839 | sum += size - high; |
| @@ -2810,19 +2842,25 @@ static unsigned int nr_free_zone_pages(int offset) | |||
| 2810 | return sum; | 2842 | return sum; |
| 2811 | } | 2843 | } |
| 2812 | 2844 | ||
| 2813 | /* | 2845 | /** |
| 2814 | * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL | 2846 | * nr_free_buffer_pages - count number of pages beyond high watermark |
| 2847 | * | ||
| 2848 | * nr_free_buffer_pages() counts the number of pages which are beyond the high | ||
| 2849 | * watermark within ZONE_DMA and ZONE_NORMAL. | ||
| 2815 | */ | 2850 | */ |
| 2816 | unsigned int nr_free_buffer_pages(void) | 2851 | unsigned long nr_free_buffer_pages(void) |
| 2817 | { | 2852 | { |
| 2818 | return nr_free_zone_pages(gfp_zone(GFP_USER)); | 2853 | return nr_free_zone_pages(gfp_zone(GFP_USER)); |
| 2819 | } | 2854 | } |
| 2820 | EXPORT_SYMBOL_GPL(nr_free_buffer_pages); | 2855 | EXPORT_SYMBOL_GPL(nr_free_buffer_pages); |
| 2821 | 2856 | ||
| 2822 | /* | 2857 | /** |
| 2823 | * Amount of free RAM allocatable within all zones | 2858 | * nr_free_pagecache_pages - count number of pages beyond high watermark |
| 2859 | * | ||
| 2860 | * nr_free_pagecache_pages() counts the number of pages which are beyond the | ||
| 2861 | * high watermark within all zones. | ||
| 2824 | */ | 2862 | */ |
| 2825 | unsigned int nr_free_pagecache_pages(void) | 2863 | unsigned long nr_free_pagecache_pages(void) |
| 2826 | { | 2864 | { |
| 2827 | return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); | 2865 | return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); |
| 2828 | } | 2866 | } |
| @@ -2854,7 +2892,7 @@ void si_meminfo_node(struct sysinfo *val, int nid) | |||
| 2854 | val->totalram = pgdat->node_present_pages; | 2892 | val->totalram = pgdat->node_present_pages; |
| 2855 | val->freeram = node_page_state(nid, NR_FREE_PAGES); | 2893 | val->freeram = node_page_state(nid, NR_FREE_PAGES); |
| 2856 | #ifdef CONFIG_HIGHMEM | 2894 | #ifdef CONFIG_HIGHMEM |
| 2857 | val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; | 2895 | val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; |
| 2858 | val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], | 2896 | val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], |
| 2859 | NR_FREE_PAGES); | 2897 | NR_FREE_PAGES); |
| 2860 | #else | 2898 | #else |
| @@ -2897,7 +2935,9 @@ static void show_migration_types(unsigned char type) | |||
| 2897 | #ifdef CONFIG_CMA | 2935 | #ifdef CONFIG_CMA |
| 2898 | [MIGRATE_CMA] = 'C', | 2936 | [MIGRATE_CMA] = 'C', |
| 2899 | #endif | 2937 | #endif |
| 2938 | #ifdef CONFIG_MEMORY_ISOLATION | ||
| 2900 | [MIGRATE_ISOLATE] = 'I', | 2939 | [MIGRATE_ISOLATE] = 'I', |
| 2940 | #endif | ||
| 2901 | }; | 2941 | }; |
| 2902 | char tmp[MIGRATE_TYPES + 1]; | 2942 | char tmp[MIGRATE_TYPES + 1]; |
| 2903 | char *p = tmp; | 2943 | char *p = tmp; |
| @@ -3236,7 +3276,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) | |||
| 3236 | { | 3276 | { |
| 3237 | int n, val; | 3277 | int n, val; |
| 3238 | int min_val = INT_MAX; | 3278 | int min_val = INT_MAX; |
| 3239 | int best_node = -1; | 3279 | int best_node = NUMA_NO_NODE; |
| 3240 | const struct cpumask *tmp = cpumask_of_node(0); | 3280 | const struct cpumask *tmp = cpumask_of_node(0); |
| 3241 | 3281 | ||
| 3242 | /* Use the local node if we haven't already */ | 3282 | /* Use the local node if we haven't already */ |
| @@ -3780,7 +3820,7 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
| 3780 | * the block. | 3820 | * the block. |
| 3781 | */ | 3821 | */ |
| 3782 | start_pfn = zone->zone_start_pfn; | 3822 | start_pfn = zone->zone_start_pfn; |
| 3783 | end_pfn = start_pfn + zone->spanned_pages; | 3823 | end_pfn = zone_end_pfn(zone); |
| 3784 | start_pfn = roundup(start_pfn, pageblock_nr_pages); | 3824 | start_pfn = roundup(start_pfn, pageblock_nr_pages); |
| 3785 | reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> | 3825 | reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> |
| 3786 | pageblock_order; | 3826 | pageblock_order; |
| @@ -3876,8 +3916,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
| 3876 | set_page_links(page, zone, nid, pfn); | 3916 | set_page_links(page, zone, nid, pfn); |
| 3877 | mminit_verify_page_links(page, zone, nid, pfn); | 3917 | mminit_verify_page_links(page, zone, nid, pfn); |
| 3878 | init_page_count(page); | 3918 | init_page_count(page); |
| 3879 | reset_page_mapcount(page); | 3919 | page_mapcount_reset(page); |
| 3880 | reset_page_last_nid(page); | 3920 | page_nid_reset_last(page); |
| 3881 | SetPageReserved(page); | 3921 | SetPageReserved(page); |
| 3882 | /* | 3922 | /* |
| 3883 | * Mark the block movable so that blocks are reserved for | 3923 | * Mark the block movable so that blocks are reserved for |
| @@ -3894,7 +3934,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
| 3894 | * pfn out of zone. | 3934 | * pfn out of zone. |
| 3895 | */ | 3935 | */ |
| 3896 | if ((z->zone_start_pfn <= pfn) | 3936 | if ((z->zone_start_pfn <= pfn) |
| 3897 | && (pfn < z->zone_start_pfn + z->spanned_pages) | 3937 | && (pfn < zone_end_pfn(z)) |
| 3898 | && !(pfn & (pageblock_nr_pages - 1))) | 3938 | && !(pfn & (pageblock_nr_pages - 1))) |
| 3899 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | 3939 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); |
| 3900 | 3940 | ||
| @@ -3932,7 +3972,7 @@ static int __meminit zone_batchsize(struct zone *zone) | |||
| 3932 | * | 3972 | * |
| 3933 | * OK, so we don't know how big the cache is. So guess. | 3973 | * OK, so we don't know how big the cache is. So guess. |
| 3934 | */ | 3974 | */ |
| 3935 | batch = zone->present_pages / 1024; | 3975 | batch = zone->managed_pages / 1024; |
| 3936 | if (batch * PAGE_SIZE > 512 * 1024) | 3976 | if (batch * PAGE_SIZE > 512 * 1024) |
| 3937 | batch = (512 * 1024) / PAGE_SIZE; | 3977 | batch = (512 * 1024) / PAGE_SIZE; |
| 3938 | batch /= 4; /* We effectively *= 4 below */ | 3978 | batch /= 4; /* We effectively *= 4 below */ |
| @@ -4016,7 +4056,7 @@ static void __meminit setup_zone_pageset(struct zone *zone) | |||
| 4016 | 4056 | ||
| 4017 | if (percpu_pagelist_fraction) | 4057 | if (percpu_pagelist_fraction) |
| 4018 | setup_pagelist_highmark(pcp, | 4058 | setup_pagelist_highmark(pcp, |
| 4019 | (zone->present_pages / | 4059 | (zone->managed_pages / |
| 4020 | percpu_pagelist_fraction)); | 4060 | percpu_pagelist_fraction)); |
| 4021 | } | 4061 | } |
| 4022 | } | 4062 | } |
| @@ -4372,6 +4412,77 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, | |||
| 4372 | return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); | 4412 | return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); |
| 4373 | } | 4413 | } |
| 4374 | 4414 | ||
| 4415 | /** | ||
| 4416 | * sanitize_zone_movable_limit - Sanitize the zone_movable_limit array. | ||
| 4417 | * | ||
| 4418 | * zone_movable_limit is initialized as 0. This function will try to get | ||
| 4419 | * the first ZONE_MOVABLE pfn of each node from movablemem_map, and | ||
| 4420 | * assigne them to zone_movable_limit. | ||
| 4421 | * zone_movable_limit[nid] == 0 means no limit for the node. | ||
| 4422 | * | ||
| 4423 | * Note: Each range is represented as [start_pfn, end_pfn) | ||
| 4424 | */ | ||
| 4425 | static void __meminit sanitize_zone_movable_limit(void) | ||
| 4426 | { | ||
| 4427 | int map_pos = 0, i, nid; | ||
| 4428 | unsigned long start_pfn, end_pfn; | ||
| 4429 | |||
| 4430 | if (!movablemem_map.nr_map) | ||
| 4431 | return; | ||
| 4432 | |||
| 4433 | /* Iterate all ranges from minimum to maximum */ | ||
| 4434 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { | ||
| 4435 | /* | ||
| 4436 | * If we have found lowest pfn of ZONE_MOVABLE of the node | ||
| 4437 | * specified by user, just go on to check next range. | ||
| 4438 | */ | ||
| 4439 | if (zone_movable_limit[nid]) | ||
| 4440 | continue; | ||
| 4441 | |||
| 4442 | #ifdef CONFIG_ZONE_DMA | ||
| 4443 | /* Skip DMA memory. */ | ||
| 4444 | if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA]) | ||
| 4445 | start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA]; | ||
| 4446 | #endif | ||
| 4447 | |||
| 4448 | #ifdef CONFIG_ZONE_DMA32 | ||
| 4449 | /* Skip DMA32 memory. */ | ||
| 4450 | if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA32]) | ||
| 4451 | start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA32]; | ||
| 4452 | #endif | ||
| 4453 | |||
| 4454 | #ifdef CONFIG_HIGHMEM | ||
| 4455 | /* Skip lowmem if ZONE_MOVABLE is highmem. */ | ||
| 4456 | if (zone_movable_is_highmem() && | ||
| 4457 | start_pfn < arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]) | ||
| 4458 | start_pfn = arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]; | ||
| 4459 | #endif | ||
| 4460 | |||
| 4461 | if (start_pfn >= end_pfn) | ||
| 4462 | continue; | ||
| 4463 | |||
| 4464 | while (map_pos < movablemem_map.nr_map) { | ||
| 4465 | if (end_pfn <= movablemem_map.map[map_pos].start_pfn) | ||
| 4466 | break; | ||
| 4467 | |||
| 4468 | if (start_pfn >= movablemem_map.map[map_pos].end_pfn) { | ||
| 4469 | map_pos++; | ||
| 4470 | continue; | ||
| 4471 | } | ||
| 4472 | |||
| 4473 | /* | ||
| 4474 | * The start_pfn of ZONE_MOVABLE is either the minimum | ||
| 4475 | * pfn specified by movablemem_map, or 0, which means | ||
| 4476 | * the node has no ZONE_MOVABLE. | ||
| 4477 | */ | ||
| 4478 | zone_movable_limit[nid] = max(start_pfn, | ||
| 4479 | movablemem_map.map[map_pos].start_pfn); | ||
| 4480 | |||
| 4481 | break; | ||
| 4482 | } | ||
| 4483 | } | ||
| 4484 | } | ||
| 4485 | |||
| 4375 | #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 4486 | #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
| 4376 | static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, | 4487 | static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, |
| 4377 | unsigned long zone_type, | 4488 | unsigned long zone_type, |
| @@ -4389,7 +4500,6 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid, | |||
| 4389 | 4500 | ||
| 4390 | return zholes_size[zone_type]; | 4501 | return zholes_size[zone_type]; |
| 4391 | } | 4502 | } |
| 4392 | |||
| 4393 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 4503 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
| 4394 | 4504 | ||
| 4395 | static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, | 4505 | static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, |
| @@ -4573,7 +4683,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
| 4573 | nr_all_pages += freesize; | 4683 | nr_all_pages += freesize; |
| 4574 | 4684 | ||
| 4575 | zone->spanned_pages = size; | 4685 | zone->spanned_pages = size; |
| 4576 | zone->present_pages = freesize; | 4686 | zone->present_pages = realsize; |
| 4577 | /* | 4687 | /* |
| 4578 | * Set an approximate value for lowmem here, it will be adjusted | 4688 | * Set an approximate value for lowmem here, it will be adjusted |
| 4579 | * when the bootmem allocator frees pages into the buddy system. | 4689 | * when the bootmem allocator frees pages into the buddy system. |
| @@ -4625,7 +4735,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) | |||
| 4625 | * for the buddy allocator to function correctly. | 4735 | * for the buddy allocator to function correctly. |
| 4626 | */ | 4736 | */ |
| 4627 | start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); | 4737 | start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); |
| 4628 | end = pgdat->node_start_pfn + pgdat->node_spanned_pages; | 4738 | end = pgdat_end_pfn(pgdat); |
| 4629 | end = ALIGN(end, MAX_ORDER_NR_PAGES); | 4739 | end = ALIGN(end, MAX_ORDER_NR_PAGES); |
| 4630 | size = (end - start) * sizeof(struct page); | 4740 | size = (end - start) * sizeof(struct page); |
| 4631 | map = alloc_remap(pgdat->node_id, size); | 4741 | map = alloc_remap(pgdat->node_id, size); |
| @@ -4831,12 +4941,19 @@ static void __init find_zone_movable_pfns_for_nodes(void) | |||
| 4831 | required_kernelcore = max(required_kernelcore, corepages); | 4941 | required_kernelcore = max(required_kernelcore, corepages); |
| 4832 | } | 4942 | } |
| 4833 | 4943 | ||
| 4834 | /* If kernelcore was not specified, there is no ZONE_MOVABLE */ | 4944 | /* |
| 4835 | if (!required_kernelcore) | 4945 | * If neither kernelcore/movablecore nor movablemem_map is specified, |
| 4946 | * there is no ZONE_MOVABLE. But if movablemem_map is specified, the | ||
| 4947 | * start pfn of ZONE_MOVABLE has been stored in zone_movable_limit[]. | ||
| 4948 | */ | ||
| 4949 | if (!required_kernelcore) { | ||
| 4950 | if (movablemem_map.nr_map) | ||
| 4951 | memcpy(zone_movable_pfn, zone_movable_limit, | ||
| 4952 | sizeof(zone_movable_pfn)); | ||
| 4836 | goto out; | 4953 | goto out; |
| 4954 | } | ||
| 4837 | 4955 | ||
| 4838 | /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ | 4956 | /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ |
| 4839 | find_usable_zone_for_movable(); | ||
| 4840 | usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; | 4957 | usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; |
| 4841 | 4958 | ||
| 4842 | restart: | 4959 | restart: |
| @@ -4864,10 +4981,24 @@ restart: | |||
| 4864 | for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { | 4981 | for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { |
| 4865 | unsigned long size_pages; | 4982 | unsigned long size_pages; |
| 4866 | 4983 | ||
| 4984 | /* | ||
| 4985 | * Find more memory for kernelcore in | ||
| 4986 | * [zone_movable_pfn[nid], zone_movable_limit[nid]). | ||
| 4987 | */ | ||
| 4867 | start_pfn = max(start_pfn, zone_movable_pfn[nid]); | 4988 | start_pfn = max(start_pfn, zone_movable_pfn[nid]); |
| 4868 | if (start_pfn >= end_pfn) | 4989 | if (start_pfn >= end_pfn) |
| 4869 | continue; | 4990 | continue; |
| 4870 | 4991 | ||
| 4992 | if (zone_movable_limit[nid]) { | ||
| 4993 | end_pfn = min(end_pfn, zone_movable_limit[nid]); | ||
| 4994 | /* No range left for kernelcore in this node */ | ||
| 4995 | if (start_pfn >= end_pfn) { | ||
| 4996 | zone_movable_pfn[nid] = | ||
| 4997 | zone_movable_limit[nid]; | ||
| 4998 | break; | ||
| 4999 | } | ||
| 5000 | } | ||
| 5001 | |||
| 4871 | /* Account for what is only usable for kernelcore */ | 5002 | /* Account for what is only usable for kernelcore */ |
| 4872 | if (start_pfn < usable_startpfn) { | 5003 | if (start_pfn < usable_startpfn) { |
| 4873 | unsigned long kernel_pages; | 5004 | unsigned long kernel_pages; |
| @@ -4927,12 +5058,12 @@ restart: | |||
| 4927 | if (usable_nodes && required_kernelcore > usable_nodes) | 5058 | if (usable_nodes && required_kernelcore > usable_nodes) |
| 4928 | goto restart; | 5059 | goto restart; |
| 4929 | 5060 | ||
| 5061 | out: | ||
| 4930 | /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ | 5062 | /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ |
| 4931 | for (nid = 0; nid < MAX_NUMNODES; nid++) | 5063 | for (nid = 0; nid < MAX_NUMNODES; nid++) |
| 4932 | zone_movable_pfn[nid] = | 5064 | zone_movable_pfn[nid] = |
| 4933 | roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); | 5065 | roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); |
| 4934 | 5066 | ||
| 4935 | out: | ||
| 4936 | /* restore the node_state */ | 5067 | /* restore the node_state */ |
| 4937 | node_states[N_MEMORY] = saved_node_state; | 5068 | node_states[N_MEMORY] = saved_node_state; |
| 4938 | } | 5069 | } |
| @@ -4995,6 +5126,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
| 4995 | 5126 | ||
| 4996 | /* Find the PFNs that ZONE_MOVABLE begins at in each node */ | 5127 | /* Find the PFNs that ZONE_MOVABLE begins at in each node */ |
| 4997 | memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); | 5128 | memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); |
| 5129 | find_usable_zone_for_movable(); | ||
| 5130 | sanitize_zone_movable_limit(); | ||
| 4998 | find_zone_movable_pfns_for_nodes(); | 5131 | find_zone_movable_pfns_for_nodes(); |
| 4999 | 5132 | ||
| 5000 | /* Print out the zone ranges */ | 5133 | /* Print out the zone ranges */ |
| @@ -5078,6 +5211,181 @@ static int __init cmdline_parse_movablecore(char *p) | |||
| 5078 | early_param("kernelcore", cmdline_parse_kernelcore); | 5211 | early_param("kernelcore", cmdline_parse_kernelcore); |
| 5079 | early_param("movablecore", cmdline_parse_movablecore); | 5212 | early_param("movablecore", cmdline_parse_movablecore); |
| 5080 | 5213 | ||
| 5214 | /** | ||
| 5215 | * movablemem_map_overlap() - Check if a range overlaps movablemem_map.map[]. | ||
| 5216 | * @start_pfn: start pfn of the range to be checked | ||
| 5217 | * @end_pfn: end pfn of the range to be checked (exclusive) | ||
| 5218 | * | ||
| 5219 | * This function checks if a given memory range [start_pfn, end_pfn) overlaps | ||
| 5220 | * the movablemem_map.map[] array. | ||
| 5221 | * | ||
| 5222 | * Return: index of the first overlapped element in movablemem_map.map[] | ||
| 5223 | * or -1 if they don't overlap each other. | ||
| 5224 | */ | ||
| 5225 | int __init movablemem_map_overlap(unsigned long start_pfn, | ||
| 5226 | unsigned long end_pfn) | ||
| 5227 | { | ||
| 5228 | int overlap; | ||
| 5229 | |||
| 5230 | if (!movablemem_map.nr_map) | ||
| 5231 | return -1; | ||
| 5232 | |||
| 5233 | for (overlap = 0; overlap < movablemem_map.nr_map; overlap++) | ||
| 5234 | if (start_pfn < movablemem_map.map[overlap].end_pfn) | ||
| 5235 | break; | ||
| 5236 | |||
| 5237 | if (overlap == movablemem_map.nr_map || | ||
| 5238 | end_pfn <= movablemem_map.map[overlap].start_pfn) | ||
| 5239 | return -1; | ||
| 5240 | |||
| 5241 | return overlap; | ||
| 5242 | } | ||
| 5243 | |||
| 5244 | /** | ||
| 5245 | * insert_movablemem_map - Insert a memory range in to movablemem_map.map. | ||
| 5246 | * @start_pfn: start pfn of the range | ||
| 5247 | * @end_pfn: end pfn of the range | ||
| 5248 | * | ||
| 5249 | * This function will also merge the overlapped ranges, and sort the array | ||
| 5250 | * by start_pfn in monotonic increasing order. | ||
| 5251 | */ | ||
| 5252 | void __init insert_movablemem_map(unsigned long start_pfn, | ||
| 5253 | unsigned long end_pfn) | ||
| 5254 | { | ||
| 5255 | int pos, overlap; | ||
| 5256 | |||
| 5257 | /* | ||
| 5258 | * pos will be at the 1st overlapped range, or the position | ||
| 5259 | * where the element should be inserted. | ||
| 5260 | */ | ||
| 5261 | for (pos = 0; pos < movablemem_map.nr_map; pos++) | ||
| 5262 | if (start_pfn <= movablemem_map.map[pos].end_pfn) | ||
| 5263 | break; | ||
| 5264 | |||
| 5265 | /* If there is no overlapped range, just insert the element. */ | ||
| 5266 | if (pos == movablemem_map.nr_map || | ||
| 5267 | end_pfn < movablemem_map.map[pos].start_pfn) { | ||
| 5268 | /* | ||
| 5269 | * If pos is not the end of array, we need to move all | ||
| 5270 | * the rest elements backward. | ||
| 5271 | */ | ||
| 5272 | if (pos < movablemem_map.nr_map) | ||
| 5273 | memmove(&movablemem_map.map[pos+1], | ||
| 5274 | &movablemem_map.map[pos], | ||
| 5275 | sizeof(struct movablemem_entry) * | ||
| 5276 | (movablemem_map.nr_map - pos)); | ||
| 5277 | movablemem_map.map[pos].start_pfn = start_pfn; | ||
| 5278 | movablemem_map.map[pos].end_pfn = end_pfn; | ||
| 5279 | movablemem_map.nr_map++; | ||
| 5280 | return; | ||
| 5281 | } | ||
| 5282 | |||
| 5283 | /* overlap will be at the last overlapped range */ | ||
| 5284 | for (overlap = pos + 1; overlap < movablemem_map.nr_map; overlap++) | ||
| 5285 | if (end_pfn < movablemem_map.map[overlap].start_pfn) | ||
| 5286 | break; | ||
| 5287 | |||
| 5288 | /* | ||
| 5289 | * If there are more ranges overlapped, we need to merge them, | ||
| 5290 | * and move the rest elements forward. | ||
| 5291 | */ | ||
| 5292 | overlap--; | ||
| 5293 | movablemem_map.map[pos].start_pfn = min(start_pfn, | ||
| 5294 | movablemem_map.map[pos].start_pfn); | ||
| 5295 | movablemem_map.map[pos].end_pfn = max(end_pfn, | ||
| 5296 | movablemem_map.map[overlap].end_pfn); | ||
| 5297 | |||
| 5298 | if (pos != overlap && overlap + 1 != movablemem_map.nr_map) | ||
| 5299 | memmove(&movablemem_map.map[pos+1], | ||
| 5300 | &movablemem_map.map[overlap+1], | ||
| 5301 | sizeof(struct movablemem_entry) * | ||
| 5302 | (movablemem_map.nr_map - overlap - 1)); | ||
| 5303 | |||
| 5304 | movablemem_map.nr_map -= overlap - pos; | ||
| 5305 | } | ||
| 5306 | |||
| 5307 | /** | ||
| 5308 | * movablemem_map_add_region - Add a memory range into movablemem_map. | ||
| 5309 | * @start: physical start address of range | ||
| 5310 | * @end: physical end address of range | ||
| 5311 | * | ||
| 5312 | * This function transform the physical address into pfn, and then add the | ||
| 5313 | * range into movablemem_map by calling insert_movablemem_map(). | ||
| 5314 | */ | ||
| 5315 | static void __init movablemem_map_add_region(u64 start, u64 size) | ||
| 5316 | { | ||
| 5317 | unsigned long start_pfn, end_pfn; | ||
| 5318 | |||
| 5319 | /* In case size == 0 or start + size overflows */ | ||
| 5320 | if (start + size <= start) | ||
| 5321 | return; | ||
| 5322 | |||
| 5323 | if (movablemem_map.nr_map >= ARRAY_SIZE(movablemem_map.map)) { | ||
| 5324 | pr_err("movablemem_map: too many entries;" | ||
| 5325 | " ignoring [mem %#010llx-%#010llx]\n", | ||
| 5326 | (unsigned long long) start, | ||
| 5327 | (unsigned long long) (start + size - 1)); | ||
| 5328 | return; | ||
| 5329 | } | ||
| 5330 | |||
| 5331 | start_pfn = PFN_DOWN(start); | ||
| 5332 | end_pfn = PFN_UP(start + size); | ||
| 5333 | insert_movablemem_map(start_pfn, end_pfn); | ||
| 5334 | } | ||
| 5335 | |||
| 5336 | /* | ||
| 5337 | * cmdline_parse_movablemem_map - Parse boot option movablemem_map. | ||
| 5338 | * @p: The boot option of the following format: | ||
| 5339 | * movablemem_map=nn[KMG]@ss[KMG] | ||
| 5340 | * | ||
| 5341 | * This option sets the memory range [ss, ss+nn) to be used as movable memory. | ||
| 5342 | * | ||
| 5343 | * Return: 0 on success or -EINVAL on failure. | ||
| 5344 | */ | ||
| 5345 | static int __init cmdline_parse_movablemem_map(char *p) | ||
| 5346 | { | ||
| 5347 | char *oldp; | ||
| 5348 | u64 start_at, mem_size; | ||
| 5349 | |||
| 5350 | if (!p) | ||
| 5351 | goto err; | ||
| 5352 | |||
| 5353 | if (!strcmp(p, "acpi")) | ||
| 5354 | movablemem_map.acpi = true; | ||
| 5355 | |||
| 5356 | /* | ||
| 5357 | * If user decide to use info from BIOS, all the other user specified | ||
| 5358 | * ranges will be ingored. | ||
| 5359 | */ | ||
| 5360 | if (movablemem_map.acpi) { | ||
| 5361 | if (movablemem_map.nr_map) { | ||
| 5362 | memset(movablemem_map.map, 0, | ||
| 5363 | sizeof(struct movablemem_entry) | ||
| 5364 | * movablemem_map.nr_map); | ||
| 5365 | movablemem_map.nr_map = 0; | ||
| 5366 | } | ||
| 5367 | return 0; | ||
| 5368 | } | ||
| 5369 | |||
| 5370 | oldp = p; | ||
| 5371 | mem_size = memparse(p, &p); | ||
| 5372 | if (p == oldp) | ||
| 5373 | goto err; | ||
| 5374 | |||
| 5375 | if (*p == '@') { | ||
| 5376 | oldp = ++p; | ||
| 5377 | start_at = memparse(p, &p); | ||
| 5378 | if (p == oldp || *p != '\0') | ||
| 5379 | goto err; | ||
| 5380 | |||
| 5381 | movablemem_map_add_region(start_at, mem_size); | ||
| 5382 | return 0; | ||
| 5383 | } | ||
| 5384 | err: | ||
| 5385 | return -EINVAL; | ||
| 5386 | } | ||
| 5387 | early_param("movablemem_map", cmdline_parse_movablemem_map); | ||
| 5388 | |||
| 5081 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 5389 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
| 5082 | 5390 | ||
| 5083 | /** | 5391 | /** |
| @@ -5160,8 +5468,8 @@ static void calculate_totalreserve_pages(void) | |||
| 5160 | /* we treat the high watermark as reserved pages. */ | 5468 | /* we treat the high watermark as reserved pages. */ |
| 5161 | max += high_wmark_pages(zone); | 5469 | max += high_wmark_pages(zone); |
| 5162 | 5470 | ||
| 5163 | if (max > zone->present_pages) | 5471 | if (max > zone->managed_pages) |
| 5164 | max = zone->present_pages; | 5472 | max = zone->managed_pages; |
| 5165 | reserve_pages += max; | 5473 | reserve_pages += max; |
| 5166 | /* | 5474 | /* |
| 5167 | * Lowmem reserves are not available to | 5475 | * Lowmem reserves are not available to |
| @@ -5193,7 +5501,7 @@ static void setup_per_zone_lowmem_reserve(void) | |||
| 5193 | for_each_online_pgdat(pgdat) { | 5501 | for_each_online_pgdat(pgdat) { |
| 5194 | for (j = 0; j < MAX_NR_ZONES; j++) { | 5502 | for (j = 0; j < MAX_NR_ZONES; j++) { |
| 5195 | struct zone *zone = pgdat->node_zones + j; | 5503 | struct zone *zone = pgdat->node_zones + j; |
| 5196 | unsigned long present_pages = zone->present_pages; | 5504 | unsigned long managed_pages = zone->managed_pages; |
| 5197 | 5505 | ||
| 5198 | zone->lowmem_reserve[j] = 0; | 5506 | zone->lowmem_reserve[j] = 0; |
| 5199 | 5507 | ||
| @@ -5207,9 +5515,9 @@ static void setup_per_zone_lowmem_reserve(void) | |||
| 5207 | sysctl_lowmem_reserve_ratio[idx] = 1; | 5515 | sysctl_lowmem_reserve_ratio[idx] = 1; |
| 5208 | 5516 | ||
| 5209 | lower_zone = pgdat->node_zones + idx; | 5517 | lower_zone = pgdat->node_zones + idx; |
| 5210 | lower_zone->lowmem_reserve[j] = present_pages / | 5518 | lower_zone->lowmem_reserve[j] = managed_pages / |
| 5211 | sysctl_lowmem_reserve_ratio[idx]; | 5519 | sysctl_lowmem_reserve_ratio[idx]; |
| 5212 | present_pages += lower_zone->present_pages; | 5520 | managed_pages += lower_zone->managed_pages; |
| 5213 | } | 5521 | } |
| 5214 | } | 5522 | } |
| 5215 | } | 5523 | } |
| @@ -5228,14 +5536,14 @@ static void __setup_per_zone_wmarks(void) | |||
| 5228 | /* Calculate total number of !ZONE_HIGHMEM pages */ | 5536 | /* Calculate total number of !ZONE_HIGHMEM pages */ |
| 5229 | for_each_zone(zone) { | 5537 | for_each_zone(zone) { |
| 5230 | if (!is_highmem(zone)) | 5538 | if (!is_highmem(zone)) |
| 5231 | lowmem_pages += zone->present_pages; | 5539 | lowmem_pages += zone->managed_pages; |
| 5232 | } | 5540 | } |
| 5233 | 5541 | ||
| 5234 | for_each_zone(zone) { | 5542 | for_each_zone(zone) { |
| 5235 | u64 tmp; | 5543 | u64 tmp; |
| 5236 | 5544 | ||
| 5237 | spin_lock_irqsave(&zone->lock, flags); | 5545 | spin_lock_irqsave(&zone->lock, flags); |
| 5238 | tmp = (u64)pages_min * zone->present_pages; | 5546 | tmp = (u64)pages_min * zone->managed_pages; |
| 5239 | do_div(tmp, lowmem_pages); | 5547 | do_div(tmp, lowmem_pages); |
| 5240 | if (is_highmem(zone)) { | 5548 | if (is_highmem(zone)) { |
| 5241 | /* | 5549 | /* |
| @@ -5247,13 +5555,10 @@ static void __setup_per_zone_wmarks(void) | |||
| 5247 | * deltas controls asynch page reclaim, and so should | 5555 | * deltas controls asynch page reclaim, and so should |
| 5248 | * not be capped for highmem. | 5556 | * not be capped for highmem. |
| 5249 | */ | 5557 | */ |
| 5250 | int min_pages; | 5558 | unsigned long min_pages; |
| 5251 | 5559 | ||
| 5252 | min_pages = zone->present_pages / 1024; | 5560 | min_pages = zone->managed_pages / 1024; |
| 5253 | if (min_pages < SWAP_CLUSTER_MAX) | 5561 | min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); |
| 5254 | min_pages = SWAP_CLUSTER_MAX; | ||
| 5255 | if (min_pages > 128) | ||
| 5256 | min_pages = 128; | ||
| 5257 | zone->watermark[WMARK_MIN] = min_pages; | 5562 | zone->watermark[WMARK_MIN] = min_pages; |
| 5258 | } else { | 5563 | } else { |
| 5259 | /* | 5564 | /* |
| @@ -5314,7 +5619,7 @@ static void __meminit calculate_zone_inactive_ratio(struct zone *zone) | |||
| 5314 | unsigned int gb, ratio; | 5619 | unsigned int gb, ratio; |
| 5315 | 5620 | ||
| 5316 | /* Zone size in gigabytes */ | 5621 | /* Zone size in gigabytes */ |
| 5317 | gb = zone->present_pages >> (30 - PAGE_SHIFT); | 5622 | gb = zone->managed_pages >> (30 - PAGE_SHIFT); |
| 5318 | if (gb) | 5623 | if (gb) |
| 5319 | ratio = int_sqrt(10 * gb); | 5624 | ratio = int_sqrt(10 * gb); |
| 5320 | else | 5625 | else |
| @@ -5400,7 +5705,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, | |||
| 5400 | return rc; | 5705 | return rc; |
| 5401 | 5706 | ||
| 5402 | for_each_zone(zone) | 5707 | for_each_zone(zone) |
| 5403 | zone->min_unmapped_pages = (zone->present_pages * | 5708 | zone->min_unmapped_pages = (zone->managed_pages * |
| 5404 | sysctl_min_unmapped_ratio) / 100; | 5709 | sysctl_min_unmapped_ratio) / 100; |
| 5405 | return 0; | 5710 | return 0; |
| 5406 | } | 5711 | } |
| @@ -5416,7 +5721,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, | |||
| 5416 | return rc; | 5721 | return rc; |
| 5417 | 5722 | ||
| 5418 | for_each_zone(zone) | 5723 | for_each_zone(zone) |
| 5419 | zone->min_slab_pages = (zone->present_pages * | 5724 | zone->min_slab_pages = (zone->managed_pages * |
| 5420 | sysctl_min_slab_ratio) / 100; | 5725 | sysctl_min_slab_ratio) / 100; |
| 5421 | return 0; | 5726 | return 0; |
| 5422 | } | 5727 | } |
| @@ -5458,7 +5763,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, | |||
| 5458 | for_each_populated_zone(zone) { | 5763 | for_each_populated_zone(zone) { |
| 5459 | for_each_possible_cpu(cpu) { | 5764 | for_each_possible_cpu(cpu) { |
| 5460 | unsigned long high; | 5765 | unsigned long high; |
| 5461 | high = zone->present_pages / percpu_pagelist_fraction; | 5766 | high = zone->managed_pages / percpu_pagelist_fraction; |
| 5462 | setup_pagelist_highmark( | 5767 | setup_pagelist_highmark( |
| 5463 | per_cpu_ptr(zone->pageset, cpu), high); | 5768 | per_cpu_ptr(zone->pageset, cpu), high); |
| 5464 | } | 5769 | } |
| @@ -5645,8 +5950,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, | |||
| 5645 | pfn = page_to_pfn(page); | 5950 | pfn = page_to_pfn(page); |
| 5646 | bitmap = get_pageblock_bitmap(zone, pfn); | 5951 | bitmap = get_pageblock_bitmap(zone, pfn); |
| 5647 | bitidx = pfn_to_bitidx(zone, pfn); | 5952 | bitidx = pfn_to_bitidx(zone, pfn); |
| 5648 | VM_BUG_ON(pfn < zone->zone_start_pfn); | 5953 | VM_BUG_ON(!zone_spans_pfn(zone, pfn)); |
| 5649 | VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages); | ||
| 5650 | 5954 | ||
| 5651 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) | 5955 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) |
| 5652 | if (flags & value) | 5956 | if (flags & value) |
| @@ -5744,8 +6048,7 @@ bool is_pageblock_removable_nolock(struct page *page) | |||
| 5744 | 6048 | ||
| 5745 | zone = page_zone(page); | 6049 | zone = page_zone(page); |
| 5746 | pfn = page_to_pfn(page); | 6050 | pfn = page_to_pfn(page); |
| 5747 | if (zone->zone_start_pfn > pfn || | 6051 | if (!zone_spans_pfn(zone, pfn)) |
| 5748 | zone->zone_start_pfn + zone->spanned_pages <= pfn) | ||
| 5749 | return false; | 6052 | return false; |
| 5750 | 6053 | ||
| 5751 | return !has_unmovable_pages(zone, page, 0, true); | 6054 | return !has_unmovable_pages(zone, page, 0, true); |
| @@ -5801,14 +6104,14 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, | |||
| 5801 | &cc->migratepages); | 6104 | &cc->migratepages); |
| 5802 | cc->nr_migratepages -= nr_reclaimed; | 6105 | cc->nr_migratepages -= nr_reclaimed; |
| 5803 | 6106 | ||
| 5804 | ret = migrate_pages(&cc->migratepages, | 6107 | ret = migrate_pages(&cc->migratepages, alloc_migrate_target, |
| 5805 | alloc_migrate_target, | 6108 | 0, MIGRATE_SYNC, MR_CMA); |
| 5806 | 0, false, MIGRATE_SYNC, | ||
| 5807 | MR_CMA); | ||
| 5808 | } | 6109 | } |
| 5809 | 6110 | if (ret < 0) { | |
| 5810 | putback_movable_pages(&cc->migratepages); | 6111 | putback_movable_pages(&cc->migratepages); |
| 5811 | return ret > 0 ? 0 : ret; | 6112 | return ret; |
| 6113 | } | ||
| 6114 | return 0; | ||
| 5812 | } | 6115 | } |
| 5813 | 6116 | ||
| 5814 | /** | 6117 | /** |
| @@ -105,7 +105,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) | |||
| 105 | */ | 105 | */ |
| 106 | if (rwsem_is_locked(&anon_vma->root->rwsem)) { | 106 | if (rwsem_is_locked(&anon_vma->root->rwsem)) { |
| 107 | anon_vma_lock_write(anon_vma); | 107 | anon_vma_lock_write(anon_vma); |
| 108 | anon_vma_unlock(anon_vma); | 108 | anon_vma_unlock_write(anon_vma); |
| 109 | } | 109 | } |
| 110 | 110 | ||
| 111 | kmem_cache_free(anon_vma_cachep, anon_vma); | 111 | kmem_cache_free(anon_vma_cachep, anon_vma); |
| @@ -191,7 +191,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
| 191 | avc = NULL; | 191 | avc = NULL; |
| 192 | } | 192 | } |
| 193 | spin_unlock(&mm->page_table_lock); | 193 | spin_unlock(&mm->page_table_lock); |
| 194 | anon_vma_unlock(anon_vma); | 194 | anon_vma_unlock_write(anon_vma); |
| 195 | 195 | ||
| 196 | if (unlikely(allocated)) | 196 | if (unlikely(allocated)) |
| 197 | put_anon_vma(allocated); | 197 | put_anon_vma(allocated); |
| @@ -308,7 +308,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
| 308 | vma->anon_vma = anon_vma; | 308 | vma->anon_vma = anon_vma; |
| 309 | anon_vma_lock_write(anon_vma); | 309 | anon_vma_lock_write(anon_vma); |
| 310 | anon_vma_chain_link(vma, avc, anon_vma); | 310 | anon_vma_chain_link(vma, avc, anon_vma); |
| 311 | anon_vma_unlock(anon_vma); | 311 | anon_vma_unlock_write(anon_vma); |
| 312 | 312 | ||
| 313 | return 0; | 313 | return 0; |
| 314 | 314 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index 5dd56f6efdbd..1ad79243cb7b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
| @@ -335,19 +335,19 @@ static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping, | |||
| 335 | pgoff_t start, unsigned int nr_pages, | 335 | pgoff_t start, unsigned int nr_pages, |
| 336 | struct page **pages, pgoff_t *indices) | 336 | struct page **pages, pgoff_t *indices) |
| 337 | { | 337 | { |
| 338 | unsigned int i; | 338 | void **slot; |
| 339 | unsigned int ret; | 339 | unsigned int ret = 0; |
| 340 | unsigned int nr_found; | 340 | struct radix_tree_iter iter; |
| 341 | |||
| 342 | if (!nr_pages) | ||
| 343 | return 0; | ||
| 341 | 344 | ||
| 342 | rcu_read_lock(); | 345 | rcu_read_lock(); |
| 343 | restart: | 346 | restart: |
| 344 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, | 347 | radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { |
| 345 | (void ***)pages, indices, start, nr_pages); | ||
| 346 | ret = 0; | ||
| 347 | for (i = 0; i < nr_found; i++) { | ||
| 348 | struct page *page; | 348 | struct page *page; |
| 349 | repeat: | 349 | repeat: |
| 350 | page = radix_tree_deref_slot((void **)pages[i]); | 350 | page = radix_tree_deref_slot(slot); |
| 351 | if (unlikely(!page)) | 351 | if (unlikely(!page)) |
| 352 | continue; | 352 | continue; |
| 353 | if (radix_tree_exception(page)) { | 353 | if (radix_tree_exception(page)) { |
| @@ -364,17 +364,16 @@ repeat: | |||
| 364 | goto repeat; | 364 | goto repeat; |
| 365 | 365 | ||
| 366 | /* Has the page moved? */ | 366 | /* Has the page moved? */ |
| 367 | if (unlikely(page != *((void **)pages[i]))) { | 367 | if (unlikely(page != *slot)) { |
| 368 | page_cache_release(page); | 368 | page_cache_release(page); |
| 369 | goto repeat; | 369 | goto repeat; |
| 370 | } | 370 | } |
| 371 | export: | 371 | export: |
| 372 | indices[ret] = indices[i]; | 372 | indices[ret] = iter.index; |
| 373 | pages[ret] = page; | 373 | pages[ret] = page; |
| 374 | ret++; | 374 | if (++ret == nr_pages) |
| 375 | break; | ||
| 375 | } | 376 | } |
| 376 | if (unlikely(!ret && nr_found)) | ||
| 377 | goto restart; | ||
| 378 | rcu_read_unlock(); | 377 | rcu_read_unlock(); |
| 379 | return ret; | 378 | return ret; |
| 380 | } | 379 | } |
| @@ -2386,6 +2385,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, | |||
| 2386 | bool remount) | 2385 | bool remount) |
| 2387 | { | 2386 | { |
| 2388 | char *this_char, *value, *rest; | 2387 | char *this_char, *value, *rest; |
| 2388 | struct mempolicy *mpol = NULL; | ||
| 2389 | uid_t uid; | 2389 | uid_t uid; |
| 2390 | gid_t gid; | 2390 | gid_t gid; |
| 2391 | 2391 | ||
| @@ -2414,7 +2414,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, | |||
| 2414 | printk(KERN_ERR | 2414 | printk(KERN_ERR |
| 2415 | "tmpfs: No value for mount option '%s'\n", | 2415 | "tmpfs: No value for mount option '%s'\n", |
| 2416 | this_char); | 2416 | this_char); |
| 2417 | return 1; | 2417 | goto error; |
| 2418 | } | 2418 | } |
| 2419 | 2419 | ||
| 2420 | if (!strcmp(this_char,"size")) { | 2420 | if (!strcmp(this_char,"size")) { |
| @@ -2463,19 +2463,24 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, | |||
| 2463 | if (!gid_valid(sbinfo->gid)) | 2463 | if (!gid_valid(sbinfo->gid)) |
| 2464 | goto bad_val; | 2464 | goto bad_val; |
| 2465 | } else if (!strcmp(this_char,"mpol")) { | 2465 | } else if (!strcmp(this_char,"mpol")) { |
| 2466 | if (mpol_parse_str(value, &sbinfo->mpol)) | 2466 | mpol_put(mpol); |
| 2467 | mpol = NULL; | ||
| 2468 | if (mpol_parse_str(value, &mpol)) | ||
| 2467 | goto bad_val; | 2469 | goto bad_val; |
| 2468 | } else { | 2470 | } else { |
| 2469 | printk(KERN_ERR "tmpfs: Bad mount option %s\n", | 2471 | printk(KERN_ERR "tmpfs: Bad mount option %s\n", |
| 2470 | this_char); | 2472 | this_char); |
| 2471 | return 1; | 2473 | goto error; |
| 2472 | } | 2474 | } |
| 2473 | } | 2475 | } |
| 2476 | sbinfo->mpol = mpol; | ||
| 2474 | return 0; | 2477 | return 0; |
| 2475 | 2478 | ||
| 2476 | bad_val: | 2479 | bad_val: |
| 2477 | printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", | 2480 | printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", |
| 2478 | value, this_char); | 2481 | value, this_char); |
| 2482 | error: | ||
| 2483 | mpol_put(mpol); | ||
| 2479 | return 1; | 2484 | return 1; |
| 2480 | 2485 | ||
| 2481 | } | 2486 | } |
| @@ -2487,6 +2492,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) | |||
| 2487 | unsigned long inodes; | 2492 | unsigned long inodes; |
| 2488 | int error = -EINVAL; | 2493 | int error = -EINVAL; |
| 2489 | 2494 | ||
| 2495 | config.mpol = NULL; | ||
| 2490 | if (shmem_parse_options(data, &config, true)) | 2496 | if (shmem_parse_options(data, &config, true)) |
| 2491 | return error; | 2497 | return error; |
| 2492 | 2498 | ||
| @@ -2511,8 +2517,13 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) | |||
| 2511 | sbinfo->max_inodes = config.max_inodes; | 2517 | sbinfo->max_inodes = config.max_inodes; |
| 2512 | sbinfo->free_inodes = config.max_inodes - inodes; | 2518 | sbinfo->free_inodes = config.max_inodes - inodes; |
| 2513 | 2519 | ||
| 2514 | mpol_put(sbinfo->mpol); | 2520 | /* |
| 2515 | sbinfo->mpol = config.mpol; /* transfers initial ref */ | 2521 | * Preserve previous mempolicy unless mpol remount option was specified. |
| 2522 | */ | ||
| 2523 | if (config.mpol) { | ||
| 2524 | mpol_put(sbinfo->mpol); | ||
| 2525 | sbinfo->mpol = config.mpol; /* transfers initial ref */ | ||
| 2526 | } | ||
| 2516 | out: | 2527 | out: |
| 2517 | spin_unlock(&sbinfo->stat_lock); | 2528 | spin_unlock(&sbinfo->stat_lock); |
| 2518 | return error; | 2529 | return error; |
| @@ -2545,6 +2556,7 @@ static void shmem_put_super(struct super_block *sb) | |||
| 2545 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); | 2556 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); |
| 2546 | 2557 | ||
| 2547 | percpu_counter_destroy(&sbinfo->used_blocks); | 2558 | percpu_counter_destroy(&sbinfo->used_blocks); |
| 2559 | mpol_put(sbinfo->mpol); | ||
| 2548 | kfree(sbinfo); | 2560 | kfree(sbinfo); |
| 2549 | sb->s_fs_info = NULL; | 2561 | sb->s_fs_info = NULL; |
| 2550 | } | 2562 | } |
| @@ -360,7 +360,7 @@ static void slob_free(void *block, int size) | |||
| 360 | clear_slob_page_free(sp); | 360 | clear_slob_page_free(sp); |
| 361 | spin_unlock_irqrestore(&slob_lock, flags); | 361 | spin_unlock_irqrestore(&slob_lock, flags); |
| 362 | __ClearPageSlab(sp); | 362 | __ClearPageSlab(sp); |
| 363 | reset_page_mapcount(sp); | 363 | page_mapcount_reset(sp); |
| 364 | slob_free_pages(b, 0); | 364 | slob_free_pages(b, 0); |
| 365 | return; | 365 | return; |
| 366 | } | 366 | } |
| @@ -1408,7 +1408,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
| 1408 | __ClearPageSlab(page); | 1408 | __ClearPageSlab(page); |
| 1409 | 1409 | ||
| 1410 | memcg_release_pages(s, order); | 1410 | memcg_release_pages(s, order); |
| 1411 | reset_page_mapcount(page); | 1411 | page_mapcount_reset(page); |
| 1412 | if (current->reclaim_state) | 1412 | if (current->reclaim_state) |
| 1413 | current->reclaim_state->reclaimed_slab += pages; | 1413 | current->reclaim_state->reclaimed_slab += pages; |
| 1414 | __free_memcg_kmem_pages(page, order); | 1414 | __free_memcg_kmem_pages(page, order); |
diff --git a/mm/sparse.c b/mm/sparse.c index 6b5fb762e2ca..7ca6dc847947 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
| @@ -615,10 +615,11 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, | |||
| 615 | } | 615 | } |
| 616 | static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) | 616 | static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) |
| 617 | { | 617 | { |
| 618 | return; /* XXX: Not implemented yet */ | 618 | vmemmap_free(memmap, nr_pages); |
| 619 | } | 619 | } |
| 620 | static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) | 620 | static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) |
| 621 | { | 621 | { |
| 622 | vmemmap_free(memmap, nr_pages); | ||
| 622 | } | 623 | } |
| 623 | #else | 624 | #else |
| 624 | static struct page *__kmalloc_section_memmap(unsigned long nr_pages) | 625 | static struct page *__kmalloc_section_memmap(unsigned long nr_pages) |
| @@ -697,7 +698,7 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap) | |||
| 697 | /* | 698 | /* |
| 698 | * Check to see if allocation came from hot-plug-add | 699 | * Check to see if allocation came from hot-plug-add |
| 699 | */ | 700 | */ |
| 700 | if (PageSlab(usemap_page)) { | 701 | if (PageSlab(usemap_page) || PageCompound(usemap_page)) { |
| 701 | kfree(usemap); | 702 | kfree(usemap); |
| 702 | if (memmap) | 703 | if (memmap) |
| 703 | __kfree_section_memmap(memmap, PAGES_PER_SECTION); | 704 | __kfree_section_memmap(memmap, PAGES_PER_SECTION); |
| @@ -782,7 +783,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) | |||
| 782 | 783 | ||
| 783 | for (i = 0; i < PAGES_PER_SECTION; i++) { | 784 | for (i = 0; i < PAGES_PER_SECTION; i++) { |
| 784 | if (PageHWPoison(&memmap[i])) { | 785 | if (PageHWPoison(&memmap[i])) { |
| 785 | atomic_long_sub(1, &mce_bad_pages); | 786 | atomic_long_sub(1, &num_poisoned_pages); |
| 786 | ClearPageHWPoison(&memmap[i]); | 787 | ClearPageHWPoison(&memmap[i]); |
| 787 | } | 788 | } |
| 788 | } | 789 | } |
| @@ -796,8 +797,10 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) | |||
| 796 | void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) | 797 | void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) |
| 797 | { | 798 | { |
| 798 | struct page *memmap = NULL; | 799 | struct page *memmap = NULL; |
| 799 | unsigned long *usemap = NULL; | 800 | unsigned long *usemap = NULL, flags; |
| 801 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
| 800 | 802 | ||
| 803 | pgdat_resize_lock(pgdat, &flags); | ||
| 801 | if (ms->section_mem_map) { | 804 | if (ms->section_mem_map) { |
| 802 | usemap = ms->pageblock_flags; | 805 | usemap = ms->pageblock_flags; |
| 803 | memmap = sparse_decode_mem_map(ms->section_mem_map, | 806 | memmap = sparse_decode_mem_map(ms->section_mem_map, |
| @@ -805,6 +808,7 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) | |||
| 805 | ms->section_mem_map = 0; | 808 | ms->section_mem_map = 0; |
| 806 | ms->pageblock_flags = NULL; | 809 | ms->pageblock_flags = NULL; |
| 807 | } | 810 | } |
| 811 | pgdat_resize_unlock(pgdat, &flags); | ||
| 808 | 812 | ||
| 809 | clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); | 813 | clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); |
| 810 | free_section_usemap(memmap, usemap); | 814 | free_section_usemap(memmap, usemap); |
| @@ -855,9 +855,14 @@ EXPORT_SYMBOL(pagevec_lookup_tag); | |||
| 855 | void __init swap_setup(void) | 855 | void __init swap_setup(void) |
| 856 | { | 856 | { |
| 857 | unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); | 857 | unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); |
| 858 | |||
| 859 | #ifdef CONFIG_SWAP | 858 | #ifdef CONFIG_SWAP |
| 860 | bdi_init(swapper_space.backing_dev_info); | 859 | int i; |
| 860 | |||
| 861 | bdi_init(swapper_spaces[0].backing_dev_info); | ||
| 862 | for (i = 0; i < MAX_SWAPFILES; i++) { | ||
| 863 | spin_lock_init(&swapper_spaces[i].tree_lock); | ||
| 864 | INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); | ||
| 865 | } | ||
| 861 | #endif | 866 | #endif |
| 862 | 867 | ||
| 863 | /* Use a smaller cluster for small-memory machines */ | 868 | /* Use a smaller cluster for small-memory machines */ |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 0cb36fb1f61c..7efcf1525921 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
| @@ -36,12 +36,12 @@ static struct backing_dev_info swap_backing_dev_info = { | |||
| 36 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, | 36 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, |
| 37 | }; | 37 | }; |
| 38 | 38 | ||
| 39 | struct address_space swapper_space = { | 39 | struct address_space swapper_spaces[MAX_SWAPFILES] = { |
| 40 | .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), | 40 | [0 ... MAX_SWAPFILES - 1] = { |
| 41 | .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), | 41 | .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), |
| 42 | .a_ops = &swap_aops, | 42 | .a_ops = &swap_aops, |
| 43 | .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), | 43 | .backing_dev_info = &swap_backing_dev_info, |
| 44 | .backing_dev_info = &swap_backing_dev_info, | 44 | } |
| 45 | }; | 45 | }; |
| 46 | 46 | ||
| 47 | #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) | 47 | #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) |
| @@ -53,13 +53,24 @@ static struct { | |||
| 53 | unsigned long find_total; | 53 | unsigned long find_total; |
| 54 | } swap_cache_info; | 54 | } swap_cache_info; |
| 55 | 55 | ||
| 56 | unsigned long total_swapcache_pages(void) | ||
| 57 | { | ||
| 58 | int i; | ||
| 59 | unsigned long ret = 0; | ||
| 60 | |||
| 61 | for (i = 0; i < MAX_SWAPFILES; i++) | ||
| 62 | ret += swapper_spaces[i].nrpages; | ||
| 63 | return ret; | ||
| 64 | } | ||
| 65 | |||
| 56 | void show_swap_cache_info(void) | 66 | void show_swap_cache_info(void) |
| 57 | { | 67 | { |
| 58 | printk("%lu pages in swap cache\n", total_swapcache_pages); | 68 | printk("%lu pages in swap cache\n", total_swapcache_pages()); |
| 59 | printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", | 69 | printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", |
| 60 | swap_cache_info.add_total, swap_cache_info.del_total, | 70 | swap_cache_info.add_total, swap_cache_info.del_total, |
| 61 | swap_cache_info.find_success, swap_cache_info.find_total); | 71 | swap_cache_info.find_success, swap_cache_info.find_total); |
| 62 | printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10)); | 72 | printk("Free swap = %ldkB\n", |
| 73 | get_nr_swap_pages() << (PAGE_SHIFT - 10)); | ||
| 63 | printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); | 74 | printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); |
| 64 | } | 75 | } |
| 65 | 76 | ||
| @@ -70,6 +81,7 @@ void show_swap_cache_info(void) | |||
| 70 | static int __add_to_swap_cache(struct page *page, swp_entry_t entry) | 81 | static int __add_to_swap_cache(struct page *page, swp_entry_t entry) |
| 71 | { | 82 | { |
| 72 | int error; | 83 | int error; |
| 84 | struct address_space *address_space; | ||
| 73 | 85 | ||
| 74 | VM_BUG_ON(!PageLocked(page)); | 86 | VM_BUG_ON(!PageLocked(page)); |
| 75 | VM_BUG_ON(PageSwapCache(page)); | 87 | VM_BUG_ON(PageSwapCache(page)); |
| @@ -79,14 +91,16 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry) | |||
| 79 | SetPageSwapCache(page); | 91 | SetPageSwapCache(page); |
| 80 | set_page_private(page, entry.val); | 92 | set_page_private(page, entry.val); |
| 81 | 93 | ||
| 82 | spin_lock_irq(&swapper_space.tree_lock); | 94 | address_space = swap_address_space(entry); |
| 83 | error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); | 95 | spin_lock_irq(&address_space->tree_lock); |
| 96 | error = radix_tree_insert(&address_space->page_tree, | ||
| 97 | entry.val, page); | ||
| 84 | if (likely(!error)) { | 98 | if (likely(!error)) { |
| 85 | total_swapcache_pages++; | 99 | address_space->nrpages++; |
| 86 | __inc_zone_page_state(page, NR_FILE_PAGES); | 100 | __inc_zone_page_state(page, NR_FILE_PAGES); |
| 87 | INC_CACHE_INFO(add_total); | 101 | INC_CACHE_INFO(add_total); |
| 88 | } | 102 | } |
| 89 | spin_unlock_irq(&swapper_space.tree_lock); | 103 | spin_unlock_irq(&address_space->tree_lock); |
| 90 | 104 | ||
| 91 | if (unlikely(error)) { | 105 | if (unlikely(error)) { |
| 92 | /* | 106 | /* |
| @@ -122,14 +136,19 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) | |||
| 122 | */ | 136 | */ |
| 123 | void __delete_from_swap_cache(struct page *page) | 137 | void __delete_from_swap_cache(struct page *page) |
| 124 | { | 138 | { |
| 139 | swp_entry_t entry; | ||
| 140 | struct address_space *address_space; | ||
| 141 | |||
| 125 | VM_BUG_ON(!PageLocked(page)); | 142 | VM_BUG_ON(!PageLocked(page)); |
| 126 | VM_BUG_ON(!PageSwapCache(page)); | 143 | VM_BUG_ON(!PageSwapCache(page)); |
| 127 | VM_BUG_ON(PageWriteback(page)); | 144 | VM_BUG_ON(PageWriteback(page)); |
| 128 | 145 | ||
| 129 | radix_tree_delete(&swapper_space.page_tree, page_private(page)); | 146 | entry.val = page_private(page); |
| 147 | address_space = swap_address_space(entry); | ||
| 148 | radix_tree_delete(&address_space->page_tree, page_private(page)); | ||
| 130 | set_page_private(page, 0); | 149 | set_page_private(page, 0); |
| 131 | ClearPageSwapCache(page); | 150 | ClearPageSwapCache(page); |
| 132 | total_swapcache_pages--; | 151 | address_space->nrpages--; |
| 133 | __dec_zone_page_state(page, NR_FILE_PAGES); | 152 | __dec_zone_page_state(page, NR_FILE_PAGES); |
| 134 | INC_CACHE_INFO(del_total); | 153 | INC_CACHE_INFO(del_total); |
| 135 | } | 154 | } |
| @@ -195,12 +214,14 @@ int add_to_swap(struct page *page) | |||
| 195 | void delete_from_swap_cache(struct page *page) | 214 | void delete_from_swap_cache(struct page *page) |
| 196 | { | 215 | { |
| 197 | swp_entry_t entry; | 216 | swp_entry_t entry; |
| 217 | struct address_space *address_space; | ||
| 198 | 218 | ||
| 199 | entry.val = page_private(page); | 219 | entry.val = page_private(page); |
| 200 | 220 | ||
| 201 | spin_lock_irq(&swapper_space.tree_lock); | 221 | address_space = swap_address_space(entry); |
| 222 | spin_lock_irq(&address_space->tree_lock); | ||
| 202 | __delete_from_swap_cache(page); | 223 | __delete_from_swap_cache(page); |
| 203 | spin_unlock_irq(&swapper_space.tree_lock); | 224 | spin_unlock_irq(&address_space->tree_lock); |
| 204 | 225 | ||
| 205 | swapcache_free(entry, page); | 226 | swapcache_free(entry, page); |
| 206 | page_cache_release(page); | 227 | page_cache_release(page); |
| @@ -263,7 +284,7 @@ struct page * lookup_swap_cache(swp_entry_t entry) | |||
| 263 | { | 284 | { |
| 264 | struct page *page; | 285 | struct page *page; |
| 265 | 286 | ||
| 266 | page = find_get_page(&swapper_space, entry.val); | 287 | page = find_get_page(swap_address_space(entry), entry.val); |
| 267 | 288 | ||
| 268 | if (page) | 289 | if (page) |
| 269 | INC_CACHE_INFO(find_success); | 290 | INC_CACHE_INFO(find_success); |
| @@ -290,7 +311,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
| 290 | * called after lookup_swap_cache() failed, re-calling | 311 | * called after lookup_swap_cache() failed, re-calling |
| 291 | * that would confuse statistics. | 312 | * that would confuse statistics. |
| 292 | */ | 313 | */ |
| 293 | found_page = find_get_page(&swapper_space, entry.val); | 314 | found_page = find_get_page(swap_address_space(entry), |
| 315 | entry.val); | ||
| 294 | if (found_page) | 316 | if (found_page) |
| 295 | break; | 317 | break; |
| 296 | 318 | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c index e97a0e5aea91..c72c648f750c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
| @@ -47,9 +47,11 @@ static sector_t map_swap_entry(swp_entry_t, struct block_device**); | |||
| 47 | 47 | ||
| 48 | DEFINE_SPINLOCK(swap_lock); | 48 | DEFINE_SPINLOCK(swap_lock); |
| 49 | static unsigned int nr_swapfiles; | 49 | static unsigned int nr_swapfiles; |
| 50 | long nr_swap_pages; | 50 | atomic_long_t nr_swap_pages; |
| 51 | /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ | ||
| 51 | long total_swap_pages; | 52 | long total_swap_pages; |
| 52 | static int least_priority; | 53 | static int least_priority; |
| 54 | static atomic_t highest_priority_index = ATOMIC_INIT(-1); | ||
| 53 | 55 | ||
| 54 | static const char Bad_file[] = "Bad swap file entry "; | 56 | static const char Bad_file[] = "Bad swap file entry "; |
| 55 | static const char Unused_file[] = "Unused swap file entry "; | 57 | static const char Unused_file[] = "Unused swap file entry "; |
| @@ -79,7 +81,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) | |||
| 79 | struct page *page; | 81 | struct page *page; |
| 80 | int ret = 0; | 82 | int ret = 0; |
| 81 | 83 | ||
| 82 | page = find_get_page(&swapper_space, entry.val); | 84 | page = find_get_page(swap_address_space(entry), entry.val); |
| 83 | if (!page) | 85 | if (!page) |
| 84 | return 0; | 86 | return 0; |
| 85 | /* | 87 | /* |
| @@ -223,7 +225,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
| 223 | si->lowest_alloc = si->max; | 225 | si->lowest_alloc = si->max; |
| 224 | si->highest_alloc = 0; | 226 | si->highest_alloc = 0; |
| 225 | } | 227 | } |
| 226 | spin_unlock(&swap_lock); | 228 | spin_unlock(&si->lock); |
| 227 | 229 | ||
| 228 | /* | 230 | /* |
| 229 | * If seek is expensive, start searching for new cluster from | 231 | * If seek is expensive, start searching for new cluster from |
| @@ -242,7 +244,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
| 242 | if (si->swap_map[offset]) | 244 | if (si->swap_map[offset]) |
| 243 | last_in_cluster = offset + SWAPFILE_CLUSTER; | 245 | last_in_cluster = offset + SWAPFILE_CLUSTER; |
| 244 | else if (offset == last_in_cluster) { | 246 | else if (offset == last_in_cluster) { |
| 245 | spin_lock(&swap_lock); | 247 | spin_lock(&si->lock); |
| 246 | offset -= SWAPFILE_CLUSTER - 1; | 248 | offset -= SWAPFILE_CLUSTER - 1; |
| 247 | si->cluster_next = offset; | 249 | si->cluster_next = offset; |
| 248 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 250 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
| @@ -263,7 +265,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
| 263 | if (si->swap_map[offset]) | 265 | if (si->swap_map[offset]) |
| 264 | last_in_cluster = offset + SWAPFILE_CLUSTER; | 266 | last_in_cluster = offset + SWAPFILE_CLUSTER; |
| 265 | else if (offset == last_in_cluster) { | 267 | else if (offset == last_in_cluster) { |
| 266 | spin_lock(&swap_lock); | 268 | spin_lock(&si->lock); |
| 267 | offset -= SWAPFILE_CLUSTER - 1; | 269 | offset -= SWAPFILE_CLUSTER - 1; |
| 268 | si->cluster_next = offset; | 270 | si->cluster_next = offset; |
| 269 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 271 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
| @@ -277,7 +279,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
| 277 | } | 279 | } |
| 278 | 280 | ||
| 279 | offset = scan_base; | 281 | offset = scan_base; |
| 280 | spin_lock(&swap_lock); | 282 | spin_lock(&si->lock); |
| 281 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 283 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
| 282 | si->lowest_alloc = 0; | 284 | si->lowest_alloc = 0; |
| 283 | } | 285 | } |
| @@ -293,9 +295,9 @@ checks: | |||
| 293 | /* reuse swap entry of cache-only swap if not busy. */ | 295 | /* reuse swap entry of cache-only swap if not busy. */ |
| 294 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | 296 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { |
| 295 | int swap_was_freed; | 297 | int swap_was_freed; |
| 296 | spin_unlock(&swap_lock); | 298 | spin_unlock(&si->lock); |
| 297 | swap_was_freed = __try_to_reclaim_swap(si, offset); | 299 | swap_was_freed = __try_to_reclaim_swap(si, offset); |
| 298 | spin_lock(&swap_lock); | 300 | spin_lock(&si->lock); |
| 299 | /* entry was freed successfully, try to use this again */ | 301 | /* entry was freed successfully, try to use this again */ |
| 300 | if (swap_was_freed) | 302 | if (swap_was_freed) |
| 301 | goto checks; | 303 | goto checks; |
| @@ -335,13 +337,13 @@ checks: | |||
| 335 | si->lowest_alloc <= last_in_cluster) | 337 | si->lowest_alloc <= last_in_cluster) |
| 336 | last_in_cluster = si->lowest_alloc - 1; | 338 | last_in_cluster = si->lowest_alloc - 1; |
| 337 | si->flags |= SWP_DISCARDING; | 339 | si->flags |= SWP_DISCARDING; |
| 338 | spin_unlock(&swap_lock); | 340 | spin_unlock(&si->lock); |
| 339 | 341 | ||
| 340 | if (offset < last_in_cluster) | 342 | if (offset < last_in_cluster) |
| 341 | discard_swap_cluster(si, offset, | 343 | discard_swap_cluster(si, offset, |
| 342 | last_in_cluster - offset + 1); | 344 | last_in_cluster - offset + 1); |
| 343 | 345 | ||
| 344 | spin_lock(&swap_lock); | 346 | spin_lock(&si->lock); |
| 345 | si->lowest_alloc = 0; | 347 | si->lowest_alloc = 0; |
| 346 | si->flags &= ~SWP_DISCARDING; | 348 | si->flags &= ~SWP_DISCARDING; |
| 347 | 349 | ||
| @@ -355,10 +357,10 @@ checks: | |||
| 355 | * could defer that delay until swap_writepage, | 357 | * could defer that delay until swap_writepage, |
| 356 | * but it's easier to keep this self-contained. | 358 | * but it's easier to keep this self-contained. |
| 357 | */ | 359 | */ |
| 358 | spin_unlock(&swap_lock); | 360 | spin_unlock(&si->lock); |
| 359 | wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), | 361 | wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), |
| 360 | wait_for_discard, TASK_UNINTERRUPTIBLE); | 362 | wait_for_discard, TASK_UNINTERRUPTIBLE); |
| 361 | spin_lock(&swap_lock); | 363 | spin_lock(&si->lock); |
| 362 | } else { | 364 | } else { |
| 363 | /* | 365 | /* |
| 364 | * Note pages allocated by racing tasks while | 366 | * Note pages allocated by racing tasks while |
| @@ -374,14 +376,14 @@ checks: | |||
| 374 | return offset; | 376 | return offset; |
| 375 | 377 | ||
| 376 | scan: | 378 | scan: |
| 377 | spin_unlock(&swap_lock); | 379 | spin_unlock(&si->lock); |
| 378 | while (++offset <= si->highest_bit) { | 380 | while (++offset <= si->highest_bit) { |
| 379 | if (!si->swap_map[offset]) { | 381 | if (!si->swap_map[offset]) { |
| 380 | spin_lock(&swap_lock); | 382 | spin_lock(&si->lock); |
| 381 | goto checks; | 383 | goto checks; |
| 382 | } | 384 | } |
| 383 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | 385 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { |
| 384 | spin_lock(&swap_lock); | 386 | spin_lock(&si->lock); |
| 385 | goto checks; | 387 | goto checks; |
| 386 | } | 388 | } |
| 387 | if (unlikely(--latency_ration < 0)) { | 389 | if (unlikely(--latency_ration < 0)) { |
| @@ -392,11 +394,11 @@ scan: | |||
| 392 | offset = si->lowest_bit; | 394 | offset = si->lowest_bit; |
| 393 | while (++offset < scan_base) { | 395 | while (++offset < scan_base) { |
| 394 | if (!si->swap_map[offset]) { | 396 | if (!si->swap_map[offset]) { |
| 395 | spin_lock(&swap_lock); | 397 | spin_lock(&si->lock); |
| 396 | goto checks; | 398 | goto checks; |
| 397 | } | 399 | } |
| 398 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | 400 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { |
| 399 | spin_lock(&swap_lock); | 401 | spin_lock(&si->lock); |
| 400 | goto checks; | 402 | goto checks; |
| 401 | } | 403 | } |
| 402 | if (unlikely(--latency_ration < 0)) { | 404 | if (unlikely(--latency_ration < 0)) { |
| @@ -404,7 +406,7 @@ scan: | |||
| 404 | latency_ration = LATENCY_LIMIT; | 406 | latency_ration = LATENCY_LIMIT; |
| 405 | } | 407 | } |
| 406 | } | 408 | } |
| 407 | spin_lock(&swap_lock); | 409 | spin_lock(&si->lock); |
| 408 | 410 | ||
| 409 | no_page: | 411 | no_page: |
| 410 | si->flags -= SWP_SCANNING; | 412 | si->flags -= SWP_SCANNING; |
| @@ -417,13 +419,34 @@ swp_entry_t get_swap_page(void) | |||
| 417 | pgoff_t offset; | 419 | pgoff_t offset; |
| 418 | int type, next; | 420 | int type, next; |
| 419 | int wrapped = 0; | 421 | int wrapped = 0; |
| 422 | int hp_index; | ||
| 420 | 423 | ||
| 421 | spin_lock(&swap_lock); | 424 | spin_lock(&swap_lock); |
| 422 | if (nr_swap_pages <= 0) | 425 | if (atomic_long_read(&nr_swap_pages) <= 0) |
| 423 | goto noswap; | 426 | goto noswap; |
| 424 | nr_swap_pages--; | 427 | atomic_long_dec(&nr_swap_pages); |
| 425 | 428 | ||
| 426 | for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { | 429 | for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { |
| 430 | hp_index = atomic_xchg(&highest_priority_index, -1); | ||
| 431 | /* | ||
| 432 | * highest_priority_index records current highest priority swap | ||
| 433 | * type which just frees swap entries. If its priority is | ||
| 434 | * higher than that of swap_list.next swap type, we use it. It | ||
| 435 | * isn't protected by swap_lock, so it can be an invalid value | ||
| 436 | * if the corresponding swap type is swapoff. We double check | ||
| 437 | * the flags here. It's even possible the swap type is swapoff | ||
| 438 | * and swapon again and its priority is changed. In such rare | ||
| 439 | * case, low prority swap type might be used, but eventually | ||
| 440 | * high priority swap will be used after several rounds of | ||
| 441 | * swap. | ||
| 442 | */ | ||
| 443 | if (hp_index != -1 && hp_index != type && | ||
| 444 | swap_info[type]->prio < swap_info[hp_index]->prio && | ||
| 445 | (swap_info[hp_index]->flags & SWP_WRITEOK)) { | ||
| 446 | type = hp_index; | ||
| 447 | swap_list.next = type; | ||
| 448 | } | ||
| 449 | |||
| 427 | si = swap_info[type]; | 450 | si = swap_info[type]; |
| 428 | next = si->next; | 451 | next = si->next; |
| 429 | if (next < 0 || | 452 | if (next < 0 || |
| @@ -432,22 +455,29 @@ swp_entry_t get_swap_page(void) | |||
| 432 | wrapped++; | 455 | wrapped++; |
| 433 | } | 456 | } |
| 434 | 457 | ||
| 435 | if (!si->highest_bit) | 458 | spin_lock(&si->lock); |
| 459 | if (!si->highest_bit) { | ||
| 460 | spin_unlock(&si->lock); | ||
| 436 | continue; | 461 | continue; |
| 437 | if (!(si->flags & SWP_WRITEOK)) | 462 | } |
| 463 | if (!(si->flags & SWP_WRITEOK)) { | ||
| 464 | spin_unlock(&si->lock); | ||
| 438 | continue; | 465 | continue; |
| 466 | } | ||
| 439 | 467 | ||
| 440 | swap_list.next = next; | 468 | swap_list.next = next; |
| 469 | |||
| 470 | spin_unlock(&swap_lock); | ||
| 441 | /* This is called for allocating swap entry for cache */ | 471 | /* This is called for allocating swap entry for cache */ |
| 442 | offset = scan_swap_map(si, SWAP_HAS_CACHE); | 472 | offset = scan_swap_map(si, SWAP_HAS_CACHE); |
| 443 | if (offset) { | 473 | spin_unlock(&si->lock); |
| 444 | spin_unlock(&swap_lock); | 474 | if (offset) |
| 445 | return swp_entry(type, offset); | 475 | return swp_entry(type, offset); |
| 446 | } | 476 | spin_lock(&swap_lock); |
| 447 | next = swap_list.next; | 477 | next = swap_list.next; |
| 448 | } | 478 | } |
| 449 | 479 | ||
| 450 | nr_swap_pages++; | 480 | atomic_long_inc(&nr_swap_pages); |
| 451 | noswap: | 481 | noswap: |
| 452 | spin_unlock(&swap_lock); | 482 | spin_unlock(&swap_lock); |
| 453 | return (swp_entry_t) {0}; | 483 | return (swp_entry_t) {0}; |
| @@ -459,19 +489,19 @@ swp_entry_t get_swap_page_of_type(int type) | |||
| 459 | struct swap_info_struct *si; | 489 | struct swap_info_struct *si; |
| 460 | pgoff_t offset; | 490 | pgoff_t offset; |
| 461 | 491 | ||
| 462 | spin_lock(&swap_lock); | ||
| 463 | si = swap_info[type]; | 492 | si = swap_info[type]; |
| 493 | spin_lock(&si->lock); | ||
| 464 | if (si && (si->flags & SWP_WRITEOK)) { | 494 | if (si && (si->flags & SWP_WRITEOK)) { |
| 465 | nr_swap_pages--; | 495 | atomic_long_dec(&nr_swap_pages); |
| 466 | /* This is called for allocating swap entry, not cache */ | 496 | /* This is called for allocating swap entry, not cache */ |
| 467 | offset = scan_swap_map(si, 1); | 497 | offset = scan_swap_map(si, 1); |
| 468 | if (offset) { | 498 | if (offset) { |
| 469 | spin_unlock(&swap_lock); | 499 | spin_unlock(&si->lock); |
| 470 | return swp_entry(type, offset); | 500 | return swp_entry(type, offset); |
| 471 | } | 501 | } |
| 472 | nr_swap_pages++; | 502 | atomic_long_inc(&nr_swap_pages); |
| 473 | } | 503 | } |
| 474 | spin_unlock(&swap_lock); | 504 | spin_unlock(&si->lock); |
| 475 | return (swp_entry_t) {0}; | 505 | return (swp_entry_t) {0}; |
| 476 | } | 506 | } |
| 477 | 507 | ||
| @@ -493,7 +523,7 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry) | |||
| 493 | goto bad_offset; | 523 | goto bad_offset; |
| 494 | if (!p->swap_map[offset]) | 524 | if (!p->swap_map[offset]) |
| 495 | goto bad_free; | 525 | goto bad_free; |
| 496 | spin_lock(&swap_lock); | 526 | spin_lock(&p->lock); |
| 497 | return p; | 527 | return p; |
| 498 | 528 | ||
| 499 | bad_free: | 529 | bad_free: |
| @@ -511,6 +541,27 @@ out: | |||
| 511 | return NULL; | 541 | return NULL; |
| 512 | } | 542 | } |
| 513 | 543 | ||
| 544 | /* | ||
| 545 | * This swap type frees swap entry, check if it is the highest priority swap | ||
| 546 | * type which just frees swap entry. get_swap_page() uses | ||
| 547 | * highest_priority_index to search highest priority swap type. The | ||
| 548 | * swap_info_struct.lock can't protect us if there are multiple swap types | ||
| 549 | * active, so we use atomic_cmpxchg. | ||
| 550 | */ | ||
| 551 | static void set_highest_priority_index(int type) | ||
| 552 | { | ||
| 553 | int old_hp_index, new_hp_index; | ||
| 554 | |||
| 555 | do { | ||
| 556 | old_hp_index = atomic_read(&highest_priority_index); | ||
| 557 | if (old_hp_index != -1 && | ||
| 558 | swap_info[old_hp_index]->prio >= swap_info[type]->prio) | ||
| 559 | break; | ||
| 560 | new_hp_index = type; | ||
| 561 | } while (atomic_cmpxchg(&highest_priority_index, | ||
| 562 | old_hp_index, new_hp_index) != old_hp_index); | ||
| 563 | } | ||
| 564 | |||
| 514 | static unsigned char swap_entry_free(struct swap_info_struct *p, | 565 | static unsigned char swap_entry_free(struct swap_info_struct *p, |
| 515 | swp_entry_t entry, unsigned char usage) | 566 | swp_entry_t entry, unsigned char usage) |
| 516 | { | 567 | { |
| @@ -553,10 +604,8 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, | |||
| 553 | p->lowest_bit = offset; | 604 | p->lowest_bit = offset; |
| 554 | if (offset > p->highest_bit) | 605 | if (offset > p->highest_bit) |
| 555 | p->highest_bit = offset; | 606 | p->highest_bit = offset; |
| 556 | if (swap_list.next >= 0 && | 607 | set_highest_priority_index(p->type); |
| 557 | p->prio > swap_info[swap_list.next]->prio) | 608 | atomic_long_inc(&nr_swap_pages); |
| 558 | swap_list.next = p->type; | ||
| 559 | nr_swap_pages++; | ||
| 560 | p->inuse_pages--; | 609 | p->inuse_pages--; |
| 561 | frontswap_invalidate_page(p->type, offset); | 610 | frontswap_invalidate_page(p->type, offset); |
| 562 | if (p->flags & SWP_BLKDEV) { | 611 | if (p->flags & SWP_BLKDEV) { |
| @@ -581,7 +630,7 @@ void swap_free(swp_entry_t entry) | |||
| 581 | p = swap_info_get(entry); | 630 | p = swap_info_get(entry); |
| 582 | if (p) { | 631 | if (p) { |
| 583 | swap_entry_free(p, entry, 1); | 632 | swap_entry_free(p, entry, 1); |
| 584 | spin_unlock(&swap_lock); | 633 | spin_unlock(&p->lock); |
| 585 | } | 634 | } |
| 586 | } | 635 | } |
| 587 | 636 | ||
| @@ -598,7 +647,7 @@ void swapcache_free(swp_entry_t entry, struct page *page) | |||
| 598 | count = swap_entry_free(p, entry, SWAP_HAS_CACHE); | 647 | count = swap_entry_free(p, entry, SWAP_HAS_CACHE); |
| 599 | if (page) | 648 | if (page) |
| 600 | mem_cgroup_uncharge_swapcache(page, entry, count != 0); | 649 | mem_cgroup_uncharge_swapcache(page, entry, count != 0); |
| 601 | spin_unlock(&swap_lock); | 650 | spin_unlock(&p->lock); |
| 602 | } | 651 | } |
| 603 | } | 652 | } |
| 604 | 653 | ||
| @@ -617,7 +666,7 @@ int page_swapcount(struct page *page) | |||
| 617 | p = swap_info_get(entry); | 666 | p = swap_info_get(entry); |
| 618 | if (p) { | 667 | if (p) { |
| 619 | count = swap_count(p->swap_map[swp_offset(entry)]); | 668 | count = swap_count(p->swap_map[swp_offset(entry)]); |
| 620 | spin_unlock(&swap_lock); | 669 | spin_unlock(&p->lock); |
| 621 | } | 670 | } |
| 622 | return count; | 671 | return count; |
| 623 | } | 672 | } |
| @@ -699,13 +748,14 @@ int free_swap_and_cache(swp_entry_t entry) | |||
| 699 | p = swap_info_get(entry); | 748 | p = swap_info_get(entry); |
| 700 | if (p) { | 749 | if (p) { |
| 701 | if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { | 750 | if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { |
| 702 | page = find_get_page(&swapper_space, entry.val); | 751 | page = find_get_page(swap_address_space(entry), |
| 752 | entry.val); | ||
| 703 | if (page && !trylock_page(page)) { | 753 | if (page && !trylock_page(page)) { |
| 704 | page_cache_release(page); | 754 | page_cache_release(page); |
| 705 | page = NULL; | 755 | page = NULL; |
| 706 | } | 756 | } |
| 707 | } | 757 | } |
| 708 | spin_unlock(&swap_lock); | 758 | spin_unlock(&p->lock); |
| 709 | } | 759 | } |
| 710 | if (page) { | 760 | if (page) { |
| 711 | /* | 761 | /* |
| @@ -803,11 +853,13 @@ unsigned int count_swap_pages(int type, int free) | |||
| 803 | if ((unsigned int)type < nr_swapfiles) { | 853 | if ((unsigned int)type < nr_swapfiles) { |
| 804 | struct swap_info_struct *sis = swap_info[type]; | 854 | struct swap_info_struct *sis = swap_info[type]; |
| 805 | 855 | ||
| 856 | spin_lock(&sis->lock); | ||
| 806 | if (sis->flags & SWP_WRITEOK) { | 857 | if (sis->flags & SWP_WRITEOK) { |
| 807 | n = sis->pages; | 858 | n = sis->pages; |
| 808 | if (free) | 859 | if (free) |
| 809 | n -= sis->inuse_pages; | 860 | n -= sis->inuse_pages; |
| 810 | } | 861 | } |
| 862 | spin_unlock(&sis->lock); | ||
| 811 | } | 863 | } |
| 812 | spin_unlock(&swap_lock); | 864 | spin_unlock(&swap_lock); |
| 813 | return n; | 865 | return n; |
| @@ -822,11 +874,17 @@ unsigned int count_swap_pages(int type, int free) | |||
| 822 | static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | 874 | static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, |
| 823 | unsigned long addr, swp_entry_t entry, struct page *page) | 875 | unsigned long addr, swp_entry_t entry, struct page *page) |
| 824 | { | 876 | { |
| 877 | struct page *swapcache; | ||
| 825 | struct mem_cgroup *memcg; | 878 | struct mem_cgroup *memcg; |
| 826 | spinlock_t *ptl; | 879 | spinlock_t *ptl; |
| 827 | pte_t *pte; | 880 | pte_t *pte; |
| 828 | int ret = 1; | 881 | int ret = 1; |
| 829 | 882 | ||
| 883 | swapcache = page; | ||
| 884 | page = ksm_might_need_to_copy(page, vma, addr); | ||
| 885 | if (unlikely(!page)) | ||
| 886 | return -ENOMEM; | ||
| 887 | |||
| 830 | if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, | 888 | if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, |
| 831 | GFP_KERNEL, &memcg)) { | 889 | GFP_KERNEL, &memcg)) { |
| 832 | ret = -ENOMEM; | 890 | ret = -ENOMEM; |
| @@ -845,7 +903,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 845 | get_page(page); | 903 | get_page(page); |
| 846 | set_pte_at(vma->vm_mm, addr, pte, | 904 | set_pte_at(vma->vm_mm, addr, pte, |
| 847 | pte_mkold(mk_pte(page, vma->vm_page_prot))); | 905 | pte_mkold(mk_pte(page, vma->vm_page_prot))); |
| 848 | page_add_anon_rmap(page, vma, addr); | 906 | if (page == swapcache) |
| 907 | page_add_anon_rmap(page, vma, addr); | ||
| 908 | else /* ksm created a completely new copy */ | ||
| 909 | page_add_new_anon_rmap(page, vma, addr); | ||
| 849 | mem_cgroup_commit_charge_swapin(page, memcg); | 910 | mem_cgroup_commit_charge_swapin(page, memcg); |
| 850 | swap_free(entry); | 911 | swap_free(entry); |
| 851 | /* | 912 | /* |
| @@ -856,6 +917,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 856 | out: | 917 | out: |
| 857 | pte_unmap_unlock(pte, ptl); | 918 | pte_unmap_unlock(pte, ptl); |
| 858 | out_nolock: | 919 | out_nolock: |
| 920 | if (page != swapcache) { | ||
| 921 | unlock_page(page); | ||
| 922 | put_page(page); | ||
| 923 | } | ||
| 859 | return ret; | 924 | return ret; |
| 860 | } | 925 | } |
| 861 | 926 | ||
| @@ -1456,7 +1521,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, | |||
| 1456 | p->swap_map = swap_map; | 1521 | p->swap_map = swap_map; |
| 1457 | frontswap_map_set(p, frontswap_map); | 1522 | frontswap_map_set(p, frontswap_map); |
| 1458 | p->flags |= SWP_WRITEOK; | 1523 | p->flags |= SWP_WRITEOK; |
| 1459 | nr_swap_pages += p->pages; | 1524 | atomic_long_add(p->pages, &nr_swap_pages); |
| 1460 | total_swap_pages += p->pages; | 1525 | total_swap_pages += p->pages; |
| 1461 | 1526 | ||
| 1462 | /* insert swap space into swap_list: */ | 1527 | /* insert swap space into swap_list: */ |
| @@ -1478,15 +1543,19 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, | |||
| 1478 | unsigned long *frontswap_map) | 1543 | unsigned long *frontswap_map) |
| 1479 | { | 1544 | { |
| 1480 | spin_lock(&swap_lock); | 1545 | spin_lock(&swap_lock); |
| 1546 | spin_lock(&p->lock); | ||
| 1481 | _enable_swap_info(p, prio, swap_map, frontswap_map); | 1547 | _enable_swap_info(p, prio, swap_map, frontswap_map); |
| 1482 | frontswap_init(p->type); | 1548 | frontswap_init(p->type); |
| 1549 | spin_unlock(&p->lock); | ||
| 1483 | spin_unlock(&swap_lock); | 1550 | spin_unlock(&swap_lock); |
| 1484 | } | 1551 | } |
| 1485 | 1552 | ||
| 1486 | static void reinsert_swap_info(struct swap_info_struct *p) | 1553 | static void reinsert_swap_info(struct swap_info_struct *p) |
| 1487 | { | 1554 | { |
| 1488 | spin_lock(&swap_lock); | 1555 | spin_lock(&swap_lock); |
| 1556 | spin_lock(&p->lock); | ||
| 1489 | _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); | 1557 | _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); |
| 1558 | spin_unlock(&p->lock); | ||
| 1490 | spin_unlock(&swap_lock); | 1559 | spin_unlock(&swap_lock); |
| 1491 | } | 1560 | } |
| 1492 | 1561 | ||
| @@ -1546,14 +1615,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
| 1546 | /* just pick something that's safe... */ | 1615 | /* just pick something that's safe... */ |
| 1547 | swap_list.next = swap_list.head; | 1616 | swap_list.next = swap_list.head; |
| 1548 | } | 1617 | } |
| 1618 | spin_lock(&p->lock); | ||
| 1549 | if (p->prio < 0) { | 1619 | if (p->prio < 0) { |
| 1550 | for (i = p->next; i >= 0; i = swap_info[i]->next) | 1620 | for (i = p->next; i >= 0; i = swap_info[i]->next) |
| 1551 | swap_info[i]->prio = p->prio--; | 1621 | swap_info[i]->prio = p->prio--; |
| 1552 | least_priority++; | 1622 | least_priority++; |
| 1553 | } | 1623 | } |
| 1554 | nr_swap_pages -= p->pages; | 1624 | atomic_long_sub(p->pages, &nr_swap_pages); |
| 1555 | total_swap_pages -= p->pages; | 1625 | total_swap_pages -= p->pages; |
| 1556 | p->flags &= ~SWP_WRITEOK; | 1626 | p->flags &= ~SWP_WRITEOK; |
| 1627 | spin_unlock(&p->lock); | ||
| 1557 | spin_unlock(&swap_lock); | 1628 | spin_unlock(&swap_lock); |
| 1558 | 1629 | ||
| 1559 | set_current_oom_origin(); | 1630 | set_current_oom_origin(); |
| @@ -1572,14 +1643,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
| 1572 | 1643 | ||
| 1573 | mutex_lock(&swapon_mutex); | 1644 | mutex_lock(&swapon_mutex); |
| 1574 | spin_lock(&swap_lock); | 1645 | spin_lock(&swap_lock); |
| 1646 | spin_lock(&p->lock); | ||
| 1575 | drain_mmlist(); | 1647 | drain_mmlist(); |
| 1576 | 1648 | ||
| 1577 | /* wait for anyone still in scan_swap_map */ | 1649 | /* wait for anyone still in scan_swap_map */ |
| 1578 | p->highest_bit = 0; /* cuts scans short */ | 1650 | p->highest_bit = 0; /* cuts scans short */ |
| 1579 | while (p->flags >= SWP_SCANNING) { | 1651 | while (p->flags >= SWP_SCANNING) { |
| 1652 | spin_unlock(&p->lock); | ||
| 1580 | spin_unlock(&swap_lock); | 1653 | spin_unlock(&swap_lock); |
| 1581 | schedule_timeout_uninterruptible(1); | 1654 | schedule_timeout_uninterruptible(1); |
| 1582 | spin_lock(&swap_lock); | 1655 | spin_lock(&swap_lock); |
| 1656 | spin_lock(&p->lock); | ||
| 1583 | } | 1657 | } |
| 1584 | 1658 | ||
| 1585 | swap_file = p->swap_file; | 1659 | swap_file = p->swap_file; |
| @@ -1589,6 +1663,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
| 1589 | p->swap_map = NULL; | 1663 | p->swap_map = NULL; |
| 1590 | p->flags = 0; | 1664 | p->flags = 0; |
| 1591 | frontswap_invalidate_area(type); | 1665 | frontswap_invalidate_area(type); |
| 1666 | spin_unlock(&p->lock); | ||
| 1592 | spin_unlock(&swap_lock); | 1667 | spin_unlock(&swap_lock); |
| 1593 | mutex_unlock(&swapon_mutex); | 1668 | mutex_unlock(&swapon_mutex); |
| 1594 | vfree(swap_map); | 1669 | vfree(swap_map); |
| @@ -1794,6 +1869,7 @@ static struct swap_info_struct *alloc_swap_info(void) | |||
| 1794 | p->flags = SWP_USED; | 1869 | p->flags = SWP_USED; |
| 1795 | p->next = -1; | 1870 | p->next = -1; |
| 1796 | spin_unlock(&swap_lock); | 1871 | spin_unlock(&swap_lock); |
| 1872 | spin_lock_init(&p->lock); | ||
| 1797 | 1873 | ||
| 1798 | return p; | 1874 | return p; |
| 1799 | } | 1875 | } |
| @@ -2116,7 +2192,7 @@ void si_swapinfo(struct sysinfo *val) | |||
| 2116 | if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) | 2192 | if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) |
| 2117 | nr_to_be_unused += si->inuse_pages; | 2193 | nr_to_be_unused += si->inuse_pages; |
| 2118 | } | 2194 | } |
| 2119 | val->freeswap = nr_swap_pages + nr_to_be_unused; | 2195 | val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; |
| 2120 | val->totalswap = total_swap_pages + nr_to_be_unused; | 2196 | val->totalswap = total_swap_pages + nr_to_be_unused; |
| 2121 | spin_unlock(&swap_lock); | 2197 | spin_unlock(&swap_lock); |
| 2122 | } | 2198 | } |
| @@ -2149,7 +2225,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) | |||
| 2149 | p = swap_info[type]; | 2225 | p = swap_info[type]; |
| 2150 | offset = swp_offset(entry); | 2226 | offset = swp_offset(entry); |
| 2151 | 2227 | ||
| 2152 | spin_lock(&swap_lock); | 2228 | spin_lock(&p->lock); |
| 2153 | if (unlikely(offset >= p->max)) | 2229 | if (unlikely(offset >= p->max)) |
| 2154 | goto unlock_out; | 2230 | goto unlock_out; |
| 2155 | 2231 | ||
| @@ -2184,7 +2260,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) | |||
| 2184 | p->swap_map[offset] = count | has_cache; | 2260 | p->swap_map[offset] = count | has_cache; |
| 2185 | 2261 | ||
| 2186 | unlock_out: | 2262 | unlock_out: |
| 2187 | spin_unlock(&swap_lock); | 2263 | spin_unlock(&p->lock); |
| 2188 | out: | 2264 | out: |
| 2189 | return err; | 2265 | return err; |
| 2190 | 2266 | ||
| @@ -2309,7 +2385,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) | |||
| 2309 | } | 2385 | } |
| 2310 | 2386 | ||
| 2311 | if (!page) { | 2387 | if (!page) { |
| 2312 | spin_unlock(&swap_lock); | 2388 | spin_unlock(&si->lock); |
| 2313 | return -ENOMEM; | 2389 | return -ENOMEM; |
| 2314 | } | 2390 | } |
| 2315 | 2391 | ||
| @@ -2357,7 +2433,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) | |||
| 2357 | list_add_tail(&page->lru, &head->lru); | 2433 | list_add_tail(&page->lru, &head->lru); |
| 2358 | page = NULL; /* now it's attached, don't free it */ | 2434 | page = NULL; /* now it's attached, don't free it */ |
| 2359 | out: | 2435 | out: |
| 2360 | spin_unlock(&swap_lock); | 2436 | spin_unlock(&si->lock); |
| 2361 | outer: | 2437 | outer: |
| 2362 | if (page) | 2438 | if (page) |
| 2363 | __free_page(page); | 2439 | __free_page(page); |
| @@ -5,6 +5,8 @@ | |||
| 5 | #include <linux/err.h> | 5 | #include <linux/err.h> |
| 6 | #include <linux/sched.h> | 6 | #include <linux/sched.h> |
| 7 | #include <linux/security.h> | 7 | #include <linux/security.h> |
| 8 | #include <linux/swap.h> | ||
| 9 | #include <linux/swapops.h> | ||
| 8 | #include <asm/uaccess.h> | 10 | #include <asm/uaccess.h> |
| 9 | 11 | ||
| 10 | #include "internal.h" | 12 | #include "internal.h" |
| @@ -355,12 +357,16 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, | |||
| 355 | { | 357 | { |
| 356 | unsigned long ret; | 358 | unsigned long ret; |
| 357 | struct mm_struct *mm = current->mm; | 359 | struct mm_struct *mm = current->mm; |
| 360 | unsigned long populate; | ||
| 358 | 361 | ||
| 359 | ret = security_mmap_file(file, prot, flag); | 362 | ret = security_mmap_file(file, prot, flag); |
| 360 | if (!ret) { | 363 | if (!ret) { |
| 361 | down_write(&mm->mmap_sem); | 364 | down_write(&mm->mmap_sem); |
| 362 | ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff); | 365 | ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff, |
| 366 | &populate); | ||
| 363 | up_write(&mm->mmap_sem); | 367 | up_write(&mm->mmap_sem); |
| 368 | if (populate) | ||
| 369 | mm_populate(ret, populate); | ||
| 364 | } | 370 | } |
| 365 | return ret; | 371 | return ret; |
| 366 | } | 372 | } |
| @@ -378,6 +384,24 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, | |||
| 378 | } | 384 | } |
| 379 | EXPORT_SYMBOL(vm_mmap); | 385 | EXPORT_SYMBOL(vm_mmap); |
| 380 | 386 | ||
| 387 | struct address_space *page_mapping(struct page *page) | ||
| 388 | { | ||
| 389 | struct address_space *mapping = page->mapping; | ||
| 390 | |||
| 391 | VM_BUG_ON(PageSlab(page)); | ||
| 392 | #ifdef CONFIG_SWAP | ||
| 393 | if (unlikely(PageSwapCache(page))) { | ||
| 394 | swp_entry_t entry; | ||
| 395 | |||
| 396 | entry.val = page_private(page); | ||
| 397 | mapping = swap_address_space(entry); | ||
| 398 | } else | ||
| 399 | #endif | ||
| 400 | if ((unsigned long)mapping & PAGE_MAPPING_ANON) | ||
| 401 | mapping = NULL; | ||
| 402 | return mapping; | ||
| 403 | } | ||
| 404 | |||
| 381 | /* Tracepoints definitions. */ | 405 | /* Tracepoints definitions. */ |
| 382 | EXPORT_TRACEPOINT_SYMBOL(kmalloc); | 406 | EXPORT_TRACEPOINT_SYMBOL(kmalloc); |
| 383 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); | 407 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5123a169ab7b..0f751f2068c3 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
| @@ -1376,8 +1376,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, | |||
| 1376 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, | 1376 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, |
| 1377 | unsigned long start, unsigned long end) | 1377 | unsigned long start, unsigned long end) |
| 1378 | { | 1378 | { |
| 1379 | return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, | 1379 | return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, |
| 1380 | __builtin_return_address(0)); | 1380 | GFP_KERNEL, __builtin_return_address(0)); |
| 1381 | } | 1381 | } |
| 1382 | EXPORT_SYMBOL_GPL(__get_vm_area); | 1382 | EXPORT_SYMBOL_GPL(__get_vm_area); |
| 1383 | 1383 | ||
| @@ -1385,8 +1385,8 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, | |||
| 1385 | unsigned long start, unsigned long end, | 1385 | unsigned long start, unsigned long end, |
| 1386 | const void *caller) | 1386 | const void *caller) |
| 1387 | { | 1387 | { |
| 1388 | return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, | 1388 | return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, |
| 1389 | caller); | 1389 | GFP_KERNEL, caller); |
| 1390 | } | 1390 | } |
| 1391 | 1391 | ||
| 1392 | /** | 1392 | /** |
| @@ -1401,14 +1401,15 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, | |||
| 1401 | struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) | 1401 | struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) |
| 1402 | { | 1402 | { |
| 1403 | return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, | 1403 | return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, |
| 1404 | -1, GFP_KERNEL, __builtin_return_address(0)); | 1404 | NUMA_NO_NODE, GFP_KERNEL, |
| 1405 | __builtin_return_address(0)); | ||
| 1405 | } | 1406 | } |
| 1406 | 1407 | ||
| 1407 | struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, | 1408 | struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, |
| 1408 | const void *caller) | 1409 | const void *caller) |
| 1409 | { | 1410 | { |
| 1410 | return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, | 1411 | return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, |
| 1411 | -1, GFP_KERNEL, caller); | 1412 | NUMA_NO_NODE, GFP_KERNEL, caller); |
| 1412 | } | 1413 | } |
| 1413 | 1414 | ||
| 1414 | /** | 1415 | /** |
| @@ -1650,7 +1651,7 @@ fail: | |||
| 1650 | * @end: vm area range end | 1651 | * @end: vm area range end |
| 1651 | * @gfp_mask: flags for the page level allocator | 1652 | * @gfp_mask: flags for the page level allocator |
| 1652 | * @prot: protection mask for the allocated pages | 1653 | * @prot: protection mask for the allocated pages |
| 1653 | * @node: node to use for allocation or -1 | 1654 | * @node: node to use for allocation or NUMA_NO_NODE |
| 1654 | * @caller: caller's return address | 1655 | * @caller: caller's return address |
| 1655 | * | 1656 | * |
| 1656 | * Allocate enough pages to cover @size from the page level | 1657 | * Allocate enough pages to cover @size from the page level |
| @@ -1706,7 +1707,7 @@ fail: | |||
| 1706 | * @align: desired alignment | 1707 | * @align: desired alignment |
| 1707 | * @gfp_mask: flags for the page level allocator | 1708 | * @gfp_mask: flags for the page level allocator |
| 1708 | * @prot: protection mask for the allocated pages | 1709 | * @prot: protection mask for the allocated pages |
| 1709 | * @node: node to use for allocation or -1 | 1710 | * @node: node to use for allocation or NUMA_NO_NODE |
| 1710 | * @caller: caller's return address | 1711 | * @caller: caller's return address |
| 1711 | * | 1712 | * |
| 1712 | * Allocate enough pages to cover @size from the page level | 1713 | * Allocate enough pages to cover @size from the page level |
| @@ -1723,7 +1724,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, | |||
| 1723 | 1724 | ||
| 1724 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | 1725 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) |
| 1725 | { | 1726 | { |
| 1726 | return __vmalloc_node(size, 1, gfp_mask, prot, -1, | 1727 | return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE, |
| 1727 | __builtin_return_address(0)); | 1728 | __builtin_return_address(0)); |
| 1728 | } | 1729 | } |
| 1729 | EXPORT_SYMBOL(__vmalloc); | 1730 | EXPORT_SYMBOL(__vmalloc); |
| @@ -1746,7 +1747,8 @@ static inline void *__vmalloc_node_flags(unsigned long size, | |||
| 1746 | */ | 1747 | */ |
| 1747 | void *vmalloc(unsigned long size) | 1748 | void *vmalloc(unsigned long size) |
| 1748 | { | 1749 | { |
| 1749 | return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM); | 1750 | return __vmalloc_node_flags(size, NUMA_NO_NODE, |
| 1751 | GFP_KERNEL | __GFP_HIGHMEM); | ||
| 1750 | } | 1752 | } |
| 1751 | EXPORT_SYMBOL(vmalloc); | 1753 | EXPORT_SYMBOL(vmalloc); |
| 1752 | 1754 | ||
| @@ -1762,7 +1764,7 @@ EXPORT_SYMBOL(vmalloc); | |||
| 1762 | */ | 1764 | */ |
| 1763 | void *vzalloc(unsigned long size) | 1765 | void *vzalloc(unsigned long size) |
| 1764 | { | 1766 | { |
| 1765 | return __vmalloc_node_flags(size, -1, | 1767 | return __vmalloc_node_flags(size, NUMA_NO_NODE, |
| 1766 | GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); | 1768 | GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); |
| 1767 | } | 1769 | } |
| 1768 | EXPORT_SYMBOL(vzalloc); | 1770 | EXPORT_SYMBOL(vzalloc); |
| @@ -1781,7 +1783,8 @@ void *vmalloc_user(unsigned long size) | |||
| 1781 | 1783 | ||
| 1782 | ret = __vmalloc_node(size, SHMLBA, | 1784 | ret = __vmalloc_node(size, SHMLBA, |
| 1783 | GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, | 1785 | GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, |
| 1784 | PAGE_KERNEL, -1, __builtin_return_address(0)); | 1786 | PAGE_KERNEL, NUMA_NO_NODE, |
| 1787 | __builtin_return_address(0)); | ||
| 1785 | if (ret) { | 1788 | if (ret) { |
| 1786 | area = find_vm_area(ret); | 1789 | area = find_vm_area(ret); |
| 1787 | area->flags |= VM_USERMAP; | 1790 | area->flags |= VM_USERMAP; |
| @@ -1846,7 +1849,7 @@ EXPORT_SYMBOL(vzalloc_node); | |||
| 1846 | void *vmalloc_exec(unsigned long size) | 1849 | void *vmalloc_exec(unsigned long size) |
| 1847 | { | 1850 | { |
| 1848 | return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, | 1851 | return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, |
| 1849 | -1, __builtin_return_address(0)); | 1852 | NUMA_NO_NODE, __builtin_return_address(0)); |
| 1850 | } | 1853 | } |
| 1851 | 1854 | ||
| 1852 | #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) | 1855 | #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) |
| @@ -1867,7 +1870,7 @@ void *vmalloc_exec(unsigned long size) | |||
| 1867 | void *vmalloc_32(unsigned long size) | 1870 | void *vmalloc_32(unsigned long size) |
| 1868 | { | 1871 | { |
| 1869 | return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, | 1872 | return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, |
| 1870 | -1, __builtin_return_address(0)); | 1873 | NUMA_NO_NODE, __builtin_return_address(0)); |
| 1871 | } | 1874 | } |
| 1872 | EXPORT_SYMBOL(vmalloc_32); | 1875 | EXPORT_SYMBOL(vmalloc_32); |
| 1873 | 1876 | ||
| @@ -1884,7 +1887,7 @@ void *vmalloc_32_user(unsigned long size) | |||
| 1884 | void *ret; | 1887 | void *ret; |
| 1885 | 1888 | ||
| 1886 | ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, | 1889 | ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, |
| 1887 | -1, __builtin_return_address(0)); | 1890 | NUMA_NO_NODE, __builtin_return_address(0)); |
| 1888 | if (ret) { | 1891 | if (ret) { |
| 1889 | area = find_vm_area(ret); | 1892 | area = find_vm_area(ret); |
| 1890 | area->flags |= VM_USERMAP; | 1893 | area->flags |= VM_USERMAP; |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 196709f5ee58..88c5fed8b9a4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
| @@ -128,7 +128,7 @@ struct scan_control { | |||
| 128 | * From 0 .. 100. Higher means more swappy. | 128 | * From 0 .. 100. Higher means more swappy. |
| 129 | */ | 129 | */ |
| 130 | int vm_swappiness = 60; | 130 | int vm_swappiness = 60; |
| 131 | long vm_total_pages; /* The total number of pages which the VM controls */ | 131 | unsigned long vm_total_pages; /* The total number of pages which the VM controls */ |
| 132 | 132 | ||
| 133 | static LIST_HEAD(shrinker_list); | 133 | static LIST_HEAD(shrinker_list); |
| 134 | static DECLARE_RWSEM(shrinker_rwsem); | 134 | static DECLARE_RWSEM(shrinker_rwsem); |
| @@ -1579,16 +1579,6 @@ static inline int inactive_anon_is_low(struct lruvec *lruvec) | |||
| 1579 | } | 1579 | } |
| 1580 | #endif | 1580 | #endif |
| 1581 | 1581 | ||
| 1582 | static int inactive_file_is_low_global(struct zone *zone) | ||
| 1583 | { | ||
| 1584 | unsigned long active, inactive; | ||
| 1585 | |||
| 1586 | active = zone_page_state(zone, NR_ACTIVE_FILE); | ||
| 1587 | inactive = zone_page_state(zone, NR_INACTIVE_FILE); | ||
| 1588 | |||
| 1589 | return (active > inactive); | ||
| 1590 | } | ||
| 1591 | |||
| 1592 | /** | 1582 | /** |
| 1593 | * inactive_file_is_low - check if file pages need to be deactivated | 1583 | * inactive_file_is_low - check if file pages need to be deactivated |
| 1594 | * @lruvec: LRU vector to check | 1584 | * @lruvec: LRU vector to check |
| @@ -1605,10 +1595,13 @@ static int inactive_file_is_low_global(struct zone *zone) | |||
| 1605 | */ | 1595 | */ |
| 1606 | static int inactive_file_is_low(struct lruvec *lruvec) | 1596 | static int inactive_file_is_low(struct lruvec *lruvec) |
| 1607 | { | 1597 | { |
| 1608 | if (!mem_cgroup_disabled()) | 1598 | unsigned long inactive; |
| 1609 | return mem_cgroup_inactive_file_is_low(lruvec); | 1599 | unsigned long active; |
| 1600 | |||
| 1601 | inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE); | ||
| 1602 | active = get_lru_size(lruvec, LRU_ACTIVE_FILE); | ||
| 1610 | 1603 | ||
| 1611 | return inactive_file_is_low_global(lruvec_zone(lruvec)); | 1604 | return active > inactive; |
| 1612 | } | 1605 | } |
| 1613 | 1606 | ||
| 1614 | static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) | 1607 | static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) |
| @@ -1638,6 +1631,13 @@ static int vmscan_swappiness(struct scan_control *sc) | |||
| 1638 | return mem_cgroup_swappiness(sc->target_mem_cgroup); | 1631 | return mem_cgroup_swappiness(sc->target_mem_cgroup); |
| 1639 | } | 1632 | } |
| 1640 | 1633 | ||
| 1634 | enum scan_balance { | ||
| 1635 | SCAN_EQUAL, | ||
| 1636 | SCAN_FRACT, | ||
| 1637 | SCAN_ANON, | ||
| 1638 | SCAN_FILE, | ||
| 1639 | }; | ||
| 1640 | |||
| 1641 | /* | 1641 | /* |
| 1642 | * Determine how aggressively the anon and file LRU lists should be | 1642 | * Determine how aggressively the anon and file LRU lists should be |
| 1643 | * scanned. The relative value of each set of LRU lists is determined | 1643 | * scanned. The relative value of each set of LRU lists is determined |
| @@ -1650,15 +1650,16 @@ static int vmscan_swappiness(struct scan_control *sc) | |||
| 1650 | static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | 1650 | static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, |
| 1651 | unsigned long *nr) | 1651 | unsigned long *nr) |
| 1652 | { | 1652 | { |
| 1653 | unsigned long anon, file, free; | 1653 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; |
| 1654 | u64 fraction[2]; | ||
| 1655 | u64 denominator = 0; /* gcc */ | ||
| 1656 | struct zone *zone = lruvec_zone(lruvec); | ||
| 1654 | unsigned long anon_prio, file_prio; | 1657 | unsigned long anon_prio, file_prio; |
| 1658 | enum scan_balance scan_balance; | ||
| 1659 | unsigned long anon, file, free; | ||
| 1660 | bool force_scan = false; | ||
| 1655 | unsigned long ap, fp; | 1661 | unsigned long ap, fp; |
| 1656 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; | ||
| 1657 | u64 fraction[2], denominator; | ||
| 1658 | enum lru_list lru; | 1662 | enum lru_list lru; |
| 1659 | int noswap = 0; | ||
| 1660 | bool force_scan = false; | ||
| 1661 | struct zone *zone = lruvec_zone(lruvec); | ||
| 1662 | 1663 | ||
| 1663 | /* | 1664 | /* |
| 1664 | * If the zone or memcg is small, nr[l] can be 0. This | 1665 | * If the zone or memcg is small, nr[l] can be 0. This |
| @@ -1676,11 +1677,30 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
| 1676 | force_scan = true; | 1677 | force_scan = true; |
| 1677 | 1678 | ||
| 1678 | /* If we have no swap space, do not bother scanning anon pages. */ | 1679 | /* If we have no swap space, do not bother scanning anon pages. */ |
| 1679 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | 1680 | if (!sc->may_swap || (get_nr_swap_pages() <= 0)) { |
| 1680 | noswap = 1; | 1681 | scan_balance = SCAN_FILE; |
| 1681 | fraction[0] = 0; | 1682 | goto out; |
| 1682 | fraction[1] = 1; | 1683 | } |
| 1683 | denominator = 1; | 1684 | |
| 1685 | /* | ||
| 1686 | * Global reclaim will swap to prevent OOM even with no | ||
| 1687 | * swappiness, but memcg users want to use this knob to | ||
| 1688 | * disable swapping for individual groups completely when | ||
| 1689 | * using the memory controller's swap limit feature would be | ||
| 1690 | * too expensive. | ||
| 1691 | */ | ||
| 1692 | if (!global_reclaim(sc) && !vmscan_swappiness(sc)) { | ||
| 1693 | scan_balance = SCAN_FILE; | ||
| 1694 | goto out; | ||
| 1695 | } | ||
| 1696 | |||
| 1697 | /* | ||
| 1698 | * Do not apply any pressure balancing cleverness when the | ||
| 1699 | * system is close to OOM, scan both anon and file equally | ||
| 1700 | * (unless the swappiness setting disagrees with swapping). | ||
| 1701 | */ | ||
| 1702 | if (!sc->priority && vmscan_swappiness(sc)) { | ||
| 1703 | scan_balance = SCAN_EQUAL; | ||
| 1684 | goto out; | 1704 | goto out; |
| 1685 | } | 1705 | } |
| 1686 | 1706 | ||
| @@ -1689,30 +1709,32 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
| 1689 | file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + | 1709 | file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + |
| 1690 | get_lru_size(lruvec, LRU_INACTIVE_FILE); | 1710 | get_lru_size(lruvec, LRU_INACTIVE_FILE); |
| 1691 | 1711 | ||
| 1712 | /* | ||
| 1713 | * If it's foreseeable that reclaiming the file cache won't be | ||
| 1714 | * enough to get the zone back into a desirable shape, we have | ||
| 1715 | * to swap. Better start now and leave the - probably heavily | ||
| 1716 | * thrashing - remaining file pages alone. | ||
| 1717 | */ | ||
| 1692 | if (global_reclaim(sc)) { | 1718 | if (global_reclaim(sc)) { |
| 1693 | free = zone_page_state(zone, NR_FREE_PAGES); | 1719 | free = zone_page_state(zone, NR_FREE_PAGES); |
| 1694 | if (unlikely(file + free <= high_wmark_pages(zone))) { | 1720 | if (unlikely(file + free <= high_wmark_pages(zone))) { |
| 1695 | /* | 1721 | scan_balance = SCAN_ANON; |
| 1696 | * If we have very few page cache pages, force-scan | ||
| 1697 | * anon pages. | ||
| 1698 | */ | ||
| 1699 | fraction[0] = 1; | ||
| 1700 | fraction[1] = 0; | ||
| 1701 | denominator = 1; | ||
| 1702 | goto out; | ||
| 1703 | } else if (!inactive_file_is_low_global(zone)) { | ||
| 1704 | /* | ||
| 1705 | * There is enough inactive page cache, do not | ||
| 1706 | * reclaim anything from the working set right now. | ||
| 1707 | */ | ||
| 1708 | fraction[0] = 0; | ||
| 1709 | fraction[1] = 1; | ||
| 1710 | denominator = 1; | ||
| 1711 | goto out; | 1722 | goto out; |
| 1712 | } | 1723 | } |
| 1713 | } | 1724 | } |
| 1714 | 1725 | ||
| 1715 | /* | 1726 | /* |
| 1727 | * There is enough inactive page cache, do not reclaim | ||
| 1728 | * anything from the anonymous working set right now. | ||
| 1729 | */ | ||
| 1730 | if (!inactive_file_is_low(lruvec)) { | ||
| 1731 | scan_balance = SCAN_FILE; | ||
| 1732 | goto out; | ||
| 1733 | } | ||
| 1734 | |||
| 1735 | scan_balance = SCAN_FRACT; | ||
| 1736 | |||
| 1737 | /* | ||
| 1716 | * With swappiness at 100, anonymous and file have the same priority. | 1738 | * With swappiness at 100, anonymous and file have the same priority. |
| 1717 | * This scanning priority is essentially the inverse of IO cost. | 1739 | * This scanning priority is essentially the inverse of IO cost. |
| 1718 | */ | 1740 | */ |
| @@ -1759,19 +1781,92 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
| 1759 | out: | 1781 | out: |
| 1760 | for_each_evictable_lru(lru) { | 1782 | for_each_evictable_lru(lru) { |
| 1761 | int file = is_file_lru(lru); | 1783 | int file = is_file_lru(lru); |
| 1784 | unsigned long size; | ||
| 1762 | unsigned long scan; | 1785 | unsigned long scan; |
| 1763 | 1786 | ||
| 1764 | scan = get_lru_size(lruvec, lru); | 1787 | size = get_lru_size(lruvec, lru); |
| 1765 | if (sc->priority || noswap || !vmscan_swappiness(sc)) { | 1788 | scan = size >> sc->priority; |
| 1766 | scan >>= sc->priority; | 1789 | |
| 1767 | if (!scan && force_scan) | 1790 | if (!scan && force_scan) |
| 1768 | scan = SWAP_CLUSTER_MAX; | 1791 | scan = min(size, SWAP_CLUSTER_MAX); |
| 1792 | |||
| 1793 | switch (scan_balance) { | ||
| 1794 | case SCAN_EQUAL: | ||
| 1795 | /* Scan lists relative to size */ | ||
| 1796 | break; | ||
| 1797 | case SCAN_FRACT: | ||
| 1798 | /* | ||
| 1799 | * Scan types proportional to swappiness and | ||
| 1800 | * their relative recent reclaim efficiency. | ||
| 1801 | */ | ||
| 1769 | scan = div64_u64(scan * fraction[file], denominator); | 1802 | scan = div64_u64(scan * fraction[file], denominator); |
| 1803 | break; | ||
| 1804 | case SCAN_FILE: | ||
| 1805 | case SCAN_ANON: | ||
| 1806 | /* Scan one type exclusively */ | ||
| 1807 | if ((scan_balance == SCAN_FILE) != file) | ||
| 1808 | scan = 0; | ||
| 1809 | break; | ||
| 1810 | default: | ||
| 1811 | /* Look ma, no brain */ | ||
| 1812 | BUG(); | ||
| 1770 | } | 1813 | } |
| 1771 | nr[lru] = scan; | 1814 | nr[lru] = scan; |
| 1772 | } | 1815 | } |
| 1773 | } | 1816 | } |
| 1774 | 1817 | ||
| 1818 | /* | ||
| 1819 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | ||
| 1820 | */ | ||
| 1821 | static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) | ||
| 1822 | { | ||
| 1823 | unsigned long nr[NR_LRU_LISTS]; | ||
| 1824 | unsigned long nr_to_scan; | ||
| 1825 | enum lru_list lru; | ||
| 1826 | unsigned long nr_reclaimed = 0; | ||
| 1827 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; | ||
| 1828 | struct blk_plug plug; | ||
| 1829 | |||
| 1830 | get_scan_count(lruvec, sc, nr); | ||
| 1831 | |||
| 1832 | blk_start_plug(&plug); | ||
| 1833 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | ||
| 1834 | nr[LRU_INACTIVE_FILE]) { | ||
| 1835 | for_each_evictable_lru(lru) { | ||
| 1836 | if (nr[lru]) { | ||
| 1837 | nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); | ||
| 1838 | nr[lru] -= nr_to_scan; | ||
| 1839 | |||
| 1840 | nr_reclaimed += shrink_list(lru, nr_to_scan, | ||
| 1841 | lruvec, sc); | ||
| 1842 | } | ||
| 1843 | } | ||
| 1844 | /* | ||
| 1845 | * On large memory systems, scan >> priority can become | ||
| 1846 | * really large. This is fine for the starting priority; | ||
| 1847 | * we want to put equal scanning pressure on each zone. | ||
| 1848 | * However, if the VM has a harder time of freeing pages, | ||
| 1849 | * with multiple processes reclaiming pages, the total | ||
| 1850 | * freeing target can get unreasonably large. | ||
| 1851 | */ | ||
| 1852 | if (nr_reclaimed >= nr_to_reclaim && | ||
| 1853 | sc->priority < DEF_PRIORITY) | ||
| 1854 | break; | ||
| 1855 | } | ||
| 1856 | blk_finish_plug(&plug); | ||
| 1857 | sc->nr_reclaimed += nr_reclaimed; | ||
| 1858 | |||
| 1859 | /* | ||
| 1860 | * Even if we did not try to evict anon pages at all, we want to | ||
| 1861 | * rebalance the anon lru active/inactive ratio. | ||
| 1862 | */ | ||
| 1863 | if (inactive_anon_is_low(lruvec)) | ||
| 1864 | shrink_active_list(SWAP_CLUSTER_MAX, lruvec, | ||
| 1865 | sc, LRU_ACTIVE_ANON); | ||
| 1866 | |||
| 1867 | throttle_vm_writeout(sc->gfp_mask); | ||
| 1868 | } | ||
| 1869 | |||
| 1775 | /* Use reclaim/compaction for costly allocs or under memory pressure */ | 1870 | /* Use reclaim/compaction for costly allocs or under memory pressure */ |
| 1776 | static bool in_reclaim_compaction(struct scan_control *sc) | 1871 | static bool in_reclaim_compaction(struct scan_control *sc) |
| 1777 | { | 1872 | { |
| @@ -1790,7 +1885,7 @@ static bool in_reclaim_compaction(struct scan_control *sc) | |||
| 1790 | * calls try_to_compact_zone() that it will have enough free pages to succeed. | 1885 | * calls try_to_compact_zone() that it will have enough free pages to succeed. |
| 1791 | * It will give up earlier than that if there is difficulty reclaiming pages. | 1886 | * It will give up earlier than that if there is difficulty reclaiming pages. |
| 1792 | */ | 1887 | */ |
| 1793 | static inline bool should_continue_reclaim(struct lruvec *lruvec, | 1888 | static inline bool should_continue_reclaim(struct zone *zone, |
| 1794 | unsigned long nr_reclaimed, | 1889 | unsigned long nr_reclaimed, |
| 1795 | unsigned long nr_scanned, | 1890 | unsigned long nr_scanned, |
| 1796 | struct scan_control *sc) | 1891 | struct scan_control *sc) |
| @@ -1830,15 +1925,15 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec, | |||
| 1830 | * inactive lists are large enough, continue reclaiming | 1925 | * inactive lists are large enough, continue reclaiming |
| 1831 | */ | 1926 | */ |
| 1832 | pages_for_compaction = (2UL << sc->order); | 1927 | pages_for_compaction = (2UL << sc->order); |
| 1833 | inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); | 1928 | inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE); |
| 1834 | if (nr_swap_pages > 0) | 1929 | if (get_nr_swap_pages() > 0) |
| 1835 | inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON); | 1930 | inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON); |
| 1836 | if (sc->nr_reclaimed < pages_for_compaction && | 1931 | if (sc->nr_reclaimed < pages_for_compaction && |
| 1837 | inactive_lru_pages > pages_for_compaction) | 1932 | inactive_lru_pages > pages_for_compaction) |
| 1838 | return true; | 1933 | return true; |
| 1839 | 1934 | ||
| 1840 | /* If compaction would go ahead or the allocation would succeed, stop */ | 1935 | /* If compaction would go ahead or the allocation would succeed, stop */ |
| 1841 | switch (compaction_suitable(lruvec_zone(lruvec), sc->order)) { | 1936 | switch (compaction_suitable(zone, sc->order)) { |
| 1842 | case COMPACT_PARTIAL: | 1937 | case COMPACT_PARTIAL: |
| 1843 | case COMPACT_CONTINUE: | 1938 | case COMPACT_CONTINUE: |
| 1844 | return false; | 1939 | return false; |
| @@ -1847,98 +1942,48 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec, | |||
| 1847 | } | 1942 | } |
| 1848 | } | 1943 | } |
| 1849 | 1944 | ||
| 1850 | /* | 1945 | static void shrink_zone(struct zone *zone, struct scan_control *sc) |
| 1851 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | ||
| 1852 | */ | ||
| 1853 | static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) | ||
| 1854 | { | 1946 | { |
| 1855 | unsigned long nr[NR_LRU_LISTS]; | ||
| 1856 | unsigned long nr_to_scan; | ||
| 1857 | enum lru_list lru; | ||
| 1858 | unsigned long nr_reclaimed, nr_scanned; | 1947 | unsigned long nr_reclaimed, nr_scanned; |
| 1859 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; | ||
| 1860 | struct blk_plug plug; | ||
| 1861 | |||
| 1862 | restart: | ||
| 1863 | nr_reclaimed = 0; | ||
| 1864 | nr_scanned = sc->nr_scanned; | ||
| 1865 | get_scan_count(lruvec, sc, nr); | ||
| 1866 | |||
| 1867 | blk_start_plug(&plug); | ||
| 1868 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | ||
| 1869 | nr[LRU_INACTIVE_FILE]) { | ||
| 1870 | for_each_evictable_lru(lru) { | ||
| 1871 | if (nr[lru]) { | ||
| 1872 | nr_to_scan = min_t(unsigned long, | ||
| 1873 | nr[lru], SWAP_CLUSTER_MAX); | ||
| 1874 | nr[lru] -= nr_to_scan; | ||
| 1875 | |||
| 1876 | nr_reclaimed += shrink_list(lru, nr_to_scan, | ||
| 1877 | lruvec, sc); | ||
| 1878 | } | ||
| 1879 | } | ||
| 1880 | /* | ||
| 1881 | * On large memory systems, scan >> priority can become | ||
| 1882 | * really large. This is fine for the starting priority; | ||
| 1883 | * we want to put equal scanning pressure on each zone. | ||
| 1884 | * However, if the VM has a harder time of freeing pages, | ||
| 1885 | * with multiple processes reclaiming pages, the total | ||
| 1886 | * freeing target can get unreasonably large. | ||
| 1887 | */ | ||
| 1888 | if (nr_reclaimed >= nr_to_reclaim && | ||
| 1889 | sc->priority < DEF_PRIORITY) | ||
| 1890 | break; | ||
| 1891 | } | ||
| 1892 | blk_finish_plug(&plug); | ||
| 1893 | sc->nr_reclaimed += nr_reclaimed; | ||
| 1894 | 1948 | ||
| 1895 | /* | 1949 | do { |
| 1896 | * Even if we did not try to evict anon pages at all, we want to | 1950 | struct mem_cgroup *root = sc->target_mem_cgroup; |
| 1897 | * rebalance the anon lru active/inactive ratio. | 1951 | struct mem_cgroup_reclaim_cookie reclaim = { |
| 1898 | */ | 1952 | .zone = zone, |
| 1899 | if (inactive_anon_is_low(lruvec)) | 1953 | .priority = sc->priority, |
| 1900 | shrink_active_list(SWAP_CLUSTER_MAX, lruvec, | 1954 | }; |
| 1901 | sc, LRU_ACTIVE_ANON); | 1955 | struct mem_cgroup *memcg; |
| 1902 | |||
| 1903 | /* reclaim/compaction might need reclaim to continue */ | ||
| 1904 | if (should_continue_reclaim(lruvec, nr_reclaimed, | ||
| 1905 | sc->nr_scanned - nr_scanned, sc)) | ||
| 1906 | goto restart; | ||
| 1907 | 1956 | ||
| 1908 | throttle_vm_writeout(sc->gfp_mask); | 1957 | nr_reclaimed = sc->nr_reclaimed; |
| 1909 | } | 1958 | nr_scanned = sc->nr_scanned; |
| 1910 | 1959 | ||
| 1911 | static void shrink_zone(struct zone *zone, struct scan_control *sc) | 1960 | memcg = mem_cgroup_iter(root, NULL, &reclaim); |
| 1912 | { | 1961 | do { |
| 1913 | struct mem_cgroup *root = sc->target_mem_cgroup; | 1962 | struct lruvec *lruvec; |
| 1914 | struct mem_cgroup_reclaim_cookie reclaim = { | ||
| 1915 | .zone = zone, | ||
| 1916 | .priority = sc->priority, | ||
| 1917 | }; | ||
| 1918 | struct mem_cgroup *memcg; | ||
| 1919 | 1963 | ||
| 1920 | memcg = mem_cgroup_iter(root, NULL, &reclaim); | 1964 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
| 1921 | do { | ||
| 1922 | struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); | ||
| 1923 | 1965 | ||
| 1924 | shrink_lruvec(lruvec, sc); | 1966 | shrink_lruvec(lruvec, sc); |
| 1925 | 1967 | ||
| 1926 | /* | 1968 | /* |
| 1927 | * Limit reclaim has historically picked one memcg and | 1969 | * Direct reclaim and kswapd have to scan all memory |
| 1928 | * scanned it with decreasing priority levels until | 1970 | * cgroups to fulfill the overall scan target for the |
| 1929 | * nr_to_reclaim had been reclaimed. This priority | 1971 | * zone. |
| 1930 | * cycle is thus over after a single memcg. | 1972 | * |
| 1931 | * | 1973 | * Limit reclaim, on the other hand, only cares about |
| 1932 | * Direct reclaim and kswapd, on the other hand, have | 1974 | * nr_to_reclaim pages to be reclaimed and it will |
| 1933 | * to scan all memory cgroups to fulfill the overall | 1975 | * retry with decreasing priority if one round over the |
| 1934 | * scan target for the zone. | 1976 | * whole hierarchy is not sufficient. |
| 1935 | */ | 1977 | */ |
| 1936 | if (!global_reclaim(sc)) { | 1978 | if (!global_reclaim(sc) && |
| 1937 | mem_cgroup_iter_break(root, memcg); | 1979 | sc->nr_reclaimed >= sc->nr_to_reclaim) { |
| 1938 | break; | 1980 | mem_cgroup_iter_break(root, memcg); |
| 1939 | } | 1981 | break; |
| 1940 | memcg = mem_cgroup_iter(root, memcg, &reclaim); | 1982 | } |
| 1941 | } while (memcg); | 1983 | memcg = mem_cgroup_iter(root, memcg, &reclaim); |
| 1984 | } while (memcg); | ||
| 1985 | } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, | ||
| 1986 | sc->nr_scanned - nr_scanned, sc)); | ||
| 1942 | } | 1987 | } |
| 1943 | 1988 | ||
| 1944 | /* Returns true if compaction should go ahead for a high-order request */ | 1989 | /* Returns true if compaction should go ahead for a high-order request */ |
| @@ -1958,7 +2003,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) | |||
| 1958 | * a reasonable chance of completing and allocating the page | 2003 | * a reasonable chance of completing and allocating the page |
| 1959 | */ | 2004 | */ |
| 1960 | balance_gap = min(low_wmark_pages(zone), | 2005 | balance_gap = min(low_wmark_pages(zone), |
| 1961 | (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / | 2006 | (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / |
| 1962 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | 2007 | KSWAPD_ZONE_BALANCE_GAP_RATIO); |
| 1963 | watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); | 2008 | watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); |
| 1964 | watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); | 2009 | watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); |
| @@ -2150,6 +2195,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
| 2150 | goto out; | 2195 | goto out; |
| 2151 | 2196 | ||
| 2152 | /* | 2197 | /* |
| 2198 | * If we're getting trouble reclaiming, start doing | ||
| 2199 | * writepage even in laptop mode. | ||
| 2200 | */ | ||
| 2201 | if (sc->priority < DEF_PRIORITY - 2) | ||
| 2202 | sc->may_writepage = 1; | ||
| 2203 | |||
| 2204 | /* | ||
| 2153 | * Try to write back as many pages as we just scanned. This | 2205 | * Try to write back as many pages as we just scanned. This |
| 2154 | * tends to cause slow streaming writers to write data to the | 2206 | * tends to cause slow streaming writers to write data to the |
| 2155 | * disk smoothly, at the dirtying rate, which is nice. But | 2207 | * disk smoothly, at the dirtying rate, which is nice. But |
| @@ -2300,7 +2352,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
| 2300 | { | 2352 | { |
| 2301 | unsigned long nr_reclaimed; | 2353 | unsigned long nr_reclaimed; |
| 2302 | struct scan_control sc = { | 2354 | struct scan_control sc = { |
| 2303 | .gfp_mask = gfp_mask, | 2355 | .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), |
| 2304 | .may_writepage = !laptop_mode, | 2356 | .may_writepage = !laptop_mode, |
| 2305 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2357 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
| 2306 | .may_unmap = 1, | 2358 | .may_unmap = 1, |
| @@ -2473,7 +2525,7 @@ static bool zone_balanced(struct zone *zone, int order, | |||
| 2473 | */ | 2525 | */ |
| 2474 | static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) | 2526 | static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) |
| 2475 | { | 2527 | { |
| 2476 | unsigned long present_pages = 0; | 2528 | unsigned long managed_pages = 0; |
| 2477 | unsigned long balanced_pages = 0; | 2529 | unsigned long balanced_pages = 0; |
| 2478 | int i; | 2530 | int i; |
| 2479 | 2531 | ||
| @@ -2484,7 +2536,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) | |||
| 2484 | if (!populated_zone(zone)) | 2536 | if (!populated_zone(zone)) |
| 2485 | continue; | 2537 | continue; |
| 2486 | 2538 | ||
| 2487 | present_pages += zone->present_pages; | 2539 | managed_pages += zone->managed_pages; |
| 2488 | 2540 | ||
| 2489 | /* | 2541 | /* |
| 2490 | * A special case here: | 2542 | * A special case here: |
| @@ -2494,18 +2546,18 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) | |||
| 2494 | * they must be considered balanced here as well! | 2546 | * they must be considered balanced here as well! |
| 2495 | */ | 2547 | */ |
| 2496 | if (zone->all_unreclaimable) { | 2548 | if (zone->all_unreclaimable) { |
| 2497 | balanced_pages += zone->present_pages; | 2549 | balanced_pages += zone->managed_pages; |
| 2498 | continue; | 2550 | continue; |
| 2499 | } | 2551 | } |
| 2500 | 2552 | ||
| 2501 | if (zone_balanced(zone, order, 0, i)) | 2553 | if (zone_balanced(zone, order, 0, i)) |
| 2502 | balanced_pages += zone->present_pages; | 2554 | balanced_pages += zone->managed_pages; |
| 2503 | else if (!order) | 2555 | else if (!order) |
| 2504 | return false; | 2556 | return false; |
| 2505 | } | 2557 | } |
| 2506 | 2558 | ||
| 2507 | if (order) | 2559 | if (order) |
| 2508 | return balanced_pages >= (present_pages >> 2); | 2560 | return balanced_pages >= (managed_pages >> 2); |
| 2509 | else | 2561 | else |
| 2510 | return true; | 2562 | return true; |
| 2511 | } | 2563 | } |
| @@ -2564,7 +2616,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, | |||
| 2564 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | 2616 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order, |
| 2565 | int *classzone_idx) | 2617 | int *classzone_idx) |
| 2566 | { | 2618 | { |
| 2567 | struct zone *unbalanced_zone; | 2619 | bool pgdat_is_balanced = false; |
| 2568 | int i; | 2620 | int i; |
| 2569 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | 2621 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ |
| 2570 | unsigned long total_scanned; | 2622 | unsigned long total_scanned; |
| @@ -2595,9 +2647,6 @@ loop_again: | |||
| 2595 | 2647 | ||
| 2596 | do { | 2648 | do { |
| 2597 | unsigned long lru_pages = 0; | 2649 | unsigned long lru_pages = 0; |
| 2598 | int has_under_min_watermark_zone = 0; | ||
| 2599 | |||
| 2600 | unbalanced_zone = NULL; | ||
| 2601 | 2650 | ||
| 2602 | /* | 2651 | /* |
| 2603 | * Scan in the highmem->dma direction for the highest | 2652 | * Scan in the highmem->dma direction for the highest |
| @@ -2638,8 +2687,11 @@ loop_again: | |||
| 2638 | zone_clear_flag(zone, ZONE_CONGESTED); | 2687 | zone_clear_flag(zone, ZONE_CONGESTED); |
| 2639 | } | 2688 | } |
| 2640 | } | 2689 | } |
| 2641 | if (i < 0) | 2690 | |
| 2691 | if (i < 0) { | ||
| 2692 | pgdat_is_balanced = true; | ||
| 2642 | goto out; | 2693 | goto out; |
| 2694 | } | ||
| 2643 | 2695 | ||
| 2644 | for (i = 0; i <= end_zone; i++) { | 2696 | for (i = 0; i <= end_zone; i++) { |
| 2645 | struct zone *zone = pgdat->node_zones + i; | 2697 | struct zone *zone = pgdat->node_zones + i; |
| @@ -2689,7 +2741,7 @@ loop_again: | |||
| 2689 | * of the zone, whichever is smaller. | 2741 | * of the zone, whichever is smaller. |
| 2690 | */ | 2742 | */ |
| 2691 | balance_gap = min(low_wmark_pages(zone), | 2743 | balance_gap = min(low_wmark_pages(zone), |
| 2692 | (zone->present_pages + | 2744 | (zone->managed_pages + |
| 2693 | KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / | 2745 | KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / |
| 2694 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | 2746 | KSWAPD_ZONE_BALANCE_GAP_RATIO); |
| 2695 | /* | 2747 | /* |
| @@ -2720,12 +2772,10 @@ loop_again: | |||
| 2720 | } | 2772 | } |
| 2721 | 2773 | ||
| 2722 | /* | 2774 | /* |
| 2723 | * If we've done a decent amount of scanning and | 2775 | * If we're getting trouble reclaiming, start doing |
| 2724 | * the reclaim ratio is low, start doing writepage | 2776 | * writepage even in laptop mode. |
| 2725 | * even in laptop mode | ||
| 2726 | */ | 2777 | */ |
| 2727 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && | 2778 | if (sc.priority < DEF_PRIORITY - 2) |
| 2728 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) | ||
| 2729 | sc.may_writepage = 1; | 2779 | sc.may_writepage = 1; |
| 2730 | 2780 | ||
| 2731 | if (zone->all_unreclaimable) { | 2781 | if (zone->all_unreclaimable) { |
| @@ -2734,17 +2784,7 @@ loop_again: | |||
| 2734 | continue; | 2784 | continue; |
| 2735 | } | 2785 | } |
| 2736 | 2786 | ||
| 2737 | if (!zone_balanced(zone, testorder, 0, end_zone)) { | 2787 | if (zone_balanced(zone, testorder, 0, end_zone)) |
| 2738 | unbalanced_zone = zone; | ||
| 2739 | /* | ||
| 2740 | * We are still under min water mark. This | ||
| 2741 | * means that we have a GFP_ATOMIC allocation | ||
| 2742 | * failure risk. Hurry up! | ||
| 2743 | */ | ||
| 2744 | if (!zone_watermark_ok_safe(zone, order, | ||
| 2745 | min_wmark_pages(zone), end_zone, 0)) | ||
| 2746 | has_under_min_watermark_zone = 1; | ||
| 2747 | } else { | ||
| 2748 | /* | 2788 | /* |
| 2749 | * If a zone reaches its high watermark, | 2789 | * If a zone reaches its high watermark, |
| 2750 | * consider it to be no longer congested. It's | 2790 | * consider it to be no longer congested. It's |
| @@ -2753,8 +2793,6 @@ loop_again: | |||
| 2753 | * speculatively avoid congestion waits | 2793 | * speculatively avoid congestion waits |
| 2754 | */ | 2794 | */ |
| 2755 | zone_clear_flag(zone, ZONE_CONGESTED); | 2795 | zone_clear_flag(zone, ZONE_CONGESTED); |
| 2756 | } | ||
| 2757 | |||
| 2758 | } | 2796 | } |
| 2759 | 2797 | ||
| 2760 | /* | 2798 | /* |
| @@ -2766,17 +2804,9 @@ loop_again: | |||
| 2766 | pfmemalloc_watermark_ok(pgdat)) | 2804 | pfmemalloc_watermark_ok(pgdat)) |
| 2767 | wake_up(&pgdat->pfmemalloc_wait); | 2805 | wake_up(&pgdat->pfmemalloc_wait); |
| 2768 | 2806 | ||
| 2769 | if (pgdat_balanced(pgdat, order, *classzone_idx)) | 2807 | if (pgdat_balanced(pgdat, order, *classzone_idx)) { |
| 2808 | pgdat_is_balanced = true; | ||
| 2770 | break; /* kswapd: all done */ | 2809 | break; /* kswapd: all done */ |
| 2771 | /* | ||
| 2772 | * OK, kswapd is getting into trouble. Take a nap, then take | ||
| 2773 | * another pass across the zones. | ||
| 2774 | */ | ||
| 2775 | if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) { | ||
| 2776 | if (has_under_min_watermark_zone) | ||
| 2777 | count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); | ||
| 2778 | else if (unbalanced_zone) | ||
| 2779 | wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10); | ||
| 2780 | } | 2810 | } |
| 2781 | 2811 | ||
| 2782 | /* | 2812 | /* |
| @@ -2788,9 +2818,9 @@ loop_again: | |||
| 2788 | if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) | 2818 | if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) |
| 2789 | break; | 2819 | break; |
| 2790 | } while (--sc.priority >= 0); | 2820 | } while (--sc.priority >= 0); |
| 2791 | out: | ||
| 2792 | 2821 | ||
| 2793 | if (!pgdat_balanced(pgdat, order, *classzone_idx)) { | 2822 | out: |
| 2823 | if (!pgdat_is_balanced) { | ||
| 2794 | cond_resched(); | 2824 | cond_resched(); |
| 2795 | 2825 | ||
| 2796 | try_to_freeze(); | 2826 | try_to_freeze(); |
| @@ -3053,7 +3083,7 @@ unsigned long global_reclaimable_pages(void) | |||
| 3053 | nr = global_page_state(NR_ACTIVE_FILE) + | 3083 | nr = global_page_state(NR_ACTIVE_FILE) + |
| 3054 | global_page_state(NR_INACTIVE_FILE); | 3084 | global_page_state(NR_INACTIVE_FILE); |
| 3055 | 3085 | ||
| 3056 | if (nr_swap_pages > 0) | 3086 | if (get_nr_swap_pages() > 0) |
| 3057 | nr += global_page_state(NR_ACTIVE_ANON) + | 3087 | nr += global_page_state(NR_ACTIVE_ANON) + |
| 3058 | global_page_state(NR_INACTIVE_ANON); | 3088 | global_page_state(NR_INACTIVE_ANON); |
| 3059 | 3089 | ||
| @@ -3067,7 +3097,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone) | |||
| 3067 | nr = zone_page_state(zone, NR_ACTIVE_FILE) + | 3097 | nr = zone_page_state(zone, NR_ACTIVE_FILE) + |
| 3068 | zone_page_state(zone, NR_INACTIVE_FILE); | 3098 | zone_page_state(zone, NR_INACTIVE_FILE); |
| 3069 | 3099 | ||
| 3070 | if (nr_swap_pages > 0) | 3100 | if (get_nr_swap_pages() > 0) |
| 3071 | nr += zone_page_state(zone, NR_ACTIVE_ANON) + | 3101 | nr += zone_page_state(zone, NR_ACTIVE_ANON) + |
| 3072 | zone_page_state(zone, NR_INACTIVE_ANON); | 3102 | zone_page_state(zone, NR_INACTIVE_ANON); |
| 3073 | 3103 | ||
| @@ -3280,9 +3310,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 3280 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | 3310 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), |
| 3281 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), | 3311 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), |
| 3282 | .may_swap = 1, | 3312 | .may_swap = 1, |
| 3283 | .nr_to_reclaim = max_t(unsigned long, nr_pages, | 3313 | .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), |
| 3284 | SWAP_CLUSTER_MAX), | 3314 | .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), |
| 3285 | .gfp_mask = gfp_mask, | ||
| 3286 | .order = order, | 3315 | .order = order, |
| 3287 | .priority = ZONE_RECLAIM_PRIORITY, | 3316 | .priority = ZONE_RECLAIM_PRIORITY, |
| 3288 | }; | 3317 | }; |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 9800306c8195..e1d8ed172c42 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
| @@ -142,7 +142,7 @@ int calculate_normal_threshold(struct zone *zone) | |||
| 142 | * 125 1024 10 16-32 GB 9 | 142 | * 125 1024 10 16-32 GB 9 |
| 143 | */ | 143 | */ |
| 144 | 144 | ||
| 145 | mem = zone->present_pages >> (27 - PAGE_SHIFT); | 145 | mem = zone->managed_pages >> (27 - PAGE_SHIFT); |
| 146 | 146 | ||
| 147 | threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); | 147 | threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); |
| 148 | 148 | ||
| @@ -628,7 +628,9 @@ static char * const migratetype_names[MIGRATE_TYPES] = { | |||
| 628 | #ifdef CONFIG_CMA | 628 | #ifdef CONFIG_CMA |
| 629 | "CMA", | 629 | "CMA", |
| 630 | #endif | 630 | #endif |
| 631 | #ifdef CONFIG_MEMORY_ISOLATION | ||
| 631 | "Isolate", | 632 | "Isolate", |
| 633 | #endif | ||
| 632 | }; | 634 | }; |
| 633 | 635 | ||
| 634 | static void *frag_start(struct seq_file *m, loff_t *pos) | 636 | static void *frag_start(struct seq_file *m, loff_t *pos) |
| @@ -768,7 +770,6 @@ const char * const vmstat_text[] = { | |||
| 768 | "kswapd_inodesteal", | 770 | "kswapd_inodesteal", |
| 769 | "kswapd_low_wmark_hit_quickly", | 771 | "kswapd_low_wmark_hit_quickly", |
| 770 | "kswapd_high_wmark_hit_quickly", | 772 | "kswapd_high_wmark_hit_quickly", |
| 771 | "kswapd_skip_congestion_wait", | ||
| 772 | "pageoutrun", | 773 | "pageoutrun", |
| 773 | "allocstall", | 774 | "allocstall", |
| 774 | 775 | ||
| @@ -890,7 +891,7 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, | |||
| 890 | int mtype; | 891 | int mtype; |
| 891 | unsigned long pfn; | 892 | unsigned long pfn; |
| 892 | unsigned long start_pfn = zone->zone_start_pfn; | 893 | unsigned long start_pfn = zone->zone_start_pfn; |
| 893 | unsigned long end_pfn = start_pfn + zone->spanned_pages; | 894 | unsigned long end_pfn = zone_end_pfn(zone); |
| 894 | unsigned long count[MIGRATE_TYPES] = { 0, }; | 895 | unsigned long count[MIGRATE_TYPES] = { 0, }; |
| 895 | 896 | ||
| 896 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | 897 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { |
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index fd05c81cb348..de2e950a0a7a 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c | |||
| @@ -87,7 +87,7 @@ struct virtio_chan { | |||
| 87 | /* This is global limit. Since we don't have a global structure, | 87 | /* This is global limit. Since we don't have a global structure, |
| 88 | * will be placing it in each channel. | 88 | * will be placing it in each channel. |
| 89 | */ | 89 | */ |
| 90 | int p9_max_pages; | 90 | unsigned long p9_max_pages; |
| 91 | /* Scatterlist: can be too big for stack. */ | 91 | /* Scatterlist: can be too big for stack. */ |
| 92 | struct scatterlist sg[VIRTQUEUE_NUM]; | 92 | struct scatterlist sg[VIRTQUEUE_NUM]; |
| 93 | 93 | ||
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index a5b89a6fec6d..7427ab5e27d8 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c | |||
| @@ -21,6 +21,7 @@ | |||
| 21 | #include <linux/vmalloc.h> | 21 | #include <linux/vmalloc.h> |
| 22 | #include <linux/export.h> | 22 | #include <linux/export.h> |
| 23 | #include <linux/jiffies.h> | 23 | #include <linux/jiffies.h> |
| 24 | #include <linux/pm_runtime.h> | ||
| 24 | 25 | ||
| 25 | #include "net-sysfs.h" | 26 | #include "net-sysfs.h" |
| 26 | 27 | ||
| @@ -1257,6 +1258,8 @@ void netdev_unregister_kobject(struct net_device * net) | |||
| 1257 | 1258 | ||
| 1258 | remove_queue_kobjects(net); | 1259 | remove_queue_kobjects(net); |
| 1259 | 1260 | ||
| 1261 | pm_runtime_set_memalloc_noio(dev, false); | ||
| 1262 | |||
| 1260 | device_del(dev); | 1263 | device_del(dev); |
| 1261 | } | 1264 | } |
| 1262 | 1265 | ||
| @@ -1301,6 +1304,8 @@ int netdev_register_kobject(struct net_device *net) | |||
| 1301 | return error; | 1304 | return error; |
| 1302 | } | 1305 | } |
| 1303 | 1306 | ||
| 1307 | pm_runtime_set_memalloc_noio(dev, true); | ||
| 1308 | |||
| 1304 | return error; | 1309 | return error; |
| 1305 | } | 1310 | } |
| 1306 | 1311 | ||
