aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-11-13 01:45:43 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-11-13 01:45:43 -0500
commit5cbb3d216e2041700231bcfc383ee5f8b7fc8b74 (patch)
treea738fa82dbcefa9bd283c08bc67f38827be63937 /mm
parent9bc9ccd7db1c9f043f75380b5a5b94912046a60e (diff)
parent4e9b45a19241354daec281d7a785739829b52359 (diff)
Merge branch 'akpm' (patches from Andrew Morton)
Merge first patch-bomb from Andrew Morton: "Quite a lot of other stuff is banked up awaiting further next->mainline merging, but this batch contains: - Lots of random misc patches - OCFS2 - Most of MM - backlight updates - lib/ updates - printk updates - checkpatch updates - epoll tweaking - rtc updates - hfs - hfsplus - documentation - procfs - update gcov to gcc-4.7 format - IPC" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (269 commits) ipc, msg: fix message length check for negative values ipc/util.c: remove unnecessary work pending test devpts: plug the memory leak in kill_sb ./Makefile: export initial ramdisk compression config option init/Kconfig: add option to disable kernel compression drivers: w1: make w1_slave::flags long to avoid memory corruption drivers/w1/masters/ds1wm.cuse dev_get_platdata() drivers/memstick/core/ms_block.c: fix unreachable state in h_msb_read_page() drivers/memstick/core/mspro_block.c: fix attributes array allocation drivers/pps/clients/pps-gpio.c: remove redundant of_match_ptr kernel/panic.c: reduce 1 byte usage for print tainted buffer gcov: reuse kbasename helper kernel/gcov/fs.c: use pr_warn() kernel/module.c: use pr_foo() gcov: compile specific gcov implementation based on gcc version gcov: add support for gcc 4.7 gcov format gcov: move gcov structs definitions to a gcc version specific file kernel/taskstats.c: return -ENOMEM when alloc memory fails in add_del_listener() kernel/taskstats.c: add nla_nest_cancel() for failure processing between nla_nest_start() and nla_nest_end() kernel/sysctl_binary.c: use scnprintf() instead of snprintf() ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig17
-rw-r--r--mm/bootmem.c8
-rw-r--r--mm/compaction.c7
-rw-r--r--mm/huge_memory.c78
-rw-r--r--mm/kmemleak.c4
-rw-r--r--mm/ksm.c4
-rw-r--r--mm/memblock.c124
-rw-r--r--mm/memcontrol.c97
-rw-r--r--mm/memory-failure.c36
-rw-r--r--mm/memory.c2
-rw-r--r--mm/memory_hotplug.c65
-rw-r--r--mm/mempolicy.c62
-rw-r--r--mm/mmap.c16
-rw-r--r--mm/mprotect.c10
-rw-r--r--mm/nobootmem.c25
-rw-r--r--mm/nommu.c3
-rw-r--r--mm/page_alloc.c34
-rw-r--r--mm/readahead.c8
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slab.h6
-rw-r--r--mm/slab_common.c2
-rw-r--r--mm/slub.c2
-rw-r--r--mm/sparse.c53
-rw-r--r--mm/swapfile.c16
-rw-r--r--mm/util.c13
-rw-r--r--mm/vmalloc.c48
-rw-r--r--mm/vmstat.c22
-rw-r--r--mm/zswap.c195
28 files changed, 569 insertions, 390 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 394838f489eb..3f4ffda152bb 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -153,11 +153,18 @@ config MOVABLE_NODE
153 help 153 help
154 Allow a node to have only movable memory. Pages used by the kernel, 154 Allow a node to have only movable memory. Pages used by the kernel,
155 such as direct mapping pages cannot be migrated. So the corresponding 155 such as direct mapping pages cannot be migrated. So the corresponding
156 memory device cannot be hotplugged. This option allows users to 156 memory device cannot be hotplugged. This option allows the following
157 online all the memory of a node as movable memory so that the whole 157 two things:
158 node can be hotplugged. Users who don't use the memory hotplug 158 - When the system is booting, node full of hotpluggable memory can
159 feature are fine with this option on since they don't online memory 159 be arranged to have only movable memory so that the whole node can
160 as movable. 160 be hot-removed. (need movable_node boot option specified).
161 - After the system is up, the option allows users to online all the
162 memory of a node as movable memory so that the whole node can be
163 hot-removed.
164
165 Users who don't use the memory hotplug feature are fine with this
166 option on since they don't specify movable_node boot option or they
167 don't online memory as movable.
161 168
162 Say Y here if you want to hotplug a whole node. 169 Say Y here if you want to hotplug a whole node.
163 Say N here if you want kernel to use memory on all nodes evenly. 170 Say N here if you want kernel to use memory on all nodes evenly.
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 6ab7744e692e..90bd3507b413 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -172,11 +172,12 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
172static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) 172static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
173{ 173{
174 struct page *page; 174 struct page *page;
175 unsigned long start, end, pages, count = 0; 175 unsigned long *map, start, end, pages, count = 0;
176 176
177 if (!bdata->node_bootmem_map) 177 if (!bdata->node_bootmem_map)
178 return 0; 178 return 0;
179 179
180 map = bdata->node_bootmem_map;
180 start = bdata->node_min_pfn; 181 start = bdata->node_min_pfn;
181 end = bdata->node_low_pfn; 182 end = bdata->node_low_pfn;
182 183
@@ -184,10 +185,9 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
184 bdata - bootmem_node_data, start, end); 185 bdata - bootmem_node_data, start, end);
185 186
186 while (start < end) { 187 while (start < end) {
187 unsigned long *map, idx, vec; 188 unsigned long idx, vec;
188 unsigned shift; 189 unsigned shift;
189 190
190 map = bdata->node_bootmem_map;
191 idx = start - bdata->node_min_pfn; 191 idx = start - bdata->node_min_pfn;
192 shift = idx & (BITS_PER_LONG - 1); 192 shift = idx & (BITS_PER_LONG - 1);
193 /* 193 /*
@@ -784,7 +784,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
784 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 784 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
785 785
786 /* update goal according ...MAX_DMA32_PFN */ 786 /* update goal according ...MAX_DMA32_PFN */
787 end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages; 787 end_pfn = pgdat_end_pfn(pgdat);
788 788
789 if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && 789 if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
790 (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { 790 (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
diff --git a/mm/compaction.c b/mm/compaction.c
index b5326b141a25..805165bcd3dd 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -235,10 +235,9 @@ static bool suitable_migration_target(struct page *page)
235} 235}
236 236
237/* 237/*
238 * Isolate free pages onto a private freelist. Caller must hold zone->lock. 238 * Isolate free pages onto a private freelist. If @strict is true, will abort
239 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free 239 * returning 0 on any invalid PFNs or non-free pages inside of the pageblock
240 * pages inside of the pageblock (even though it may still end up isolating 240 * (even though it may still end up isolating some pages).
241 * some pages).
242 */ 241 */
243static unsigned long isolate_freepages_block(struct compact_control *cc, 242static unsigned long isolate_freepages_block(struct compact_control *cc,
244 unsigned long blockpfn, 243 unsigned long blockpfn,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2612f60f53ee..0556c6a44959 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -27,11 +27,12 @@
27#include "internal.h" 27#include "internal.h"
28 28
29/* 29/*
30 * By default transparent hugepage support is enabled for all mappings 30 * By default transparent hugepage support is disabled in order that avoid
31 * and khugepaged scans all mappings. Defrag is only invoked by 31 * to risk increase the memory footprint of applications without a guaranteed
32 * khugepaged hugepage allocations and by page faults inside 32 * benefit. When transparent hugepage support is enabled, is for all mappings,
33 * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived 33 * and khugepaged scans all mappings.
34 * allocations. 34 * Defrag is invoked by khugepaged hugepage allocations and by page faults
35 * for all hugepage allocations.
35 */ 36 */
36unsigned long transparent_hugepage_flags __read_mostly = 37unsigned long transparent_hugepage_flags __read_mostly =
37#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS 38#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
@@ -758,14 +759,6 @@ static inline struct page *alloc_hugepage_vma(int defrag,
758 HPAGE_PMD_ORDER, vma, haddr, nd); 759 HPAGE_PMD_ORDER, vma, haddr, nd);
759} 760}
760 761
761#ifndef CONFIG_NUMA
762static inline struct page *alloc_hugepage(int defrag)
763{
764 return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
765 HPAGE_PMD_ORDER);
766}
767#endif
768
769static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, 762static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
770 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, 763 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
771 struct page *zero_page) 764 struct page *zero_page)
@@ -2198,7 +2191,34 @@ static void khugepaged_alloc_sleep(void)
2198 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); 2191 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
2199} 2192}
2200 2193
2194static int khugepaged_node_load[MAX_NUMNODES];
2195
2201#ifdef CONFIG_NUMA 2196#ifdef CONFIG_NUMA
2197static int khugepaged_find_target_node(void)
2198{
2199 static int last_khugepaged_target_node = NUMA_NO_NODE;
2200 int nid, target_node = 0, max_value = 0;
2201
2202 /* find first node with max normal pages hit */
2203 for (nid = 0; nid < MAX_NUMNODES; nid++)
2204 if (khugepaged_node_load[nid] > max_value) {
2205 max_value = khugepaged_node_load[nid];
2206 target_node = nid;
2207 }
2208
2209 /* do some balance if several nodes have the same hit record */
2210 if (target_node <= last_khugepaged_target_node)
2211 for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
2212 nid++)
2213 if (max_value == khugepaged_node_load[nid]) {
2214 target_node = nid;
2215 break;
2216 }
2217
2218 last_khugepaged_target_node = target_node;
2219 return target_node;
2220}
2221
2202static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) 2222static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
2203{ 2223{
2204 if (IS_ERR(*hpage)) { 2224 if (IS_ERR(*hpage)) {
@@ -2232,9 +2252,8 @@ static struct page
2232 * mmap_sem in read mode is good idea also to allow greater 2252 * mmap_sem in read mode is good idea also to allow greater
2233 * scalability. 2253 * scalability.
2234 */ 2254 */
2235 *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address, 2255 *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
2236 node, __GFP_OTHER_NODE); 2256 khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
2237
2238 /* 2257 /*
2239 * After allocating the hugepage, release the mmap_sem read lock in 2258 * After allocating the hugepage, release the mmap_sem read lock in
2240 * preparation for taking it in write mode. 2259 * preparation for taking it in write mode.
@@ -2250,6 +2269,17 @@ static struct page
2250 return *hpage; 2269 return *hpage;
2251} 2270}
2252#else 2271#else
2272static int khugepaged_find_target_node(void)
2273{
2274 return 0;
2275}
2276
2277static inline struct page *alloc_hugepage(int defrag)
2278{
2279 return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
2280 HPAGE_PMD_ORDER);
2281}
2282
2253static struct page *khugepaged_alloc_hugepage(bool *wait) 2283static struct page *khugepaged_alloc_hugepage(bool *wait)
2254{ 2284{
2255 struct page *hpage; 2285 struct page *hpage;
@@ -2456,6 +2486,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2456 if (pmd_trans_huge(*pmd)) 2486 if (pmd_trans_huge(*pmd))
2457 goto out; 2487 goto out;
2458 2488
2489 memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
2459 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2490 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2460 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; 2491 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
2461 _pte++, _address += PAGE_SIZE) { 2492 _pte++, _address += PAGE_SIZE) {
@@ -2472,12 +2503,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2472 if (unlikely(!page)) 2503 if (unlikely(!page))
2473 goto out_unmap; 2504 goto out_unmap;
2474 /* 2505 /*
2475 * Chose the node of the first page. This could 2506 * Record which node the original page is from and save this
2476 * be more sophisticated and look at more pages, 2507 * information to khugepaged_node_load[].
2477 * but isn't for now. 2508 * Khupaged will allocate hugepage from the node has the max
2509 * hit record.
2478 */ 2510 */
2479 if (node == NUMA_NO_NODE) 2511 node = page_to_nid(page);
2480 node = page_to_nid(page); 2512 khugepaged_node_load[node]++;
2481 VM_BUG_ON(PageCompound(page)); 2513 VM_BUG_ON(PageCompound(page));
2482 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) 2514 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
2483 goto out_unmap; 2515 goto out_unmap;
@@ -2492,9 +2524,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2492 ret = 1; 2524 ret = 1;
2493out_unmap: 2525out_unmap:
2494 pte_unmap_unlock(pte, ptl); 2526 pte_unmap_unlock(pte, ptl);
2495 if (ret) 2527 if (ret) {
2528 node = khugepaged_find_target_node();
2496 /* collapse_huge_page will return with the mmap_sem released */ 2529 /* collapse_huge_page will return with the mmap_sem released */
2497 collapse_huge_page(mm, address, hpage, vma, node); 2530 collapse_huge_page(mm, address, hpage, vma, node);
2531 }
2498out: 2532out:
2499 return ret; 2533 return ret;
2500} 2534}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index e126b0ef9ad2..31f01c5011e5 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -753,7 +753,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
753 } 753 }
754 754
755 spin_lock_irqsave(&object->lock, flags); 755 spin_lock_irqsave(&object->lock, flags);
756 if (ptr + size > object->pointer + object->size) { 756 if (size == SIZE_MAX) {
757 size = object->pointer + object->size - ptr;
758 } else if (ptr + size > object->pointer + object->size) {
757 kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); 759 kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
758 dump_object_info(object); 760 dump_object_info(object);
759 kmem_cache_free(scan_area_cache, area); 761 kmem_cache_free(scan_area_cache, area);
diff --git a/mm/ksm.c b/mm/ksm.c
index 0bea2b262a47..175fff79dc95 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2309,8 +2309,8 @@ static ssize_t merge_across_nodes_store(struct kobject *kobj,
2309 * Allocate stable and unstable together: 2309 * Allocate stable and unstable together:
2310 * MAXSMP NODES_SHIFT 10 will use 16kB. 2310 * MAXSMP NODES_SHIFT 10 will use 16kB.
2311 */ 2311 */
2312 buf = kcalloc(nr_node_ids + nr_node_ids, 2312 buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
2313 sizeof(*buf), GFP_KERNEL | __GFP_ZERO); 2313 GFP_KERNEL);
2314 /* Let us assume that RB_ROOT is NULL is zero */ 2314 /* Let us assume that RB_ROOT is NULL is zero */
2315 if (!buf) 2315 if (!buf)
2316 err = -ENOMEM; 2316 err = -ENOMEM;
diff --git a/mm/memblock.c b/mm/memblock.c
index 0ac412a0a7ee..53e477bb5558 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -20,6 +20,8 @@
20#include <linux/seq_file.h> 20#include <linux/seq_file.h>
21#include <linux/memblock.h> 21#include <linux/memblock.h>
22 22
23#include <asm-generic/sections.h>
24
23static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; 25static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
24static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; 26static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
25 27
@@ -32,6 +34,7 @@ struct memblock memblock __initdata_memblock = {
32 .reserved.cnt = 1, /* empty dummy entry */ 34 .reserved.cnt = 1, /* empty dummy entry */
33 .reserved.max = INIT_MEMBLOCK_REGIONS, 35 .reserved.max = INIT_MEMBLOCK_REGIONS,
34 36
37 .bottom_up = false,
35 .current_limit = MEMBLOCK_ALLOC_ANYWHERE, 38 .current_limit = MEMBLOCK_ALLOC_ANYWHERE,
36}; 39};
37 40
@@ -82,6 +85,73 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
82 return (i < type->cnt) ? i : -1; 85 return (i < type->cnt) ? i : -1;
83} 86}
84 87
88/*
89 * __memblock_find_range_bottom_up - find free area utility in bottom-up
90 * @start: start of candidate range
91 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
92 * @size: size of free area to find
93 * @align: alignment of free area to find
94 * @nid: nid of the free area to find, %MAX_NUMNODES for any node
95 *
96 * Utility called from memblock_find_in_range_node(), find free area bottom-up.
97 *
98 * RETURNS:
99 * Found address on success, 0 on failure.
100 */
101static phys_addr_t __init_memblock
102__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
103 phys_addr_t size, phys_addr_t align, int nid)
104{
105 phys_addr_t this_start, this_end, cand;
106 u64 i;
107
108 for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) {
109 this_start = clamp(this_start, start, end);
110 this_end = clamp(this_end, start, end);
111
112 cand = round_up(this_start, align);
113 if (cand < this_end && this_end - cand >= size)
114 return cand;
115 }
116
117 return 0;
118}
119
120/**
121 * __memblock_find_range_top_down - find free area utility, in top-down
122 * @start: start of candidate range
123 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
124 * @size: size of free area to find
125 * @align: alignment of free area to find
126 * @nid: nid of the free area to find, %MAX_NUMNODES for any node
127 *
128 * Utility called from memblock_find_in_range_node(), find free area top-down.
129 *
130 * RETURNS:
131 * Found address on success, 0 on failure.
132 */
133static phys_addr_t __init_memblock
134__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
135 phys_addr_t size, phys_addr_t align, int nid)
136{
137 phys_addr_t this_start, this_end, cand;
138 u64 i;
139
140 for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
141 this_start = clamp(this_start, start, end);
142 this_end = clamp(this_end, start, end);
143
144 if (this_end < size)
145 continue;
146
147 cand = round_down(this_end - size, align);
148 if (cand >= this_start)
149 return cand;
150 }
151
152 return 0;
153}
154
85/** 155/**
86 * memblock_find_in_range_node - find free area in given range and node 156 * memblock_find_in_range_node - find free area in given range and node
87 * @start: start of candidate range 157 * @start: start of candidate range
@@ -92,15 +162,23 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
92 * 162 *
93 * Find @size free area aligned to @align in the specified range and node. 163 * Find @size free area aligned to @align in the specified range and node.
94 * 164 *
165 * When allocation direction is bottom-up, the @start should be greater
166 * than the end of the kernel image. Otherwise, it will be trimmed. The
167 * reason is that we want the bottom-up allocation just near the kernel
168 * image so it is highly likely that the allocated memory and the kernel
169 * will reside in the same node.
170 *
171 * If bottom-up allocation failed, will try to allocate memory top-down.
172 *
95 * RETURNS: 173 * RETURNS:
96 * Found address on success, %0 on failure. 174 * Found address on success, 0 on failure.
97 */ 175 */
98phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, 176phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
99 phys_addr_t end, phys_addr_t size, 177 phys_addr_t end, phys_addr_t size,
100 phys_addr_t align, int nid) 178 phys_addr_t align, int nid)
101{ 179{
102 phys_addr_t this_start, this_end, cand; 180 int ret;
103 u64 i; 181 phys_addr_t kernel_end;
104 182
105 /* pump up @end */ 183 /* pump up @end */
106 if (end == MEMBLOCK_ALLOC_ACCESSIBLE) 184 if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
@@ -109,19 +187,39 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
109 /* avoid allocating the first page */ 187 /* avoid allocating the first page */
110 start = max_t(phys_addr_t, start, PAGE_SIZE); 188 start = max_t(phys_addr_t, start, PAGE_SIZE);
111 end = max(start, end); 189 end = max(start, end);
190 kernel_end = __pa_symbol(_end);
112 191
113 for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { 192 /*
114 this_start = clamp(this_start, start, end); 193 * try bottom-up allocation only when bottom-up mode
115 this_end = clamp(this_end, start, end); 194 * is set and @end is above the kernel image.
195 */
196 if (memblock_bottom_up() && end > kernel_end) {
197 phys_addr_t bottom_up_start;
116 198
117 if (this_end < size) 199 /* make sure we will allocate above the kernel */
118 continue; 200 bottom_up_start = max(start, kernel_end);
119 201
120 cand = round_down(this_end - size, align); 202 /* ok, try bottom-up allocation first */
121 if (cand >= this_start) 203 ret = __memblock_find_range_bottom_up(bottom_up_start, end,
122 return cand; 204 size, align, nid);
205 if (ret)
206 return ret;
207
208 /*
209 * we always limit bottom-up allocation above the kernel,
210 * but top-down allocation doesn't have the limit, so
211 * retrying top-down allocation may succeed when bottom-up
212 * allocation failed.
213 *
214 * bottom-up allocation is expected to be fail very rarely,
215 * so we use WARN_ONCE() here to see the stack trace if
216 * fail happens.
217 */
218 WARN_ONCE(1, "memblock: bottom-up allocation failed, "
219 "memory hotunplug may be affected\n");
123 } 220 }
124 return 0; 221
222 return __memblock_find_range_top_down(start, end, size, align, nid);
125} 223}
126 224
127/** 225/**
@@ -134,7 +232,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
134 * Find @size free area aligned to @align in the specified range. 232 * Find @size free area aligned to @align in the specified range.
135 * 233 *
136 * RETURNS: 234 * RETURNS:
137 * Found address on success, %0 on failure. 235 * Found address on success, 0 on failure.
138 */ 236 */
139phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, 237phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
140 phys_addr_t end, phys_addr_t size, 238 phys_addr_t end, phys_addr_t size,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 796820925de0..f20a57b7faf2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -59,6 +59,7 @@
59#include <net/sock.h> 59#include <net/sock.h>
60#include <net/ip.h> 60#include <net/ip.h>
61#include <net/tcp_memcontrol.h> 61#include <net/tcp_memcontrol.h>
62#include "slab.h"
62 63
63#include <asm/uaccess.h> 64#include <asm/uaccess.h>
64 65
@@ -2968,7 +2969,7 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
2968 2969
2969 VM_BUG_ON(p->is_root_cache); 2970 VM_BUG_ON(p->is_root_cache);
2970 cachep = p->root_cache; 2971 cachep = p->root_cache;
2971 return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)]; 2972 return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
2972} 2973}
2973 2974
2974#ifdef CONFIG_SLABINFO 2975#ifdef CONFIG_SLABINFO
@@ -2997,21 +2998,14 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
2997 struct res_counter *fail_res; 2998 struct res_counter *fail_res;
2998 struct mem_cgroup *_memcg; 2999 struct mem_cgroup *_memcg;
2999 int ret = 0; 3000 int ret = 0;
3000 bool may_oom;
3001 3001
3002 ret = res_counter_charge(&memcg->kmem, size, &fail_res); 3002 ret = res_counter_charge(&memcg->kmem, size, &fail_res);
3003 if (ret) 3003 if (ret)
3004 return ret; 3004 return ret;
3005 3005
3006 /*
3007 * Conditions under which we can wait for the oom_killer. Those are
3008 * the same conditions tested by the core page allocator
3009 */
3010 may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
3011
3012 _memcg = memcg; 3006 _memcg = memcg;
3013 ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, 3007 ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
3014 &_memcg, may_oom); 3008 &_memcg, oom_gfp_allowed(gfp));
3015 3009
3016 if (ret == -EINTR) { 3010 if (ret == -EINTR) {
3017 /* 3011 /*
@@ -3151,7 +3145,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
3151{ 3145{
3152 struct memcg_cache_params *cur_params = s->memcg_params; 3146 struct memcg_cache_params *cur_params = s->memcg_params;
3153 3147
3154 VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache); 3148 VM_BUG_ON(!is_root_cache(s));
3155 3149
3156 if (num_groups > memcg_limited_groups_array_size) { 3150 if (num_groups > memcg_limited_groups_array_size) {
3157 int i; 3151 int i;
@@ -3412,7 +3406,7 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
3412 idx = memcg_cache_id(memcg); 3406 idx = memcg_cache_id(memcg);
3413 3407
3414 mutex_lock(&memcg_cache_mutex); 3408 mutex_lock(&memcg_cache_mutex);
3415 new_cachep = cachep->memcg_params->memcg_caches[idx]; 3409 new_cachep = cache_from_memcg_idx(cachep, idx);
3416 if (new_cachep) { 3410 if (new_cachep) {
3417 css_put(&memcg->css); 3411 css_put(&memcg->css);
3418 goto out; 3412 goto out;
@@ -3458,8 +3452,8 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
3458 * we'll take the set_limit_mutex to protect ourselves against this. 3452 * we'll take the set_limit_mutex to protect ourselves against this.
3459 */ 3453 */
3460 mutex_lock(&set_limit_mutex); 3454 mutex_lock(&set_limit_mutex);
3461 for (i = 0; i < memcg_limited_groups_array_size; i++) { 3455 for_each_memcg_cache_index(i) {
3462 c = s->memcg_params->memcg_caches[i]; 3456 c = cache_from_memcg_idx(s, i);
3463 if (!c) 3457 if (!c)
3464 continue; 3458 continue;
3465 3459
@@ -3592,8 +3586,8 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3592 * code updating memcg_caches will issue a write barrier to match this. 3586 * code updating memcg_caches will issue a write barrier to match this.
3593 */ 3587 */
3594 read_barrier_depends(); 3588 read_barrier_depends();
3595 if (likely(cachep->memcg_params->memcg_caches[idx])) { 3589 if (likely(cache_from_memcg_idx(cachep, idx))) {
3596 cachep = cachep->memcg_params->memcg_caches[idx]; 3590 cachep = cache_from_memcg_idx(cachep, idx);
3597 goto out; 3591 goto out;
3598 } 3592 }
3599 3593
@@ -5389,45 +5383,50 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
5389static int memcg_numa_stat_show(struct cgroup_subsys_state *css, 5383static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
5390 struct cftype *cft, struct seq_file *m) 5384 struct cftype *cft, struct seq_file *m)
5391{ 5385{
5386 struct numa_stat {
5387 const char *name;
5388 unsigned int lru_mask;
5389 };
5390
5391 static const struct numa_stat stats[] = {
5392 { "total", LRU_ALL },
5393 { "file", LRU_ALL_FILE },
5394 { "anon", LRU_ALL_ANON },
5395 { "unevictable", BIT(LRU_UNEVICTABLE) },
5396 };
5397 const struct numa_stat *stat;
5392 int nid; 5398 int nid;
5393 unsigned long total_nr, file_nr, anon_nr, unevictable_nr; 5399 unsigned long nr;
5394 unsigned long node_nr;
5395 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5400 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5396 5401
5397 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); 5402 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
5398 seq_printf(m, "total=%lu", total_nr); 5403 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
5399 for_each_node_state(nid, N_MEMORY) { 5404 seq_printf(m, "%s=%lu", stat->name, nr);
5400 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); 5405 for_each_node_state(nid, N_MEMORY) {
5401 seq_printf(m, " N%d=%lu", nid, node_nr); 5406 nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
5402 } 5407 stat->lru_mask);
5403 seq_putc(m, '\n'); 5408 seq_printf(m, " N%d=%lu", nid, nr);
5404 5409 }
5405 file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); 5410 seq_putc(m, '\n');
5406 seq_printf(m, "file=%lu", file_nr); 5411 }
5407 for_each_node_state(nid, N_MEMORY) { 5412
5408 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 5413 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
5409 LRU_ALL_FILE); 5414 struct mem_cgroup *iter;
5410 seq_printf(m, " N%d=%lu", nid, node_nr); 5415
5411 } 5416 nr = 0;
5412 seq_putc(m, '\n'); 5417 for_each_mem_cgroup_tree(iter, memcg)
5413 5418 nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
5414 anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); 5419 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
5415 seq_printf(m, "anon=%lu", anon_nr); 5420 for_each_node_state(nid, N_MEMORY) {
5416 for_each_node_state(nid, N_MEMORY) { 5421 nr = 0;
5417 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 5422 for_each_mem_cgroup_tree(iter, memcg)
5418 LRU_ALL_ANON); 5423 nr += mem_cgroup_node_nr_lru_pages(
5419 seq_printf(m, " N%d=%lu", nid, node_nr); 5424 iter, nid, stat->lru_mask);
5425 seq_printf(m, " N%d=%lu", nid, nr);
5426 }
5427 seq_putc(m, '\n');
5420 } 5428 }
5421 seq_putc(m, '\n');
5422 5429
5423 unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
5424 seq_printf(m, "unevictable=%lu", unevictable_nr);
5425 for_each_node_state(nid, N_MEMORY) {
5426 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
5427 BIT(LRU_UNEVICTABLE));
5428 seq_printf(m, " N%d=%lu", nid, node_nr);
5429 }
5430 seq_putc(m, '\n');
5431 return 0; 5430 return 0;
5432} 5431}
5433#endif /* CONFIG_NUMA */ 5432#endif /* CONFIG_NUMA */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index bf3351b5115e..f9d78ec7831f 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1423,19 +1423,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1423 return 1; 1423 return 1;
1424 1424
1425 /* 1425 /*
1426 * The lock_memory_hotplug prevents a race with memory hotplug.
1427 * This is a big hammer, a better would be nicer.
1428 */
1429 lock_memory_hotplug();
1430
1431 /*
1432 * Isolate the page, so that it doesn't get reallocated if it
1433 * was free. This flag should be kept set until the source page
1434 * is freed and PG_hwpoison on it is set.
1435 */
1436 if (get_pageblock_migratetype(p) != MIGRATE_ISOLATE)
1437 set_migratetype_isolate(p, true);
1438 /*
1439 * When the target page is a free hugepage, just remove it 1426 * When the target page is a free hugepage, just remove it
1440 * from free hugepage list. 1427 * from free hugepage list.
1441 */ 1428 */
@@ -1455,7 +1442,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1455 /* Not a free page */ 1442 /* Not a free page */
1456 ret = 1; 1443 ret = 1;
1457 } 1444 }
1458 unlock_memory_hotplug();
1459 return ret; 1445 return ret;
1460} 1446}
1461 1447
@@ -1654,15 +1640,28 @@ int soft_offline_page(struct page *page, int flags)
1654 } 1640 }
1655 } 1641 }
1656 1642
1643 /*
1644 * The lock_memory_hotplug prevents a race with memory hotplug.
1645 * This is a big hammer, a better would be nicer.
1646 */
1647 lock_memory_hotplug();
1648
1649 /*
1650 * Isolate the page, so that it doesn't get reallocated if it
1651 * was free. This flag should be kept set until the source page
1652 * is freed and PG_hwpoison on it is set.
1653 */
1654 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
1655 set_migratetype_isolate(page, true);
1656
1657 ret = get_any_page(page, pfn, flags); 1657 ret = get_any_page(page, pfn, flags);
1658 if (ret < 0) 1658 unlock_memory_hotplug();
1659 goto unset; 1659 if (ret > 0) { /* for in-use pages */
1660 if (ret) { /* for in-use pages */
1661 if (PageHuge(page)) 1660 if (PageHuge(page))
1662 ret = soft_offline_huge_page(page, flags); 1661 ret = soft_offline_huge_page(page, flags);
1663 else 1662 else
1664 ret = __soft_offline_page(page, flags); 1663 ret = __soft_offline_page(page, flags);
1665 } else { /* for free pages */ 1664 } else if (ret == 0) { /* for free pages */
1666 if (PageHuge(page)) { 1665 if (PageHuge(page)) {
1667 set_page_hwpoison_huge_page(hpage); 1666 set_page_hwpoison_huge_page(hpage);
1668 dequeue_hwpoisoned_huge_page(hpage); 1667 dequeue_hwpoisoned_huge_page(hpage);
@@ -1673,7 +1672,6 @@ int soft_offline_page(struct page *page, int flags)
1673 atomic_long_inc(&num_poisoned_pages); 1672 atomic_long_inc(&num_poisoned_pages);
1674 } 1673 }
1675 } 1674 }
1676unset:
1677 unset_migratetype_isolate(page, MIGRATE_MOVABLE); 1675 unset_migratetype_isolate(page, MIGRATE_MOVABLE);
1678 return ret; 1676 return ret;
1679} 1677}
diff --git a/mm/memory.c b/mm/memory.c
index 33a3dbec3cc8..bf8665849a5f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -453,8 +453,6 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
453 453
454/* 454/*
455 * This function frees user-level page tables of a process. 455 * This function frees user-level page tables of a process.
456 *
457 * Must be called with pagetable lock held.
458 */ 456 */
459void free_pgd_range(struct mmu_gather *tlb, 457void free_pgd_range(struct mmu_gather *tlb,
460 unsigned long addr, unsigned long end, 458 unsigned long addr, unsigned long end,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ed85fe3870e2..489f235502db 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -31,6 +31,7 @@
31#include <linux/firmware-map.h> 31#include <linux/firmware-map.h>
32#include <linux/stop_machine.h> 32#include <linux/stop_machine.h>
33#include <linux/hugetlb.h> 33#include <linux/hugetlb.h>
34#include <linux/memblock.h>
34 35
35#include <asm/tlbflush.h> 36#include <asm/tlbflush.h>
36 37
@@ -365,8 +366,7 @@ out_fail:
365static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 366static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
366 unsigned long end_pfn) 367 unsigned long end_pfn)
367{ 368{
368 unsigned long old_pgdat_end_pfn = 369 unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat);
369 pgdat->node_start_pfn + pgdat->node_spanned_pages;
370 370
371 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) 371 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
372 pgdat->node_start_pfn = start_pfn; 372 pgdat->node_start_pfn = start_pfn;
@@ -402,13 +402,12 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
402static int __meminit __add_section(int nid, struct zone *zone, 402static int __meminit __add_section(int nid, struct zone *zone,
403 unsigned long phys_start_pfn) 403 unsigned long phys_start_pfn)
404{ 404{
405 int nr_pages = PAGES_PER_SECTION;
406 int ret; 405 int ret;
407 406
408 if (pfn_valid(phys_start_pfn)) 407 if (pfn_valid(phys_start_pfn))
409 return -EEXIST; 408 return -EEXIST;
410 409
411 ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); 410 ret = sparse_add_one_section(zone, phys_start_pfn);
412 411
413 if (ret < 0) 412 if (ret < 0)
414 return ret; 413 return ret;
@@ -579,9 +578,9 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
579static void shrink_pgdat_span(struct pglist_data *pgdat, 578static void shrink_pgdat_span(struct pglist_data *pgdat,
580 unsigned long start_pfn, unsigned long end_pfn) 579 unsigned long start_pfn, unsigned long end_pfn)
581{ 580{
582 unsigned long pgdat_start_pfn = pgdat->node_start_pfn; 581 unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
583 unsigned long pgdat_end_pfn = 582 unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */
584 pgdat->node_start_pfn + pgdat->node_spanned_pages; 583 unsigned long pgdat_end_pfn = p;
585 unsigned long pfn; 584 unsigned long pfn;
586 struct mem_section *ms; 585 struct mem_section *ms;
587 int nid = pgdat->node_id; 586 int nid = pgdat->node_id;
@@ -935,7 +934,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
935 arg.nr_pages = nr_pages; 934 arg.nr_pages = nr_pages;
936 node_states_check_changes_online(nr_pages, zone, &arg); 935 node_states_check_changes_online(nr_pages, zone, &arg);
937 936
938 nid = page_to_nid(pfn_to_page(pfn)); 937 nid = pfn_to_nid(pfn);
939 938
940 ret = memory_notify(MEM_GOING_ONLINE, &arg); 939 ret = memory_notify(MEM_GOING_ONLINE, &arg);
941 ret = notifier_to_errno(ret); 940 ret = notifier_to_errno(ret);
@@ -1044,17 +1043,23 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
1044} 1043}
1045 1044
1046 1045
1047/* 1046/**
1047 * try_online_node - online a node if offlined
1048 *
1048 * called by cpu_up() to online a node without onlined memory. 1049 * called by cpu_up() to online a node without onlined memory.
1049 */ 1050 */
1050int mem_online_node(int nid) 1051int try_online_node(int nid)
1051{ 1052{
1052 pg_data_t *pgdat; 1053 pg_data_t *pgdat;
1053 int ret; 1054 int ret;
1054 1055
1056 if (node_online(nid))
1057 return 0;
1058
1055 lock_memory_hotplug(); 1059 lock_memory_hotplug();
1056 pgdat = hotadd_new_pgdat(nid, 0); 1060 pgdat = hotadd_new_pgdat(nid, 0);
1057 if (!pgdat) { 1061 if (!pgdat) {
1062 pr_err("Cannot online node %d due to NULL pgdat\n", nid);
1058 ret = -ENOMEM; 1063 ret = -ENOMEM;
1059 goto out; 1064 goto out;
1060 } 1065 }
@@ -1062,6 +1067,12 @@ int mem_online_node(int nid)
1062 ret = register_one_node(nid); 1067 ret = register_one_node(nid);
1063 BUG_ON(ret); 1068 BUG_ON(ret);
1064 1069
1070 if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
1071 mutex_lock(&zonelists_mutex);
1072 build_all_zonelists(NULL, NULL);
1073 mutex_unlock(&zonelists_mutex);
1074 }
1075
1065out: 1076out:
1066 unlock_memory_hotplug(); 1077 unlock_memory_hotplug();
1067 return ret; 1078 return ret;
@@ -1412,6 +1423,36 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
1412} 1423}
1413#endif /* CONFIG_MOVABLE_NODE */ 1424#endif /* CONFIG_MOVABLE_NODE */
1414 1425
1426static int __init cmdline_parse_movable_node(char *p)
1427{
1428#ifdef CONFIG_MOVABLE_NODE
1429 /*
1430 * Memory used by the kernel cannot be hot-removed because Linux
1431 * cannot migrate the kernel pages. When memory hotplug is
1432 * enabled, we should prevent memblock from allocating memory
1433 * for the kernel.
1434 *
1435 * ACPI SRAT records all hotpluggable memory ranges. But before
1436 * SRAT is parsed, we don't know about it.
1437 *
1438 * The kernel image is loaded into memory at very early time. We
1439 * cannot prevent this anyway. So on NUMA system, we set any
1440 * node the kernel resides in as un-hotpluggable.
1441 *
1442 * Since on modern servers, one node could have double-digit
1443 * gigabytes memory, we can assume the memory around the kernel
1444 * image is also un-hotpluggable. So before SRAT is parsed, just
1445 * allocate memory near the kernel image to try the best to keep
1446 * the kernel away from hotpluggable memory.
1447 */
1448 memblock_set_bottom_up(true);
1449#else
1450 pr_warn("movable_node option not supported\n");
1451#endif
1452 return 0;
1453}
1454early_param("movable_node", cmdline_parse_movable_node);
1455
1415/* check which state of node_states will be changed when offline memory */ 1456/* check which state of node_states will be changed when offline memory */
1416static void node_states_check_changes_offline(unsigned long nr_pages, 1457static void node_states_check_changes_offline(unsigned long nr_pages,
1417 struct zone *zone, struct memory_notify *arg) 1458 struct zone *zone, struct memory_notify *arg)
@@ -1702,7 +1743,7 @@ int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
1702} 1743}
1703 1744
1704#ifdef CONFIG_MEMORY_HOTREMOVE 1745#ifdef CONFIG_MEMORY_HOTREMOVE
1705static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) 1746static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
1706{ 1747{
1707 int ret = !is_memblock_offlined(mem); 1748 int ret = !is_memblock_offlined(mem);
1708 1749
@@ -1854,7 +1895,7 @@ void __ref remove_memory(int nid, u64 start, u64 size)
1854 * if this is not the case. 1895 * if this is not the case.
1855 */ 1896 */
1856 ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, 1897 ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
1857 is_memblock_offlined_cb); 1898 check_memblock_offlined_cb);
1858 if (ret) { 1899 if (ret) {
1859 unlock_memory_hotplug(); 1900 unlock_memory_hotplug();
1860 BUG(); 1901 BUG();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 71cb253368cb..4cc19f6ab6c6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1125,7 +1125,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1125 tmp = *from; 1125 tmp = *from;
1126 while (!nodes_empty(tmp)) { 1126 while (!nodes_empty(tmp)) {
1127 int s,d; 1127 int s,d;
1128 int source = -1; 1128 int source = NUMA_NO_NODE;
1129 int dest = 0; 1129 int dest = 0;
1130 1130
1131 for_each_node_mask(s, tmp) { 1131 for_each_node_mask(s, tmp) {
@@ -1160,7 +1160,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1160 if (!node_isset(dest, tmp)) 1160 if (!node_isset(dest, tmp))
1161 break; 1161 break;
1162 } 1162 }
1163 if (source == -1) 1163 if (source == NUMA_NO_NODE)
1164 break; 1164 break;
1165 1165
1166 node_clear(source, tmp); 1166 node_clear(source, tmp);
@@ -1835,7 +1835,7 @@ static unsigned offset_il_node(struct mempolicy *pol,
1835 unsigned nnodes = nodes_weight(pol->v.nodes); 1835 unsigned nnodes = nodes_weight(pol->v.nodes);
1836 unsigned target; 1836 unsigned target;
1837 int c; 1837 int c;
1838 int nid = -1; 1838 int nid = NUMA_NO_NODE;
1839 1839
1840 if (!nnodes) 1840 if (!nnodes)
1841 return numa_node_id(); 1841 return numa_node_id();
@@ -1872,11 +1872,11 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
1872 1872
1873/* 1873/*
1874 * Return the bit number of a random bit set in the nodemask. 1874 * Return the bit number of a random bit set in the nodemask.
1875 * (returns -1 if nodemask is empty) 1875 * (returns NUMA_NO_NODE if nodemask is empty)
1876 */ 1876 */
1877int node_random(const nodemask_t *maskp) 1877int node_random(const nodemask_t *maskp)
1878{ 1878{
1879 int w, bit = -1; 1879 int w, bit = NUMA_NO_NODE;
1880 1880
1881 w = nodes_weight(*maskp); 1881 w = nodes_weight(*maskp);
1882 if (w) 1882 if (w)
@@ -2914,62 +2914,45 @@ out:
2914 * @maxlen: length of @buffer 2914 * @maxlen: length of @buffer
2915 * @pol: pointer to mempolicy to be formatted 2915 * @pol: pointer to mempolicy to be formatted
2916 * 2916 *
2917 * Convert a mempolicy into a string. 2917 * Convert @pol into a string. If @buffer is too short, truncate the string.
2918 * Returns the number of characters in buffer (if positive) 2918 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2919 * or an error (negative) 2919 * longest flag, "relative", and to display at least a few node ids.
2920 */ 2920 */
2921int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) 2921void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2922{ 2922{
2923 char *p = buffer; 2923 char *p = buffer;
2924 int l; 2924 nodemask_t nodes = NODE_MASK_NONE;
2925 nodemask_t nodes; 2925 unsigned short mode = MPOL_DEFAULT;
2926 unsigned short mode; 2926 unsigned short flags = 0;
2927 unsigned short flags = pol ? pol->flags : 0;
2928
2929 /*
2930 * Sanity check: room for longest mode, flag and some nodes
2931 */
2932 VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2933 2927
2934 if (!pol || pol == &default_policy) 2928 if (pol && pol != &default_policy) {
2935 mode = MPOL_DEFAULT;
2936 else
2937 mode = pol->mode; 2929 mode = pol->mode;
2930 flags = pol->flags;
2931 }
2938 2932
2939 switch (mode) { 2933 switch (mode) {
2940 case MPOL_DEFAULT: 2934 case MPOL_DEFAULT:
2941 nodes_clear(nodes);
2942 break; 2935 break;
2943
2944 case MPOL_PREFERRED: 2936 case MPOL_PREFERRED:
2945 nodes_clear(nodes);
2946 if (flags & MPOL_F_LOCAL) 2937 if (flags & MPOL_F_LOCAL)
2947 mode = MPOL_LOCAL; 2938 mode = MPOL_LOCAL;
2948 else 2939 else
2949 node_set(pol->v.preferred_node, nodes); 2940 node_set(pol->v.preferred_node, nodes);
2950 break; 2941 break;
2951
2952 case MPOL_BIND: 2942 case MPOL_BIND:
2953 /* Fall through */
2954 case MPOL_INTERLEAVE: 2943 case MPOL_INTERLEAVE:
2955 nodes = pol->v.nodes; 2944 nodes = pol->v.nodes;
2956 break; 2945 break;
2957
2958 default: 2946 default:
2959 return -EINVAL; 2947 WARN_ON_ONCE(1);
2948 snprintf(p, maxlen, "unknown");
2949 return;
2960 } 2950 }
2961 2951
2962 l = strlen(policy_modes[mode]); 2952 p += snprintf(p, maxlen, policy_modes[mode]);
2963 if (buffer + maxlen < p + l + 1)
2964 return -ENOSPC;
2965
2966 strcpy(p, policy_modes[mode]);
2967 p += l;
2968 2953
2969 if (flags & MPOL_MODE_FLAGS) { 2954 if (flags & MPOL_MODE_FLAGS) {
2970 if (buffer + maxlen < p + 2) 2955 p += snprintf(p, buffer + maxlen - p, "=");
2971 return -ENOSPC;
2972 *p++ = '=';
2973 2956
2974 /* 2957 /*
2975 * Currently, the only defined flags are mutually exclusive 2958 * Currently, the only defined flags are mutually exclusive
@@ -2981,10 +2964,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2981 } 2964 }
2982 2965
2983 if (!nodes_empty(nodes)) { 2966 if (!nodes_empty(nodes)) {
2984 if (buffer + maxlen < p + 2) 2967 p += snprintf(p, buffer + maxlen - p, ":");
2985 return -ENOSPC;
2986 *p++ = ':';
2987 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); 2968 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2988 } 2969 }
2989 return p - buffer;
2990} 2970}
diff --git a/mm/mmap.c b/mm/mmap.c
index ab199dfc9e26..5a6baddde15d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -179,14 +179,12 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
179 goto error; 179 goto error;
180 } 180 }
181 181
182 allowed = (totalram_pages - hugetlb_total_pages()) 182 allowed = vm_commit_limit();
183 * sysctl_overcommit_ratio / 100;
184 /* 183 /*
185 * Reserve some for root 184 * Reserve some for root
186 */ 185 */
187 if (!cap_sys_admin) 186 if (!cap_sys_admin)
188 allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); 187 allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
189 allowed += total_swap_pages;
190 188
191 /* 189 /*
192 * Don't let a single process grow so big a user can't recover 190 * Don't let a single process grow so big a user can't recover
@@ -1856,7 +1854,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
1856 struct vm_area_struct *vma; 1854 struct vm_area_struct *vma;
1857 struct vm_unmapped_area_info info; 1855 struct vm_unmapped_area_info info;
1858 1856
1859 if (len > TASK_SIZE) 1857 if (len > TASK_SIZE - mmap_min_addr)
1860 return -ENOMEM; 1858 return -ENOMEM;
1861 1859
1862 if (flags & MAP_FIXED) 1860 if (flags & MAP_FIXED)
@@ -1865,14 +1863,14 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
1865 if (addr) { 1863 if (addr) {
1866 addr = PAGE_ALIGN(addr); 1864 addr = PAGE_ALIGN(addr);
1867 vma = find_vma(mm, addr); 1865 vma = find_vma(mm, addr);
1868 if (TASK_SIZE - len >= addr && 1866 if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
1869 (!vma || addr + len <= vma->vm_start)) 1867 (!vma || addr + len <= vma->vm_start))
1870 return addr; 1868 return addr;
1871 } 1869 }
1872 1870
1873 info.flags = 0; 1871 info.flags = 0;
1874 info.length = len; 1872 info.length = len;
1875 info.low_limit = TASK_UNMAPPED_BASE; 1873 info.low_limit = mm->mmap_base;
1876 info.high_limit = TASK_SIZE; 1874 info.high_limit = TASK_SIZE;
1877 info.align_mask = 0; 1875 info.align_mask = 0;
1878 return vm_unmapped_area(&info); 1876 return vm_unmapped_area(&info);
@@ -1895,7 +1893,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1895 struct vm_unmapped_area_info info; 1893 struct vm_unmapped_area_info info;
1896 1894
1897 /* requested length too big for entire address space */ 1895 /* requested length too big for entire address space */
1898 if (len > TASK_SIZE) 1896 if (len > TASK_SIZE - mmap_min_addr)
1899 return -ENOMEM; 1897 return -ENOMEM;
1900 1898
1901 if (flags & MAP_FIXED) 1899 if (flags & MAP_FIXED)
@@ -1905,14 +1903,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1905 if (addr) { 1903 if (addr) {
1906 addr = PAGE_ALIGN(addr); 1904 addr = PAGE_ALIGN(addr);
1907 vma = find_vma(mm, addr); 1905 vma = find_vma(mm, addr);
1908 if (TASK_SIZE - len >= addr && 1906 if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
1909 (!vma || addr + len <= vma->vm_start)) 1907 (!vma || addr + len <= vma->vm_start))
1910 return addr; 1908 return addr;
1911 } 1909 }
1912 1910
1913 info.flags = VM_UNMAPPED_AREA_TOPDOWN; 1911 info.flags = VM_UNMAPPED_AREA_TOPDOWN;
1914 info.length = len; 1912 info.length = len;
1915 info.low_limit = PAGE_SIZE; 1913 info.low_limit = max(PAGE_SIZE, mmap_min_addr);
1916 info.high_limit = mm->mmap_base; 1914 info.high_limit = mm->mmap_base;
1917 info.align_mask = 0; 1915 info.align_mask = 0;
1918 addr = vm_unmapped_area(&info); 1916 addr = vm_unmapped_area(&info);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index a597f2ffcd6f..26667971c824 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -112,6 +112,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
112 pmd_t *pmd; 112 pmd_t *pmd;
113 unsigned long next; 113 unsigned long next;
114 unsigned long pages = 0; 114 unsigned long pages = 0;
115 unsigned long nr_huge_updates = 0;
115 116
116 pmd = pmd_offset(pud, addr); 117 pmd = pmd_offset(pud, addr);
117 do { 118 do {
@@ -126,9 +127,10 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
126 newprot, prot_numa); 127 newprot, prot_numa);
127 128
128 if (nr_ptes) { 129 if (nr_ptes) {
129 if (nr_ptes == HPAGE_PMD_NR) 130 if (nr_ptes == HPAGE_PMD_NR) {
130 pages++; 131 pages += HPAGE_PMD_NR;
131 132 nr_huge_updates++;
133 }
132 continue; 134 continue;
133 } 135 }
134 } 136 }
@@ -141,6 +143,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
141 pages += this_pages; 143 pages += this_pages;
142 } while (pmd++, addr = next, addr != end); 144 } while (pmd++, addr = next, addr != end);
143 145
146 if (nr_huge_updates)
147 count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
144 return pages; 148 return pages;
145} 149}
146 150
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 61107cf55bb3..2c254d374655 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -82,27 +82,18 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
82 82
83static void __init __free_pages_memory(unsigned long start, unsigned long end) 83static void __init __free_pages_memory(unsigned long start, unsigned long end)
84{ 84{
85 unsigned long i, start_aligned, end_aligned; 85 int order;
86 int order = ilog2(BITS_PER_LONG);
87 86
88 start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); 87 while (start < end) {
89 end_aligned = end & ~(BITS_PER_LONG - 1); 88 order = min(MAX_ORDER - 1UL, __ffs(start));
90 89
91 if (end_aligned <= start_aligned) { 90 while (start + (1UL << order) > end)
92 for (i = start; i < end; i++) 91 order--;
93 __free_pages_bootmem(pfn_to_page(i), 0);
94 92
95 return; 93 __free_pages_bootmem(pfn_to_page(start), order);
96 }
97
98 for (i = start; i < start_aligned; i++)
99 __free_pages_bootmem(pfn_to_page(i), 0);
100 94
101 for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) 95 start += (1UL << order);
102 __free_pages_bootmem(pfn_to_page(i), order); 96 }
103
104 for (i = end_aligned; i < end; i++)
105 __free_pages_bootmem(pfn_to_page(i), 0);
106} 97}
107 98
108static unsigned long __init __free_memory_core(phys_addr_t start, 99static unsigned long __init __free_memory_core(phys_addr_t start,
diff --git a/mm/nommu.c b/mm/nommu.c
index 9e6cb02cba64..fec093adad9c 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1948,13 +1948,12 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1948 goto error; 1948 goto error;
1949 } 1949 }
1950 1950
1951 allowed = totalram_pages * sysctl_overcommit_ratio / 100; 1951 allowed = vm_commit_limit();
1952 /* 1952 /*
1953 * Reserve some 3% for root 1953 * Reserve some 3% for root
1954 */ 1954 */
1955 if (!cap_sys_admin) 1955 if (!cap_sys_admin)
1956 allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); 1956 allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
1957 allowed += total_swap_pages;
1958 1957
1959 /* 1958 /*
1960 * Don't let a single process grow so big a user can't recover 1959 * Don't let a single process grow so big a user can't recover
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 73d812f16dde..580a5f075ed0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -234,8 +234,8 @@ int page_group_by_mobility_disabled __read_mostly;
234 234
235void set_pageblock_migratetype(struct page *page, int migratetype) 235void set_pageblock_migratetype(struct page *page, int migratetype)
236{ 236{
237 237 if (unlikely(page_group_by_mobility_disabled &&
238 if (unlikely(page_group_by_mobility_disabled)) 238 migratetype < MIGRATE_PCPTYPES))
239 migratetype = MIGRATE_UNMOVABLE; 239 migratetype = MIGRATE_UNMOVABLE;
240 240
241 set_pageblock_flags_group(page, (unsigned long)migratetype, 241 set_pageblock_flags_group(page, (unsigned long)migratetype,
@@ -1027,6 +1027,10 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
1027{ 1027{
1028 int current_order = page_order(page); 1028 int current_order = page_order(page);
1029 1029
1030 /*
1031 * When borrowing from MIGRATE_CMA, we need to release the excess
1032 * buddy pages to CMA itself.
1033 */
1030 if (is_migrate_cma(fallback_type)) 1034 if (is_migrate_cma(fallback_type))
1031 return fallback_type; 1035 return fallback_type;
1032 1036
@@ -1091,21 +1095,11 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
1091 list_del(&page->lru); 1095 list_del(&page->lru);
1092 rmv_page_order(page); 1096 rmv_page_order(page);
1093 1097
1094 /*
1095 * Borrow the excess buddy pages as well, irrespective
1096 * of whether we stole freepages, or took ownership of
1097 * the pageblock or not.
1098 *
1099 * Exception: When borrowing from MIGRATE_CMA, release
1100 * the excess buddy pages to CMA itself.
1101 */
1102 expand(zone, page, order, current_order, area, 1098 expand(zone, page, order, current_order, area,
1103 is_migrate_cma(migratetype) 1099 new_type);
1104 ? migratetype : start_migratetype);
1105 1100
1106 trace_mm_page_alloc_extfrag(page, order, 1101 trace_mm_page_alloc_extfrag(page, order, current_order,
1107 current_order, start_migratetype, migratetype, 1102 start_migratetype, migratetype, new_type);
1108 new_type == start_migratetype);
1109 1103
1110 return page; 1104 return page;
1111 } 1105 }
@@ -1711,7 +1705,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1711 * comments in mmzone.h. Reduces cache footprint of zonelist scans 1705 * comments in mmzone.h. Reduces cache footprint of zonelist scans
1712 * that have to skip over a lot of full or unallowed zones. 1706 * that have to skip over a lot of full or unallowed zones.
1713 * 1707 *
1714 * If the zonelist cache is present in the passed in zonelist, then 1708 * If the zonelist cache is present in the passed zonelist, then
1715 * returns a pointer to the allowed node mask (either the current 1709 * returns a pointer to the allowed node mask (either the current
1716 * tasks mems_allowed, or node_states[N_MEMORY].) 1710 * tasks mems_allowed, or node_states[N_MEMORY].)
1717 * 1711 *
@@ -2593,7 +2587,7 @@ rebalance:
2593 * running out of options and have to consider going OOM 2587 * running out of options and have to consider going OOM
2594 */ 2588 */
2595 if (!did_some_progress) { 2589 if (!did_some_progress) {
2596 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 2590 if (oom_gfp_allowed(gfp_mask)) {
2597 if (oom_killer_disabled) 2591 if (oom_killer_disabled)
2598 goto nopage; 2592 goto nopage;
2599 /* Coredumps can quickly deplete all memory reserves */ 2593 /* Coredumps can quickly deplete all memory reserves */
@@ -3881,8 +3875,6 @@ static inline unsigned long wait_table_bits(unsigned long size)
3881 return ffz(~size); 3875 return ffz(~size);
3882} 3876}
3883 3877
3884#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
3885
3886/* 3878/*
3887 * Check if a pageblock contains reserved pages 3879 * Check if a pageblock contains reserved pages
3888 */ 3880 */
@@ -4266,7 +4258,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
4266 */ 4258 */
4267 zone->pageset = &boot_pageset; 4259 zone->pageset = &boot_pageset;
4268 4260
4269 if (zone->present_pages) 4261 if (populated_zone(zone))
4270 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", 4262 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
4271 zone->name, zone->present_pages, 4263 zone->name, zone->present_pages,
4272 zone_batchsize(zone)); 4264 zone_batchsize(zone));
@@ -5160,7 +5152,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
5160 5152
5161 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { 5153 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
5162 struct zone *zone = &pgdat->node_zones[zone_type]; 5154 struct zone *zone = &pgdat->node_zones[zone_type];
5163 if (zone->present_pages) { 5155 if (populated_zone(zone)) {
5164 node_set_state(nid, N_HIGH_MEMORY); 5156 node_set_state(nid, N_HIGH_MEMORY);
5165 if (N_NORMAL_MEMORY != N_HIGH_MEMORY && 5157 if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
5166 zone_type <= ZONE_NORMAL) 5158 zone_type <= ZONE_NORMAL)
diff --git a/mm/readahead.c b/mm/readahead.c
index e4ed04149785..7cdbb44aa90b 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -401,6 +401,7 @@ ondemand_readahead(struct address_space *mapping,
401 unsigned long req_size) 401 unsigned long req_size)
402{ 402{
403 unsigned long max = max_sane_readahead(ra->ra_pages); 403 unsigned long max = max_sane_readahead(ra->ra_pages);
404 pgoff_t prev_offset;
404 405
405 /* 406 /*
406 * start of file 407 * start of file
@@ -452,8 +453,11 @@ ondemand_readahead(struct address_space *mapping,
452 453
453 /* 454 /*
454 * sequential cache miss 455 * sequential cache miss
456 * trivial case: (offset - prev_offset) == 1
457 * unaligned reads: (offset - prev_offset) == 0
455 */ 458 */
456 if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL) 459 prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT;
460 if (offset - prev_offset <= 1UL)
457 goto initial_readahead; 461 goto initial_readahead;
458 462
459 /* 463 /*
@@ -569,7 +573,7 @@ static ssize_t
569do_readahead(struct address_space *mapping, struct file *filp, 573do_readahead(struct address_space *mapping, struct file *filp,
570 pgoff_t index, unsigned long nr) 574 pgoff_t index, unsigned long nr)
571{ 575{
572 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) 576 if (!mapping || !mapping->a_ops)
573 return -EINVAL; 577 return -EINVAL;
574 578
575 force_page_cache_readahead(mapping, filp, index, nr); 579 force_page_cache_readahead(mapping, filp, index, nr);
diff --git a/mm/slab.c b/mm/slab.c
index 2580db062df9..0c8967bb2018 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3982,7 +3982,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3982 3982
3983 VM_BUG_ON(!mutex_is_locked(&slab_mutex)); 3983 VM_BUG_ON(!mutex_is_locked(&slab_mutex));
3984 for_each_memcg_cache_index(i) { 3984 for_each_memcg_cache_index(i) {
3985 c = cache_from_memcg(cachep, i); 3985 c = cache_from_memcg_idx(cachep, i);
3986 if (c) 3986 if (c)
3987 /* return value determined by the parent cache only */ 3987 /* return value determined by the parent cache only */
3988 __do_tune_cpucache(c, limit, batchcount, shared, gfp); 3988 __do_tune_cpucache(c, limit, batchcount, shared, gfp);
diff --git a/mm/slab.h b/mm/slab.h
index a535033f7e9a..0859c4241ba1 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -160,7 +160,8 @@ static inline const char *cache_name(struct kmem_cache *s)
160 return s->name; 160 return s->name;
161} 161}
162 162
163static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) 163static inline struct kmem_cache *
164cache_from_memcg_idx(struct kmem_cache *s, int idx)
164{ 165{
165 if (!s->memcg_params) 166 if (!s->memcg_params)
166 return NULL; 167 return NULL;
@@ -204,7 +205,8 @@ static inline const char *cache_name(struct kmem_cache *s)
204 return s->name; 205 return s->name;
205} 206}
206 207
207static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) 208static inline struct kmem_cache *
209cache_from_memcg_idx(struct kmem_cache *s, int idx)
208{ 210{
209 return NULL; 211 return NULL;
210} 212}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index e2e98af703ea..0b7bb399b0e4 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -571,7 +571,7 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
571 return; 571 return;
572 572
573 for_each_memcg_cache_index(i) { 573 for_each_memcg_cache_index(i) {
574 c = cache_from_memcg(s, i); 574 c = cache_from_memcg_idx(s, i);
575 if (!c) 575 if (!c)
576 continue; 576 continue;
577 577
diff --git a/mm/slub.c b/mm/slub.c
index c3eb3d3ca835..92737a0b787b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4983,7 +4983,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
4983 * through the descendants with best-effort propagation. 4983 * through the descendants with best-effort propagation.
4984 */ 4984 */
4985 for_each_memcg_cache_index(i) { 4985 for_each_memcg_cache_index(i) {
4986 struct kmem_cache *c = cache_from_memcg(s, i); 4986 struct kmem_cache *c = cache_from_memcg_idx(s, i);
4987 if (c) 4987 if (c)
4988 attribute->store(c, buf, len); 4988 attribute->store(c, buf, len);
4989 } 4989 }
diff --git a/mm/sparse.c b/mm/sparse.c
index 4ac1d7ef548f..8cc7be0e9590 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -590,33 +590,32 @@ void __init sparse_init(void)
590 590
591#ifdef CONFIG_MEMORY_HOTPLUG 591#ifdef CONFIG_MEMORY_HOTPLUG
592#ifdef CONFIG_SPARSEMEM_VMEMMAP 592#ifdef CONFIG_SPARSEMEM_VMEMMAP
593static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, 593static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid)
594 unsigned long nr_pages)
595{ 594{
596 /* This will make the necessary allocations eventually. */ 595 /* This will make the necessary allocations eventually. */
597 return sparse_mem_map_populate(pnum, nid); 596 return sparse_mem_map_populate(pnum, nid);
598} 597}
599static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) 598static void __kfree_section_memmap(struct page *memmap)
600{ 599{
601 unsigned long start = (unsigned long)memmap; 600 unsigned long start = (unsigned long)memmap;
602 unsigned long end = (unsigned long)(memmap + nr_pages); 601 unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
603 602
604 vmemmap_free(start, end); 603 vmemmap_free(start, end);
605} 604}
606#ifdef CONFIG_MEMORY_HOTREMOVE 605#ifdef CONFIG_MEMORY_HOTREMOVE
607static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) 606static void free_map_bootmem(struct page *memmap)
608{ 607{
609 unsigned long start = (unsigned long)memmap; 608 unsigned long start = (unsigned long)memmap;
610 unsigned long end = (unsigned long)(memmap + nr_pages); 609 unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
611 610
612 vmemmap_free(start, end); 611 vmemmap_free(start, end);
613} 612}
614#endif /* CONFIG_MEMORY_HOTREMOVE */ 613#endif /* CONFIG_MEMORY_HOTREMOVE */
615#else 614#else
616static struct page *__kmalloc_section_memmap(unsigned long nr_pages) 615static struct page *__kmalloc_section_memmap(void)
617{ 616{
618 struct page *page, *ret; 617 struct page *page, *ret;
619 unsigned long memmap_size = sizeof(struct page) * nr_pages; 618 unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
620 619
621 page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size)); 620 page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
622 if (page) 621 if (page)
@@ -634,28 +633,30 @@ got_map_ptr:
634 return ret; 633 return ret;
635} 634}
636 635
637static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, 636static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid)
638 unsigned long nr_pages)
639{ 637{
640 return __kmalloc_section_memmap(nr_pages); 638 return __kmalloc_section_memmap();
641} 639}
642 640
643static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) 641static void __kfree_section_memmap(struct page *memmap)
644{ 642{
645 if (is_vmalloc_addr(memmap)) 643 if (is_vmalloc_addr(memmap))
646 vfree(memmap); 644 vfree(memmap);
647 else 645 else
648 free_pages((unsigned long)memmap, 646 free_pages((unsigned long)memmap,
649 get_order(sizeof(struct page) * nr_pages)); 647 get_order(sizeof(struct page) * PAGES_PER_SECTION));
650} 648}
651 649
652#ifdef CONFIG_MEMORY_HOTREMOVE 650#ifdef CONFIG_MEMORY_HOTREMOVE
653static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) 651static void free_map_bootmem(struct page *memmap)
654{ 652{
655 unsigned long maps_section_nr, removing_section_nr, i; 653 unsigned long maps_section_nr, removing_section_nr, i;
656 unsigned long magic; 654 unsigned long magic, nr_pages;
657 struct page *page = virt_to_page(memmap); 655 struct page *page = virt_to_page(memmap);
658 656
657 nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
658 >> PAGE_SHIFT;
659
659 for (i = 0; i < nr_pages; i++, page++) { 660 for (i = 0; i < nr_pages; i++, page++) {
660 magic = (unsigned long) page->lru.next; 661 magic = (unsigned long) page->lru.next;
661 662
@@ -684,8 +685,7 @@ static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
684 * set. If this is <=0, then that means that the passed-in 685 * set. If this is <=0, then that means that the passed-in
685 * map was not consumed and must be freed. 686 * map was not consumed and must be freed.
686 */ 687 */
687int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn, 688int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn)
688 int nr_pages)
689{ 689{
690 unsigned long section_nr = pfn_to_section_nr(start_pfn); 690 unsigned long section_nr = pfn_to_section_nr(start_pfn);
691 struct pglist_data *pgdat = zone->zone_pgdat; 691 struct pglist_data *pgdat = zone->zone_pgdat;
@@ -702,12 +702,12 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
702 ret = sparse_index_init(section_nr, pgdat->node_id); 702 ret = sparse_index_init(section_nr, pgdat->node_id);
703 if (ret < 0 && ret != -EEXIST) 703 if (ret < 0 && ret != -EEXIST)
704 return ret; 704 return ret;
705 memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages); 705 memmap = kmalloc_section_memmap(section_nr, pgdat->node_id);
706 if (!memmap) 706 if (!memmap)
707 return -ENOMEM; 707 return -ENOMEM;
708 usemap = __kmalloc_section_usemap(); 708 usemap = __kmalloc_section_usemap();
709 if (!usemap) { 709 if (!usemap) {
710 __kfree_section_memmap(memmap, nr_pages); 710 __kfree_section_memmap(memmap);
711 return -ENOMEM; 711 return -ENOMEM;
712 } 712 }
713 713
@@ -719,7 +719,7 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
719 goto out; 719 goto out;
720 } 720 }
721 721
722 memset(memmap, 0, sizeof(struct page) * nr_pages); 722 memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION);
723 723
724 ms->section_mem_map |= SECTION_MARKED_PRESENT; 724 ms->section_mem_map |= SECTION_MARKED_PRESENT;
725 725
@@ -729,7 +729,7 @@ out:
729 pgdat_resize_unlock(pgdat, &flags); 729 pgdat_resize_unlock(pgdat, &flags);
730 if (ret <= 0) { 730 if (ret <= 0) {
731 kfree(usemap); 731 kfree(usemap);
732 __kfree_section_memmap(memmap, nr_pages); 732 __kfree_section_memmap(memmap);
733 } 733 }
734 return ret; 734 return ret;
735} 735}
@@ -759,7 +759,6 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
759static void free_section_usemap(struct page *memmap, unsigned long *usemap) 759static void free_section_usemap(struct page *memmap, unsigned long *usemap)
760{ 760{
761 struct page *usemap_page; 761 struct page *usemap_page;
762 unsigned long nr_pages;
763 762
764 if (!usemap) 763 if (!usemap)
765 return; 764 return;
@@ -771,7 +770,7 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
771 if (PageSlab(usemap_page) || PageCompound(usemap_page)) { 770 if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
772 kfree(usemap); 771 kfree(usemap);
773 if (memmap) 772 if (memmap)
774 __kfree_section_memmap(memmap, PAGES_PER_SECTION); 773 __kfree_section_memmap(memmap);
775 return; 774 return;
776 } 775 }
777 776
@@ -780,12 +779,8 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
780 * on the section which has pgdat at boot time. Just keep it as is now. 779 * on the section which has pgdat at boot time. Just keep it as is now.
781 */ 780 */
782 781
783 if (memmap) { 782 if (memmap)
784 nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) 783 free_map_bootmem(memmap);
785 >> PAGE_SHIFT;
786
787 free_map_bootmem(memmap, nr_pages);
788 }
789} 784}
790 785
791void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) 786void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index de7c904e52e5..612a7c9795f6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -707,7 +707,7 @@ noswap:
707 return (swp_entry_t) {0}; 707 return (swp_entry_t) {0};
708} 708}
709 709
710/* The only caller of this function is now susupend routine */ 710/* The only caller of this function is now suspend routine */
711swp_entry_t get_swap_page_of_type(int type) 711swp_entry_t get_swap_page_of_type(int type)
712{ 712{
713 struct swap_info_struct *si; 713 struct swap_info_struct *si;
@@ -845,7 +845,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
845} 845}
846 846
847/* 847/*
848 * Caller has made sure that the swapdevice corresponding to entry 848 * Caller has made sure that the swap device corresponding to entry
849 * is still around or has not been recycled. 849 * is still around or has not been recycled.
850 */ 850 */
851void swap_free(swp_entry_t entry) 851void swap_free(swp_entry_t entry)
@@ -947,7 +947,7 @@ int try_to_free_swap(struct page *page)
947 * original page might be freed under memory pressure, then 947 * original page might be freed under memory pressure, then
948 * later read back in from swap, now with the wrong data. 948 * later read back in from swap, now with the wrong data.
949 * 949 *
950 * Hibration suspends storage while it is writing the image 950 * Hibernation suspends storage while it is writing the image
951 * to disk so check that here. 951 * to disk so check that here.
952 */ 952 */
953 if (pm_suspended_storage()) 953 if (pm_suspended_storage())
@@ -1179,7 +1179,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
1179 * some architectures (e.g. x86_32 with PAE) we might catch a glimpse 1179 * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
1180 * of unmatched parts which look like swp_pte, so unuse_pte must 1180 * of unmatched parts which look like swp_pte, so unuse_pte must
1181 * recheck under pte lock. Scanning without pte lock lets it be 1181 * recheck under pte lock. Scanning without pte lock lets it be
1182 * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. 1182 * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
1183 */ 1183 */
1184 pte = pte_offset_map(pmd, addr); 1184 pte = pte_offset_map(pmd, addr);
1185 do { 1185 do {
@@ -1924,17 +1924,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1924 p->cluster_info = NULL; 1924 p->cluster_info = NULL;
1925 p->flags = 0; 1925 p->flags = 0;
1926 frontswap_map = frontswap_map_get(p); 1926 frontswap_map = frontswap_map_get(p);
1927 frontswap_map_set(p, NULL);
1928 spin_unlock(&p->lock); 1927 spin_unlock(&p->lock);
1929 spin_unlock(&swap_lock); 1928 spin_unlock(&swap_lock);
1930 frontswap_invalidate_area(type); 1929 frontswap_invalidate_area(type);
1930 frontswap_map_set(p, NULL);
1931 mutex_unlock(&swapon_mutex); 1931 mutex_unlock(&swapon_mutex);
1932 free_percpu(p->percpu_cluster); 1932 free_percpu(p->percpu_cluster);
1933 p->percpu_cluster = NULL; 1933 p->percpu_cluster = NULL;
1934 vfree(swap_map); 1934 vfree(swap_map);
1935 vfree(cluster_info); 1935 vfree(cluster_info);
1936 vfree(frontswap_map); 1936 vfree(frontswap_map);
1937 /* Destroy swap account informatin */ 1937 /* Destroy swap account information */
1938 swap_cgroup_swapoff(type); 1938 swap_cgroup_swapoff(type);
1939 1939
1940 inode = mapping->host; 1940 inode = mapping->host;
@@ -2786,8 +2786,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2786 2786
2787 /* 2787 /*
2788 * We are fortunate that although vmalloc_to_page uses pte_offset_map, 2788 * We are fortunate that although vmalloc_to_page uses pte_offset_map,
2789 * no architecture is using highmem pages for kernel pagetables: so it 2789 * no architecture is using highmem pages for kernel page tables: so it
2790 * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps. 2790 * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps.
2791 */ 2791 */
2792 head = vmalloc_to_page(si->swap_map + offset); 2792 head = vmalloc_to_page(si->swap_map + offset);
2793 offset &= ~PAGE_MASK; 2793 offset &= ~PAGE_MASK;
diff --git a/mm/util.c b/mm/util.c
index eaf63fc2c92f..f7bc2096071c 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -7,6 +7,9 @@
7#include <linux/security.h> 7#include <linux/security.h>
8#include <linux/swap.h> 8#include <linux/swap.h>
9#include <linux/swapops.h> 9#include <linux/swapops.h>
10#include <linux/mman.h>
11#include <linux/hugetlb.h>
12
10#include <asm/uaccess.h> 13#include <asm/uaccess.h>
11 14
12#include "internal.h" 15#include "internal.h"
@@ -398,6 +401,16 @@ struct address_space *page_mapping(struct page *page)
398 return mapping; 401 return mapping;
399} 402}
400 403
404/*
405 * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
406 */
407unsigned long vm_commit_limit(void)
408{
409 return ((totalram_pages - hugetlb_total_pages())
410 * sysctl_overcommit_ratio / 100) + total_swap_pages;
411}
412
413
401/* Tracepoints definitions. */ 414/* Tracepoints definitions. */
402EXPORT_TRACEPOINT_SYMBOL(kmalloc); 415EXPORT_TRACEPOINT_SYMBOL(kmalloc);
403EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); 416EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 107454312d5e..0fdf96803c5b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -359,6 +359,12 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
359 if (unlikely(!va)) 359 if (unlikely(!va))
360 return ERR_PTR(-ENOMEM); 360 return ERR_PTR(-ENOMEM);
361 361
362 /*
363 * Only scan the relevant parts containing pointers to other objects
364 * to avoid false negatives.
365 */
366 kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
367
362retry: 368retry:
363 spin_lock(&vmap_area_lock); 369 spin_lock(&vmap_area_lock);
364 /* 370 /*
@@ -1546,7 +1552,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
1546 gfp_t gfp_mask, pgprot_t prot, 1552 gfp_t gfp_mask, pgprot_t prot,
1547 int node, const void *caller); 1553 int node, const void *caller);
1548static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 1554static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1549 pgprot_t prot, int node, const void *caller) 1555 pgprot_t prot, int node)
1550{ 1556{
1551 const int order = 0; 1557 const int order = 0;
1552 struct page **pages; 1558 struct page **pages;
@@ -1560,13 +1566,12 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1560 /* Please note that the recursion is strictly bounded. */ 1566 /* Please note that the recursion is strictly bounded. */
1561 if (array_size > PAGE_SIZE) { 1567 if (array_size > PAGE_SIZE) {
1562 pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, 1568 pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
1563 PAGE_KERNEL, node, caller); 1569 PAGE_KERNEL, node, area->caller);
1564 area->flags |= VM_VPAGES; 1570 area->flags |= VM_VPAGES;
1565 } else { 1571 } else {
1566 pages = kmalloc_node(array_size, nested_gfp, node); 1572 pages = kmalloc_node(array_size, nested_gfp, node);
1567 } 1573 }
1568 area->pages = pages; 1574 area->pages = pages;
1569 area->caller = caller;
1570 if (!area->pages) { 1575 if (!area->pages) {
1571 remove_vm_area(area->addr); 1576 remove_vm_area(area->addr);
1572 kfree(area); 1577 kfree(area);
@@ -1577,7 +1582,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1577 struct page *page; 1582 struct page *page;
1578 gfp_t tmp_mask = gfp_mask | __GFP_NOWARN; 1583 gfp_t tmp_mask = gfp_mask | __GFP_NOWARN;
1579 1584
1580 if (node < 0) 1585 if (node == NUMA_NO_NODE)
1581 page = alloc_page(tmp_mask); 1586 page = alloc_page(tmp_mask);
1582 else 1587 else
1583 page = alloc_pages_node(node, tmp_mask, order); 1588 page = alloc_pages_node(node, tmp_mask, order);
@@ -1634,9 +1639,9 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
1634 if (!area) 1639 if (!area)
1635 goto fail; 1640 goto fail;
1636 1641
1637 addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); 1642 addr = __vmalloc_area_node(area, gfp_mask, prot, node);
1638 if (!addr) 1643 if (!addr)
1639 goto fail; 1644 return NULL;
1640 1645
1641 /* 1646 /*
1642 * In this function, newly allocated vm_struct has VM_UNINITIALIZED 1647 * In this function, newly allocated vm_struct has VM_UNINITIALIZED
@@ -1646,11 +1651,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
1646 clear_vm_uninitialized_flag(area); 1651 clear_vm_uninitialized_flag(area);
1647 1652
1648 /* 1653 /*
1649 * A ref_count = 3 is needed because the vm_struct and vmap_area 1654 * A ref_count = 2 is needed because vm_struct allocated in
1650 * structures allocated in the __get_vm_area_node() function contain 1655 * __get_vm_area_node() contains a reference to the virtual address of
1651 * references to the virtual address of the vmalloc'ed block. 1656 * the vmalloc'ed block.
1652 */ 1657 */
1653 kmemleak_alloc(addr, real_size, 3, gfp_mask); 1658 kmemleak_alloc(addr, real_size, 2, gfp_mask);
1654 1659
1655 return addr; 1660 return addr;
1656 1661
@@ -2563,6 +2568,11 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
2563 if (!counters) 2568 if (!counters)
2564 return; 2569 return;
2565 2570
2571 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
2572 smp_rmb();
2573 if (v->flags & VM_UNINITIALIZED)
2574 return;
2575
2566 memset(counters, 0, nr_node_ids * sizeof(unsigned int)); 2576 memset(counters, 0, nr_node_ids * sizeof(unsigned int));
2567 2577
2568 for (nr = 0; nr < v->nr_pages; nr++) 2578 for (nr = 0; nr < v->nr_pages; nr++)
@@ -2579,23 +2589,15 @@ static int s_show(struct seq_file *m, void *p)
2579 struct vmap_area *va = p; 2589 struct vmap_area *va = p;
2580 struct vm_struct *v; 2590 struct vm_struct *v;
2581 2591
2582 if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING)) 2592 /*
2593 * s_show can encounter race with remove_vm_area, !VM_VM_AREA on
2594 * behalf of vmap area is being tear down or vm_map_ram allocation.
2595 */
2596 if (!(va->flags & VM_VM_AREA))
2583 return 0; 2597 return 0;
2584 2598
2585 if (!(va->flags & VM_VM_AREA)) {
2586 seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
2587 (void *)va->va_start, (void *)va->va_end,
2588 va->va_end - va->va_start);
2589 return 0;
2590 }
2591
2592 v = va->vm; 2599 v = va->vm;
2593 2600
2594 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
2595 smp_rmb();
2596 if (v->flags & VM_UNINITIALIZED)
2597 return 0;
2598
2599 seq_printf(m, "0x%pK-0x%pK %7ld", 2601 seq_printf(m, "0x%pK-0x%pK %7ld",
2600 v->addr, v->addr + v->size, v->size); 2602 v->addr, v->addr + v->size, v->size);
2601 2603
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9bb314577911..72496140ac08 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -812,6 +812,7 @@ const char * const vmstat_text[] = {
812 812
813#ifdef CONFIG_NUMA_BALANCING 813#ifdef CONFIG_NUMA_BALANCING
814 "numa_pte_updates", 814 "numa_pte_updates",
815 "numa_huge_pte_updates",
815 "numa_hint_faults", 816 "numa_hint_faults",
816 "numa_hint_faults_local", 817 "numa_hint_faults_local",
817 "numa_pages_migrated", 818 "numa_pages_migrated",
@@ -1229,6 +1230,20 @@ static void start_cpu_timer(int cpu)
1229 schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); 1230 schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
1230} 1231}
1231 1232
1233static void vmstat_cpu_dead(int node)
1234{
1235 int cpu;
1236
1237 get_online_cpus();
1238 for_each_online_cpu(cpu)
1239 if (cpu_to_node(cpu) == node)
1240 goto end;
1241
1242 node_clear_state(node, N_CPU);
1243end:
1244 put_online_cpus();
1245}
1246
1232/* 1247/*
1233 * Use the cpu notifier to insure that the thresholds are recalculated 1248 * Use the cpu notifier to insure that the thresholds are recalculated
1234 * when necessary. 1249 * when necessary.
@@ -1258,6 +1273,7 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb,
1258 case CPU_DEAD: 1273 case CPU_DEAD:
1259 case CPU_DEAD_FROZEN: 1274 case CPU_DEAD_FROZEN:
1260 refresh_zone_stat_thresholds(); 1275 refresh_zone_stat_thresholds();
1276 vmstat_cpu_dead(cpu_to_node(cpu));
1261 break; 1277 break;
1262 default: 1278 default:
1263 break; 1279 break;
@@ -1276,8 +1292,12 @@ static int __init setup_vmstat(void)
1276 1292
1277 register_cpu_notifier(&vmstat_notifier); 1293 register_cpu_notifier(&vmstat_notifier);
1278 1294
1279 for_each_online_cpu(cpu) 1295 get_online_cpus();
1296 for_each_online_cpu(cpu) {
1280 start_cpu_timer(cpu); 1297 start_cpu_timer(cpu);
1298 node_set_state(cpu_to_node(cpu), N_CPU);
1299 }
1300 put_online_cpus();
1281#endif 1301#endif
1282#ifdef CONFIG_PROC_FS 1302#ifdef CONFIG_PROC_FS
1283 proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); 1303 proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
diff --git a/mm/zswap.c b/mm/zswap.c
index d93510c6aa2d..5a63f78a5601 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -217,6 +217,7 @@ static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
217 if (!entry) 217 if (!entry)
218 return NULL; 218 return NULL;
219 entry->refcount = 1; 219 entry->refcount = 1;
220 RB_CLEAR_NODE(&entry->rbnode);
220 return entry; 221 return entry;
221} 222}
222 223
@@ -225,19 +226,6 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
225 kmem_cache_free(zswap_entry_cache, entry); 226 kmem_cache_free(zswap_entry_cache, entry);
226} 227}
227 228
228/* caller must hold the tree lock */
229static void zswap_entry_get(struct zswap_entry *entry)
230{
231 entry->refcount++;
232}
233
234/* caller must hold the tree lock */
235static int zswap_entry_put(struct zswap_entry *entry)
236{
237 entry->refcount--;
238 return entry->refcount;
239}
240
241/********************************* 229/*********************************
242* rbtree functions 230* rbtree functions
243**********************************/ 231**********************************/
@@ -285,6 +273,61 @@ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
285 return 0; 273 return 0;
286} 274}
287 275
276static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
277{
278 if (!RB_EMPTY_NODE(&entry->rbnode)) {
279 rb_erase(&entry->rbnode, root);
280 RB_CLEAR_NODE(&entry->rbnode);
281 }
282}
283
284/*
285 * Carries out the common pattern of freeing and entry's zsmalloc allocation,
286 * freeing the entry itself, and decrementing the number of stored pages.
287 */
288static void zswap_free_entry(struct zswap_tree *tree,
289 struct zswap_entry *entry)
290{
291 zbud_free(tree->pool, entry->handle);
292 zswap_entry_cache_free(entry);
293 atomic_dec(&zswap_stored_pages);
294 zswap_pool_pages = zbud_get_pool_size(tree->pool);
295}
296
297/* caller must hold the tree lock */
298static void zswap_entry_get(struct zswap_entry *entry)
299{
300 entry->refcount++;
301}
302
303/* caller must hold the tree lock
304* remove from the tree and free it, if nobody reference the entry
305*/
306static void zswap_entry_put(struct zswap_tree *tree,
307 struct zswap_entry *entry)
308{
309 int refcount = --entry->refcount;
310
311 BUG_ON(refcount < 0);
312 if (refcount == 0) {
313 zswap_rb_erase(&tree->rbroot, entry);
314 zswap_free_entry(tree, entry);
315 }
316}
317
318/* caller must hold the tree lock */
319static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
320 pgoff_t offset)
321{
322 struct zswap_entry *entry = NULL;
323
324 entry = zswap_rb_search(root, offset);
325 if (entry)
326 zswap_entry_get(entry);
327
328 return entry;
329}
330
288/********************************* 331/*********************************
289* per-cpu code 332* per-cpu code
290**********************************/ 333**********************************/
@@ -368,18 +411,6 @@ static bool zswap_is_full(void)
368 zswap_pool_pages); 411 zswap_pool_pages);
369} 412}
370 413
371/*
372 * Carries out the common pattern of freeing and entry's zsmalloc allocation,
373 * freeing the entry itself, and decrementing the number of stored pages.
374 */
375static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry)
376{
377 zbud_free(tree->pool, entry->handle);
378 zswap_entry_cache_free(entry);
379 atomic_dec(&zswap_stored_pages);
380 zswap_pool_pages = zbud_get_pool_size(tree->pool);
381}
382
383/********************************* 414/*********************************
384* writeback code 415* writeback code
385**********************************/ 416**********************************/
@@ -387,7 +418,7 @@ static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry)
387enum zswap_get_swap_ret { 418enum zswap_get_swap_ret {
388 ZSWAP_SWAPCACHE_NEW, 419 ZSWAP_SWAPCACHE_NEW,
389 ZSWAP_SWAPCACHE_EXIST, 420 ZSWAP_SWAPCACHE_EXIST,
390 ZSWAP_SWAPCACHE_NOMEM 421 ZSWAP_SWAPCACHE_FAIL,
391}; 422};
392 423
393/* 424/*
@@ -401,9 +432,10 @@ enum zswap_get_swap_ret {
401 * added to the swap cache, and returned in retpage. 432 * added to the swap cache, and returned in retpage.
402 * 433 *
403 * If success, the swap cache page is returned in retpage 434 * If success, the swap cache page is returned in retpage
404 * Returns 0 if page was already in the swap cache, page is not locked 435 * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache
405 * Returns 1 if the new page needs to be populated, page is locked 436 * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated,
406 * Returns <0 on error 437 * the new page is added to swapcache and locked
438 * Returns ZSWAP_SWAPCACHE_FAIL on error
407 */ 439 */
408static int zswap_get_swap_cache_page(swp_entry_t entry, 440static int zswap_get_swap_cache_page(swp_entry_t entry,
409 struct page **retpage) 441 struct page **retpage)
@@ -475,7 +507,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
475 if (new_page) 507 if (new_page)
476 page_cache_release(new_page); 508 page_cache_release(new_page);
477 if (!found_page) 509 if (!found_page)
478 return ZSWAP_SWAPCACHE_NOMEM; 510 return ZSWAP_SWAPCACHE_FAIL;
479 *retpage = found_page; 511 *retpage = found_page;
480 return ZSWAP_SWAPCACHE_EXIST; 512 return ZSWAP_SWAPCACHE_EXIST;
481} 513}
@@ -502,7 +534,7 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
502 struct page *page; 534 struct page *page;
503 u8 *src, *dst; 535 u8 *src, *dst;
504 unsigned int dlen; 536 unsigned int dlen;
505 int ret, refcount; 537 int ret;
506 struct writeback_control wbc = { 538 struct writeback_control wbc = {
507 .sync_mode = WB_SYNC_NONE, 539 .sync_mode = WB_SYNC_NONE,
508 }; 540 };
@@ -517,23 +549,22 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
517 549
518 /* find and ref zswap entry */ 550 /* find and ref zswap entry */
519 spin_lock(&tree->lock); 551 spin_lock(&tree->lock);
520 entry = zswap_rb_search(&tree->rbroot, offset); 552 entry = zswap_entry_find_get(&tree->rbroot, offset);
521 if (!entry) { 553 if (!entry) {
522 /* entry was invalidated */ 554 /* entry was invalidated */
523 spin_unlock(&tree->lock); 555 spin_unlock(&tree->lock);
524 return 0; 556 return 0;
525 } 557 }
526 zswap_entry_get(entry);
527 spin_unlock(&tree->lock); 558 spin_unlock(&tree->lock);
528 BUG_ON(offset != entry->offset); 559 BUG_ON(offset != entry->offset);
529 560
530 /* try to allocate swap cache page */ 561 /* try to allocate swap cache page */
531 switch (zswap_get_swap_cache_page(swpentry, &page)) { 562 switch (zswap_get_swap_cache_page(swpentry, &page)) {
532 case ZSWAP_SWAPCACHE_NOMEM: /* no memory */ 563 case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
533 ret = -ENOMEM; 564 ret = -ENOMEM;
534 goto fail; 565 goto fail;
535 566
536 case ZSWAP_SWAPCACHE_EXIST: /* page is unlocked */ 567 case ZSWAP_SWAPCACHE_EXIST:
537 /* page is already in the swap cache, ignore for now */ 568 /* page is already in the swap cache, ignore for now */
538 page_cache_release(page); 569 page_cache_release(page);
539 ret = -EEXIST; 570 ret = -EEXIST;
@@ -556,43 +587,44 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
556 SetPageUptodate(page); 587 SetPageUptodate(page);
557 } 588 }
558 589
590 /* move it to the tail of the inactive list after end_writeback */
591 SetPageReclaim(page);
592
559 /* start writeback */ 593 /* start writeback */
560 __swap_writepage(page, &wbc, end_swap_bio_write); 594 __swap_writepage(page, &wbc, end_swap_bio_write);
561 page_cache_release(page); 595 page_cache_release(page);
562 zswap_written_back_pages++; 596 zswap_written_back_pages++;
563 597
564 spin_lock(&tree->lock); 598 spin_lock(&tree->lock);
565
566 /* drop local reference */ 599 /* drop local reference */
567 zswap_entry_put(entry); 600 zswap_entry_put(tree, entry);
568 /* drop the initial reference from entry creation */
569 refcount = zswap_entry_put(entry);
570 601
571 /* 602 /*
572 * There are three possible values for refcount here: 603 * There are two possible situations for entry here:
573 * (1) refcount is 1, load is in progress, unlink from rbtree, 604 * (1) refcount is 1(normal case), entry is valid and on the tree
574 * load will free 605 * (2) refcount is 0, entry is freed and not on the tree
575 * (2) refcount is 0, (normal case) entry is valid, 606 * because invalidate happened during writeback
576 * remove from rbtree and free entry 607 * search the tree and free the entry if find entry
577 * (3) refcount is -1, invalidate happened during writeback, 608 */
578 * free entry 609 if (entry == zswap_rb_search(&tree->rbroot, offset))
579 */ 610 zswap_entry_put(tree, entry);
580 if (refcount >= 0) {
581 /* no invalidate yet, remove from rbtree */
582 rb_erase(&entry->rbnode, &tree->rbroot);
583 }
584 spin_unlock(&tree->lock); 611 spin_unlock(&tree->lock);
585 if (refcount <= 0) {
586 /* free the entry */
587 zswap_free_entry(tree, entry);
588 return 0;
589 }
590 return -EAGAIN;
591 612
613 goto end;
614
615 /*
616 * if we get here due to ZSWAP_SWAPCACHE_EXIST
617 * a load may happening concurrently
618 * it is safe and okay to not free the entry
619 * if we free the entry in the following put
620 * it it either okay to return !0
621 */
592fail: 622fail:
593 spin_lock(&tree->lock); 623 spin_lock(&tree->lock);
594 zswap_entry_put(entry); 624 zswap_entry_put(tree, entry);
595 spin_unlock(&tree->lock); 625 spin_unlock(&tree->lock);
626
627end:
596 return ret; 628 return ret;
597} 629}
598 630
@@ -676,11 +708,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
676 if (ret == -EEXIST) { 708 if (ret == -EEXIST) {
677 zswap_duplicate_entry++; 709 zswap_duplicate_entry++;
678 /* remove from rbtree */ 710 /* remove from rbtree */
679 rb_erase(&dupentry->rbnode, &tree->rbroot); 711 zswap_rb_erase(&tree->rbroot, dupentry);
680 if (!zswap_entry_put(dupentry)) { 712 zswap_entry_put(tree, dupentry);
681 /* free */
682 zswap_free_entry(tree, dupentry);
683 }
684 } 713 }
685 } while (ret == -EEXIST); 714 } while (ret == -EEXIST);
686 spin_unlock(&tree->lock); 715 spin_unlock(&tree->lock);
@@ -709,17 +738,16 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
709 struct zswap_entry *entry; 738 struct zswap_entry *entry;
710 u8 *src, *dst; 739 u8 *src, *dst;
711 unsigned int dlen; 740 unsigned int dlen;
712 int refcount, ret; 741 int ret;
713 742
714 /* find */ 743 /* find */
715 spin_lock(&tree->lock); 744 spin_lock(&tree->lock);
716 entry = zswap_rb_search(&tree->rbroot, offset); 745 entry = zswap_entry_find_get(&tree->rbroot, offset);
717 if (!entry) { 746 if (!entry) {
718 /* entry was written back */ 747 /* entry was written back */
719 spin_unlock(&tree->lock); 748 spin_unlock(&tree->lock);
720 return -1; 749 return -1;
721 } 750 }
722 zswap_entry_get(entry);
723 spin_unlock(&tree->lock); 751 spin_unlock(&tree->lock);
724 752
725 /* decompress */ 753 /* decompress */
@@ -734,22 +762,9 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
734 BUG_ON(ret); 762 BUG_ON(ret);
735 763
736 spin_lock(&tree->lock); 764 spin_lock(&tree->lock);
737 refcount = zswap_entry_put(entry); 765 zswap_entry_put(tree, entry);
738 if (likely(refcount)) {
739 spin_unlock(&tree->lock);
740 return 0;
741 }
742 spin_unlock(&tree->lock); 766 spin_unlock(&tree->lock);
743 767
744 /*
745 * We don't have to unlink from the rbtree because
746 * zswap_writeback_entry() or zswap_frontswap_invalidate page()
747 * has already done this for us if we are the last reference.
748 */
749 /* free */
750
751 zswap_free_entry(tree, entry);
752
753 return 0; 768 return 0;
754} 769}
755 770
@@ -758,7 +773,6 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
758{ 773{
759 struct zswap_tree *tree = zswap_trees[type]; 774 struct zswap_tree *tree = zswap_trees[type];
760 struct zswap_entry *entry; 775 struct zswap_entry *entry;
761 int refcount;
762 776
763 /* find */ 777 /* find */
764 spin_lock(&tree->lock); 778 spin_lock(&tree->lock);
@@ -770,20 +784,12 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
770 } 784 }
771 785
772 /* remove from rbtree */ 786 /* remove from rbtree */
773 rb_erase(&entry->rbnode, &tree->rbroot); 787 zswap_rb_erase(&tree->rbroot, entry);
774 788
775 /* drop the initial reference from entry creation */ 789 /* drop the initial reference from entry creation */
776 refcount = zswap_entry_put(entry); 790 zswap_entry_put(tree, entry);
777 791
778 spin_unlock(&tree->lock); 792 spin_unlock(&tree->lock);
779
780 if (refcount) {
781 /* writeback in progress, writeback will free */
782 return;
783 }
784
785 /* free */
786 zswap_free_entry(tree, entry);
787} 793}
788 794
789/* frees all zswap entries for the given swap type */ 795/* frees all zswap entries for the given swap type */
@@ -797,11 +803,8 @@ static void zswap_frontswap_invalidate_area(unsigned type)
797 803
798 /* walk the tree and free everything */ 804 /* walk the tree and free everything */
799 spin_lock(&tree->lock); 805 spin_lock(&tree->lock);
800 rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) { 806 rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
801 zbud_free(tree->pool, entry->handle); 807 zswap_free_entry(tree, entry);
802 zswap_entry_cache_free(entry);
803 atomic_dec(&zswap_stored_pages);
804 }
805 tree->rbroot = RB_ROOT; 808 tree->rbroot = RB_ROOT;
806 spin_unlock(&tree->lock); 809 spin_unlock(&tree->lock);
807 810