summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig.debug1
-rw-r--r--mm/Makefile2
-rw-r--r--mm/cma.c2
-rw-r--r--mm/debug.c5
-rw-r--r--mm/filemap.c221
-rw-r--r--mm/hmm.c3
-rw-r--r--mm/huge_memory.c78
-rw-r--r--mm/hugetlb.c16
-rw-r--r--mm/kasan/kasan.c2
-rw-r--r--mm/khugepaged.c2
-rw-r--r--mm/kmemcheck.c125
-rw-r--r--mm/kmemleak.c11
-rw-r--r--mm/ksm.c15
-rw-r--r--mm/list_lru.c1
-rw-r--r--mm/memblock.c68
-rw-r--r--mm/memcontrol.c2
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory.c90
-rw-r--r--mm/memory_hotplug.c50
-rw-r--r--mm/mempolicy.c16
-rw-r--r--mm/mempool.c2
-rw-r--r--mm/migrate.c15
-rw-r--r--mm/mlock.c9
-rw-r--r--mm/mmu_notifier.c11
-rw-r--r--mm/oom_kill.c60
-rw-r--r--mm/page-writeback.c47
-rw-r--r--mm/page_alloc.c465
-rw-r--r--mm/page_ext.c4
-rw-r--r--mm/page_io.c6
-rw-r--r--mm/page_isolation.c10
-rw-r--r--mm/page_owner.c4
-rw-r--r--mm/percpu-vm.c2
-rw-r--r--mm/rmap.c65
-rw-r--r--mm/shmem.c17
-rw-r--r--mm/slab.c45
-rw-r--r--mm/slab.h41
-rw-r--r--mm/slab_common.c59
-rw-r--r--mm/slob.c4
-rw-r--r--mm/slub.c67
-rw-r--r--mm/sparse-vmemmap.c34
-rw-r--r--mm/sparse.c6
-rw-r--r--mm/swap.c35
-rw-r--r--mm/swap_slots.c11
-rw-r--r--mm/swap_state.c11
-rw-r--r--mm/swapfile.c21
-rw-r--r--mm/truncate.c149
-rw-r--r--mm/vmscan.c8
-rw-r--r--mm/vmstat.c77
-rw-r--r--mm/workingset.c10
-rw-r--r--mm/zsmalloc.c2
50 files changed, 1196 insertions, 813 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 5b0adf1435de..e5e606ee5f71 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -11,7 +11,6 @@ config DEBUG_PAGEALLOC
11 bool "Debug page memory allocations" 11 bool "Debug page memory allocations"
12 depends on DEBUG_KERNEL 12 depends on DEBUG_KERNEL
13 depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC 13 depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
14 depends on !KMEMCHECK
15 select PAGE_EXTENSION 14 select PAGE_EXTENSION
16 select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC 15 select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
17 ---help--- 16 ---help---
diff --git a/mm/Makefile b/mm/Makefile
index 4659b93cba43..e7ebd176fb93 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -17,7 +17,6 @@ KCOV_INSTRUMENT_slub.o := n
17KCOV_INSTRUMENT_page_alloc.o := n 17KCOV_INSTRUMENT_page_alloc.o := n
18KCOV_INSTRUMENT_debug-pagealloc.o := n 18KCOV_INSTRUMENT_debug-pagealloc.o := n
19KCOV_INSTRUMENT_kmemleak.o := n 19KCOV_INSTRUMENT_kmemleak.o := n
20KCOV_INSTRUMENT_kmemcheck.o := n
21KCOV_INSTRUMENT_memcontrol.o := n 20KCOV_INSTRUMENT_memcontrol.o := n
22KCOV_INSTRUMENT_mmzone.o := n 21KCOV_INSTRUMENT_mmzone.o := n
23KCOV_INSTRUMENT_vmstat.o := n 22KCOV_INSTRUMENT_vmstat.o := n
@@ -70,7 +69,6 @@ obj-$(CONFIG_KSM) += ksm.o
70obj-$(CONFIG_PAGE_POISONING) += page_poison.o 69obj-$(CONFIG_PAGE_POISONING) += page_poison.o
71obj-$(CONFIG_SLAB) += slab.o 70obj-$(CONFIG_SLAB) += slab.o
72obj-$(CONFIG_SLUB) += slub.o 71obj-$(CONFIG_SLUB) += slub.o
73obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
74obj-$(CONFIG_KASAN) += kasan/ 72obj-$(CONFIG_KASAN) += kasan/
75obj-$(CONFIG_FAILSLAB) += failslab.o 73obj-$(CONFIG_FAILSLAB) += failslab.o
76obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 74obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
diff --git a/mm/cma.c b/mm/cma.c
index 022e52bd8370..0607729abf3b 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -461,7 +461,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
461 trace_cma_alloc(pfn, page, count, align); 461 trace_cma_alloc(pfn, page, count, align);
462 462
463 if (ret && !(gfp_mask & __GFP_NOWARN)) { 463 if (ret && !(gfp_mask & __GFP_NOWARN)) {
464 pr_info("%s: alloc failed, req-size: %zu pages, ret: %d\n", 464 pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n",
465 __func__, count, ret); 465 __func__, count, ret);
466 cma_debug_show_areas(cma); 466 cma_debug_show_areas(cma);
467 } 467 }
diff --git a/mm/debug.c b/mm/debug.c
index 6726bec731c9..d947f3e03b0d 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -105,7 +105,7 @@ void dump_mm(const struct mm_struct *mm)
105 "get_unmapped_area %p\n" 105 "get_unmapped_area %p\n"
106#endif 106#endif
107 "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" 107 "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
108 "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" 108 "pgd %p mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n"
109 "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" 109 "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
110 "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n" 110 "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n"
111 "start_code %lx end_code %lx start_data %lx end_data %lx\n" 111 "start_code %lx end_code %lx start_data %lx end_data %lx\n"
@@ -135,8 +135,7 @@ void dump_mm(const struct mm_struct *mm)
135 mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end, 135 mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end,
136 mm->pgd, atomic_read(&mm->mm_users), 136 mm->pgd, atomic_read(&mm->mm_users),
137 atomic_read(&mm->mm_count), 137 atomic_read(&mm->mm_count),
138 atomic_long_read((atomic_long_t *)&mm->nr_ptes), 138 mm_pgtables_bytes(mm),
139 mm_nr_pmds((struct mm_struct *)mm),
140 mm->map_count, 139 mm->map_count,
141 mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, 140 mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
142 mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm, 141 mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm,
diff --git a/mm/filemap.c b/mm/filemap.c
index 594d73fef8b4..923fc2ebd74a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -35,6 +35,7 @@
35#include <linux/hugetlb.h> 35#include <linux/hugetlb.h>
36#include <linux/memcontrol.h> 36#include <linux/memcontrol.h>
37#include <linux/cleancache.h> 37#include <linux/cleancache.h>
38#include <linux/shmem_fs.h>
38#include <linux/rmap.h> 39#include <linux/rmap.h>
39#include "internal.h" 40#include "internal.h"
40 41
@@ -134,7 +135,7 @@ static int page_cache_tree_insert(struct address_space *mapping,
134 *shadowp = p; 135 *shadowp = p;
135 } 136 }
136 __radix_tree_replace(&mapping->page_tree, node, slot, page, 137 __radix_tree_replace(&mapping->page_tree, node, slot, page,
137 workingset_update_node, mapping); 138 workingset_lookup_update(mapping));
138 mapping->nrpages++; 139 mapping->nrpages++;
139 return 0; 140 return 0;
140} 141}
@@ -162,9 +163,12 @@ static void page_cache_tree_delete(struct address_space *mapping,
162 163
163 radix_tree_clear_tags(&mapping->page_tree, node, slot); 164 radix_tree_clear_tags(&mapping->page_tree, node, slot);
164 __radix_tree_replace(&mapping->page_tree, node, slot, shadow, 165 __radix_tree_replace(&mapping->page_tree, node, slot, shadow,
165 workingset_update_node, mapping); 166 workingset_lookup_update(mapping));
166 } 167 }
167 168
169 page->mapping = NULL;
170 /* Leave page->index set: truncation lookup relies upon it */
171
168 if (shadow) { 172 if (shadow) {
169 mapping->nrexceptional += nr; 173 mapping->nrexceptional += nr;
170 /* 174 /*
@@ -178,17 +182,11 @@ static void page_cache_tree_delete(struct address_space *mapping,
178 mapping->nrpages -= nr; 182 mapping->nrpages -= nr;
179} 183}
180 184
181/* 185static void unaccount_page_cache_page(struct address_space *mapping,
182 * Delete a page from the page cache and free it. Caller has to make 186 struct page *page)
183 * sure the page is locked and that nobody else uses it - or that usage
184 * is safe. The caller must hold the mapping's tree_lock.
185 */
186void __delete_from_page_cache(struct page *page, void *shadow)
187{ 187{
188 struct address_space *mapping = page->mapping; 188 int nr;
189 int nr = hpage_nr_pages(page);
190 189
191 trace_mm_filemap_delete_from_page_cache(page);
192 /* 190 /*
193 * if we're uptodate, flush out into the cleancache, otherwise 191 * if we're uptodate, flush out into the cleancache, otherwise
194 * invalidate any existing cleancache entries. We can't leave 192 * invalidate any existing cleancache entries. We can't leave
@@ -224,15 +222,12 @@ void __delete_from_page_cache(struct page *page, void *shadow)
224 } 222 }
225 } 223 }
226 224
227 page_cache_tree_delete(mapping, page, shadow);
228
229 page->mapping = NULL;
230 /* Leave page->index set: truncation lookup relies upon it */
231
232 /* hugetlb pages do not participate in page cache accounting. */ 225 /* hugetlb pages do not participate in page cache accounting. */
233 if (PageHuge(page)) 226 if (PageHuge(page))
234 return; 227 return;
235 228
229 nr = hpage_nr_pages(page);
230
236 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); 231 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
237 if (PageSwapBacked(page)) { 232 if (PageSwapBacked(page)) {
238 __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); 233 __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
@@ -243,17 +238,51 @@ void __delete_from_page_cache(struct page *page, void *shadow)
243 } 238 }
244 239
245 /* 240 /*
246 * At this point page must be either written or cleaned by truncate. 241 * At this point page must be either written or cleaned by
247 * Dirty page here signals a bug and loss of unwritten data. 242 * truncate. Dirty page here signals a bug and loss of
243 * unwritten data.
248 * 244 *
249 * This fixes dirty accounting after removing the page entirely but 245 * This fixes dirty accounting after removing the page entirely
250 * leaves PageDirty set: it has no effect for truncated page and 246 * but leaves PageDirty set: it has no effect for truncated
251 * anyway will be cleared before returning page into buddy allocator. 247 * page and anyway will be cleared before returning page into
248 * buddy allocator.
252 */ 249 */
253 if (WARN_ON_ONCE(PageDirty(page))) 250 if (WARN_ON_ONCE(PageDirty(page)))
254 account_page_cleaned(page, mapping, inode_to_wb(mapping->host)); 251 account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
255} 252}
256 253
254/*
255 * Delete a page from the page cache and free it. Caller has to make
256 * sure the page is locked and that nobody else uses it - or that usage
257 * is safe. The caller must hold the mapping's tree_lock.
258 */
259void __delete_from_page_cache(struct page *page, void *shadow)
260{
261 struct address_space *mapping = page->mapping;
262
263 trace_mm_filemap_delete_from_page_cache(page);
264
265 unaccount_page_cache_page(mapping, page);
266 page_cache_tree_delete(mapping, page, shadow);
267}
268
269static void page_cache_free_page(struct address_space *mapping,
270 struct page *page)
271{
272 void (*freepage)(struct page *);
273
274 freepage = mapping->a_ops->freepage;
275 if (freepage)
276 freepage(page);
277
278 if (PageTransHuge(page) && !PageHuge(page)) {
279 page_ref_sub(page, HPAGE_PMD_NR);
280 VM_BUG_ON_PAGE(page_count(page) <= 0, page);
281 } else {
282 put_page(page);
283 }
284}
285
257/** 286/**
258 * delete_from_page_cache - delete page from page cache 287 * delete_from_page_cache - delete page from page cache
259 * @page: the page which the kernel is trying to remove from page cache 288 * @page: the page which the kernel is trying to remove from page cache
@@ -266,27 +295,98 @@ void delete_from_page_cache(struct page *page)
266{ 295{
267 struct address_space *mapping = page_mapping(page); 296 struct address_space *mapping = page_mapping(page);
268 unsigned long flags; 297 unsigned long flags;
269 void (*freepage)(struct page *);
270 298
271 BUG_ON(!PageLocked(page)); 299 BUG_ON(!PageLocked(page));
272
273 freepage = mapping->a_ops->freepage;
274
275 spin_lock_irqsave(&mapping->tree_lock, flags); 300 spin_lock_irqsave(&mapping->tree_lock, flags);
276 __delete_from_page_cache(page, NULL); 301 __delete_from_page_cache(page, NULL);
277 spin_unlock_irqrestore(&mapping->tree_lock, flags); 302 spin_unlock_irqrestore(&mapping->tree_lock, flags);
278 303
279 if (freepage) 304 page_cache_free_page(mapping, page);
280 freepage(page); 305}
306EXPORT_SYMBOL(delete_from_page_cache);
281 307
282 if (PageTransHuge(page) && !PageHuge(page)) { 308/*
283 page_ref_sub(page, HPAGE_PMD_NR); 309 * page_cache_tree_delete_batch - delete several pages from page cache
284 VM_BUG_ON_PAGE(page_count(page) <= 0, page); 310 * @mapping: the mapping to which pages belong
285 } else { 311 * @pvec: pagevec with pages to delete
286 put_page(page); 312 *
313 * The function walks over mapping->page_tree and removes pages passed in @pvec
314 * from the radix tree. The function expects @pvec to be sorted by page index.
315 * It tolerates holes in @pvec (radix tree entries at those indices are not
316 * modified). The function expects only THP head pages to be present in the
317 * @pvec and takes care to delete all corresponding tail pages from the radix
318 * tree as well.
319 *
320 * The function expects mapping->tree_lock to be held.
321 */
322static void
323page_cache_tree_delete_batch(struct address_space *mapping,
324 struct pagevec *pvec)
325{
326 struct radix_tree_iter iter;
327 void **slot;
328 int total_pages = 0;
329 int i = 0, tail_pages = 0;
330 struct page *page;
331 pgoff_t start;
332
333 start = pvec->pages[0]->index;
334 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
335 if (i >= pagevec_count(pvec) && !tail_pages)
336 break;
337 page = radix_tree_deref_slot_protected(slot,
338 &mapping->tree_lock);
339 if (radix_tree_exceptional_entry(page))
340 continue;
341 if (!tail_pages) {
342 /*
343 * Some page got inserted in our range? Skip it. We
344 * have our pages locked so they are protected from
345 * being removed.
346 */
347 if (page != pvec->pages[i])
348 continue;
349 WARN_ON_ONCE(!PageLocked(page));
350 if (PageTransHuge(page) && !PageHuge(page))
351 tail_pages = HPAGE_PMD_NR - 1;
352 page->mapping = NULL;
353 /*
354 * Leave page->index set: truncation lookup relies
355 * upon it
356 */
357 i++;
358 } else {
359 tail_pages--;
360 }
361 radix_tree_clear_tags(&mapping->page_tree, iter.node, slot);
362 __radix_tree_replace(&mapping->page_tree, iter.node, slot, NULL,
363 workingset_lookup_update(mapping));
364 total_pages++;
287 } 365 }
366 mapping->nrpages -= total_pages;
367}
368
369void delete_from_page_cache_batch(struct address_space *mapping,
370 struct pagevec *pvec)
371{
372 int i;
373 unsigned long flags;
374
375 if (!pagevec_count(pvec))
376 return;
377
378 spin_lock_irqsave(&mapping->tree_lock, flags);
379 for (i = 0; i < pagevec_count(pvec); i++) {
380 trace_mm_filemap_delete_from_page_cache(pvec->pages[i]);
381
382 unaccount_page_cache_page(mapping, pvec->pages[i]);
383 }
384 page_cache_tree_delete_batch(mapping, pvec);
385 spin_unlock_irqrestore(&mapping->tree_lock, flags);
386
387 for (i = 0; i < pagevec_count(pvec); i++)
388 page_cache_free_page(mapping, pvec->pages[i]);
288} 389}
289EXPORT_SYMBOL(delete_from_page_cache);
290 390
291int filemap_check_errors(struct address_space *mapping) 391int filemap_check_errors(struct address_space *mapping)
292{ 392{
@@ -419,20 +519,18 @@ static void __filemap_fdatawait_range(struct address_space *mapping,
419 if (end_byte < start_byte) 519 if (end_byte < start_byte)
420 return; 520 return;
421 521
422 pagevec_init(&pvec, 0); 522 pagevec_init(&pvec);
423 while ((index <= end) && 523 while (index <= end) {
424 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
425 PAGECACHE_TAG_WRITEBACK,
426 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
427 unsigned i; 524 unsigned i;
428 525
526 nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
527 end, PAGECACHE_TAG_WRITEBACK);
528 if (!nr_pages)
529 break;
530
429 for (i = 0; i < nr_pages; i++) { 531 for (i = 0; i < nr_pages; i++) {
430 struct page *page = pvec.pages[i]; 532 struct page *page = pvec.pages[i];
431 533
432 /* until radix tree lookup accepts end_index */
433 if (page->index > end)
434 continue;
435
436 wait_on_page_writeback(page); 534 wait_on_page_writeback(page);
437 ClearPageError(page); 535 ClearPageError(page);
438 } 536 }
@@ -1754,9 +1852,10 @@ repeat:
1754EXPORT_SYMBOL(find_get_pages_contig); 1852EXPORT_SYMBOL(find_get_pages_contig);
1755 1853
1756/** 1854/**
1757 * find_get_pages_tag - find and return pages that match @tag 1855 * find_get_pages_range_tag - find and return pages in given range matching @tag
1758 * @mapping: the address_space to search 1856 * @mapping: the address_space to search
1759 * @index: the starting page index 1857 * @index: the starting page index
1858 * @end: The final page index (inclusive)
1760 * @tag: the tag index 1859 * @tag: the tag index
1761 * @nr_pages: the maximum number of pages 1860 * @nr_pages: the maximum number of pages
1762 * @pages: where the resulting pages are placed 1861 * @pages: where the resulting pages are placed
@@ -1764,8 +1863,9 @@ EXPORT_SYMBOL(find_get_pages_contig);
1764 * Like find_get_pages, except we only return pages which are tagged with 1863 * Like find_get_pages, except we only return pages which are tagged with
1765 * @tag. We update @index to index the next page for the traversal. 1864 * @tag. We update @index to index the next page for the traversal.
1766 */ 1865 */
1767unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, 1866unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
1768 int tag, unsigned int nr_pages, struct page **pages) 1867 pgoff_t end, int tag, unsigned int nr_pages,
1868 struct page **pages)
1769{ 1869{
1770 struct radix_tree_iter iter; 1870 struct radix_tree_iter iter;
1771 void **slot; 1871 void **slot;
@@ -1778,6 +1878,9 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
1778 radix_tree_for_each_tagged(slot, &mapping->page_tree, 1878 radix_tree_for_each_tagged(slot, &mapping->page_tree,
1779 &iter, *index, tag) { 1879 &iter, *index, tag) {
1780 struct page *head, *page; 1880 struct page *head, *page;
1881
1882 if (iter.index > end)
1883 break;
1781repeat: 1884repeat:
1782 page = radix_tree_deref_slot(slot); 1885 page = radix_tree_deref_slot(slot);
1783 if (unlikely(!page)) 1886 if (unlikely(!page))
@@ -1819,18 +1922,28 @@ repeat:
1819 } 1922 }
1820 1923
1821 pages[ret] = page; 1924 pages[ret] = page;
1822 if (++ret == nr_pages) 1925 if (++ret == nr_pages) {
1823 break; 1926 *index = pages[ret - 1]->index + 1;
1927 goto out;
1928 }
1824 } 1929 }
1825 1930
1931 /*
1932 * We come here when we got at @end. We take care to not overflow the
1933 * index @index as it confuses some of the callers. This breaks the
1934 * iteration when there is page at index -1 but that is already broken
1935 * anyway.
1936 */
1937 if (end == (pgoff_t)-1)
1938 *index = (pgoff_t)-1;
1939 else
1940 *index = end + 1;
1941out:
1826 rcu_read_unlock(); 1942 rcu_read_unlock();
1827 1943
1828 if (ret)
1829 *index = pages[ret - 1]->index + 1;
1830
1831 return ret; 1944 return ret;
1832} 1945}
1833EXPORT_SYMBOL(find_get_pages_tag); 1946EXPORT_SYMBOL(find_get_pages_range_tag);
1834 1947
1835/** 1948/**
1836 * find_get_entries_tag - find and return entries that match @tag 1949 * find_get_entries_tag - find and return entries that match @tag
@@ -2159,7 +2272,7 @@ no_cached_page:
2159 * Ok, it wasn't cached, so we need to create a new 2272 * Ok, it wasn't cached, so we need to create a new
2160 * page.. 2273 * page..
2161 */ 2274 */
2162 page = page_cache_alloc_cold(mapping); 2275 page = page_cache_alloc(mapping);
2163 if (!page) { 2276 if (!page) {
2164 error = -ENOMEM; 2277 error = -ENOMEM;
2165 goto out; 2278 goto out;
@@ -2271,7 +2384,7 @@ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
2271 int ret; 2384 int ret;
2272 2385
2273 do { 2386 do {
2274 page = __page_cache_alloc(gfp_mask|__GFP_COLD); 2387 page = __page_cache_alloc(gfp_mask);
2275 if (!page) 2388 if (!page)
2276 return -ENOMEM; 2389 return -ENOMEM;
2277 2390
@@ -2675,7 +2788,7 @@ static struct page *do_read_cache_page(struct address_space *mapping,
2675repeat: 2788repeat:
2676 page = find_get_page(mapping, index); 2789 page = find_get_page(mapping, index);
2677 if (!page) { 2790 if (!page) {
2678 page = __page_cache_alloc(gfp | __GFP_COLD); 2791 page = __page_cache_alloc(gfp);
2679 if (!page) 2792 if (!page)
2680 return ERR_PTR(-ENOMEM); 2793 return ERR_PTR(-ENOMEM);
2681 err = add_to_page_cache_lru(page, mapping, index, gfp); 2794 err = add_to_page_cache_lru(page, mapping, index, gfp);
diff --git a/mm/hmm.c b/mm/hmm.c
index a88a847bccba..ea19742a5d60 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -803,11 +803,10 @@ static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL);
803 803
804static void hmm_devmem_radix_release(struct resource *resource) 804static void hmm_devmem_radix_release(struct resource *resource)
805{ 805{
806 resource_size_t key, align_start, align_size, align_end; 806 resource_size_t key, align_start, align_size;
807 807
808 align_start = resource->start & ~(PA_SECTION_SIZE - 1); 808 align_start = resource->start & ~(PA_SECTION_SIZE - 1);
809 align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE); 809 align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE);
810 align_end = align_start + align_size - 1;
811 810
812 mutex_lock(&hmm_devmem_lock); 811 mutex_lock(&hmm_devmem_lock);
813 for (key = resource->start; 812 for (key = resource->start;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 003f7bcd0952..86fe697e8bfb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -606,7 +606,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
606 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); 606 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
607 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); 607 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
608 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); 608 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
609 atomic_long_inc(&vma->vm_mm->nr_ptes); 609 mm_inc_nr_ptes(vma->vm_mm);
610 spin_unlock(vmf->ptl); 610 spin_unlock(vmf->ptl);
611 count_vm_event(THP_FAULT_ALLOC); 611 count_vm_event(THP_FAULT_ALLOC);
612 } 612 }
@@ -662,7 +662,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
662 if (pgtable) 662 if (pgtable)
663 pgtable_trans_huge_deposit(mm, pmd, pgtable); 663 pgtable_trans_huge_deposit(mm, pmd, pgtable);
664 set_pmd_at(mm, haddr, pmd, entry); 664 set_pmd_at(mm, haddr, pmd, entry);
665 atomic_long_inc(&mm->nr_ptes); 665 mm_inc_nr_ptes(mm);
666 return true; 666 return true;
667} 667}
668 668
@@ -747,7 +747,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
747 747
748 if (pgtable) { 748 if (pgtable) {
749 pgtable_trans_huge_deposit(mm, pmd, pgtable); 749 pgtable_trans_huge_deposit(mm, pmd, pgtable);
750 atomic_long_inc(&mm->nr_ptes); 750 mm_inc_nr_ptes(mm);
751 } 751 }
752 752
753 set_pmd_at(mm, addr, pmd, entry); 753 set_pmd_at(mm, addr, pmd, entry);
@@ -942,7 +942,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
942 set_pmd_at(src_mm, addr, src_pmd, pmd); 942 set_pmd_at(src_mm, addr, src_pmd, pmd);
943 } 943 }
944 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 944 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
945 atomic_long_inc(&dst_mm->nr_ptes); 945 mm_inc_nr_ptes(dst_mm);
946 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); 946 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
947 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 947 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
948 ret = 0; 948 ret = 0;
@@ -978,7 +978,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
978 get_page(src_page); 978 get_page(src_page);
979 page_dup_rmap(src_page, true); 979 page_dup_rmap(src_page, true);
980 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 980 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
981 atomic_long_inc(&dst_mm->nr_ptes); 981 mm_inc_nr_ptes(dst_mm);
982 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); 982 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
983 983
984 pmdp_set_wrprotect(src_mm, addr, src_pmd); 984 pmdp_set_wrprotect(src_mm, addr, src_pmd);
@@ -1189,8 +1189,15 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
1189 goto out_free_pages; 1189 goto out_free_pages;
1190 VM_BUG_ON_PAGE(!PageHead(page), page); 1190 VM_BUG_ON_PAGE(!PageHead(page), page);
1191 1191
1192 /*
1193 * Leave pmd empty until pte is filled note we must notify here as
1194 * concurrent CPU thread might write to new page before the call to
1195 * mmu_notifier_invalidate_range_end() happens which can lead to a
1196 * device seeing memory write in different order than CPU.
1197 *
1198 * See Documentation/vm/mmu_notifier.txt
1199 */
1192 pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); 1200 pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
1193 /* leave pmd empty until pte is filled */
1194 1201
1195 pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd); 1202 pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
1196 pmd_populate(vma->vm_mm, &_pmd, pgtable); 1203 pmd_populate(vma->vm_mm, &_pmd, pgtable);
@@ -1216,7 +1223,12 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
1216 page_remove_rmap(page, true); 1223 page_remove_rmap(page, true);
1217 spin_unlock(vmf->ptl); 1224 spin_unlock(vmf->ptl);
1218 1225
1219 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); 1226 /*
1227 * No need to double call mmu_notifier->invalidate_range() callback as
1228 * the above pmdp_huge_clear_flush_notify() did already call it.
1229 */
1230 mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
1231 mmun_end);
1220 1232
1221 ret |= VM_FAULT_WRITE; 1233 ret |= VM_FAULT_WRITE;
1222 put_page(page); 1234 put_page(page);
@@ -1365,7 +1377,12 @@ alloc:
1365 } 1377 }
1366 spin_unlock(vmf->ptl); 1378 spin_unlock(vmf->ptl);
1367out_mn: 1379out_mn:
1368 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); 1380 /*
1381 * No need to double call mmu_notifier->invalidate_range() callback as
1382 * the above pmdp_huge_clear_flush_notify() did already call it.
1383 */
1384 mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
1385 mmun_end);
1369out: 1386out:
1370 return ret; 1387 return ret;
1371out_unlock: 1388out_unlock:
@@ -1678,7 +1695,7 @@ static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
1678 1695
1679 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 1696 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
1680 pte_free(mm, pgtable); 1697 pte_free(mm, pgtable);
1681 atomic_long_dec(&mm->nr_ptes); 1698 mm_dec_nr_ptes(mm);
1682} 1699}
1683 1700
1684int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1701int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
@@ -2017,7 +2034,12 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
2017 2034
2018out: 2035out:
2019 spin_unlock(ptl); 2036 spin_unlock(ptl);
2020 mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PUD_SIZE); 2037 /*
2038 * No need to double call mmu_notifier->invalidate_range() callback as
2039 * the above pudp_huge_clear_flush_notify() did already call it.
2040 */
2041 mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
2042 HPAGE_PUD_SIZE);
2021} 2043}
2022#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 2044#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
2023 2045
@@ -2029,8 +2051,15 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2029 pmd_t _pmd; 2051 pmd_t _pmd;
2030 int i; 2052 int i;
2031 2053
2032 /* leave pmd empty until pte is filled */ 2054 /*
2033 pmdp_huge_clear_flush_notify(vma, haddr, pmd); 2055 * Leave pmd empty until pte is filled note that it is fine to delay
2056 * notification until mmu_notifier_invalidate_range_end() as we are
2057 * replacing a zero pmd write protected page with a zero pte write
2058 * protected page.
2059 *
2060 * See Documentation/vm/mmu_notifier.txt
2061 */
2062 pmdp_huge_clear_flush(vma, haddr, pmd);
2034 2063
2035 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2064 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2036 pmd_populate(mm, &_pmd, pgtable); 2065 pmd_populate(mm, &_pmd, pgtable);
@@ -2085,6 +2114,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
2085 add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR); 2114 add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR);
2086 return; 2115 return;
2087 } else if (is_huge_zero_pmd(*pmd)) { 2116 } else if (is_huge_zero_pmd(*pmd)) {
2117 /*
2118 * FIXME: Do we want to invalidate secondary mmu by calling
2119 * mmu_notifier_invalidate_range() see comments below inside
2120 * __split_huge_pmd() ?
2121 *
2122 * We are going from a zero huge page write protected to zero
2123 * small page also write protected so it does not seems useful
2124 * to invalidate secondary mmu at this time.
2125 */
2088 return __split_huge_zero_page_pmd(vma, haddr, pmd); 2126 return __split_huge_zero_page_pmd(vma, haddr, pmd);
2089 } 2127 }
2090 2128
@@ -2220,7 +2258,21 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
2220 __split_huge_pmd_locked(vma, pmd, haddr, freeze); 2258 __split_huge_pmd_locked(vma, pmd, haddr, freeze);
2221out: 2259out:
2222 spin_unlock(ptl); 2260 spin_unlock(ptl);
2223 mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE); 2261 /*
2262 * No need to double call mmu_notifier->invalidate_range() callback.
2263 * They are 3 cases to consider inside __split_huge_pmd_locked():
2264 * 1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious
2265 * 2) __split_huge_zero_page_pmd() read only zero page and any write
2266 * fault will trigger a flush_notify before pointing to a new page
2267 * (it is fine if the secondary mmu keeps pointing to the old zero
2268 * page in the meantime)
2269 * 3) Split a huge pmd into pte pointing to the same page. No need
2270 * to invalidate secondary tlb entry they are all still valid.
2271 * any further changes to individual pte will notify. So no need
2272 * to call mmu_notifier->invalidate_range()
2273 */
2274 mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
2275 HPAGE_PMD_SIZE);
2224} 2276}
2225 2277
2226void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, 2278void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2d2ff5e8bf2b..681b300185c0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3256,9 +3256,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
3256 set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); 3256 set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
3257 } else { 3257 } else {
3258 if (cow) { 3258 if (cow) {
3259 /*
3260 * No need to notify as we are downgrading page
3261 * table protection not changing it to point
3262 * to a new page.
3263 *
3264 * See Documentation/vm/mmu_notifier.txt
3265 */
3259 huge_ptep_set_wrprotect(src, addr, src_pte); 3266 huge_ptep_set_wrprotect(src, addr, src_pte);
3260 mmu_notifier_invalidate_range(src, mmun_start,
3261 mmun_end);
3262 } 3267 }
3263 entry = huge_ptep_get(src_pte); 3268 entry = huge_ptep_get(src_pte);
3264 ptepage = pte_page(entry); 3269 ptepage = pte_page(entry);
@@ -4318,7 +4323,12 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
4318 * and that page table be reused and filled with junk. 4323 * and that page table be reused and filled with junk.
4319 */ 4324 */
4320 flush_hugetlb_tlb_range(vma, start, end); 4325 flush_hugetlb_tlb_range(vma, start, end);
4321 mmu_notifier_invalidate_range(mm, start, end); 4326 /*
4327 * No need to call mmu_notifier_invalidate_range() we are downgrading
4328 * page table protection not changing it to point to a new page.
4329 *
4330 * See Documentation/vm/mmu_notifier.txt
4331 */
4322 i_mmap_unlock_write(vma->vm_file->f_mapping); 4332 i_mmap_unlock_write(vma->vm_file->f_mapping);
4323 mmu_notifier_invalidate_range_end(mm, start, end); 4333 mmu_notifier_invalidate_range_end(mm, start, end);
4324 4334
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 6f319fb81718..405bba487df5 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -337,7 +337,7 @@ static size_t optimal_redzone(size_t object_size)
337} 337}
338 338
339void kasan_cache_create(struct kmem_cache *cache, size_t *size, 339void kasan_cache_create(struct kmem_cache *cache, size_t *size,
340 unsigned long *flags) 340 slab_flags_t *flags)
341{ 341{
342 int redzone_adjust; 342 int redzone_adjust;
343 int orig_size = *size; 343 int orig_size = *size;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 43cb3043311b..ea4ff259b671 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1270,7 +1270,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
1270 _pmd = pmdp_collapse_flush(vma, addr, pmd); 1270 _pmd = pmdp_collapse_flush(vma, addr, pmd);
1271 spin_unlock(ptl); 1271 spin_unlock(ptl);
1272 up_write(&vma->vm_mm->mmap_sem); 1272 up_write(&vma->vm_mm->mmap_sem);
1273 atomic_long_dec(&vma->vm_mm->nr_ptes); 1273 mm_dec_nr_ptes(vma->vm_mm);
1274 pte_free(vma->vm_mm, pmd_pgtable(_pmd)); 1274 pte_free(vma->vm_mm, pmd_pgtable(_pmd));
1275 } 1275 }
1276 } 1276 }
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index 800d64b854ea..cec594032515 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -1,126 +1 @@
1// SPDX-License-Identifier: GPL-2.0 // SPDX-License-Identifier: GPL-2.0
2#include <linux/gfp.h>
3#include <linux/mm_types.h>
4#include <linux/mm.h>
5#include <linux/slab.h>
6#include "slab.h"
7#include <linux/kmemcheck.h>
8
9void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
10{
11 struct page *shadow;
12 int pages;
13 int i;
14
15 pages = 1 << order;
16
17 /*
18 * With kmemcheck enabled, we need to allocate a memory area for the
19 * shadow bits as well.
20 */
21 shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order);
22 if (!shadow) {
23 if (printk_ratelimit())
24 pr_err("kmemcheck: failed to allocate shadow bitmap\n");
25 return;
26 }
27
28 for(i = 0; i < pages; ++i)
29 page[i].shadow = page_address(&shadow[i]);
30
31 /*
32 * Mark it as non-present for the MMU so that our accesses to
33 * this memory will trigger a page fault and let us analyze
34 * the memory accesses.
35 */
36 kmemcheck_hide_pages(page, pages);
37}
38
39void kmemcheck_free_shadow(struct page *page, int order)
40{
41 struct page *shadow;
42 int pages;
43 int i;
44
45 if (!kmemcheck_page_is_tracked(page))
46 return;
47
48 pages = 1 << order;
49
50 kmemcheck_show_pages(page, pages);
51
52 shadow = virt_to_page(page[0].shadow);
53
54 for(i = 0; i < pages; ++i)
55 page[i].shadow = NULL;
56
57 __free_pages(shadow, order);
58}
59
60void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
61 size_t size)
62{
63 if (unlikely(!object)) /* Skip object if allocation failed */
64 return;
65
66 /*
67 * Has already been memset(), which initializes the shadow for us
68 * as well.
69 */
70 if (gfpflags & __GFP_ZERO)
71 return;
72
73 /* No need to initialize the shadow of a non-tracked slab. */
74 if (s->flags & SLAB_NOTRACK)
75 return;
76
77 if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) {
78 /*
79 * Allow notracked objects to be allocated from
80 * tracked caches. Note however that these objects
81 * will still get page faults on access, they just
82 * won't ever be flagged as uninitialized. If page
83 * faults are not acceptable, the slab cache itself
84 * should be marked NOTRACK.
85 */
86 kmemcheck_mark_initialized(object, size);
87 } else if (!s->ctor) {
88 /*
89 * New objects should be marked uninitialized before
90 * they're returned to the called.
91 */
92 kmemcheck_mark_uninitialized(object, size);
93 }
94}
95
96void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size)
97{
98 /* TODO: RCU freeing is unsupported for now; hide false positives. */
99 if (!s->ctor && !(s->flags & SLAB_TYPESAFE_BY_RCU))
100 kmemcheck_mark_freed(object, size);
101}
102
103void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order,
104 gfp_t gfpflags)
105{
106 int pages;
107
108 if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK))
109 return;
110
111 pages = 1 << order;
112
113 /*
114 * NOTE: We choose to track GFP_ZERO pages too; in fact, they
115 * can become uninitialized by copying uninitialized memory
116 * into them.
117 */
118
119 /* XXX: Can use zone->node for node? */
120 kmemcheck_alloc_shadow(page, order, gfpflags, -1);
121
122 if (gfpflags & __GFP_ZERO)
123 kmemcheck_mark_initialized_pages(page, pages);
124 else
125 kmemcheck_mark_uninitialized_pages(page, pages);
126}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 7780cd83a495..e4738d5e9b8c 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -110,7 +110,6 @@
110#include <linux/atomic.h> 110#include <linux/atomic.h>
111 111
112#include <linux/kasan.h> 112#include <linux/kasan.h>
113#include <linux/kmemcheck.h>
114#include <linux/kmemleak.h> 113#include <linux/kmemleak.h>
115#include <linux/memory_hotplug.h> 114#include <linux/memory_hotplug.h>
116 115
@@ -1238,9 +1237,6 @@ static bool update_checksum(struct kmemleak_object *object)
1238{ 1237{
1239 u32 old_csum = object->checksum; 1238 u32 old_csum = object->checksum;
1240 1239
1241 if (!kmemcheck_is_obj_initialized(object->pointer, object->size))
1242 return false;
1243
1244 kasan_disable_current(); 1240 kasan_disable_current();
1245 object->checksum = crc32(0, (void *)object->pointer, object->size); 1241 object->checksum = crc32(0, (void *)object->pointer, object->size);
1246 kasan_enable_current(); 1242 kasan_enable_current();
@@ -1314,11 +1310,6 @@ static void scan_block(void *_start, void *_end,
1314 if (scan_should_stop()) 1310 if (scan_should_stop())
1315 break; 1311 break;
1316 1312
1317 /* don't scan uninitialized memory */
1318 if (!kmemcheck_is_obj_initialized((unsigned long)ptr,
1319 BYTES_PER_POINTER))
1320 continue;
1321
1322 kasan_disable_current(); 1313 kasan_disable_current();
1323 pointer = *ptr; 1314 pointer = *ptr;
1324 kasan_enable_current(); 1315 kasan_enable_current();
@@ -2104,7 +2095,7 @@ static int __init kmemleak_late_init(void)
2104 return -ENOMEM; 2095 return -ENOMEM;
2105 } 2096 }
2106 2097
2107 dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL, 2098 dentry = debugfs_create_file("kmemleak", 0644, NULL, NULL,
2108 &kmemleak_fops); 2099 &kmemleak_fops);
2109 if (!dentry) 2100 if (!dentry)
2110 pr_warn("Failed to create the debugfs kmemleak file\n"); 2101 pr_warn("Failed to create the debugfs kmemleak file\n");
diff --git a/mm/ksm.c b/mm/ksm.c
index 6cb60f46cce5..be8f4576f842 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1052,8 +1052,13 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
1052 * So we clear the pte and flush the tlb before the check 1052 * So we clear the pte and flush the tlb before the check
1053 * this assure us that no O_DIRECT can happen after the check 1053 * this assure us that no O_DIRECT can happen after the check
1054 * or in the middle of the check. 1054 * or in the middle of the check.
1055 *
1056 * No need to notify as we are downgrading page table to read
1057 * only not changing it to point to a new page.
1058 *
1059 * See Documentation/vm/mmu_notifier.txt
1055 */ 1060 */
1056 entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte); 1061 entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
1057 /* 1062 /*
1058 * Check that no O_DIRECT or similar I/O is in progress on the 1063 * Check that no O_DIRECT or similar I/O is in progress on the
1059 * page 1064 * page
@@ -1136,7 +1141,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
1136 } 1141 }
1137 1142
1138 flush_cache_page(vma, addr, pte_pfn(*ptep)); 1143 flush_cache_page(vma, addr, pte_pfn(*ptep));
1139 ptep_clear_flush_notify(vma, addr, ptep); 1144 /*
1145 * No need to notify as we are replacing a read only page with another
1146 * read only page with the same content.
1147 *
1148 * See Documentation/vm/mmu_notifier.txt
1149 */
1150 ptep_clear_flush(vma, addr, ptep);
1140 set_pte_at_notify(mm, addr, ptep, newpte); 1151 set_pte_at_notify(mm, addr, ptep, newpte);
1141 1152
1142 page_remove_rmap(page, false); 1153 page_remove_rmap(page, false);
diff --git a/mm/list_lru.c b/mm/list_lru.c
index f141f0c80ff3..fd41e969ede5 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -221,6 +221,7 @@ restart:
221 switch (ret) { 221 switch (ret) {
222 case LRU_REMOVED_RETRY: 222 case LRU_REMOVED_RETRY:
223 assert_spin_locked(&nlru->lock); 223 assert_spin_locked(&nlru->lock);
224 /* fall through */
224 case LRU_REMOVED: 225 case LRU_REMOVED:
225 isolated++; 226 isolated++;
226 nlru->nr_items--; 227 nlru->nr_items--;
diff --git a/mm/memblock.c b/mm/memblock.c
index 91205780e6b1..46aacdfa4f4d 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -533,7 +533,7 @@ repeat:
533 base = obase; 533 base = obase;
534 nr_new = 0; 534 nr_new = 0;
535 535
536 for_each_memblock_type(type, rgn) { 536 for_each_memblock_type(idx, type, rgn) {
537 phys_addr_t rbase = rgn->base; 537 phys_addr_t rbase = rgn->base;
538 phys_addr_t rend = rbase + rgn->size; 538 phys_addr_t rend = rbase + rgn->size;
539 539
@@ -637,7 +637,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
637 if (memblock_double_array(type, base, size) < 0) 637 if (memblock_double_array(type, base, size) < 0)
638 return -ENOMEM; 638 return -ENOMEM;
639 639
640 for_each_memblock_type(type, rgn) { 640 for_each_memblock_type(idx, type, rgn) {
641 phys_addr_t rbase = rgn->base; 641 phys_addr_t rbase = rgn->base;
642 phys_addr_t rend = rbase + rgn->size; 642 phys_addr_t rend = rbase + rgn->size;
643 643
@@ -1327,7 +1327,6 @@ again:
1327 return NULL; 1327 return NULL;
1328done: 1328done:
1329 ptr = phys_to_virt(alloc); 1329 ptr = phys_to_virt(alloc);
1330 memset(ptr, 0, size);
1331 1330
1332 /* 1331 /*
1333 * The min_count is set to 0 so that bootmem allocated blocks 1332 * The min_count is set to 0 so that bootmem allocated blocks
@@ -1341,6 +1340,45 @@ done:
1341} 1340}
1342 1341
1343/** 1342/**
1343 * memblock_virt_alloc_try_nid_raw - allocate boot memory block without zeroing
1344 * memory and without panicking
1345 * @size: size of memory block to be allocated in bytes
1346 * @align: alignment of the region and block's size
1347 * @min_addr: the lower bound of the memory region from where the allocation
1348 * is preferred (phys address)
1349 * @max_addr: the upper bound of the memory region from where the allocation
1350 * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
1351 * allocate only from memory limited by memblock.current_limit value
1352 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
1353 *
1354 * Public function, provides additional debug information (including caller
1355 * info), if enabled. Does not zero allocated memory, does not panic if request
1356 * cannot be satisfied.
1357 *
1358 * RETURNS:
1359 * Virtual address of allocated memory block on success, NULL on failure.
1360 */
1361void * __init memblock_virt_alloc_try_nid_raw(
1362 phys_addr_t size, phys_addr_t align,
1363 phys_addr_t min_addr, phys_addr_t max_addr,
1364 int nid)
1365{
1366 void *ptr;
1367
1368 memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
1369 __func__, (u64)size, (u64)align, nid, (u64)min_addr,
1370 (u64)max_addr, (void *)_RET_IP_);
1371
1372 ptr = memblock_virt_alloc_internal(size, align,
1373 min_addr, max_addr, nid);
1374#ifdef CONFIG_DEBUG_VM
1375 if (ptr && size > 0)
1376 memset(ptr, 0xff, size);
1377#endif
1378 return ptr;
1379}
1380
1381/**
1344 * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block 1382 * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block
1345 * @size: size of memory block to be allocated in bytes 1383 * @size: size of memory block to be allocated in bytes
1346 * @align: alignment of the region and block's size 1384 * @align: alignment of the region and block's size
@@ -1351,8 +1389,8 @@ done:
1351 * allocate only from memory limited by memblock.current_limit value 1389 * allocate only from memory limited by memblock.current_limit value
1352 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node 1390 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
1353 * 1391 *
1354 * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides 1392 * Public function, provides additional debug information (including caller
1355 * additional debug information (including caller info), if enabled. 1393 * info), if enabled. This function zeroes the allocated memory.
1356 * 1394 *
1357 * RETURNS: 1395 * RETURNS:
1358 * Virtual address of allocated memory block on success, NULL on failure. 1396 * Virtual address of allocated memory block on success, NULL on failure.
@@ -1362,11 +1400,17 @@ void * __init memblock_virt_alloc_try_nid_nopanic(
1362 phys_addr_t min_addr, phys_addr_t max_addr, 1400 phys_addr_t min_addr, phys_addr_t max_addr,
1363 int nid) 1401 int nid)
1364{ 1402{
1403 void *ptr;
1404
1365 memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", 1405 memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
1366 __func__, (u64)size, (u64)align, nid, (u64)min_addr, 1406 __func__, (u64)size, (u64)align, nid, (u64)min_addr,
1367 (u64)max_addr, (void *)_RET_IP_); 1407 (u64)max_addr, (void *)_RET_IP_);
1368 return memblock_virt_alloc_internal(size, align, min_addr, 1408
1369 max_addr, nid); 1409 ptr = memblock_virt_alloc_internal(size, align,
1410 min_addr, max_addr, nid);
1411 if (ptr)
1412 memset(ptr, 0, size);
1413 return ptr;
1370} 1414}
1371 1415
1372/** 1416/**
@@ -1380,7 +1424,7 @@ void * __init memblock_virt_alloc_try_nid_nopanic(
1380 * allocate only from memory limited by memblock.current_limit value 1424 * allocate only from memory limited by memblock.current_limit value
1381 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node 1425 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
1382 * 1426 *
1383 * Public panicking version of _memblock_virt_alloc_try_nid_nopanic() 1427 * Public panicking version of memblock_virt_alloc_try_nid_nopanic()
1384 * which provides debug information (including caller info), if enabled, 1428 * which provides debug information (including caller info), if enabled,
1385 * and panics if the request can not be satisfied. 1429 * and panics if the request can not be satisfied.
1386 * 1430 *
@@ -1399,8 +1443,10 @@ void * __init memblock_virt_alloc_try_nid(
1399 (u64)max_addr, (void *)_RET_IP_); 1443 (u64)max_addr, (void *)_RET_IP_);
1400 ptr = memblock_virt_alloc_internal(size, align, 1444 ptr = memblock_virt_alloc_internal(size, align,
1401 min_addr, max_addr, nid); 1445 min_addr, max_addr, nid);
1402 if (ptr) 1446 if (ptr) {
1447 memset(ptr, 0, size);
1403 return ptr; 1448 return ptr;
1449 }
1404 1450
1405 panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n", 1451 panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n",
1406 __func__, (u64)size, (u64)align, nid, (u64)min_addr, 1452 __func__, (u64)size, (u64)align, nid, (u64)min_addr,
@@ -1715,7 +1761,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type)
1715 1761
1716 pr_info(" %s.cnt = 0x%lx\n", type->name, type->cnt); 1762 pr_info(" %s.cnt = 0x%lx\n", type->name, type->cnt);
1717 1763
1718 for_each_memblock_type(type, rgn) { 1764 for_each_memblock_type(idx, type, rgn) {
1719 char nid_buf[32] = ""; 1765 char nid_buf[32] = "";
1720 1766
1721 base = rgn->base; 1767 base = rgn->base;
@@ -1739,7 +1785,7 @@ memblock_reserved_memory_within(phys_addr_t start_addr, phys_addr_t end_addr)
1739 unsigned long size = 0; 1785 unsigned long size = 0;
1740 int idx; 1786 int idx;
1741 1787
1742 for_each_memblock_type((&memblock.reserved), rgn) { 1788 for_each_memblock_type(idx, (&memblock.reserved), rgn) {
1743 phys_addr_t start, end; 1789 phys_addr_t start, end;
1744 1790
1745 if (rgn->base + rgn->size < start_addr) 1791 if (rgn->base + rgn->size < start_addr)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 661f046ad318..50e6906314f8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4049,7 +4049,7 @@ static struct cftype mem_cgroup_legacy_files[] = {
4049 .write = mem_cgroup_reset, 4049 .write = mem_cgroup_reset,
4050 .read_u64 = mem_cgroup_read_u64, 4050 .read_u64 = mem_cgroup_read_u64,
4051 }, 4051 },
4052#ifdef CONFIG_SLABINFO 4052#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
4053 { 4053 {
4054 .name = "kmem.slabinfo", 4054 .name = "kmem.slabinfo",
4055 .seq_start = memcg_slab_start, 4055 .seq_start = memcg_slab_start,
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 88366626c0b7..4acdf393a801 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1587,7 +1587,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
1587 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, 1587 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1588 MIGRATE_SYNC, MR_MEMORY_FAILURE); 1588 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1589 if (ret) { 1589 if (ret) {
1590 pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n", 1590 pr_info("soft offline: %#lx: hugepage migration failed %d, type %lx (%pGp)\n",
1591 pfn, ret, page->flags, &page->flags); 1591 pfn, ret, page->flags, &page->flags);
1592 if (!list_empty(&pagelist)) 1592 if (!list_empty(&pagelist))
1593 putback_movable_pages(&pagelist); 1593 putback_movable_pages(&pagelist);
diff --git a/mm/memory.c b/mm/memory.c
index cae514e7dcfc..85e7a87da79f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -438,7 +438,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
438 pgtable_t token = pmd_pgtable(*pmd); 438 pgtable_t token = pmd_pgtable(*pmd);
439 pmd_clear(pmd); 439 pmd_clear(pmd);
440 pte_free_tlb(tlb, token, addr); 440 pte_free_tlb(tlb, token, addr);
441 atomic_long_dec(&tlb->mm->nr_ptes); 441 mm_dec_nr_ptes(tlb->mm);
442} 442}
443 443
444static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 444static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -506,6 +506,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
506 pud = pud_offset(p4d, start); 506 pud = pud_offset(p4d, start);
507 p4d_clear(p4d); 507 p4d_clear(p4d);
508 pud_free_tlb(tlb, pud, start); 508 pud_free_tlb(tlb, pud, start);
509 mm_dec_nr_puds(tlb->mm);
509} 510}
510 511
511static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, 512static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -665,7 +666,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
665 666
666 ptl = pmd_lock(mm, pmd); 667 ptl = pmd_lock(mm, pmd);
667 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ 668 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
668 atomic_long_inc(&mm->nr_ptes); 669 mm_inc_nr_ptes(mm);
669 pmd_populate(mm, pmd, new); 670 pmd_populate(mm, pmd, new);
670 new = NULL; 671 new = NULL;
671 } 672 }
@@ -2554,7 +2555,11 @@ static int wp_page_copy(struct vm_fault *vmf)
2554 put_page(new_page); 2555 put_page(new_page);
2555 2556
2556 pte_unmap_unlock(vmf->pte, vmf->ptl); 2557 pte_unmap_unlock(vmf->pte, vmf->ptl);
2557 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2558 /*
2559 * No need to double call mmu_notifier->invalidate_range() callback as
2560 * the above ptep_clear_flush_notify() did already call it.
2561 */
2562 mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end);
2558 if (old_page) { 2563 if (old_page) {
2559 /* 2564 /*
2560 * Don't let another task, with possibly unlocked vma, 2565 * Don't let another task, with possibly unlocked vma,
@@ -2842,7 +2847,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
2842int do_swap_page(struct vm_fault *vmf) 2847int do_swap_page(struct vm_fault *vmf)
2843{ 2848{
2844 struct vm_area_struct *vma = vmf->vma; 2849 struct vm_area_struct *vma = vmf->vma;
2845 struct page *page = NULL, *swapcache; 2850 struct page *page = NULL, *swapcache = NULL;
2846 struct mem_cgroup *memcg; 2851 struct mem_cgroup *memcg;
2847 struct vma_swap_readahead swap_ra; 2852 struct vma_swap_readahead swap_ra;
2848 swp_entry_t entry; 2853 swp_entry_t entry;
@@ -2881,17 +2886,36 @@ int do_swap_page(struct vm_fault *vmf)
2881 } 2886 }
2882 goto out; 2887 goto out;
2883 } 2888 }
2889
2890
2884 delayacct_set_flag(DELAYACCT_PF_SWAPIN); 2891 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2885 if (!page) 2892 if (!page)
2886 page = lookup_swap_cache(entry, vma_readahead ? vma : NULL, 2893 page = lookup_swap_cache(entry, vma_readahead ? vma : NULL,
2887 vmf->address); 2894 vmf->address);
2888 if (!page) { 2895 if (!page) {
2889 if (vma_readahead) 2896 struct swap_info_struct *si = swp_swap_info(entry);
2890 page = do_swap_page_readahead(entry, 2897
2891 GFP_HIGHUSER_MOVABLE, vmf, &swap_ra); 2898 if (si->flags & SWP_SYNCHRONOUS_IO &&
2892 else 2899 __swap_count(si, entry) == 1) {
2893 page = swapin_readahead(entry, 2900 /* skip swapcache */
2894 GFP_HIGHUSER_MOVABLE, vma, vmf->address); 2901 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
2902 if (page) {
2903 __SetPageLocked(page);
2904 __SetPageSwapBacked(page);
2905 set_page_private(page, entry.val);
2906 lru_cache_add_anon(page);
2907 swap_readpage(page, true);
2908 }
2909 } else {
2910 if (vma_readahead)
2911 page = do_swap_page_readahead(entry,
2912 GFP_HIGHUSER_MOVABLE, vmf, &swap_ra);
2913 else
2914 page = swapin_readahead(entry,
2915 GFP_HIGHUSER_MOVABLE, vma, vmf->address);
2916 swapcache = page;
2917 }
2918
2895 if (!page) { 2919 if (!page) {
2896 /* 2920 /*
2897 * Back out if somebody else faulted in this pte 2921 * Back out if somebody else faulted in this pte
@@ -2920,7 +2944,6 @@ int do_swap_page(struct vm_fault *vmf)
2920 goto out_release; 2944 goto out_release;
2921 } 2945 }
2922 2946
2923 swapcache = page;
2924 locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); 2947 locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
2925 2948
2926 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2949 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2935,7 +2958,8 @@ int do_swap_page(struct vm_fault *vmf)
2935 * test below, are not enough to exclude that. Even if it is still 2958 * test below, are not enough to exclude that. Even if it is still
2936 * swapcache, we need to check that the page's swap has not changed. 2959 * swapcache, we need to check that the page's swap has not changed.
2937 */ 2960 */
2938 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) 2961 if (unlikely((!PageSwapCache(page) ||
2962 page_private(page) != entry.val)) && swapcache)
2939 goto out_page; 2963 goto out_page;
2940 2964
2941 page = ksm_might_need_to_copy(page, vma, vmf->address); 2965 page = ksm_might_need_to_copy(page, vma, vmf->address);
@@ -2988,14 +3012,16 @@ int do_swap_page(struct vm_fault *vmf)
2988 pte = pte_mksoft_dirty(pte); 3012 pte = pte_mksoft_dirty(pte);
2989 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); 3013 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
2990 vmf->orig_pte = pte; 3014 vmf->orig_pte = pte;
2991 if (page == swapcache) { 3015
2992 do_page_add_anon_rmap(page, vma, vmf->address, exclusive); 3016 /* ksm created a completely new copy */
2993 mem_cgroup_commit_charge(page, memcg, true, false); 3017 if (unlikely(page != swapcache && swapcache)) {
2994 activate_page(page);
2995 } else { /* ksm created a completely new copy */
2996 page_add_new_anon_rmap(page, vma, vmf->address, false); 3018 page_add_new_anon_rmap(page, vma, vmf->address, false);
2997 mem_cgroup_commit_charge(page, memcg, false, false); 3019 mem_cgroup_commit_charge(page, memcg, false, false);
2998 lru_cache_add_active_or_unevictable(page, vma); 3020 lru_cache_add_active_or_unevictable(page, vma);
3021 } else {
3022 do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
3023 mem_cgroup_commit_charge(page, memcg, true, false);
3024 activate_page(page);
2999 } 3025 }
3000 3026
3001 swap_free(entry); 3027 swap_free(entry);
@@ -3003,7 +3029,7 @@ int do_swap_page(struct vm_fault *vmf)
3003 (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) 3029 (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
3004 try_to_free_swap(page); 3030 try_to_free_swap(page);
3005 unlock_page(page); 3031 unlock_page(page);
3006 if (page != swapcache) { 3032 if (page != swapcache && swapcache) {
3007 /* 3033 /*
3008 * Hold the lock to avoid the swap entry to be reused 3034 * Hold the lock to avoid the swap entry to be reused
3009 * until we take the PT lock for the pte_same() check 3035 * until we take the PT lock for the pte_same() check
@@ -3036,7 +3062,7 @@ out_page:
3036 unlock_page(page); 3062 unlock_page(page);
3037out_release: 3063out_release:
3038 put_page(page); 3064 put_page(page);
3039 if (page != swapcache) { 3065 if (page != swapcache && swapcache) {
3040 unlock_page(swapcache); 3066 unlock_page(swapcache);
3041 put_page(swapcache); 3067 put_page(swapcache);
3042 } 3068 }
@@ -3212,7 +3238,7 @@ static int pte_alloc_one_map(struct vm_fault *vmf)
3212 goto map_pte; 3238 goto map_pte;
3213 } 3239 }
3214 3240
3215 atomic_long_inc(&vma->vm_mm->nr_ptes); 3241 mm_inc_nr_ptes(vma->vm_mm);
3216 pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); 3242 pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3217 spin_unlock(vmf->ptl); 3243 spin_unlock(vmf->ptl);
3218 vmf->prealloc_pte = NULL; 3244 vmf->prealloc_pte = NULL;
@@ -3271,7 +3297,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf)
3271 * We are going to consume the prealloc table, 3297 * We are going to consume the prealloc table,
3272 * count that as nr_ptes. 3298 * count that as nr_ptes.
3273 */ 3299 */
3274 atomic_long_inc(&vma->vm_mm->nr_ptes); 3300 mm_inc_nr_ptes(vma->vm_mm);
3275 vmf->prealloc_pte = NULL; 3301 vmf->prealloc_pte = NULL;
3276} 3302}
3277 3303
@@ -4124,15 +4150,17 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
4124 4150
4125 spin_lock(&mm->page_table_lock); 4151 spin_lock(&mm->page_table_lock);
4126#ifndef __ARCH_HAS_5LEVEL_HACK 4152#ifndef __ARCH_HAS_5LEVEL_HACK
4127 if (p4d_present(*p4d)) /* Another has populated it */ 4153 if (!p4d_present(*p4d)) {
4128 pud_free(mm, new); 4154 mm_inc_nr_puds(mm);
4129 else
4130 p4d_populate(mm, p4d, new); 4155 p4d_populate(mm, p4d, new);
4131#else 4156 } else /* Another has populated it */
4132 if (pgd_present(*p4d)) /* Another has populated it */
4133 pud_free(mm, new); 4157 pud_free(mm, new);
4134 else 4158#else
4159 if (!pgd_present(*p4d)) {
4160 mm_inc_nr_puds(mm);
4135 pgd_populate(mm, p4d, new); 4161 pgd_populate(mm, p4d, new);
4162 } else /* Another has populated it */
4163 pud_free(mm, new);
4136#endif /* __ARCH_HAS_5LEVEL_HACK */ 4164#endif /* __ARCH_HAS_5LEVEL_HACK */
4137 spin_unlock(&mm->page_table_lock); 4165 spin_unlock(&mm->page_table_lock);
4138 return 0; 4166 return 0;
@@ -4457,17 +4485,15 @@ void print_vma_addr(char *prefix, unsigned long ip)
4457 struct vm_area_struct *vma; 4485 struct vm_area_struct *vma;
4458 4486
4459 /* 4487 /*
4460 * Do not print if we are in atomic 4488 * we might be running from an atomic context so we cannot sleep
4461 * contexts (in exception stacks, etc.):
4462 */ 4489 */
4463 if (preempt_count()) 4490 if (!down_read_trylock(&mm->mmap_sem))
4464 return; 4491 return;
4465 4492
4466 down_read(&mm->mmap_sem);
4467 vma = find_vma(mm, ip); 4493 vma = find_vma(mm, ip);
4468 if (vma && vma->vm_file) { 4494 if (vma && vma->vm_file) {
4469 struct file *f = vma->vm_file; 4495 struct file *f = vma->vm_file;
4470 char *buf = (char *)__get_free_page(GFP_KERNEL); 4496 char *buf = (char *)__get_free_page(GFP_NOWAIT);
4471 if (buf) { 4497 if (buf) {
4472 char *p; 4498 char *p;
4473 4499
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d4b5f29906b9..c52aa05b106c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -265,7 +265,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
265 /* 265 /*
266 * Make all the pages reserved so that nobody will stumble over half 266 * Make all the pages reserved so that nobody will stumble over half
267 * initialized state. 267 * initialized state.
268 * FIXME: We also have to associate it with a node because pfn_to_node 268 * FIXME: We also have to associate it with a node because page_to_nid
269 * relies on having page with the proper node. 269 * relies on having page with the proper node.
270 */ 270 */
271 for (i = 0; i < PAGES_PER_SECTION; i++) { 271 for (i = 0; i < PAGES_PER_SECTION; i++) {
@@ -1590,11 +1590,11 @@ static void node_states_clear_node(int node, struct memory_notify *arg)
1590} 1590}
1591 1591
1592static int __ref __offline_pages(unsigned long start_pfn, 1592static int __ref __offline_pages(unsigned long start_pfn,
1593 unsigned long end_pfn, unsigned long timeout) 1593 unsigned long end_pfn)
1594{ 1594{
1595 unsigned long pfn, nr_pages, expire; 1595 unsigned long pfn, nr_pages;
1596 long offlined_pages; 1596 long offlined_pages;
1597 int ret, drain, retry_max, node; 1597 int ret, node;
1598 unsigned long flags; 1598 unsigned long flags;
1599 unsigned long valid_start, valid_end; 1599 unsigned long valid_start, valid_end;
1600 struct zone *zone; 1600 struct zone *zone;
@@ -1630,44 +1630,22 @@ static int __ref __offline_pages(unsigned long start_pfn,
1630 goto failed_removal; 1630 goto failed_removal;
1631 1631
1632 pfn = start_pfn; 1632 pfn = start_pfn;
1633 expire = jiffies + timeout;
1634 drain = 0;
1635 retry_max = 5;
1636repeat: 1633repeat:
1637 /* start memory hot removal */ 1634 /* start memory hot removal */
1638 ret = -EAGAIN;
1639 if (time_after(jiffies, expire))
1640 goto failed_removal;
1641 ret = -EINTR; 1635 ret = -EINTR;
1642 if (signal_pending(current)) 1636 if (signal_pending(current))
1643 goto failed_removal; 1637 goto failed_removal;
1644 ret = 0; 1638
1645 if (drain) { 1639 cond_resched();
1646 lru_add_drain_all_cpuslocked(); 1640 lru_add_drain_all_cpuslocked();
1647 cond_resched(); 1641 drain_all_pages(zone);
1648 drain_all_pages(zone);
1649 }
1650 1642
1651 pfn = scan_movable_pages(start_pfn, end_pfn); 1643 pfn = scan_movable_pages(start_pfn, end_pfn);
1652 if (pfn) { /* We have movable pages */ 1644 if (pfn) { /* We have movable pages */
1653 ret = do_migrate_range(pfn, end_pfn); 1645 ret = do_migrate_range(pfn, end_pfn);
1654 if (!ret) { 1646 goto repeat;
1655 drain = 1;
1656 goto repeat;
1657 } else {
1658 if (ret < 0)
1659 if (--retry_max == 0)
1660 goto failed_removal;
1661 yield();
1662 drain = 1;
1663 goto repeat;
1664 }
1665 } 1647 }
1666 /* drain all zone's lru pagevec, this is asynchronous... */ 1648
1667 lru_add_drain_all_cpuslocked();
1668 yield();
1669 /* drain pcp pages, this is synchronous. */
1670 drain_all_pages(zone);
1671 /* 1649 /*
1672 * dissolve free hugepages in the memory block before doing offlining 1650 * dissolve free hugepages in the memory block before doing offlining
1673 * actually in order to make hugetlbfs's object counting consistent. 1651 * actually in order to make hugetlbfs's object counting consistent.
@@ -1677,10 +1655,8 @@ repeat:
1677 goto failed_removal; 1655 goto failed_removal;
1678 /* check again */ 1656 /* check again */
1679 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 1657 offlined_pages = check_pages_isolated(start_pfn, end_pfn);
1680 if (offlined_pages < 0) { 1658 if (offlined_pages < 0)
1681 ret = -EBUSY; 1659 goto repeat;
1682 goto failed_removal;
1683 }
1684 pr_info("Offlined Pages %ld\n", offlined_pages); 1660 pr_info("Offlined Pages %ld\n", offlined_pages);
1685 /* Ok, all of our target is isolated. 1661 /* Ok, all of our target is isolated.
1686 We cannot do rollback at this point. */ 1662 We cannot do rollback at this point. */
@@ -1728,7 +1704,7 @@ failed_removal:
1728/* Must be protected by mem_hotplug_begin() or a device_lock */ 1704/* Must be protected by mem_hotplug_begin() or a device_lock */
1729int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1705int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1730{ 1706{
1731 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 1707 return __offline_pages(start_pfn, start_pfn + nr_pages);
1732} 1708}
1733#endif /* CONFIG_MEMORY_HOTREMOVE */ 1709#endif /* CONFIG_MEMORY_HOTREMOVE */
1734 1710
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a2af6d58a68f..4ce44d3ff03d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -85,6 +85,7 @@
85#include <linux/interrupt.h> 85#include <linux/interrupt.h>
86#include <linux/init.h> 86#include <linux/init.h>
87#include <linux/compat.h> 87#include <linux/compat.h>
88#include <linux/ptrace.h>
88#include <linux/swap.h> 89#include <linux/swap.h>
89#include <linux/seq_file.h> 90#include <linux/seq_file.h>
90#include <linux/proc_fs.h> 91#include <linux/proc_fs.h>
@@ -1365,7 +1366,6 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1365 const unsigned long __user *, old_nodes, 1366 const unsigned long __user *, old_nodes,
1366 const unsigned long __user *, new_nodes) 1367 const unsigned long __user *, new_nodes)
1367{ 1368{
1368 const struct cred *cred = current_cred(), *tcred;
1369 struct mm_struct *mm = NULL; 1369 struct mm_struct *mm = NULL;
1370 struct task_struct *task; 1370 struct task_struct *task;
1371 nodemask_t task_nodes; 1371 nodemask_t task_nodes;
@@ -1401,15 +1401,10 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1401 err = -EINVAL; 1401 err = -EINVAL;
1402 1402
1403 /* 1403 /*
1404 * Check if this process has the right to modify the specified 1404 * Check if this process has the right to modify the specified process.
1405 * process. The right exists if the process has administrative 1405 * Use the regular "ptrace_may_access()" checks.
1406 * capabilities, superuser privileges or the same
1407 * userid as the target process.
1408 */ 1406 */
1409 tcred = __task_cred(task); 1407 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1410 if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1411 !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&
1412 !capable(CAP_SYS_NICE)) {
1413 rcu_read_unlock(); 1408 rcu_read_unlock();
1414 err = -EPERM; 1409 err = -EPERM;
1415 goto out_put; 1410 goto out_put;
@@ -1920,6 +1915,9 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1920 struct page *page; 1915 struct page *page;
1921 1916
1922 page = __alloc_pages(gfp, order, nid); 1917 page = __alloc_pages(gfp, order, nid);
1918 /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
1919 if (!static_branch_likely(&vm_numa_stat_key))
1920 return page;
1923 if (page && page_to_nid(page) == nid) { 1921 if (page && page_to_nid(page) == nid) {
1924 preempt_disable(); 1922 preempt_disable();
1925 __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT); 1923 __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
diff --git a/mm/mempool.c b/mm/mempool.c
index c4a23cdae3f0..7d8c5a0010a2 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -189,7 +189,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
189 pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id); 189 pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id);
190 if (!pool) 190 if (!pool)
191 return NULL; 191 return NULL;
192 pool->elements = kmalloc_node(min_nr * sizeof(void *), 192 pool->elements = kmalloc_array_node(min_nr, sizeof(void *),
193 gfp_mask, node_id); 193 gfp_mask, node_id);
194 if (!pool->elements) { 194 if (!pool->elements) {
195 kfree(pool); 195 kfree(pool);
diff --git a/mm/migrate.c b/mm/migrate.c
index 1236449b4777..4d0be47a322a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2089,7 +2089,11 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
2089 set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); 2089 set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
2090 2090
2091 spin_unlock(ptl); 2091 spin_unlock(ptl);
2092 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2092 /*
2093 * No need to double call mmu_notifier->invalidate_range() callback as
2094 * the above pmdp_huge_clear_flush_notify() did already call it.
2095 */
2096 mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end);
2093 2097
2094 /* Take an "isolate" reference and put new page on the LRU. */ 2098 /* Take an "isolate" reference and put new page on the LRU. */
2095 get_page(new_page); 2099 get_page(new_page);
@@ -2805,9 +2809,14 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
2805 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2809 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2806 } 2810 }
2807 2811
2812 /*
2813 * No need to double call mmu_notifier->invalidate_range() callback as
2814 * the above ptep_clear_flush_notify() inside migrate_vma_insert_page()
2815 * did already call it.
2816 */
2808 if (notified) 2817 if (notified)
2809 mmu_notifier_invalidate_range_end(mm, mmu_start, 2818 mmu_notifier_invalidate_range_only_end(mm, mmu_start,
2810 migrate->end); 2819 migrate->end);
2811} 2820}
2812 2821
2813/* 2822/*
diff --git a/mm/mlock.c b/mm/mlock.c
index 46af369c13e5..30472d438794 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -289,7 +289,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
289 struct pagevec pvec_putback; 289 struct pagevec pvec_putback;
290 int pgrescued = 0; 290 int pgrescued = 0;
291 291
292 pagevec_init(&pvec_putback, 0); 292 pagevec_init(&pvec_putback);
293 293
294 /* Phase 1: page isolation */ 294 /* Phase 1: page isolation */
295 spin_lock_irq(zone_lru_lock(zone)); 295 spin_lock_irq(zone_lru_lock(zone));
@@ -448,7 +448,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
448 struct pagevec pvec; 448 struct pagevec pvec;
449 struct zone *zone; 449 struct zone *zone;
450 450
451 pagevec_init(&pvec, 0); 451 pagevec_init(&pvec);
452 /* 452 /*
453 * Although FOLL_DUMP is intended for get_dump_page(), 453 * Although FOLL_DUMP is intended for get_dump_page(),
454 * it just so happens that its special treatment of the 454 * it just so happens that its special treatment of the
@@ -670,8 +670,6 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla
670 if (!can_do_mlock()) 670 if (!can_do_mlock())
671 return -EPERM; 671 return -EPERM;
672 672
673 lru_add_drain_all(); /* flush pagevec */
674
675 len = PAGE_ALIGN(len + (offset_in_page(start))); 673 len = PAGE_ALIGN(len + (offset_in_page(start)));
676 start &= PAGE_MASK; 674 start &= PAGE_MASK;
677 675
@@ -798,9 +796,6 @@ SYSCALL_DEFINE1(mlockall, int, flags)
798 if (!can_do_mlock()) 796 if (!can_do_mlock())
799 return -EPERM; 797 return -EPERM;
800 798
801 if (flags & MCL_CURRENT)
802 lru_add_drain_all(); /* flush pagevec */
803
804 lock_limit = rlimit(RLIMIT_MEMLOCK); 799 lock_limit = rlimit(RLIMIT_MEMLOCK);
805 lock_limit >>= PAGE_SHIFT; 800 lock_limit >>= PAGE_SHIFT;
806 801
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 314285284e6e..96edb33fd09a 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -190,7 +190,9 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
190EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start); 190EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start);
191 191
192void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, 192void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
193 unsigned long start, unsigned long end) 193 unsigned long start,
194 unsigned long end,
195 bool only_end)
194{ 196{
195 struct mmu_notifier *mn; 197 struct mmu_notifier *mn;
196 int id; 198 int id;
@@ -204,8 +206,13 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
204 * subsystem registers either invalidate_range_start()/end() or 206 * subsystem registers either invalidate_range_start()/end() or
205 * invalidate_range(), so this will be no additional overhead 207 * invalidate_range(), so this will be no additional overhead
206 * (besides the pointer check). 208 * (besides the pointer check).
209 *
210 * We skip call to invalidate_range() if we know it is safe ie
211 * call site use mmu_notifier_invalidate_range_only_end() which
212 * is safe to do when we know that a call to invalidate_range()
213 * already happen under page table lock.
207 */ 214 */
208 if (mn->ops->invalidate_range) 215 if (!only_end && mn->ops->invalidate_range)
209 mn->ops->invalidate_range(mn, mm, start, end); 216 mn->ops->invalidate_range(mn, mm, start, end);
210 if (mn->ops->invalidate_range_end) 217 if (mn->ops->invalidate_range_end)
211 mn->ops->invalidate_range_end(mn, mm, start, end); 218 mn->ops->invalidate_range_end(mn, mm, start, end);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dee0f75c3013..c86fbd1b590e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -44,6 +44,7 @@
44 44
45#include <asm/tlb.h> 45#include <asm/tlb.h>
46#include "internal.h" 46#include "internal.h"
47#include "slab.h"
47 48
48#define CREATE_TRACE_POINTS 49#define CREATE_TRACE_POINTS
49#include <trace/events/oom.h> 50#include <trace/events/oom.h>
@@ -161,6 +162,25 @@ static bool oom_unkillable_task(struct task_struct *p,
161 return false; 162 return false;
162} 163}
163 164
165/*
166 * Print out unreclaimble slabs info when unreclaimable slabs amount is greater
167 * than all user memory (LRU pages)
168 */
169static bool is_dump_unreclaim_slabs(void)
170{
171 unsigned long nr_lru;
172
173 nr_lru = global_node_page_state(NR_ACTIVE_ANON) +
174 global_node_page_state(NR_INACTIVE_ANON) +
175 global_node_page_state(NR_ACTIVE_FILE) +
176 global_node_page_state(NR_INACTIVE_FILE) +
177 global_node_page_state(NR_ISOLATED_ANON) +
178 global_node_page_state(NR_ISOLATED_FILE) +
179 global_node_page_state(NR_UNEVICTABLE);
180
181 return (global_node_page_state(NR_SLAB_UNRECLAIMABLE) > nr_lru);
182}
183
164/** 184/**
165 * oom_badness - heuristic function to determine which candidate task to kill 185 * oom_badness - heuristic function to determine which candidate task to kill
166 * @p: task struct of which task we should calculate 186 * @p: task struct of which task we should calculate
@@ -201,7 +221,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
201 * task's rss, pagetable and swap space use. 221 * task's rss, pagetable and swap space use.
202 */ 222 */
203 points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + 223 points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
204 atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm); 224 mm_pgtables_bytes(p->mm) / PAGE_SIZE;
205 task_unlock(p); 225 task_unlock(p);
206 226
207 /* 227 /*
@@ -369,15 +389,15 @@ static void select_bad_process(struct oom_control *oc)
369 * Dumps the current memory state of all eligible tasks. Tasks not in the same 389 * Dumps the current memory state of all eligible tasks. Tasks not in the same
370 * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes 390 * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
371 * are not shown. 391 * are not shown.
372 * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes, 392 * State information includes task's pid, uid, tgid, vm size, rss,
373 * swapents, oom_score_adj value, and name. 393 * pgtables_bytes, swapents, oom_score_adj value, and name.
374 */ 394 */
375static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) 395static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
376{ 396{
377 struct task_struct *p; 397 struct task_struct *p;
378 struct task_struct *task; 398 struct task_struct *task;
379 399
380 pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n"); 400 pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
381 rcu_read_lock(); 401 rcu_read_lock();
382 for_each_process(p) { 402 for_each_process(p) {
383 if (oom_unkillable_task(p, memcg, nodemask)) 403 if (oom_unkillable_task(p, memcg, nodemask))
@@ -393,11 +413,10 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
393 continue; 413 continue;
394 } 414 }
395 415
396 pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n", 416 pr_info("[%5d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
397 task->pid, from_kuid(&init_user_ns, task_uid(task)), 417 task->pid, from_kuid(&init_user_ns, task_uid(task)),
398 task->tgid, task->mm->total_vm, get_mm_rss(task->mm), 418 task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
399 atomic_long_read(&task->mm->nr_ptes), 419 mm_pgtables_bytes(task->mm),
400 mm_nr_pmds(task->mm),
401 get_mm_counter(task->mm, MM_SWAPENTS), 420 get_mm_counter(task->mm, MM_SWAPENTS),
402 task->signal->oom_score_adj, task->comm); 421 task->signal->oom_score_adj, task->comm);
403 task_unlock(task); 422 task_unlock(task);
@@ -407,23 +426,22 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
407 426
408static void dump_header(struct oom_control *oc, struct task_struct *p) 427static void dump_header(struct oom_control *oc, struct task_struct *p)
409{ 428{
410 pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=", 429 pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n",
411 current->comm, oc->gfp_mask, &oc->gfp_mask); 430 current->comm, oc->gfp_mask, &oc->gfp_mask,
412 if (oc->nodemask) 431 nodemask_pr_args(oc->nodemask), oc->order,
413 pr_cont("%*pbl", nodemask_pr_args(oc->nodemask)); 432 current->signal->oom_score_adj);
414 else
415 pr_cont("(null)");
416 pr_cont(", order=%d, oom_score_adj=%hd\n",
417 oc->order, current->signal->oom_score_adj);
418 if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) 433 if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
419 pr_warn("COMPACTION is disabled!!!\n"); 434 pr_warn("COMPACTION is disabled!!!\n");
420 435
421 cpuset_print_current_mems_allowed(); 436 cpuset_print_current_mems_allowed();
422 dump_stack(); 437 dump_stack();
423 if (oc->memcg) 438 if (is_memcg_oom(oc))
424 mem_cgroup_print_oom_info(oc->memcg, p); 439 mem_cgroup_print_oom_info(oc->memcg, p);
425 else 440 else {
426 show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask); 441 show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
442 if (is_dump_unreclaim_slabs())
443 dump_unreclaimable_slab();
444 }
427 if (sysctl_oom_dump_tasks) 445 if (sysctl_oom_dump_tasks)
428 dump_tasks(oc->memcg, oc->nodemask); 446 dump_tasks(oc->memcg, oc->nodemask);
429} 447}
@@ -618,9 +636,6 @@ static int oom_reaper(void *unused)
618 636
619static void wake_oom_reaper(struct task_struct *tsk) 637static void wake_oom_reaper(struct task_struct *tsk)
620{ 638{
621 if (!oom_reaper_th)
622 return;
623
624 /* tsk is already queued? */ 639 /* tsk is already queued? */
625 if (tsk == oom_reaper_list || tsk->oom_reaper_list) 640 if (tsk == oom_reaper_list || tsk->oom_reaper_list)
626 return; 641 return;
@@ -638,11 +653,6 @@ static void wake_oom_reaper(struct task_struct *tsk)
638static int __init oom_init(void) 653static int __init oom_init(void)
639{ 654{
640 oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); 655 oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
641 if (IS_ERR(oom_reaper_th)) {
642 pr_err("Unable to start OOM reaper %ld. Continuing regardless\n",
643 PTR_ERR(oom_reaper_th));
644 oom_reaper_th = NULL;
645 }
646 return 0; 656 return 0;
647} 657}
648subsys_initcall(oom_init) 658subsys_initcall(oom_init)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index c518c845f202..8a1551154285 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -433,8 +433,11 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc)
433 else 433 else
434 bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; 434 bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;
435 435
436 if (bg_thresh >= thresh) 436 if (unlikely(bg_thresh >= thresh)) {
437 pr_warn("vm direct limit must be set greater than background limit.\n");
437 bg_thresh = thresh / 2; 438 bg_thresh = thresh / 2;
439 }
440
438 tsk = current; 441 tsk = current;
439 if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { 442 if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
440 bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; 443 bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
@@ -625,9 +628,9 @@ EXPORT_SYMBOL_GPL(wb_writeout_inc);
625 * On idle system, we can be called long after we scheduled because we use 628 * On idle system, we can be called long after we scheduled because we use
626 * deferred timers so count with missed periods. 629 * deferred timers so count with missed periods.
627 */ 630 */
628static void writeout_period(unsigned long t) 631static void writeout_period(struct timer_list *t)
629{ 632{
630 struct wb_domain *dom = (void *)t; 633 struct wb_domain *dom = from_timer(dom, t, period_timer);
631 int miss_periods = (jiffies - dom->period_time) / 634 int miss_periods = (jiffies - dom->period_time) /
632 VM_COMPLETIONS_PERIOD_LEN; 635 VM_COMPLETIONS_PERIOD_LEN;
633 636
@@ -650,8 +653,7 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
650 653
651 spin_lock_init(&dom->lock); 654 spin_lock_init(&dom->lock);
652 655
653 setup_deferrable_timer(&dom->period_timer, writeout_period, 656 timer_setup(&dom->period_timer, writeout_period, TIMER_DEFERRABLE);
654 (unsigned long)dom);
655 657
656 dom->dirty_limit_tstamp = jiffies; 658 dom->dirty_limit_tstamp = jiffies;
657 659
@@ -1543,7 +1545,7 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
1543 * actually dirty; with m+n sitting in the percpu 1545 * actually dirty; with m+n sitting in the percpu
1544 * deltas. 1546 * deltas.
1545 */ 1547 */
1546 if (dtc->wb_thresh < 2 * wb_stat_error(wb)) { 1548 if (dtc->wb_thresh < 2 * wb_stat_error()) {
1547 wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); 1549 wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
1548 dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK); 1550 dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK);
1549 } else { 1551 } else {
@@ -1559,8 +1561,7 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
1559 * If we're over `background_thresh' then the writeback threads are woken to 1561 * If we're over `background_thresh' then the writeback threads are woken to
1560 * perform some writeout. 1562 * perform some writeout.
1561 */ 1563 */
1562static void balance_dirty_pages(struct address_space *mapping, 1564static void balance_dirty_pages(struct bdi_writeback *wb,
1563 struct bdi_writeback *wb,
1564 unsigned long pages_dirtied) 1565 unsigned long pages_dirtied)
1565{ 1566{
1566 struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; 1567 struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
@@ -1802,7 +1803,7 @@ pause:
1802 * more page. However wb_dirty has accounting errors. So use 1803 * more page. However wb_dirty has accounting errors. So use
1803 * the larger and more IO friendly wb_stat_error. 1804 * the larger and more IO friendly wb_stat_error.
1804 */ 1805 */
1805 if (sdtc->wb_dirty <= wb_stat_error(wb)) 1806 if (sdtc->wb_dirty <= wb_stat_error())
1806 break; 1807 break;
1807 1808
1808 if (fatal_signal_pending(current)) 1809 if (fatal_signal_pending(current))
@@ -1910,7 +1911,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
1910 preempt_enable(); 1911 preempt_enable();
1911 1912
1912 if (unlikely(current->nr_dirtied >= ratelimit)) 1913 if (unlikely(current->nr_dirtied >= ratelimit))
1913 balance_dirty_pages(mapping, wb, current->nr_dirtied); 1914 balance_dirty_pages(wb, current->nr_dirtied);
1914 1915
1915 wb_put(wb); 1916 wb_put(wb);
1916} 1917}
@@ -2167,7 +2168,7 @@ int write_cache_pages(struct address_space *mapping,
2167 int range_whole = 0; 2168 int range_whole = 0;
2168 int tag; 2169 int tag;
2169 2170
2170 pagevec_init(&pvec, 0); 2171 pagevec_init(&pvec);
2171 if (wbc->range_cyclic) { 2172 if (wbc->range_cyclic) {
2172 writeback_index = mapping->writeback_index; /* prev offset */ 2173 writeback_index = mapping->writeback_index; /* prev offset */
2173 index = writeback_index; 2174 index = writeback_index;
@@ -2194,30 +2195,14 @@ retry:
2194 while (!done && (index <= end)) { 2195 while (!done && (index <= end)) {
2195 int i; 2196 int i;
2196 2197
2197 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 2198 nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
2198 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 2199 tag);
2199 if (nr_pages == 0) 2200 if (nr_pages == 0)
2200 break; 2201 break;
2201 2202
2202 for (i = 0; i < nr_pages; i++) { 2203 for (i = 0; i < nr_pages; i++) {
2203 struct page *page = pvec.pages[i]; 2204 struct page *page = pvec.pages[i];
2204 2205
2205 /*
2206 * At this point, the page may be truncated or
2207 * invalidated (changing page->mapping to NULL), or
2208 * even swizzled back from swapper_space to tmpfs file
2209 * mapping. However, page->index will not change
2210 * because we have a reference on the page.
2211 */
2212 if (page->index > end) {
2213 /*
2214 * can't be range_cyclic (1st pass) because
2215 * end == -1 in that case.
2216 */
2217 done = 1;
2218 break;
2219 }
2220
2221 done_index = page->index; 2206 done_index = page->index;
2222 2207
2223 lock_page(page); 2208 lock_page(page);
@@ -2623,7 +2608,7 @@ EXPORT_SYMBOL(set_page_dirty_lock);
2623 * page without actually doing it through the VM. Can you say "ext3 is 2608 * page without actually doing it through the VM. Can you say "ext3 is
2624 * horribly ugly"? Thought you could. 2609 * horribly ugly"? Thought you could.
2625 */ 2610 */
2626void cancel_dirty_page(struct page *page) 2611void __cancel_dirty_page(struct page *page)
2627{ 2612{
2628 struct address_space *mapping = page_mapping(page); 2613 struct address_space *mapping = page_mapping(page);
2629 2614
@@ -2644,7 +2629,7 @@ void cancel_dirty_page(struct page *page)
2644 ClearPageDirty(page); 2629 ClearPageDirty(page);
2645 } 2630 }
2646} 2631}
2647EXPORT_SYMBOL(cancel_dirty_page); 2632EXPORT_SYMBOL(__cancel_dirty_page);
2648 2633
2649/* 2634/*
2650 * Clear a page's dirty flag, while caring for dirty memory accounting. 2635 * Clear a page's dirty flag, while caring for dirty memory accounting.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 77e4d3c5c57b..55ded92f9809 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -24,7 +24,6 @@
24#include <linux/memblock.h> 24#include <linux/memblock.h>
25#include <linux/compiler.h> 25#include <linux/compiler.h>
26#include <linux/kernel.h> 26#include <linux/kernel.h>
27#include <linux/kmemcheck.h>
28#include <linux/kasan.h> 27#include <linux/kasan.h>
29#include <linux/module.h> 28#include <linux/module.h>
30#include <linux/suspend.h> 29#include <linux/suspend.h>
@@ -83,6 +82,8 @@ DEFINE_PER_CPU(int, numa_node);
83EXPORT_PER_CPU_SYMBOL(numa_node); 82EXPORT_PER_CPU_SYMBOL(numa_node);
84#endif 83#endif
85 84
85DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
86
86#ifdef CONFIG_HAVE_MEMORYLESS_NODES 87#ifdef CONFIG_HAVE_MEMORYLESS_NODES
87/* 88/*
88 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. 89 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
@@ -290,28 +291,37 @@ EXPORT_SYMBOL(nr_online_nodes);
290int page_group_by_mobility_disabled __read_mostly; 291int page_group_by_mobility_disabled __read_mostly;
291 292
292#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 293#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
294
295/*
296 * Determine how many pages need to be initialized durig early boot
297 * (non-deferred initialization).
298 * The value of first_deferred_pfn will be set later, once non-deferred pages
299 * are initialized, but for now set it ULONG_MAX.
300 */
293static inline void reset_deferred_meminit(pg_data_t *pgdat) 301static inline void reset_deferred_meminit(pg_data_t *pgdat)
294{ 302{
295 unsigned long max_initialise; 303 phys_addr_t start_addr, end_addr;
296 unsigned long reserved_lowmem; 304 unsigned long max_pgcnt;
305 unsigned long reserved;
297 306
298 /* 307 /*
299 * Initialise at least 2G of a node but also take into account that 308 * Initialise at least 2G of a node but also take into account that
300 * two large system hashes that can take up 1GB for 0.25TB/node. 309 * two large system hashes that can take up 1GB for 0.25TB/node.
301 */ 310 */
302 max_initialise = max(2UL << (30 - PAGE_SHIFT), 311 max_pgcnt = max(2UL << (30 - PAGE_SHIFT),
303 (pgdat->node_spanned_pages >> 8)); 312 (pgdat->node_spanned_pages >> 8));
304 313
305 /* 314 /*
306 * Compensate the all the memblock reservations (e.g. crash kernel) 315 * Compensate the all the memblock reservations (e.g. crash kernel)
307 * from the initial estimation to make sure we will initialize enough 316 * from the initial estimation to make sure we will initialize enough
308 * memory to boot. 317 * memory to boot.
309 */ 318 */
310 reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn, 319 start_addr = PFN_PHYS(pgdat->node_start_pfn);
311 pgdat->node_start_pfn + max_initialise); 320 end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt);
312 max_initialise += reserved_lowmem; 321 reserved = memblock_reserved_memory_within(start_addr, end_addr);
322 max_pgcnt += PHYS_PFN(reserved);
313 323
314 pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages); 324 pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages);
315 pgdat->first_deferred_pfn = ULONG_MAX; 325 pgdat->first_deferred_pfn = ULONG_MAX;
316} 326}
317 327
@@ -338,7 +348,7 @@ static inline bool update_defer_init(pg_data_t *pgdat,
338 if (zone_end < pgdat_end_pfn(pgdat)) 348 if (zone_end < pgdat_end_pfn(pgdat))
339 return true; 349 return true;
340 (*nr_initialised)++; 350 (*nr_initialised)++;
341 if ((*nr_initialised > pgdat->static_init_size) && 351 if ((*nr_initialised > pgdat->static_init_pgcnt) &&
342 (pfn & (PAGES_PER_SECTION - 1)) == 0) { 352 (pfn & (PAGES_PER_SECTION - 1)) == 0) {
343 pgdat->first_deferred_pfn = pfn; 353 pgdat->first_deferred_pfn = pfn;
344 return false; 354 return false;
@@ -1013,7 +1023,6 @@ static __always_inline bool free_pages_prepare(struct page *page,
1013 VM_BUG_ON_PAGE(PageTail(page), page); 1023 VM_BUG_ON_PAGE(PageTail(page), page);
1014 1024
1015 trace_mm_page_free(page, order); 1025 trace_mm_page_free(page, order);
1016 kmemcheck_free_shadow(page, order);
1017 1026
1018 /* 1027 /*
1019 * Check tail pages before head page information is cleared to 1028 * Check tail pages before head page information is cleared to
@@ -1170,6 +1179,7 @@ static void free_one_page(struct zone *zone,
1170static void __meminit __init_single_page(struct page *page, unsigned long pfn, 1179static void __meminit __init_single_page(struct page *page, unsigned long pfn,
1171 unsigned long zone, int nid) 1180 unsigned long zone, int nid)
1172{ 1181{
1182 mm_zero_struct_page(page);
1173 set_page_links(page, zone, nid, pfn); 1183 set_page_links(page, zone, nid, pfn);
1174 init_page_count(page); 1184 init_page_count(page);
1175 page_mapcount_reset(page); 1185 page_mapcount_reset(page);
@@ -1410,14 +1420,17 @@ void clear_zone_contiguous(struct zone *zone)
1410} 1420}
1411 1421
1412#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 1422#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1413static void __init deferred_free_range(struct page *page, 1423static void __init deferred_free_range(unsigned long pfn,
1414 unsigned long pfn, int nr_pages) 1424 unsigned long nr_pages)
1415{ 1425{
1416 int i; 1426 struct page *page;
1427 unsigned long i;
1417 1428
1418 if (!page) 1429 if (!nr_pages)
1419 return; 1430 return;
1420 1431
1432 page = pfn_to_page(pfn);
1433
1421 /* Free a large naturally-aligned chunk if possible */ 1434 /* Free a large naturally-aligned chunk if possible */
1422 if (nr_pages == pageblock_nr_pages && 1435 if (nr_pages == pageblock_nr_pages &&
1423 (pfn & (pageblock_nr_pages - 1)) == 0) { 1436 (pfn & (pageblock_nr_pages - 1)) == 0) {
@@ -1443,19 +1456,109 @@ static inline void __init pgdat_init_report_one_done(void)
1443 complete(&pgdat_init_all_done_comp); 1456 complete(&pgdat_init_all_done_comp);
1444} 1457}
1445 1458
1459/*
1460 * Helper for deferred_init_range, free the given range, reset the counters, and
1461 * return number of pages freed.
1462 */
1463static inline unsigned long __init __def_free(unsigned long *nr_free,
1464 unsigned long *free_base_pfn,
1465 struct page **page)
1466{
1467 unsigned long nr = *nr_free;
1468
1469 deferred_free_range(*free_base_pfn, nr);
1470 *free_base_pfn = 0;
1471 *nr_free = 0;
1472 *page = NULL;
1473
1474 return nr;
1475}
1476
1477static unsigned long __init deferred_init_range(int nid, int zid,
1478 unsigned long start_pfn,
1479 unsigned long end_pfn)
1480{
1481 struct mminit_pfnnid_cache nid_init_state = { };
1482 unsigned long nr_pgmask = pageblock_nr_pages - 1;
1483 unsigned long free_base_pfn = 0;
1484 unsigned long nr_pages = 0;
1485 unsigned long nr_free = 0;
1486 struct page *page = NULL;
1487 unsigned long pfn;
1488
1489 /*
1490 * First we check if pfn is valid on architectures where it is possible
1491 * to have holes within pageblock_nr_pages. On systems where it is not
1492 * possible, this function is optimized out.
1493 *
1494 * Then, we check if a current large page is valid by only checking the
1495 * validity of the head pfn.
1496 *
1497 * meminit_pfn_in_nid is checked on systems where pfns can interleave
1498 * within a node: a pfn is between start and end of a node, but does not
1499 * belong to this memory node.
1500 *
1501 * Finally, we minimize pfn page lookups and scheduler checks by
1502 * performing it only once every pageblock_nr_pages.
1503 *
1504 * We do it in two loops: first we initialize struct page, than free to
1505 * buddy allocator, becuse while we are freeing pages we can access
1506 * pages that are ahead (computing buddy page in __free_one_page()).
1507 */
1508 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1509 if (!pfn_valid_within(pfn))
1510 continue;
1511 if ((pfn & nr_pgmask) || pfn_valid(pfn)) {
1512 if (meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
1513 if (page && (pfn & nr_pgmask))
1514 page++;
1515 else
1516 page = pfn_to_page(pfn);
1517 __init_single_page(page, pfn, zid, nid);
1518 cond_resched();
1519 }
1520 }
1521 }
1522
1523 page = NULL;
1524 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1525 if (!pfn_valid_within(pfn)) {
1526 nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
1527 } else if (!(pfn & nr_pgmask) && !pfn_valid(pfn)) {
1528 nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
1529 } else if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
1530 nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
1531 } else if (page && (pfn & nr_pgmask)) {
1532 page++;
1533 nr_free++;
1534 } else {
1535 nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
1536 page = pfn_to_page(pfn);
1537 free_base_pfn = pfn;
1538 nr_free = 1;
1539 cond_resched();
1540 }
1541 }
1542 /* Free the last block of pages to allocator */
1543 nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
1544
1545 return nr_pages;
1546}
1547
1446/* Initialise remaining memory on a node */ 1548/* Initialise remaining memory on a node */
1447static int __init deferred_init_memmap(void *data) 1549static int __init deferred_init_memmap(void *data)
1448{ 1550{
1449 pg_data_t *pgdat = data; 1551 pg_data_t *pgdat = data;
1450 int nid = pgdat->node_id; 1552 int nid = pgdat->node_id;
1451 struct mminit_pfnnid_cache nid_init_state = { };
1452 unsigned long start = jiffies; 1553 unsigned long start = jiffies;
1453 unsigned long nr_pages = 0; 1554 unsigned long nr_pages = 0;
1454 unsigned long walk_start, walk_end; 1555 unsigned long spfn, epfn;
1455 int i, zid; 1556 phys_addr_t spa, epa;
1557 int zid;
1456 struct zone *zone; 1558 struct zone *zone;
1457 unsigned long first_init_pfn = pgdat->first_deferred_pfn; 1559 unsigned long first_init_pfn = pgdat->first_deferred_pfn;
1458 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 1560 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
1561 u64 i;
1459 1562
1460 if (first_init_pfn == ULONG_MAX) { 1563 if (first_init_pfn == ULONG_MAX) {
1461 pgdat_init_report_one_done(); 1564 pgdat_init_report_one_done();
@@ -1477,83 +1580,12 @@ static int __init deferred_init_memmap(void *data)
1477 if (first_init_pfn < zone_end_pfn(zone)) 1580 if (first_init_pfn < zone_end_pfn(zone))
1478 break; 1581 break;
1479 } 1582 }
1583 first_init_pfn = max(zone->zone_start_pfn, first_init_pfn);
1480 1584
1481 for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) { 1585 for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1482 unsigned long pfn, end_pfn; 1586 spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1483 struct page *page = NULL; 1587 epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
1484 struct page *free_base_page = NULL; 1588 nr_pages += deferred_init_range(nid, zid, spfn, epfn);
1485 unsigned long free_base_pfn = 0;
1486 int nr_to_free = 0;
1487
1488 end_pfn = min(walk_end, zone_end_pfn(zone));
1489 pfn = first_init_pfn;
1490 if (pfn < walk_start)
1491 pfn = walk_start;
1492 if (pfn < zone->zone_start_pfn)
1493 pfn = zone->zone_start_pfn;
1494
1495 for (; pfn < end_pfn; pfn++) {
1496 if (!pfn_valid_within(pfn))
1497 goto free_range;
1498
1499 /*
1500 * Ensure pfn_valid is checked every
1501 * pageblock_nr_pages for memory holes
1502 */
1503 if ((pfn & (pageblock_nr_pages - 1)) == 0) {
1504 if (!pfn_valid(pfn)) {
1505 page = NULL;
1506 goto free_range;
1507 }
1508 }
1509
1510 if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
1511 page = NULL;
1512 goto free_range;
1513 }
1514
1515 /* Minimise pfn page lookups and scheduler checks */
1516 if (page && (pfn & (pageblock_nr_pages - 1)) != 0) {
1517 page++;
1518 } else {
1519 nr_pages += nr_to_free;
1520 deferred_free_range(free_base_page,
1521 free_base_pfn, nr_to_free);
1522 free_base_page = NULL;
1523 free_base_pfn = nr_to_free = 0;
1524
1525 page = pfn_to_page(pfn);
1526 cond_resched();
1527 }
1528
1529 if (page->flags) {
1530 VM_BUG_ON(page_zone(page) != zone);
1531 goto free_range;
1532 }
1533
1534 __init_single_page(page, pfn, zid, nid);
1535 if (!free_base_page) {
1536 free_base_page = page;
1537 free_base_pfn = pfn;
1538 nr_to_free = 0;
1539 }
1540 nr_to_free++;
1541
1542 /* Where possible, batch up pages for a single free */
1543 continue;
1544free_range:
1545 /* Free the current block of pages to allocator */
1546 nr_pages += nr_to_free;
1547 deferred_free_range(free_base_page, free_base_pfn,
1548 nr_to_free);
1549 free_base_page = NULL;
1550 free_base_pfn = nr_to_free = 0;
1551 }
1552 /* Free the last block of pages to allocator */
1553 nr_pages += nr_to_free;
1554 deferred_free_range(free_base_page, free_base_pfn, nr_to_free);
1555
1556 first_init_pfn = max(end_pfn, first_init_pfn);
1557 } 1589 }
1558 1590
1559 /* Sanity check that the next zone really is unpopulated */ 1591 /* Sanity check that the next zone really is unpopulated */
@@ -1792,7 +1824,7 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
1792 * Go through the free lists for the given migratetype and remove 1824 * Go through the free lists for the given migratetype and remove
1793 * the smallest available page from the freelists 1825 * the smallest available page from the freelists
1794 */ 1826 */
1795static inline 1827static __always_inline
1796struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 1828struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
1797 int migratetype) 1829 int migratetype)
1798{ 1830{
@@ -1836,7 +1868,7 @@ static int fallbacks[MIGRATE_TYPES][4] = {
1836}; 1868};
1837 1869
1838#ifdef CONFIG_CMA 1870#ifdef CONFIG_CMA
1839static struct page *__rmqueue_cma_fallback(struct zone *zone, 1871static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone,
1840 unsigned int order) 1872 unsigned int order)
1841{ 1873{
1842 return __rmqueue_smallest(zone, order, MIGRATE_CMA); 1874 return __rmqueue_smallest(zone, order, MIGRATE_CMA);
@@ -2217,7 +2249,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
2217 * deviation from the rest of this file, to make the for loop 2249 * deviation from the rest of this file, to make the for loop
2218 * condition simpler. 2250 * condition simpler.
2219 */ 2251 */
2220static inline bool 2252static __always_inline bool
2221__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) 2253__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
2222{ 2254{
2223 struct free_area *area; 2255 struct free_area *area;
@@ -2289,8 +2321,8 @@ do_steal:
2289 * Do the hard work of removing an element from the buddy allocator. 2321 * Do the hard work of removing an element from the buddy allocator.
2290 * Call me with the zone->lock already held. 2322 * Call me with the zone->lock already held.
2291 */ 2323 */
2292static struct page *__rmqueue(struct zone *zone, unsigned int order, 2324static __always_inline struct page *
2293 int migratetype) 2325__rmqueue(struct zone *zone, unsigned int order, int migratetype)
2294{ 2326{
2295 struct page *page; 2327 struct page *page;
2296 2328
@@ -2315,7 +2347,7 @@ retry:
2315 */ 2347 */
2316static int rmqueue_bulk(struct zone *zone, unsigned int order, 2348static int rmqueue_bulk(struct zone *zone, unsigned int order,
2317 unsigned long count, struct list_head *list, 2349 unsigned long count, struct list_head *list,
2318 int migratetype, bool cold) 2350 int migratetype)
2319{ 2351{
2320 int i, alloced = 0; 2352 int i, alloced = 0;
2321 2353
@@ -2329,19 +2361,16 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
2329 continue; 2361 continue;
2330 2362
2331 /* 2363 /*
2332 * Split buddy pages returned by expand() are received here 2364 * Split buddy pages returned by expand() are received here in
2333 * in physical page order. The page is added to the callers and 2365 * physical page order. The page is added to the tail of
2334 * list and the list head then moves forward. From the callers 2366 * caller's list. From the callers perspective, the linked list
2335 * perspective, the linked list is ordered by page number in 2367 * is ordered by page number under some conditions. This is
2336 * some conditions. This is useful for IO devices that can 2368 * useful for IO devices that can forward direction from the
2337 * merge IO requests if the physical pages are ordered 2369 * head, thus also in the physical page order. This is useful
2338 * properly. 2370 * for IO devices that can merge IO requests if the physical
2371 * pages are ordered properly.
2339 */ 2372 */
2340 if (likely(!cold)) 2373 list_add_tail(&page->lru, list);
2341 list_add(&page->lru, list);
2342 else
2343 list_add_tail(&page->lru, list);
2344 list = &page->lru;
2345 alloced++; 2374 alloced++;
2346 if (is_migrate_cma(get_pcppage_migratetype(page))) 2375 if (is_migrate_cma(get_pcppage_migratetype(page)))
2347 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 2376 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
@@ -2590,24 +2619,25 @@ void mark_free_pages(struct zone *zone)
2590} 2619}
2591#endif /* CONFIG_PM */ 2620#endif /* CONFIG_PM */
2592 2621
2593/* 2622static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
2594 * Free a 0-order page
2595 * cold == true ? free a cold page : free a hot page
2596 */
2597void free_hot_cold_page(struct page *page, bool cold)
2598{ 2623{
2599 struct zone *zone = page_zone(page);
2600 struct per_cpu_pages *pcp;
2601 unsigned long flags;
2602 unsigned long pfn = page_to_pfn(page);
2603 int migratetype; 2624 int migratetype;
2604 2625
2605 if (!free_pcp_prepare(page)) 2626 if (!free_pcp_prepare(page))
2606 return; 2627 return false;
2607 2628
2608 migratetype = get_pfnblock_migratetype(page, pfn); 2629 migratetype = get_pfnblock_migratetype(page, pfn);
2609 set_pcppage_migratetype(page, migratetype); 2630 set_pcppage_migratetype(page, migratetype);
2610 local_irq_save(flags); 2631 return true;
2632}
2633
2634static void free_unref_page_commit(struct page *page, unsigned long pfn)
2635{
2636 struct zone *zone = page_zone(page);
2637 struct per_cpu_pages *pcp;
2638 int migratetype;
2639
2640 migratetype = get_pcppage_migratetype(page);
2611 __count_vm_event(PGFREE); 2641 __count_vm_event(PGFREE);
2612 2642
2613 /* 2643 /*
@@ -2620,38 +2650,62 @@ void free_hot_cold_page(struct page *page, bool cold)
2620 if (migratetype >= MIGRATE_PCPTYPES) { 2650 if (migratetype >= MIGRATE_PCPTYPES) {
2621 if (unlikely(is_migrate_isolate(migratetype))) { 2651 if (unlikely(is_migrate_isolate(migratetype))) {
2622 free_one_page(zone, page, pfn, 0, migratetype); 2652 free_one_page(zone, page, pfn, 0, migratetype);
2623 goto out; 2653 return;
2624 } 2654 }
2625 migratetype = MIGRATE_MOVABLE; 2655 migratetype = MIGRATE_MOVABLE;
2626 } 2656 }
2627 2657
2628 pcp = &this_cpu_ptr(zone->pageset)->pcp; 2658 pcp = &this_cpu_ptr(zone->pageset)->pcp;
2629 if (!cold) 2659 list_add(&page->lru, &pcp->lists[migratetype]);
2630 list_add(&page->lru, &pcp->lists[migratetype]);
2631 else
2632 list_add_tail(&page->lru, &pcp->lists[migratetype]);
2633 pcp->count++; 2660 pcp->count++;
2634 if (pcp->count >= pcp->high) { 2661 if (pcp->count >= pcp->high) {
2635 unsigned long batch = READ_ONCE(pcp->batch); 2662 unsigned long batch = READ_ONCE(pcp->batch);
2636 free_pcppages_bulk(zone, batch, pcp); 2663 free_pcppages_bulk(zone, batch, pcp);
2637 pcp->count -= batch; 2664 pcp->count -= batch;
2638 } 2665 }
2666}
2639 2667
2640out: 2668/*
2669 * Free a 0-order page
2670 */
2671void free_unref_page(struct page *page)
2672{
2673 unsigned long flags;
2674 unsigned long pfn = page_to_pfn(page);
2675
2676 if (!free_unref_page_prepare(page, pfn))
2677 return;
2678
2679 local_irq_save(flags);
2680 free_unref_page_commit(page, pfn);
2641 local_irq_restore(flags); 2681 local_irq_restore(flags);
2642} 2682}
2643 2683
2644/* 2684/*
2645 * Free a list of 0-order pages 2685 * Free a list of 0-order pages
2646 */ 2686 */
2647void free_hot_cold_page_list(struct list_head *list, bool cold) 2687void free_unref_page_list(struct list_head *list)
2648{ 2688{
2649 struct page *page, *next; 2689 struct page *page, *next;
2690 unsigned long flags, pfn;
2691
2692 /* Prepare pages for freeing */
2693 list_for_each_entry_safe(page, next, list, lru) {
2694 pfn = page_to_pfn(page);
2695 if (!free_unref_page_prepare(page, pfn))
2696 list_del(&page->lru);
2697 set_page_private(page, pfn);
2698 }
2650 2699
2700 local_irq_save(flags);
2651 list_for_each_entry_safe(page, next, list, lru) { 2701 list_for_each_entry_safe(page, next, list, lru) {
2652 trace_mm_page_free_batched(page, cold); 2702 unsigned long pfn = page_private(page);
2653 free_hot_cold_page(page, cold); 2703
2704 set_page_private(page, 0);
2705 trace_mm_page_free_batched(page);
2706 free_unref_page_commit(page, pfn);
2654 } 2707 }
2708 local_irq_restore(flags);
2655} 2709}
2656 2710
2657/* 2711/*
@@ -2669,15 +2723,6 @@ void split_page(struct page *page, unsigned int order)
2669 VM_BUG_ON_PAGE(PageCompound(page), page); 2723 VM_BUG_ON_PAGE(PageCompound(page), page);
2670 VM_BUG_ON_PAGE(!page_count(page), page); 2724 VM_BUG_ON_PAGE(!page_count(page), page);
2671 2725
2672#ifdef CONFIG_KMEMCHECK
2673 /*
2674 * Split shadow pages too, because free(page[0]) would
2675 * otherwise free the whole shadow.
2676 */
2677 if (kmemcheck_page_is_tracked(page))
2678 split_page(virt_to_page(page[0].shadow), order);
2679#endif
2680
2681 for (i = 1; i < (1 << order); i++) 2726 for (i = 1; i < (1 << order); i++)
2682 set_page_refcounted(page + i); 2727 set_page_refcounted(page + i);
2683 split_page_owner(page, order); 2728 split_page_owner(page, order);
@@ -2743,6 +2788,10 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
2743#ifdef CONFIG_NUMA 2788#ifdef CONFIG_NUMA
2744 enum numa_stat_item local_stat = NUMA_LOCAL; 2789 enum numa_stat_item local_stat = NUMA_LOCAL;
2745 2790
2791 /* skip numa counters update if numa stats is disabled */
2792 if (!static_branch_likely(&vm_numa_stat_key))
2793 return;
2794
2746 if (z->node != numa_node_id()) 2795 if (z->node != numa_node_id())
2747 local_stat = NUMA_OTHER; 2796 local_stat = NUMA_OTHER;
2748 2797
@@ -2758,7 +2807,7 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
2758 2807
2759/* Remove page from the per-cpu list, caller must protect the list */ 2808/* Remove page from the per-cpu list, caller must protect the list */
2760static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, 2809static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
2761 bool cold, struct per_cpu_pages *pcp, 2810 struct per_cpu_pages *pcp,
2762 struct list_head *list) 2811 struct list_head *list)
2763{ 2812{
2764 struct page *page; 2813 struct page *page;
@@ -2767,16 +2816,12 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
2767 if (list_empty(list)) { 2816 if (list_empty(list)) {
2768 pcp->count += rmqueue_bulk(zone, 0, 2817 pcp->count += rmqueue_bulk(zone, 0,
2769 pcp->batch, list, 2818 pcp->batch, list,
2770 migratetype, cold); 2819 migratetype);
2771 if (unlikely(list_empty(list))) 2820 if (unlikely(list_empty(list)))
2772 return NULL; 2821 return NULL;
2773 } 2822 }
2774 2823
2775 if (cold) 2824 page = list_first_entry(list, struct page, lru);
2776 page = list_last_entry(list, struct page, lru);
2777 else
2778 page = list_first_entry(list, struct page, lru);
2779
2780 list_del(&page->lru); 2825 list_del(&page->lru);
2781 pcp->count--; 2826 pcp->count--;
2782 } while (check_new_pcp(page)); 2827 } while (check_new_pcp(page));
@@ -2791,14 +2836,13 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
2791{ 2836{
2792 struct per_cpu_pages *pcp; 2837 struct per_cpu_pages *pcp;
2793 struct list_head *list; 2838 struct list_head *list;
2794 bool cold = ((gfp_flags & __GFP_COLD) != 0);
2795 struct page *page; 2839 struct page *page;
2796 unsigned long flags; 2840 unsigned long flags;
2797 2841
2798 local_irq_save(flags); 2842 local_irq_save(flags);
2799 pcp = &this_cpu_ptr(zone->pageset)->pcp; 2843 pcp = &this_cpu_ptr(zone->pageset)->pcp;
2800 list = &pcp->lists[migratetype]; 2844 list = &pcp->lists[migratetype];
2801 page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list); 2845 page = __rmqueue_pcplist(zone, migratetype, pcp, list);
2802 if (page) { 2846 if (page) {
2803 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); 2847 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
2804 zone_statistics(preferred_zone, zone); 2848 zone_statistics(preferred_zone, zone);
@@ -3006,9 +3050,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3006 if (!area->nr_free) 3050 if (!area->nr_free)
3007 continue; 3051 continue;
3008 3052
3009 if (alloc_harder)
3010 return true;
3011
3012 for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { 3053 for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
3013 if (!list_empty(&area->free_list[mt])) 3054 if (!list_empty(&area->free_list[mt]))
3014 return true; 3055 return true;
@@ -3020,6 +3061,9 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3020 return true; 3061 return true;
3021 } 3062 }
3022#endif 3063#endif
3064 if (alloc_harder &&
3065 !list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
3066 return true;
3023 } 3067 }
3024 return false; 3068 return false;
3025} 3069}
@@ -3235,20 +3279,14 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
3235 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) 3279 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
3236 return; 3280 return;
3237 3281
3238 pr_warn("%s: ", current->comm);
3239
3240 va_start(args, fmt); 3282 va_start(args, fmt);
3241 vaf.fmt = fmt; 3283 vaf.fmt = fmt;
3242 vaf.va = &args; 3284 vaf.va = &args;
3243 pr_cont("%pV", &vaf); 3285 pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n",
3286 current->comm, &vaf, gfp_mask, &gfp_mask,
3287 nodemask_pr_args(nodemask));
3244 va_end(args); 3288 va_end(args);
3245 3289
3246 pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask);
3247 if (nodemask)
3248 pr_cont("%*pbl\n", nodemask_pr_args(nodemask));
3249 else
3250 pr_cont("(null)\n");
3251
3252 cpuset_print_current_mems_allowed(); 3290 cpuset_print_current_mems_allowed();
3253 3291
3254 dump_stack(); 3292 dump_stack();
@@ -3868,8 +3906,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
3868 enum compact_result compact_result; 3906 enum compact_result compact_result;
3869 int compaction_retries; 3907 int compaction_retries;
3870 int no_progress_loops; 3908 int no_progress_loops;
3871 unsigned long alloc_start = jiffies;
3872 unsigned int stall_timeout = 10 * HZ;
3873 unsigned int cpuset_mems_cookie; 3909 unsigned int cpuset_mems_cookie;
3874 int reserve_flags; 3910 int reserve_flags;
3875 3911
@@ -4001,14 +4037,6 @@ retry:
4001 if (!can_direct_reclaim) 4037 if (!can_direct_reclaim)
4002 goto nopage; 4038 goto nopage;
4003 4039
4004 /* Make sure we know about allocations which stall for too long */
4005 if (time_after(jiffies, alloc_start + stall_timeout)) {
4006 warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask,
4007 "page allocation stalls for %ums, order:%u",
4008 jiffies_to_msecs(jiffies-alloc_start), order);
4009 stall_timeout += 10 * HZ;
4010 }
4011
4012 /* Avoid recursion of direct reclaim */ 4040 /* Avoid recursion of direct reclaim */
4013 if (current->flags & PF_MEMALLOC) 4041 if (current->flags & PF_MEMALLOC)
4014 goto nopage; 4042 goto nopage;
@@ -4223,9 +4251,6 @@ out:
4223 page = NULL; 4251 page = NULL;
4224 } 4252 }
4225 4253
4226 if (kmemcheck_enabled && page)
4227 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
4228
4229 trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); 4254 trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
4230 4255
4231 return page; 4256 return page;
@@ -4262,7 +4287,7 @@ void __free_pages(struct page *page, unsigned int order)
4262{ 4287{
4263 if (put_page_testzero(page)) { 4288 if (put_page_testzero(page)) {
4264 if (order == 0) 4289 if (order == 0)
4265 free_hot_cold_page(page, false); 4290 free_unref_page(page);
4266 else 4291 else
4267 __free_pages_ok(page, order); 4292 __free_pages_ok(page, order);
4268 } 4293 }
@@ -4320,7 +4345,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
4320 unsigned int order = compound_order(page); 4345 unsigned int order = compound_order(page);
4321 4346
4322 if (order == 0) 4347 if (order == 0)
4323 free_hot_cold_page(page, false); 4348 free_unref_page(page);
4324 else 4349 else
4325 __free_pages_ok(page, order); 4350 __free_pages_ok(page, order);
4326 } 4351 }
@@ -6126,6 +6151,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
6126 } 6151 }
6127} 6152}
6128 6153
6154#ifdef CONFIG_FLAT_NODE_MEM_MAP
6129static void __ref alloc_node_mem_map(struct pglist_data *pgdat) 6155static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
6130{ 6156{
6131 unsigned long __maybe_unused start = 0; 6157 unsigned long __maybe_unused start = 0;
@@ -6135,7 +6161,6 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
6135 if (!pgdat->node_spanned_pages) 6161 if (!pgdat->node_spanned_pages)
6136 return; 6162 return;
6137 6163
6138#ifdef CONFIG_FLAT_NODE_MEM_MAP
6139 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 6164 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
6140 offset = pgdat->node_start_pfn - start; 6165 offset = pgdat->node_start_pfn - start;
6141 /* ia64 gets its own node_mem_map, before this, without bootmem */ 6166 /* ia64 gets its own node_mem_map, before this, without bootmem */
@@ -6157,6 +6182,9 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
6157 pgdat->node_id); 6182 pgdat->node_id);
6158 pgdat->node_mem_map = map + offset; 6183 pgdat->node_mem_map = map + offset;
6159 } 6184 }
6185 pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
6186 __func__, pgdat->node_id, (unsigned long)pgdat,
6187 (unsigned long)pgdat->node_mem_map);
6160#ifndef CONFIG_NEED_MULTIPLE_NODES 6188#ifndef CONFIG_NEED_MULTIPLE_NODES
6161 /* 6189 /*
6162 * With no DISCONTIG, the global mem_map is just set as node 0's 6190 * With no DISCONTIG, the global mem_map is just set as node 0's
@@ -6169,8 +6197,10 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
6169#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 6197#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6170 } 6198 }
6171#endif 6199#endif
6172#endif /* CONFIG_FLAT_NODE_MEM_MAP */
6173} 6200}
6201#else
6202static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
6203#endif /* CONFIG_FLAT_NODE_MEM_MAP */
6174 6204
6175void __paginginit free_area_init_node(int nid, unsigned long *zones_size, 6205void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
6176 unsigned long node_start_pfn, unsigned long *zholes_size) 6206 unsigned long node_start_pfn, unsigned long *zholes_size)
@@ -6197,16 +6227,49 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
6197 zones_size, zholes_size); 6227 zones_size, zholes_size);
6198 6228
6199 alloc_node_mem_map(pgdat); 6229 alloc_node_mem_map(pgdat);
6200#ifdef CONFIG_FLAT_NODE_MEM_MAP
6201 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
6202 nid, (unsigned long)pgdat,
6203 (unsigned long)pgdat->node_mem_map);
6204#endif
6205 6230
6206 reset_deferred_meminit(pgdat); 6231 reset_deferred_meminit(pgdat);
6207 free_area_init_core(pgdat); 6232 free_area_init_core(pgdat);
6208} 6233}
6209 6234
6235#ifdef CONFIG_HAVE_MEMBLOCK
6236/*
6237 * Only struct pages that are backed by physical memory are zeroed and
6238 * initialized by going through __init_single_page(). But, there are some
6239 * struct pages which are reserved in memblock allocator and their fields
6240 * may be accessed (for example page_to_pfn() on some configuration accesses
6241 * flags). We must explicitly zero those struct pages.
6242 */
6243void __paginginit zero_resv_unavail(void)
6244{
6245 phys_addr_t start, end;
6246 unsigned long pfn;
6247 u64 i, pgcnt;
6248
6249 /*
6250 * Loop through ranges that are reserved, but do not have reported
6251 * physical memory backing.
6252 */
6253 pgcnt = 0;
6254 for_each_resv_unavail_range(i, &start, &end) {
6255 for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) {
6256 mm_zero_struct_page(pfn_to_page(pfn));
6257 pgcnt++;
6258 }
6259 }
6260
6261 /*
6262 * Struct pages that do not have backing memory. This could be because
6263 * firmware is using some of this memory, or for some other reasons.
6264 * Once memblock is changed so such behaviour is not allowed: i.e.
6265 * list of "reserved" memory must be a subset of list of "memory", then
6266 * this code can be removed.
6267 */
6268 if (pgcnt)
6269 pr_info("Reserved but unavailable: %lld pages", pgcnt);
6270}
6271#endif /* CONFIG_HAVE_MEMBLOCK */
6272
6210#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 6273#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
6211 6274
6212#if MAX_NUMNODES > 1 6275#if MAX_NUMNODES > 1
@@ -6630,6 +6693,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
6630 node_set_state(nid, N_MEMORY); 6693 node_set_state(nid, N_MEMORY);
6631 check_for_memory(pgdat, nid); 6694 check_for_memory(pgdat, nid);
6632 } 6695 }
6696 zero_resv_unavail();
6633} 6697}
6634 6698
6635static int __init cmdline_parse_core(char *p, unsigned long *core) 6699static int __init cmdline_parse_core(char *p, unsigned long *core)
@@ -6793,6 +6857,7 @@ void __init free_area_init(unsigned long *zones_size)
6793{ 6857{
6794 free_area_init_node(0, zones_size, 6858 free_area_init_node(0, zones_size,
6795 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 6859 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
6860 zero_resv_unavail();
6796} 6861}
6797 6862
6798static int page_alloc_cpu_dead(unsigned int cpu) 6863static int page_alloc_cpu_dead(unsigned int cpu)
@@ -7305,18 +7370,17 @@ void *__init alloc_large_system_hash(const char *tablename,
7305 7370
7306 log2qty = ilog2(numentries); 7371 log2qty = ilog2(numentries);
7307 7372
7308 /*
7309 * memblock allocator returns zeroed memory already, so HASH_ZERO is
7310 * currently not used when HASH_EARLY is specified.
7311 */
7312 gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; 7373 gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
7313 do { 7374 do {
7314 size = bucketsize << log2qty; 7375 size = bucketsize << log2qty;
7315 if (flags & HASH_EARLY) 7376 if (flags & HASH_EARLY) {
7316 table = memblock_virt_alloc_nopanic(size, 0); 7377 if (flags & HASH_ZERO)
7317 else if (hashdist) 7378 table = memblock_virt_alloc_nopanic(size, 0);
7379 else
7380 table = memblock_virt_alloc_raw(size, 0);
7381 } else if (hashdist) {
7318 table = __vmalloc(size, gfp_flags, PAGE_KERNEL); 7382 table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
7319 else { 7383 } else {
7320 /* 7384 /*
7321 * If bucketsize is not a power-of-two, we may free 7385 * If bucketsize is not a power-of-two, we may free
7322 * some pages at the end of hash table which 7386 * some pages at the end of hash table which
@@ -7353,10 +7417,10 @@ void *__init alloc_large_system_hash(const char *tablename,
7353 * race condition. So you can't expect this function should be exact. 7417 * race condition. So you can't expect this function should be exact.
7354 */ 7418 */
7355bool has_unmovable_pages(struct zone *zone, struct page *page, int count, 7419bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
7420 int migratetype,
7356 bool skip_hwpoisoned_pages) 7421 bool skip_hwpoisoned_pages)
7357{ 7422{
7358 unsigned long pfn, iter, found; 7423 unsigned long pfn, iter, found;
7359 int mt;
7360 7424
7361 /* 7425 /*
7362 * For avoiding noise data, lru_add_drain_all() should be called 7426 * For avoiding noise data, lru_add_drain_all() should be called
@@ -7364,8 +7428,14 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
7364 */ 7428 */
7365 if (zone_idx(zone) == ZONE_MOVABLE) 7429 if (zone_idx(zone) == ZONE_MOVABLE)
7366 return false; 7430 return false;
7367 mt = get_pageblock_migratetype(page); 7431
7368 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) 7432 /*
7433 * CMA allocations (alloc_contig_range) really need to mark isolate
7434 * CMA pageblocks even when they are not movable in fact so consider
7435 * them movable here.
7436 */
7437 if (is_migrate_cma(migratetype) &&
7438 is_migrate_cma(get_pageblock_migratetype(page)))
7369 return false; 7439 return false;
7370 7440
7371 pfn = page_to_pfn(page); 7441 pfn = page_to_pfn(page);
@@ -7377,6 +7447,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
7377 7447
7378 page = pfn_to_page(check); 7448 page = pfn_to_page(check);
7379 7449
7450 if (PageReserved(page))
7451 return true;
7452
7380 /* 7453 /*
7381 * Hugepages are not in LRU lists, but they're movable. 7454 * Hugepages are not in LRU lists, but they're movable.
7382 * We need not scan over tail pages bacause we don't 7455 * We need not scan over tail pages bacause we don't
@@ -7450,7 +7523,7 @@ bool is_pageblock_removable_nolock(struct page *page)
7450 if (!zone_spans_pfn(zone, pfn)) 7523 if (!zone_spans_pfn(zone, pfn))
7451 return false; 7524 return false;
7452 7525
7453 return !has_unmovable_pages(zone, page, 0, true); 7526 return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);
7454} 7527}
7455 7528
7456#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) 7529#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 4f0367d472c4..2c16216c29b6 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -125,7 +125,6 @@ struct page_ext *lookup_page_ext(struct page *page)
125 struct page_ext *base; 125 struct page_ext *base;
126 126
127 base = NODE_DATA(page_to_nid(page))->node_page_ext; 127 base = NODE_DATA(page_to_nid(page))->node_page_ext;
128#if defined(CONFIG_DEBUG_VM)
129 /* 128 /*
130 * The sanity checks the page allocator does upon freeing a 129 * The sanity checks the page allocator does upon freeing a
131 * page can reach here before the page_ext arrays are 130 * page can reach here before the page_ext arrays are
@@ -134,7 +133,6 @@ struct page_ext *lookup_page_ext(struct page *page)
134 */ 133 */
135 if (unlikely(!base)) 134 if (unlikely(!base))
136 return NULL; 135 return NULL;
137#endif
138 index = pfn - round_down(node_start_pfn(page_to_nid(page)), 136 index = pfn - round_down(node_start_pfn(page_to_nid(page)),
139 MAX_ORDER_NR_PAGES); 137 MAX_ORDER_NR_PAGES);
140 return get_entry(base, index); 138 return get_entry(base, index);
@@ -199,7 +197,6 @@ struct page_ext *lookup_page_ext(struct page *page)
199{ 197{
200 unsigned long pfn = page_to_pfn(page); 198 unsigned long pfn = page_to_pfn(page);
201 struct mem_section *section = __pfn_to_section(pfn); 199 struct mem_section *section = __pfn_to_section(pfn);
202#if defined(CONFIG_DEBUG_VM)
203 /* 200 /*
204 * The sanity checks the page allocator does upon freeing a 201 * The sanity checks the page allocator does upon freeing a
205 * page can reach here before the page_ext arrays are 202 * page can reach here before the page_ext arrays are
@@ -208,7 +205,6 @@ struct page_ext *lookup_page_ext(struct page *page)
208 */ 205 */
209 if (!section->page_ext) 206 if (!section->page_ext)
210 return NULL; 207 return NULL;
211#endif
212 return get_entry(section->page_ext, pfn); 208 return get_entry(section->page_ext, pfn);
213} 209}
214 210
diff --git a/mm/page_io.c b/mm/page_io.c
index cd52b9cc169b..e93f1a4cacd7 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -347,7 +347,7 @@ out:
347 return ret; 347 return ret;
348} 348}
349 349
350int swap_readpage(struct page *page, bool do_poll) 350int swap_readpage(struct page *page, bool synchronous)
351{ 351{
352 struct bio *bio; 352 struct bio *bio;
353 int ret = 0; 353 int ret = 0;
@@ -355,7 +355,7 @@ int swap_readpage(struct page *page, bool do_poll)
355 blk_qc_t qc; 355 blk_qc_t qc;
356 struct gendisk *disk; 356 struct gendisk *disk;
357 357
358 VM_BUG_ON_PAGE(!PageSwapCache(page), page); 358 VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page);
359 VM_BUG_ON_PAGE(!PageLocked(page), page); 359 VM_BUG_ON_PAGE(!PageLocked(page), page);
360 VM_BUG_ON_PAGE(PageUptodate(page), page); 360 VM_BUG_ON_PAGE(PageUptodate(page), page);
361 if (frontswap_load(page) == 0) { 361 if (frontswap_load(page) == 0) {
@@ -403,7 +403,7 @@ int swap_readpage(struct page *page, bool do_poll)
403 count_vm_event(PSWPIN); 403 count_vm_event(PSWPIN);
404 bio_get(bio); 404 bio_get(bio);
405 qc = submit_bio(bio); 405 qc = submit_bio(bio);
406 while (do_poll) { 406 while (synchronous) {
407 set_current_state(TASK_UNINTERRUPTIBLE); 407 set_current_state(TASK_UNINTERRUPTIBLE);
408 if (!READ_ONCE(bio->bi_private)) 408 if (!READ_ONCE(bio->bi_private))
409 break; 409 break;
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 44f213935bf6..165ed8117bd1 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -15,7 +15,7 @@
15#define CREATE_TRACE_POINTS 15#define CREATE_TRACE_POINTS
16#include <trace/events/page_isolation.h> 16#include <trace/events/page_isolation.h>
17 17
18static int set_migratetype_isolate(struct page *page, 18static int set_migratetype_isolate(struct page *page, int migratetype,
19 bool skip_hwpoisoned_pages) 19 bool skip_hwpoisoned_pages)
20{ 20{
21 struct zone *zone; 21 struct zone *zone;
@@ -52,7 +52,7 @@ static int set_migratetype_isolate(struct page *page,
52 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. 52 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
53 * We just check MOVABLE pages. 53 * We just check MOVABLE pages.
54 */ 54 */
55 if (!has_unmovable_pages(zone, page, arg.pages_found, 55 if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype,
56 skip_hwpoisoned_pages)) 56 skip_hwpoisoned_pages))
57 ret = 0; 57 ret = 0;
58 58
@@ -64,14 +64,14 @@ static int set_migratetype_isolate(struct page *page,
64out: 64out:
65 if (!ret) { 65 if (!ret) {
66 unsigned long nr_pages; 66 unsigned long nr_pages;
67 int migratetype = get_pageblock_migratetype(page); 67 int mt = get_pageblock_migratetype(page);
68 68
69 set_pageblock_migratetype(page, MIGRATE_ISOLATE); 69 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
70 zone->nr_isolate_pageblock++; 70 zone->nr_isolate_pageblock++;
71 nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE, 71 nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE,
72 NULL); 72 NULL);
73 73
74 __mod_zone_freepage_state(zone, -nr_pages, migratetype); 74 __mod_zone_freepage_state(zone, -nr_pages, mt);
75 } 75 }
76 76
77 spin_unlock_irqrestore(&zone->lock, flags); 77 spin_unlock_irqrestore(&zone->lock, flags);
@@ -183,7 +183,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
183 pfn += pageblock_nr_pages) { 183 pfn += pageblock_nr_pages) {
184 page = __first_valid_page(pfn, pageblock_nr_pages); 184 page = __first_valid_page(pfn, pageblock_nr_pages);
185 if (page && 185 if (page &&
186 set_migratetype_isolate(page, skip_hwpoisoned_pages)) { 186 set_migratetype_isolate(page, migratetype, skip_hwpoisoned_pages)) {
187 undo_pfn = pfn; 187 undo_pfn = pfn;
188 goto undo; 188 goto undo;
189 } 189 }
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 4f44b95b9d1e..8592543a0f15 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -20,9 +20,9 @@
20#define PAGE_OWNER_STACK_DEPTH (16) 20#define PAGE_OWNER_STACK_DEPTH (16)
21 21
22struct page_owner { 22struct page_owner {
23 unsigned int order; 23 unsigned short order;
24 short last_migrate_reason;
24 gfp_t gfp_mask; 25 gfp_t gfp_mask;
25 int last_migrate_reason;
26 depot_stack_handle_t handle; 26 depot_stack_handle_t handle;
27}; 27};
28 28
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 15dab691ea70..9158e5a81391 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -81,7 +81,7 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk,
81static int pcpu_alloc_pages(struct pcpu_chunk *chunk, 81static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
82 struct page **pages, int page_start, int page_end) 82 struct page **pages, int page_start, int page_end)
83{ 83{
84 const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; 84 const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM;
85 unsigned int cpu, tcpu; 85 unsigned int cpu, tcpu;
86 int i; 86 int i;
87 87
diff --git a/mm/rmap.c b/mm/rmap.c
index b874c4761e84..47db27f8049e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -899,7 +899,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
899 mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); 899 mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
900 900
901 while (page_vma_mapped_walk(&pvmw)) { 901 while (page_vma_mapped_walk(&pvmw)) {
902 unsigned long cstart, cend; 902 unsigned long cstart;
903 int ret = 0; 903 int ret = 0;
904 904
905 cstart = address = pvmw.address; 905 cstart = address = pvmw.address;
@@ -915,7 +915,6 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
915 entry = pte_wrprotect(entry); 915 entry = pte_wrprotect(entry);
916 entry = pte_mkclean(entry); 916 entry = pte_mkclean(entry);
917 set_pte_at(vma->vm_mm, address, pte, entry); 917 set_pte_at(vma->vm_mm, address, pte, entry);
918 cend = cstart + PAGE_SIZE;
919 ret = 1; 918 ret = 1;
920 } else { 919 } else {
921#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE 920#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
@@ -931,7 +930,6 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
931 entry = pmd_mkclean(entry); 930 entry = pmd_mkclean(entry);
932 set_pmd_at(vma->vm_mm, address, pmd, entry); 931 set_pmd_at(vma->vm_mm, address, pmd, entry);
933 cstart &= PMD_MASK; 932 cstart &= PMD_MASK;
934 cend = cstart + PMD_SIZE;
935 ret = 1; 933 ret = 1;
936#else 934#else
937 /* unexpected pmd-mapped page? */ 935 /* unexpected pmd-mapped page? */
@@ -939,10 +937,15 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
939#endif 937#endif
940 } 938 }
941 939
942 if (ret) { 940 /*
943 mmu_notifier_invalidate_range(vma->vm_mm, cstart, cend); 941 * No need to call mmu_notifier_invalidate_range() as we are
942 * downgrading page table protection not changing it to point
943 * to a new page.
944 *
945 * See Documentation/vm/mmu_notifier.txt
946 */
947 if (ret)
944 (*cleaned)++; 948 (*cleaned)++;
945 }
946 } 949 }
947 950
948 mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); 951 mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
@@ -1318,7 +1321,7 @@ void page_remove_rmap(struct page *page, bool compound)
1318 * It would be tidy to reset the PageAnon mapping here, 1321 * It would be tidy to reset the PageAnon mapping here,
1319 * but that might overwrite a racing page_add_anon_rmap 1322 * but that might overwrite a racing page_add_anon_rmap
1320 * which increments mapcount after us but sets mapping 1323 * which increments mapcount after us but sets mapping
1321 * before us: so leave the reset to free_hot_cold_page, 1324 * before us: so leave the reset to free_unref_page,
1322 * and remember that it's only reliable while mapped. 1325 * and remember that it's only reliable while mapped.
1323 * Leaving it set also helps swapoff to reinstate ptes 1326 * Leaving it set also helps swapoff to reinstate ptes
1324 * faster for those pages still in swapcache. 1327 * faster for those pages still in swapcache.
@@ -1426,6 +1429,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1426 if (pte_soft_dirty(pteval)) 1429 if (pte_soft_dirty(pteval))
1427 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1430 swp_pte = pte_swp_mksoft_dirty(swp_pte);
1428 set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); 1431 set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
1432 /*
1433 * No need to invalidate here it will synchronize on
1434 * against the special swap migration pte.
1435 */
1429 goto discard; 1436 goto discard;
1430 } 1437 }
1431 1438
@@ -1483,6 +1490,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1483 * will take care of the rest. 1490 * will take care of the rest.
1484 */ 1491 */
1485 dec_mm_counter(mm, mm_counter(page)); 1492 dec_mm_counter(mm, mm_counter(page));
1493 /* We have to invalidate as we cleared the pte */
1494 mmu_notifier_invalidate_range(mm, address,
1495 address + PAGE_SIZE);
1486 } else if (IS_ENABLED(CONFIG_MIGRATION) && 1496 } else if (IS_ENABLED(CONFIG_MIGRATION) &&
1487 (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) { 1497 (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) {
1488 swp_entry_t entry; 1498 swp_entry_t entry;
@@ -1498,6 +1508,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1498 if (pte_soft_dirty(pteval)) 1508 if (pte_soft_dirty(pteval))
1499 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1509 swp_pte = pte_swp_mksoft_dirty(swp_pte);
1500 set_pte_at(mm, address, pvmw.pte, swp_pte); 1510 set_pte_at(mm, address, pvmw.pte, swp_pte);
1511 /*
1512 * No need to invalidate here it will synchronize on
1513 * against the special swap migration pte.
1514 */
1501 } else if (PageAnon(page)) { 1515 } else if (PageAnon(page)) {
1502 swp_entry_t entry = { .val = page_private(subpage) }; 1516 swp_entry_t entry = { .val = page_private(subpage) };
1503 pte_t swp_pte; 1517 pte_t swp_pte;
@@ -1509,6 +1523,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1509 WARN_ON_ONCE(1); 1523 WARN_ON_ONCE(1);
1510 ret = false; 1524 ret = false;
1511 /* We have to invalidate as we cleared the pte */ 1525 /* We have to invalidate as we cleared the pte */
1526 mmu_notifier_invalidate_range(mm, address,
1527 address + PAGE_SIZE);
1512 page_vma_mapped_walk_done(&pvmw); 1528 page_vma_mapped_walk_done(&pvmw);
1513 break; 1529 break;
1514 } 1530 }
@@ -1516,6 +1532,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1516 /* MADV_FREE page check */ 1532 /* MADV_FREE page check */
1517 if (!PageSwapBacked(page)) { 1533 if (!PageSwapBacked(page)) {
1518 if (!PageDirty(page)) { 1534 if (!PageDirty(page)) {
1535 /* Invalidate as we cleared the pte */
1536 mmu_notifier_invalidate_range(mm,
1537 address, address + PAGE_SIZE);
1519 dec_mm_counter(mm, MM_ANONPAGES); 1538 dec_mm_counter(mm, MM_ANONPAGES);
1520 goto discard; 1539 goto discard;
1521 } 1540 }
@@ -1549,13 +1568,39 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1549 if (pte_soft_dirty(pteval)) 1568 if (pte_soft_dirty(pteval))
1550 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1569 swp_pte = pte_swp_mksoft_dirty(swp_pte);
1551 set_pte_at(mm, address, pvmw.pte, swp_pte); 1570 set_pte_at(mm, address, pvmw.pte, swp_pte);
1552 } else 1571 /* Invalidate as we cleared the pte */
1572 mmu_notifier_invalidate_range(mm, address,
1573 address + PAGE_SIZE);
1574 } else {
1575 /*
1576 * We should not need to notify here as we reach this
1577 * case only from freeze_page() itself only call from
1578 * split_huge_page_to_list() so everything below must
1579 * be true:
1580 * - page is not anonymous
1581 * - page is locked
1582 *
1583 * So as it is a locked file back page thus it can not
1584 * be remove from the page cache and replace by a new
1585 * page before mmu_notifier_invalidate_range_end so no
1586 * concurrent thread might update its page table to
1587 * point at new page while a device still is using this
1588 * page.
1589 *
1590 * See Documentation/vm/mmu_notifier.txt
1591 */
1553 dec_mm_counter(mm, mm_counter_file(page)); 1592 dec_mm_counter(mm, mm_counter_file(page));
1593 }
1554discard: 1594discard:
1595 /*
1596 * No need to call mmu_notifier_invalidate_range() it has be
1597 * done above for all cases requiring it to happen under page
1598 * table lock before mmu_notifier_invalidate_range_end()
1599 *
1600 * See Documentation/vm/mmu_notifier.txt
1601 */
1555 page_remove_rmap(subpage, PageHuge(page)); 1602 page_remove_rmap(subpage, PageHuge(page));
1556 put_page(page); 1603 put_page(page);
1557 mmu_notifier_invalidate_range(mm, address,
1558 address + PAGE_SIZE);
1559 } 1604 }
1560 1605
1561 mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); 1606 mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
diff --git a/mm/shmem.c b/mm/shmem.c
index 07a1d22807be..ab22eaa2412e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -338,7 +338,7 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
338 if (item != expected) 338 if (item != expected)
339 return -ENOENT; 339 return -ENOENT;
340 __radix_tree_replace(&mapping->page_tree, node, pslot, 340 __radix_tree_replace(&mapping->page_tree, node, pslot,
341 replacement, NULL, NULL); 341 replacement, NULL);
342 return 0; 342 return 0;
343} 343}
344 344
@@ -747,7 +747,7 @@ void shmem_unlock_mapping(struct address_space *mapping)
747 pgoff_t indices[PAGEVEC_SIZE]; 747 pgoff_t indices[PAGEVEC_SIZE];
748 pgoff_t index = 0; 748 pgoff_t index = 0;
749 749
750 pagevec_init(&pvec, 0); 750 pagevec_init(&pvec);
751 /* 751 /*
752 * Minor point, but we might as well stop if someone else SHM_LOCKs it. 752 * Minor point, but we might as well stop if someone else SHM_LOCKs it.
753 */ 753 */
@@ -790,7 +790,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
790 if (lend == -1) 790 if (lend == -1)
791 end = -1; /* unsigned, so actually very big */ 791 end = -1; /* unsigned, so actually very big */
792 792
793 pagevec_init(&pvec, 0); 793 pagevec_init(&pvec);
794 index = start; 794 index = start;
795 while (index < end) { 795 while (index < end) {
796 pvec.nr = find_get_entries(mapping, index, 796 pvec.nr = find_get_entries(mapping, index,
@@ -2528,7 +2528,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
2528 bool done = false; 2528 bool done = false;
2529 int i; 2529 int i;
2530 2530
2531 pagevec_init(&pvec, 0); 2531 pagevec_init(&pvec);
2532 pvec.nr = 1; /* start small: we may be there already */ 2532 pvec.nr = 1; /* start small: we may be there already */
2533 while (!done) { 2533 while (!done) {
2534 pvec.nr = find_get_entries(mapping, index, 2534 pvec.nr = find_get_entries(mapping, index,
@@ -3862,12 +3862,11 @@ static void shmem_init_inode(void *foo)
3862 inode_init_once(&info->vfs_inode); 3862 inode_init_once(&info->vfs_inode);
3863} 3863}
3864 3864
3865static int shmem_init_inodecache(void) 3865static void shmem_init_inodecache(void)
3866{ 3866{
3867 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", 3867 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
3868 sizeof(struct shmem_inode_info), 3868 sizeof(struct shmem_inode_info),
3869 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); 3869 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
3870 return 0;
3871} 3870}
3872 3871
3873static void shmem_destroy_inodecache(void) 3872static void shmem_destroy_inodecache(void)
@@ -3991,9 +3990,7 @@ int __init shmem_init(void)
3991 if (shmem_inode_cachep) 3990 if (shmem_inode_cachep)
3992 return 0; 3991 return 0;
3993 3992
3994 error = shmem_init_inodecache(); 3993 shmem_init_inodecache();
3995 if (error)
3996 goto out3;
3997 3994
3998 error = register_filesystem(&shmem_fs_type); 3995 error = register_filesystem(&shmem_fs_type);
3999 if (error) { 3996 if (error) {
@@ -4020,7 +4017,6 @@ out1:
4020 unregister_filesystem(&shmem_fs_type); 4017 unregister_filesystem(&shmem_fs_type);
4021out2: 4018out2:
4022 shmem_destroy_inodecache(); 4019 shmem_destroy_inodecache();
4023out3:
4024 shm_mnt = ERR_PTR(error); 4020 shm_mnt = ERR_PTR(error);
4025 return error; 4021 return error;
4026} 4022}
@@ -4102,6 +4098,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma)
4102 if (i_size >= HPAGE_PMD_SIZE && 4098 if (i_size >= HPAGE_PMD_SIZE &&
4103 i_size >> PAGE_SHIFT >= off) 4099 i_size >> PAGE_SHIFT >= off)
4104 return true; 4100 return true;
4101 /* fall through */
4105 case SHMEM_HUGE_ADVISE: 4102 case SHMEM_HUGE_ADVISE:
4106 /* TODO: implement fadvise() hints */ 4103 /* TODO: implement fadvise() hints */
4107 return (vma->vm_flags & VM_HUGEPAGE); 4104 return (vma->vm_flags & VM_HUGEPAGE);
diff --git a/mm/slab.c b/mm/slab.c
index b7095884fd93..183e996dde5f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -114,7 +114,6 @@
114#include <linux/rtmutex.h> 114#include <linux/rtmutex.h>
115#include <linux/reciprocal_div.h> 115#include <linux/reciprocal_div.h>
116#include <linux/debugobjects.h> 116#include <linux/debugobjects.h>
117#include <linux/kmemcheck.h>
118#include <linux/memory.h> 117#include <linux/memory.h>
119#include <linux/prefetch.h> 118#include <linux/prefetch.h>
120#include <linux/sched/task_stack.h> 119#include <linux/sched/task_stack.h>
@@ -252,8 +251,8 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
252 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ 251 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
253 } while (0) 252 } while (0)
254 253
255#define CFLGS_OBJFREELIST_SLAB (0x40000000UL) 254#define CFLGS_OBJFREELIST_SLAB ((slab_flags_t __force)0x40000000U)
256#define CFLGS_OFF_SLAB (0x80000000UL) 255#define CFLGS_OFF_SLAB ((slab_flags_t __force)0x80000000U)
257#define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB) 256#define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB)
258#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) 257#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
259 258
@@ -441,7 +440,7 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
441 * Calculate the number of objects and left-over bytes for a given buffer size. 440 * Calculate the number of objects and left-over bytes for a given buffer size.
442 */ 441 */
443static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size, 442static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size,
444 unsigned long flags, size_t *left_over) 443 slab_flags_t flags, size_t *left_over)
445{ 444{
446 unsigned int num; 445 unsigned int num;
447 size_t slab_size = PAGE_SIZE << gfporder; 446 size_t slab_size = PAGE_SIZE << gfporder;
@@ -1410,10 +1409,8 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
1410 int nr_pages; 1409 int nr_pages;
1411 1410
1412 flags |= cachep->allocflags; 1411 flags |= cachep->allocflags;
1413 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1414 flags |= __GFP_RECLAIMABLE;
1415 1412
1416 page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); 1413 page = __alloc_pages_node(nodeid, flags, cachep->gfporder);
1417 if (!page) { 1414 if (!page) {
1418 slab_out_of_memory(cachep, flags, nodeid); 1415 slab_out_of_memory(cachep, flags, nodeid);
1419 return NULL; 1416 return NULL;
@@ -1435,15 +1432,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
1435 if (sk_memalloc_socks() && page_is_pfmemalloc(page)) 1432 if (sk_memalloc_socks() && page_is_pfmemalloc(page))
1436 SetPageSlabPfmemalloc(page); 1433 SetPageSlabPfmemalloc(page);
1437 1434
1438 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
1439 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
1440
1441 if (cachep->ctor)
1442 kmemcheck_mark_uninitialized_pages(page, nr_pages);
1443 else
1444 kmemcheck_mark_unallocated_pages(page, nr_pages);
1445 }
1446
1447 return page; 1435 return page;
1448} 1436}
1449 1437
@@ -1455,8 +1443,6 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
1455 int order = cachep->gfporder; 1443 int order = cachep->gfporder;
1456 unsigned long nr_freed = (1 << order); 1444 unsigned long nr_freed = (1 << order);
1457 1445
1458 kmemcheck_free_shadow(page, order);
1459
1460 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1446 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1461 mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, -nr_freed); 1447 mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, -nr_freed);
1462 else 1448 else
@@ -1761,7 +1747,7 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
1761 * towards high-order requests, this should be changed. 1747 * towards high-order requests, this should be changed.
1762 */ 1748 */
1763static size_t calculate_slab_order(struct kmem_cache *cachep, 1749static size_t calculate_slab_order(struct kmem_cache *cachep,
1764 size_t size, unsigned long flags) 1750 size_t size, slab_flags_t flags)
1765{ 1751{
1766 size_t left_over = 0; 1752 size_t left_over = 0;
1767 int gfporder; 1753 int gfporder;
@@ -1888,8 +1874,8 @@ static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
1888 return 0; 1874 return 0;
1889} 1875}
1890 1876
1891unsigned long kmem_cache_flags(unsigned long object_size, 1877slab_flags_t kmem_cache_flags(unsigned long object_size,
1892 unsigned long flags, const char *name, 1878 slab_flags_t flags, const char *name,
1893 void (*ctor)(void *)) 1879 void (*ctor)(void *))
1894{ 1880{
1895 return flags; 1881 return flags;
@@ -1897,7 +1883,7 @@ unsigned long kmem_cache_flags(unsigned long object_size,
1897 1883
1898struct kmem_cache * 1884struct kmem_cache *
1899__kmem_cache_alias(const char *name, size_t size, size_t align, 1885__kmem_cache_alias(const char *name, size_t size, size_t align,
1900 unsigned long flags, void (*ctor)(void *)) 1886 slab_flags_t flags, void (*ctor)(void *))
1901{ 1887{
1902 struct kmem_cache *cachep; 1888 struct kmem_cache *cachep;
1903 1889
@@ -1915,7 +1901,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
1915} 1901}
1916 1902
1917static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, 1903static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
1918 size_t size, unsigned long flags) 1904 size_t size, slab_flags_t flags)
1919{ 1905{
1920 size_t left; 1906 size_t left;
1921 1907
@@ -1938,7 +1924,7 @@ static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
1938} 1924}
1939 1925
1940static bool set_off_slab_cache(struct kmem_cache *cachep, 1926static bool set_off_slab_cache(struct kmem_cache *cachep,
1941 size_t size, unsigned long flags) 1927 size_t size, slab_flags_t flags)
1942{ 1928{
1943 size_t left; 1929 size_t left;
1944 1930
@@ -1972,7 +1958,7 @@ static bool set_off_slab_cache(struct kmem_cache *cachep,
1972} 1958}
1973 1959
1974static bool set_on_slab_cache(struct kmem_cache *cachep, 1960static bool set_on_slab_cache(struct kmem_cache *cachep,
1975 size_t size, unsigned long flags) 1961 size_t size, slab_flags_t flags)
1976{ 1962{
1977 size_t left; 1963 size_t left;
1978 1964
@@ -2008,8 +1994,7 @@ static bool set_on_slab_cache(struct kmem_cache *cachep,
2008 * cacheline. This can be beneficial if you're counting cycles as closely 1994 * cacheline. This can be beneficial if you're counting cycles as closely
2009 * as davem. 1995 * as davem.
2010 */ 1996 */
2011int 1997int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags)
2012__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2013{ 1998{
2014 size_t ralign = BYTES_PER_WORD; 1999 size_t ralign = BYTES_PER_WORD;
2015 gfp_t gfp; 2000 gfp_t gfp;
@@ -2144,6 +2129,8 @@ done:
2144 cachep->allocflags = __GFP_COMP; 2129 cachep->allocflags = __GFP_COMP;
2145 if (flags & SLAB_CACHE_DMA) 2130 if (flags & SLAB_CACHE_DMA)
2146 cachep->allocflags |= GFP_DMA; 2131 cachep->allocflags |= GFP_DMA;
2132 if (flags & SLAB_RECLAIM_ACCOUNT)
2133 cachep->allocflags |= __GFP_RECLAIMABLE;
2147 cachep->size = size; 2134 cachep->size = size;
2148 cachep->reciprocal_buffer_size = reciprocal_value(size); 2135 cachep->reciprocal_buffer_size = reciprocal_value(size);
2149 2136
@@ -3516,8 +3503,6 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
3516 kmemleak_free_recursive(objp, cachep->flags); 3503 kmemleak_free_recursive(objp, cachep->flags);
3517 objp = cache_free_debugcheck(cachep, objp, caller); 3504 objp = cache_free_debugcheck(cachep, objp, caller);
3518 3505
3519 kmemcheck_slab_free(cachep, objp, cachep->object_size);
3520
3521 /* 3506 /*
3522 * Skip calling cache_free_alien() when the platform is not numa. 3507 * Skip calling cache_free_alien() when the platform is not numa.
3523 * This will avoid cache misses that happen while accessing slabp (which 3508 * This will avoid cache misses that happen while accessing slabp (which
@@ -4097,7 +4082,6 @@ out:
4097 schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC)); 4082 schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC));
4098} 4083}
4099 4084
4100#ifdef CONFIG_SLABINFO
4101void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) 4085void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
4102{ 4086{
4103 unsigned long active_objs, num_objs, active_slabs; 4087 unsigned long active_objs, num_objs, active_slabs;
@@ -4405,7 +4389,6 @@ static int __init slab_proc_init(void)
4405 return 0; 4389 return 0;
4406} 4390}
4407module_init(slab_proc_init); 4391module_init(slab_proc_init);
4408#endif
4409 4392
4410#ifdef CONFIG_HARDENED_USERCOPY 4393#ifdef CONFIG_HARDENED_USERCOPY
4411/* 4394/*
diff --git a/mm/slab.h b/mm/slab.h
index 86d7c7d860f9..ad657ffa44e5 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -21,7 +21,7 @@ struct kmem_cache {
21 unsigned int object_size;/* The original size of the object */ 21 unsigned int object_size;/* The original size of the object */
22 unsigned int size; /* The aligned/padded/added on size */ 22 unsigned int size; /* The aligned/padded/added on size */
23 unsigned int align; /* Alignment as calculated */ 23 unsigned int align; /* Alignment as calculated */
24 unsigned long flags; /* Active flags on the slab */ 24 slab_flags_t flags; /* Active flags on the slab */
25 const char *name; /* Slab name for sysfs */ 25 const char *name; /* Slab name for sysfs */
26 int refcount; /* Use counter */ 26 int refcount; /* Use counter */
27 void (*ctor)(void *); /* Called on object slot creation */ 27 void (*ctor)(void *); /* Called on object slot creation */
@@ -40,7 +40,6 @@ struct kmem_cache {
40 40
41#include <linux/memcontrol.h> 41#include <linux/memcontrol.h>
42#include <linux/fault-inject.h> 42#include <linux/fault-inject.h>
43#include <linux/kmemcheck.h>
44#include <linux/kasan.h> 43#include <linux/kasan.h>
45#include <linux/kmemleak.h> 44#include <linux/kmemleak.h>
46#include <linux/random.h> 45#include <linux/random.h>
@@ -79,13 +78,13 @@ extern const struct kmalloc_info_struct {
79 unsigned long size; 78 unsigned long size;
80} kmalloc_info[]; 79} kmalloc_info[];
81 80
82unsigned long calculate_alignment(unsigned long flags, 81unsigned long calculate_alignment(slab_flags_t flags,
83 unsigned long align, unsigned long size); 82 unsigned long align, unsigned long size);
84 83
85#ifndef CONFIG_SLOB 84#ifndef CONFIG_SLOB
86/* Kmalloc array related functions */ 85/* Kmalloc array related functions */
87void setup_kmalloc_cache_index_table(void); 86void setup_kmalloc_cache_index_table(void);
88void create_kmalloc_caches(unsigned long); 87void create_kmalloc_caches(slab_flags_t);
89 88
90/* Find the kmalloc slab corresponding for a certain size */ 89/* Find the kmalloc slab corresponding for a certain size */
91struct kmem_cache *kmalloc_slab(size_t, gfp_t); 90struct kmem_cache *kmalloc_slab(size_t, gfp_t);
@@ -93,32 +92,32 @@ struct kmem_cache *kmalloc_slab(size_t, gfp_t);
93 92
94 93
95/* Functions provided by the slab allocators */ 94/* Functions provided by the slab allocators */
96extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags); 95int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags);
97 96
98extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size, 97extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
99 unsigned long flags); 98 slab_flags_t flags);
100extern void create_boot_cache(struct kmem_cache *, const char *name, 99extern void create_boot_cache(struct kmem_cache *, const char *name,
101 size_t size, unsigned long flags); 100 size_t size, slab_flags_t flags);
102 101
103int slab_unmergeable(struct kmem_cache *s); 102int slab_unmergeable(struct kmem_cache *s);
104struct kmem_cache *find_mergeable(size_t size, size_t align, 103struct kmem_cache *find_mergeable(size_t size, size_t align,
105 unsigned long flags, const char *name, void (*ctor)(void *)); 104 slab_flags_t flags, const char *name, void (*ctor)(void *));
106#ifndef CONFIG_SLOB 105#ifndef CONFIG_SLOB
107struct kmem_cache * 106struct kmem_cache *
108__kmem_cache_alias(const char *name, size_t size, size_t align, 107__kmem_cache_alias(const char *name, size_t size, size_t align,
109 unsigned long flags, void (*ctor)(void *)); 108 slab_flags_t flags, void (*ctor)(void *));
110 109
111unsigned long kmem_cache_flags(unsigned long object_size, 110slab_flags_t kmem_cache_flags(unsigned long object_size,
112 unsigned long flags, const char *name, 111 slab_flags_t flags, const char *name,
113 void (*ctor)(void *)); 112 void (*ctor)(void *));
114#else 113#else
115static inline struct kmem_cache * 114static inline struct kmem_cache *
116__kmem_cache_alias(const char *name, size_t size, size_t align, 115__kmem_cache_alias(const char *name, size_t size, size_t align,
117 unsigned long flags, void (*ctor)(void *)) 116 slab_flags_t flags, void (*ctor)(void *))
118{ return NULL; } 117{ return NULL; }
119 118
120static inline unsigned long kmem_cache_flags(unsigned long object_size, 119static inline slab_flags_t kmem_cache_flags(unsigned long object_size,
121 unsigned long flags, const char *name, 120 slab_flags_t flags, const char *name,
122 void (*ctor)(void *)) 121 void (*ctor)(void *))
123{ 122{
124 return flags; 123 return flags;
@@ -142,10 +141,10 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
142#if defined(CONFIG_SLAB) 141#if defined(CONFIG_SLAB)
143#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ 142#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
144 SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \ 143 SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \
145 SLAB_NOTRACK | SLAB_ACCOUNT) 144 SLAB_ACCOUNT)
146#elif defined(CONFIG_SLUB) 145#elif defined(CONFIG_SLUB)
147#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ 146#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
148 SLAB_TEMPORARY | SLAB_NOTRACK | SLAB_ACCOUNT) 147 SLAB_TEMPORARY | SLAB_ACCOUNT)
149#else 148#else
150#define SLAB_CACHE_FLAGS (0) 149#define SLAB_CACHE_FLAGS (0)
151#endif 150#endif
@@ -164,7 +163,6 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
164 SLAB_NOLEAKTRACE | \ 163 SLAB_NOLEAKTRACE | \
165 SLAB_RECLAIM_ACCOUNT | \ 164 SLAB_RECLAIM_ACCOUNT | \
166 SLAB_TEMPORARY | \ 165 SLAB_TEMPORARY | \
167 SLAB_NOTRACK | \
168 SLAB_ACCOUNT) 166 SLAB_ACCOUNT)
169 167
170int __kmem_cache_shutdown(struct kmem_cache *); 168int __kmem_cache_shutdown(struct kmem_cache *);
@@ -439,7 +437,6 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
439 for (i = 0; i < size; i++) { 437 for (i = 0; i < size; i++) {
440 void *object = p[i]; 438 void *object = p[i];
441 439
442 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
443 kmemleak_alloc_recursive(object, s->object_size, 1, 440 kmemleak_alloc_recursive(object, s->object_size, 1,
444 s->flags, flags); 441 s->flags, flags);
445 kasan_slab_alloc(s, object, flags); 442 kasan_slab_alloc(s, object, flags);
@@ -506,6 +503,14 @@ void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos);
506void memcg_slab_stop(struct seq_file *m, void *p); 503void memcg_slab_stop(struct seq_file *m, void *p);
507int memcg_slab_show(struct seq_file *m, void *p); 504int memcg_slab_show(struct seq_file *m, void *p);
508 505
506#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
507void dump_unreclaimable_slab(void);
508#else
509static inline void dump_unreclaimable_slab(void)
510{
511}
512#endif
513
509void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); 514void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr);
510 515
511#ifdef CONFIG_SLAB_FREELIST_RANDOM 516#ifdef CONFIG_SLAB_FREELIST_RANDOM
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 0d7fe71ff5e4..c8cb36774ba1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -44,7 +44,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
44 SLAB_FAILSLAB | SLAB_KASAN) 44 SLAB_FAILSLAB | SLAB_KASAN)
45 45
46#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ 46#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
47 SLAB_NOTRACK | SLAB_ACCOUNT) 47 SLAB_ACCOUNT)
48 48
49/* 49/*
50 * Merge control. If this is set then no merging of slab caches will occur. 50 * Merge control. If this is set then no merging of slab caches will occur.
@@ -291,7 +291,7 @@ int slab_unmergeable(struct kmem_cache *s)
291} 291}
292 292
293struct kmem_cache *find_mergeable(size_t size, size_t align, 293struct kmem_cache *find_mergeable(size_t size, size_t align,
294 unsigned long flags, const char *name, void (*ctor)(void *)) 294 slab_flags_t flags, const char *name, void (*ctor)(void *))
295{ 295{
296 struct kmem_cache *s; 296 struct kmem_cache *s;
297 297
@@ -341,7 +341,7 @@ struct kmem_cache *find_mergeable(size_t size, size_t align,
341 * Figure out what the alignment of the objects will be given a set of 341 * Figure out what the alignment of the objects will be given a set of
342 * flags, a user specified alignment and the size of the objects. 342 * flags, a user specified alignment and the size of the objects.
343 */ 343 */
344unsigned long calculate_alignment(unsigned long flags, 344unsigned long calculate_alignment(slab_flags_t flags,
345 unsigned long align, unsigned long size) 345 unsigned long align, unsigned long size)
346{ 346{
347 /* 347 /*
@@ -366,7 +366,7 @@ unsigned long calculate_alignment(unsigned long flags,
366 366
367static struct kmem_cache *create_cache(const char *name, 367static struct kmem_cache *create_cache(const char *name,
368 size_t object_size, size_t size, size_t align, 368 size_t object_size, size_t size, size_t align,
369 unsigned long flags, void (*ctor)(void *), 369 slab_flags_t flags, void (*ctor)(void *),
370 struct mem_cgroup *memcg, struct kmem_cache *root_cache) 370 struct mem_cgroup *memcg, struct kmem_cache *root_cache)
371{ 371{
372 struct kmem_cache *s; 372 struct kmem_cache *s;
@@ -431,7 +431,7 @@ out_free_cache:
431 */ 431 */
432struct kmem_cache * 432struct kmem_cache *
433kmem_cache_create(const char *name, size_t size, size_t align, 433kmem_cache_create(const char *name, size_t size, size_t align,
434 unsigned long flags, void (*ctor)(void *)) 434 slab_flags_t flags, void (*ctor)(void *))
435{ 435{
436 struct kmem_cache *s = NULL; 436 struct kmem_cache *s = NULL;
437 const char *cache_name; 437 const char *cache_name;
@@ -879,7 +879,7 @@ bool slab_is_available(void)
879#ifndef CONFIG_SLOB 879#ifndef CONFIG_SLOB
880/* Create a cache during boot when no slab services are available yet */ 880/* Create a cache during boot when no slab services are available yet */
881void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size, 881void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size,
882 unsigned long flags) 882 slab_flags_t flags)
883{ 883{
884 int err; 884 int err;
885 885
@@ -899,7 +899,7 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz
899} 899}
900 900
901struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, 901struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
902 unsigned long flags) 902 slab_flags_t flags)
903{ 903{
904 struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); 904 struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
905 905
@@ -1057,7 +1057,7 @@ void __init setup_kmalloc_cache_index_table(void)
1057 } 1057 }
1058} 1058}
1059 1059
1060static void __init new_kmalloc_cache(int idx, unsigned long flags) 1060static void __init new_kmalloc_cache(int idx, slab_flags_t flags)
1061{ 1061{
1062 kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name, 1062 kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name,
1063 kmalloc_info[idx].size, flags); 1063 kmalloc_info[idx].size, flags);
@@ -1068,7 +1068,7 @@ static void __init new_kmalloc_cache(int idx, unsigned long flags)
1068 * may already have been created because they were needed to 1068 * may already have been created because they were needed to
1069 * enable allocations for slab creation. 1069 * enable allocations for slab creation.
1070 */ 1070 */
1071void __init create_kmalloc_caches(unsigned long flags) 1071void __init create_kmalloc_caches(slab_flags_t flags)
1072{ 1072{
1073 int i; 1073 int i;
1074 1074
@@ -1184,8 +1184,7 @@ void cache_random_seq_destroy(struct kmem_cache *cachep)
1184} 1184}
1185#endif /* CONFIG_SLAB_FREELIST_RANDOM */ 1185#endif /* CONFIG_SLAB_FREELIST_RANDOM */
1186 1186
1187#ifdef CONFIG_SLABINFO 1187#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
1188
1189#ifdef CONFIG_SLAB 1188#ifdef CONFIG_SLAB
1190#define SLABINFO_RIGHTS (S_IWUSR | S_IRUSR) 1189#define SLABINFO_RIGHTS (S_IWUSR | S_IRUSR)
1191#else 1190#else
@@ -1281,7 +1280,41 @@ static int slab_show(struct seq_file *m, void *p)
1281 return 0; 1280 return 0;
1282} 1281}
1283 1282
1284#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) 1283void dump_unreclaimable_slab(void)
1284{
1285 struct kmem_cache *s, *s2;
1286 struct slabinfo sinfo;
1287
1288 /*
1289 * Here acquiring slab_mutex is risky since we don't prefer to get
1290 * sleep in oom path. But, without mutex hold, it may introduce a
1291 * risk of crash.
1292 * Use mutex_trylock to protect the list traverse, dump nothing
1293 * without acquiring the mutex.
1294 */
1295 if (!mutex_trylock(&slab_mutex)) {
1296 pr_warn("excessive unreclaimable slab but cannot dump stats\n");
1297 return;
1298 }
1299
1300 pr_info("Unreclaimable slab info:\n");
1301 pr_info("Name Used Total\n");
1302
1303 list_for_each_entry_safe(s, s2, &slab_caches, list) {
1304 if (!is_root_cache(s) || (s->flags & SLAB_RECLAIM_ACCOUNT))
1305 continue;
1306
1307 get_slabinfo(s, &sinfo);
1308
1309 if (sinfo.num_objs > 0)
1310 pr_info("%-17s %10luKB %10luKB\n", cache_name(s),
1311 (sinfo.active_objs * s->size) / 1024,
1312 (sinfo.num_objs * s->size) / 1024);
1313 }
1314 mutex_unlock(&slab_mutex);
1315}
1316
1317#if defined(CONFIG_MEMCG)
1285void *memcg_slab_start(struct seq_file *m, loff_t *pos) 1318void *memcg_slab_start(struct seq_file *m, loff_t *pos)
1286{ 1319{
1287 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 1320 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
@@ -1355,7 +1388,7 @@ static int __init slab_proc_init(void)
1355 return 0; 1388 return 0;
1356} 1389}
1357module_init(slab_proc_init); 1390module_init(slab_proc_init);
1358#endif /* CONFIG_SLABINFO */ 1391#endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */
1359 1392
1360static __always_inline void *__do_krealloc(const void *p, size_t new_size, 1393static __always_inline void *__do_krealloc(const void *p, size_t new_size,
1361 gfp_t flags) 1394 gfp_t flags)
diff --git a/mm/slob.c b/mm/slob.c
index 10249160b693..623e8a5c46ce 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -330,7 +330,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
330 BUG_ON(!b); 330 BUG_ON(!b);
331 spin_unlock_irqrestore(&slob_lock, flags); 331 spin_unlock_irqrestore(&slob_lock, flags);
332 } 332 }
333 if (unlikely((gfp & __GFP_ZERO) && b)) 333 if (unlikely(gfp & __GFP_ZERO))
334 memset(b, 0, size); 334 memset(b, 0, size);
335 return b; 335 return b;
336} 336}
@@ -524,7 +524,7 @@ size_t ksize(const void *block)
524} 524}
525EXPORT_SYMBOL(ksize); 525EXPORT_SYMBOL(ksize);
526 526
527int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) 527int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags)
528{ 528{
529 if (flags & SLAB_TYPESAFE_BY_RCU) { 529 if (flags & SLAB_TYPESAFE_BY_RCU) {
530 /* leave room for rcu footer at the end of object */ 530 /* leave room for rcu footer at the end of object */
diff --git a/mm/slub.c b/mm/slub.c
index 1efbb8123037..cfd56e5a35fb 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -22,7 +22,6 @@
22#include <linux/notifier.h> 22#include <linux/notifier.h>
23#include <linux/seq_file.h> 23#include <linux/seq_file.h>
24#include <linux/kasan.h> 24#include <linux/kasan.h>
25#include <linux/kmemcheck.h>
26#include <linux/cpu.h> 25#include <linux/cpu.h>
27#include <linux/cpuset.h> 26#include <linux/cpuset.h>
28#include <linux/mempolicy.h> 27#include <linux/mempolicy.h>
@@ -193,8 +192,10 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
193#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ 192#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */
194 193
195/* Internal SLUB flags */ 194/* Internal SLUB flags */
196#define __OBJECT_POISON 0x80000000UL /* Poison object */ 195/* Poison object */
197#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ 196#define __OBJECT_POISON ((slab_flags_t __force)0x80000000U)
197/* Use cmpxchg_double */
198#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000U)
198 199
199/* 200/*
200 * Tracking user of a slab. 201 * Tracking user of a slab.
@@ -485,9 +486,9 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p)
485 * Debug settings: 486 * Debug settings:
486 */ 487 */
487#if defined(CONFIG_SLUB_DEBUG_ON) 488#if defined(CONFIG_SLUB_DEBUG_ON)
488static int slub_debug = DEBUG_DEFAULT_FLAGS; 489static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
489#else 490#else
490static int slub_debug; 491static slab_flags_t slub_debug;
491#endif 492#endif
492 493
493static char *slub_debug_slabs; 494static char *slub_debug_slabs;
@@ -1289,8 +1290,8 @@ out:
1289 1290
1290__setup("slub_debug", setup_slub_debug); 1291__setup("slub_debug", setup_slub_debug);
1291 1292
1292unsigned long kmem_cache_flags(unsigned long object_size, 1293slab_flags_t kmem_cache_flags(unsigned long object_size,
1293 unsigned long flags, const char *name, 1294 slab_flags_t flags, const char *name,
1294 void (*ctor)(void *)) 1295 void (*ctor)(void *))
1295{ 1296{
1296 /* 1297 /*
@@ -1322,8 +1323,8 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
1322 struct page *page) {} 1323 struct page *page) {}
1323static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, 1324static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
1324 struct page *page) {} 1325 struct page *page) {}
1325unsigned long kmem_cache_flags(unsigned long object_size, 1326slab_flags_t kmem_cache_flags(unsigned long object_size,
1326 unsigned long flags, const char *name, 1327 slab_flags_t flags, const char *name,
1327 void (*ctor)(void *)) 1328 void (*ctor)(void *))
1328{ 1329{
1329 return flags; 1330 return flags;
@@ -1370,12 +1371,11 @@ static inline void *slab_free_hook(struct kmem_cache *s, void *x)
1370 * So in order to make the debug calls that expect irqs to be 1371 * So in order to make the debug calls that expect irqs to be
1371 * disabled we need to disable interrupts temporarily. 1372 * disabled we need to disable interrupts temporarily.
1372 */ 1373 */
1373#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) 1374#ifdef CONFIG_LOCKDEP
1374 { 1375 {
1375 unsigned long flags; 1376 unsigned long flags;
1376 1377
1377 local_irq_save(flags); 1378 local_irq_save(flags);
1378 kmemcheck_slab_free(s, x, s->object_size);
1379 debug_check_no_locks_freed(x, s->object_size); 1379 debug_check_no_locks_freed(x, s->object_size);
1380 local_irq_restore(flags); 1380 local_irq_restore(flags);
1381 } 1381 }
@@ -1399,8 +1399,7 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s,
1399 * Compiler cannot detect this function can be removed if slab_free_hook() 1399 * Compiler cannot detect this function can be removed if slab_free_hook()
1400 * evaluates to nothing. Thus, catch all relevant config debug options here. 1400 * evaluates to nothing. Thus, catch all relevant config debug options here.
1401 */ 1401 */
1402#if defined(CONFIG_KMEMCHECK) || \ 1402#if defined(CONFIG_LOCKDEP) || \
1403 defined(CONFIG_LOCKDEP) || \
1404 defined(CONFIG_DEBUG_KMEMLEAK) || \ 1403 defined(CONFIG_DEBUG_KMEMLEAK) || \
1405 defined(CONFIG_DEBUG_OBJECTS_FREE) || \ 1404 defined(CONFIG_DEBUG_OBJECTS_FREE) || \
1406 defined(CONFIG_KASAN) 1405 defined(CONFIG_KASAN)
@@ -1436,8 +1435,6 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
1436 struct page *page; 1435 struct page *page;
1437 int order = oo_order(oo); 1436 int order = oo_order(oo);
1438 1437
1439 flags |= __GFP_NOTRACK;
1440
1441 if (node == NUMA_NO_NODE) 1438 if (node == NUMA_NO_NODE)
1442 page = alloc_pages(flags, order); 1439 page = alloc_pages(flags, order);
1443 else 1440 else
@@ -1596,22 +1593,6 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1596 stat(s, ORDER_FALLBACK); 1593 stat(s, ORDER_FALLBACK);
1597 } 1594 }
1598 1595
1599 if (kmemcheck_enabled &&
1600 !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
1601 int pages = 1 << oo_order(oo);
1602
1603 kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
1604
1605 /*
1606 * Objects from caches that have a constructor don't get
1607 * cleared when they're allocated, so we need to do it here.
1608 */
1609 if (s->ctor)
1610 kmemcheck_mark_uninitialized_pages(page, pages);
1611 else
1612 kmemcheck_mark_unallocated_pages(page, pages);
1613 }
1614
1615 page->objects = oo_objects(oo); 1596 page->objects = oo_objects(oo);
1616 1597
1617 order = compound_order(page); 1598 order = compound_order(page);
@@ -1687,8 +1668,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1687 check_object(s, page, p, SLUB_RED_INACTIVE); 1668 check_object(s, page, p, SLUB_RED_INACTIVE);
1688 } 1669 }
1689 1670
1690 kmemcheck_free_shadow(page, compound_order(page));
1691
1692 mod_lruvec_page_state(page, 1671 mod_lruvec_page_state(page,
1693 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1672 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1694 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1673 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
@@ -3477,7 +3456,7 @@ static void set_cpu_partial(struct kmem_cache *s)
3477 */ 3456 */
3478static int calculate_sizes(struct kmem_cache *s, int forced_order) 3457static int calculate_sizes(struct kmem_cache *s, int forced_order)
3479{ 3458{
3480 unsigned long flags = s->flags; 3459 slab_flags_t flags = s->flags;
3481 size_t size = s->object_size; 3460 size_t size = s->object_size;
3482 int order; 3461 int order;
3483 3462
@@ -3593,7 +3572,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
3593 return !!oo_objects(s->oo); 3572 return !!oo_objects(s->oo);
3594} 3573}
3595 3574
3596static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) 3575static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
3597{ 3576{
3598 s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); 3577 s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
3599 s->reserved = 0; 3578 s->reserved = 0;
@@ -3655,7 +3634,7 @@ error:
3655 if (flags & SLAB_PANIC) 3634 if (flags & SLAB_PANIC)
3656 panic("Cannot create slab %s size=%lu realsize=%u order=%u offset=%u flags=%lx\n", 3635 panic("Cannot create slab %s size=%lu realsize=%u order=%u offset=%u flags=%lx\n",
3657 s->name, (unsigned long)s->size, s->size, 3636 s->name, (unsigned long)s->size, s->size,
3658 oo_order(s->oo), s->offset, flags); 3637 oo_order(s->oo), s->offset, (unsigned long)flags);
3659 return -EINVAL; 3638 return -EINVAL;
3660} 3639}
3661 3640
@@ -3792,7 +3771,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
3792 struct page *page; 3771 struct page *page;
3793 void *ptr = NULL; 3772 void *ptr = NULL;
3794 3773
3795 flags |= __GFP_COMP | __GFP_NOTRACK; 3774 flags |= __GFP_COMP;
3796 page = alloc_pages_node(node, flags, get_order(size)); 3775 page = alloc_pages_node(node, flags, get_order(size));
3797 if (page) 3776 if (page)
3798 ptr = page_address(page); 3777 ptr = page_address(page);
@@ -4245,7 +4224,7 @@ void __init kmem_cache_init_late(void)
4245 4224
4246struct kmem_cache * 4225struct kmem_cache *
4247__kmem_cache_alias(const char *name, size_t size, size_t align, 4226__kmem_cache_alias(const char *name, size_t size, size_t align,
4248 unsigned long flags, void (*ctor)(void *)) 4227 slab_flags_t flags, void (*ctor)(void *))
4249{ 4228{
4250 struct kmem_cache *s, *c; 4229 struct kmem_cache *s, *c;
4251 4230
@@ -4275,7 +4254,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
4275 return s; 4254 return s;
4276} 4255}
4277 4256
4278int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) 4257int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
4279{ 4258{
4280 int err; 4259 int err;
4281 4260
@@ -5655,8 +5634,6 @@ static char *create_unique_id(struct kmem_cache *s)
5655 *p++ = 'a'; 5634 *p++ = 'a';
5656 if (s->flags & SLAB_CONSISTENCY_CHECKS) 5635 if (s->flags & SLAB_CONSISTENCY_CHECKS)
5657 *p++ = 'F'; 5636 *p++ = 'F';
5658 if (!(s->flags & SLAB_NOTRACK))
5659 *p++ = 't';
5660 if (s->flags & SLAB_ACCOUNT) 5637 if (s->flags & SLAB_ACCOUNT)
5661 *p++ = 'A'; 5638 *p++ = 'A';
5662 if (p != name + 1) 5639 if (p != name + 1)
@@ -5704,6 +5681,10 @@ static int sysfs_slab_add(struct kmem_cache *s)
5704 return 0; 5681 return 0;
5705 } 5682 }
5706 5683
5684 if (!unmergeable && disable_higher_order_debug &&
5685 (slub_debug & DEBUG_METADATA_FLAGS))
5686 unmergeable = 1;
5687
5707 if (unmergeable) { 5688 if (unmergeable) {
5708 /* 5689 /*
5709 * Slabcache can never be merged so we can use the name proper. 5690 * Slabcache can never be merged so we can use the name proper.
@@ -5852,7 +5833,7 @@ __initcall(slab_sysfs_init);
5852/* 5833/*
5853 * The /proc/slabinfo ABI 5834 * The /proc/slabinfo ABI
5854 */ 5835 */
5855#ifdef CONFIG_SLABINFO 5836#ifdef CONFIG_SLUB_DEBUG
5856void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) 5837void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
5857{ 5838{
5858 unsigned long nr_slabs = 0; 5839 unsigned long nr_slabs = 0;
@@ -5884,4 +5865,4 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
5884{ 5865{
5885 return -EIO; 5866 return -EIO;
5886} 5867}
5887#endif /* CONFIG_SLABINFO */ 5868#endif /* CONFIG_SLUB_DEBUG */
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 478ce6d4a2c4..17acf01791fa 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -42,7 +42,7 @@ static void * __ref __earlyonly_bootmem_alloc(int node,
42 unsigned long align, 42 unsigned long align,
43 unsigned long goal) 43 unsigned long goal)
44{ 44{
45 return memblock_virt_alloc_try_nid(size, align, goal, 45 return memblock_virt_alloc_try_nid_raw(size, align, goal,
46 BOOTMEM_ALLOC_ACCESSIBLE, node); 46 BOOTMEM_ALLOC_ACCESSIBLE, node);
47} 47}
48 48
@@ -53,13 +53,20 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
53{ 53{
54 /* If the main allocator is up use that, fallback to bootmem. */ 54 /* If the main allocator is up use that, fallback to bootmem. */
55 if (slab_is_available()) { 55 if (slab_is_available()) {
56 gfp_t gfp_mask = GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
57 int order = get_order(size);
58 static bool warned;
56 struct page *page; 59 struct page *page;
57 60
58 page = alloc_pages_node(node, 61 page = alloc_pages_node(node, gfp_mask, order);
59 GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL,
60 get_order(size));
61 if (page) 62 if (page)
62 return page_address(page); 63 return page_address(page);
64
65 if (!warned) {
66 warn_alloc(gfp_mask & ~__GFP_NOWARN, NULL,
67 "vmemmap alloc failure: order:%u", order);
68 warned = true;
69 }
63 return NULL; 70 return NULL;
64 } else 71 } else
65 return __earlyonly_bootmem_alloc(node, size, size, 72 return __earlyonly_bootmem_alloc(node, size, size,
@@ -180,11 +187,22 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
180 return pte; 187 return pte;
181} 188}
182 189
190static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node)
191{
192 void *p = vmemmap_alloc_block(size, node);
193
194 if (!p)
195 return NULL;
196 memset(p, 0, size);
197
198 return p;
199}
200
183pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) 201pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
184{ 202{
185 pmd_t *pmd = pmd_offset(pud, addr); 203 pmd_t *pmd = pmd_offset(pud, addr);
186 if (pmd_none(*pmd)) { 204 if (pmd_none(*pmd)) {
187 void *p = vmemmap_alloc_block(PAGE_SIZE, node); 205 void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
188 if (!p) 206 if (!p)
189 return NULL; 207 return NULL;
190 pmd_populate_kernel(&init_mm, pmd, p); 208 pmd_populate_kernel(&init_mm, pmd, p);
@@ -196,7 +214,7 @@ pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
196{ 214{
197 pud_t *pud = pud_offset(p4d, addr); 215 pud_t *pud = pud_offset(p4d, addr);
198 if (pud_none(*pud)) { 216 if (pud_none(*pud)) {
199 void *p = vmemmap_alloc_block(PAGE_SIZE, node); 217 void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
200 if (!p) 218 if (!p)
201 return NULL; 219 return NULL;
202 pud_populate(&init_mm, pud, p); 220 pud_populate(&init_mm, pud, p);
@@ -208,7 +226,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
208{ 226{
209 p4d_t *p4d = p4d_offset(pgd, addr); 227 p4d_t *p4d = p4d_offset(pgd, addr);
210 if (p4d_none(*p4d)) { 228 if (p4d_none(*p4d)) {
211 void *p = vmemmap_alloc_block(PAGE_SIZE, node); 229 void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
212 if (!p) 230 if (!p)
213 return NULL; 231 return NULL;
214 p4d_populate(&init_mm, p4d, p); 232 p4d_populate(&init_mm, p4d, p);
@@ -220,7 +238,7 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
220{ 238{
221 pgd_t *pgd = pgd_offset_k(addr); 239 pgd_t *pgd = pgd_offset_k(addr);
222 if (pgd_none(*pgd)) { 240 if (pgd_none(*pgd)) {
223 void *p = vmemmap_alloc_block(PAGE_SIZE, node); 241 void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
224 if (!p) 242 if (!p)
225 return NULL; 243 return NULL;
226 pgd_populate(&init_mm, pgd, p); 244 pgd_populate(&init_mm, pgd, p);
diff --git a/mm/sparse.c b/mm/sparse.c
index 60805abf98af..7a5dacaa06e3 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -453,9 +453,9 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
453 } 453 }
454 454
455 size = PAGE_ALIGN(size); 455 size = PAGE_ALIGN(size);
456 map = memblock_virt_alloc_try_nid(size * map_count, 456 map = memblock_virt_alloc_try_nid_raw(size * map_count,
457 PAGE_SIZE, __pa(MAX_DMA_ADDRESS), 457 PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
458 BOOTMEM_ALLOC_ACCESSIBLE, nodeid); 458 BOOTMEM_ALLOC_ACCESSIBLE, nodeid);
459 if (map) { 459 if (map) {
460 for (pnum = pnum_begin; pnum < pnum_end; pnum++) { 460 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
461 if (!present_section_nr(pnum)) 461 if (!present_section_nr(pnum))
diff --git a/mm/swap.c b/mm/swap.c
index a77d68f2c1b6..38e1b6374a97 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -76,7 +76,7 @@ static void __page_cache_release(struct page *page)
76static void __put_single_page(struct page *page) 76static void __put_single_page(struct page *page)
77{ 77{
78 __page_cache_release(page); 78 __page_cache_release(page);
79 free_hot_cold_page(page, false); 79 free_unref_page(page);
80} 80}
81 81
82static void __put_compound_page(struct page *page) 82static void __put_compound_page(struct page *page)
@@ -210,7 +210,7 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
210 } 210 }
211 if (pgdat) 211 if (pgdat)
212 spin_unlock_irqrestore(&pgdat->lru_lock, flags); 212 spin_unlock_irqrestore(&pgdat->lru_lock, flags);
213 release_pages(pvec->pages, pvec->nr, pvec->cold); 213 release_pages(pvec->pages, pvec->nr);
214 pagevec_reinit(pvec); 214 pagevec_reinit(pvec);
215} 215}
216 216
@@ -740,7 +740,7 @@ void lru_add_drain_all(void)
740 * Decrement the reference count on all the pages in @pages. If it 740 * Decrement the reference count on all the pages in @pages. If it
741 * fell to zero, remove the page from the LRU and free it. 741 * fell to zero, remove the page from the LRU and free it.
742 */ 742 */
743void release_pages(struct page **pages, int nr, bool cold) 743void release_pages(struct page **pages, int nr)
744{ 744{
745 int i; 745 int i;
746 LIST_HEAD(pages_to_free); 746 LIST_HEAD(pages_to_free);
@@ -817,7 +817,7 @@ void release_pages(struct page **pages, int nr, bool cold)
817 spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); 817 spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
818 818
819 mem_cgroup_uncharge_list(&pages_to_free); 819 mem_cgroup_uncharge_list(&pages_to_free);
820 free_hot_cold_page_list(&pages_to_free, cold); 820 free_unref_page_list(&pages_to_free);
821} 821}
822EXPORT_SYMBOL(release_pages); 822EXPORT_SYMBOL(release_pages);
823 823
@@ -833,8 +833,11 @@ EXPORT_SYMBOL(release_pages);
833 */ 833 */
834void __pagevec_release(struct pagevec *pvec) 834void __pagevec_release(struct pagevec *pvec)
835{ 835{
836 lru_add_drain(); 836 if (!pvec->percpu_pvec_drained) {
837 release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); 837 lru_add_drain();
838 pvec->percpu_pvec_drained = true;
839 }
840 release_pages(pvec->pages, pagevec_count(pvec));
838 pagevec_reinit(pvec); 841 pagevec_reinit(pvec);
839} 842}
840EXPORT_SYMBOL(__pagevec_release); 843EXPORT_SYMBOL(__pagevec_release);
@@ -986,15 +989,25 @@ unsigned pagevec_lookup_range(struct pagevec *pvec,
986} 989}
987EXPORT_SYMBOL(pagevec_lookup_range); 990EXPORT_SYMBOL(pagevec_lookup_range);
988 991
989unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, 992unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
990 pgoff_t *index, int tag, unsigned nr_pages) 993 struct address_space *mapping, pgoff_t *index, pgoff_t end,
994 int tag)
991{ 995{
992 pvec->nr = find_get_pages_tag(mapping, index, tag, 996 pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
993 nr_pages, pvec->pages); 997 PAGEVEC_SIZE, pvec->pages);
994 return pagevec_count(pvec); 998 return pagevec_count(pvec);
995} 999}
996EXPORT_SYMBOL(pagevec_lookup_tag); 1000EXPORT_SYMBOL(pagevec_lookup_range_tag);
997 1001
1002unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
1003 struct address_space *mapping, pgoff_t *index, pgoff_t end,
1004 int tag, unsigned max_pages)
1005{
1006 pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
1007 min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages);
1008 return pagevec_count(pvec);
1009}
1010EXPORT_SYMBOL(pagevec_lookup_range_nr_tag);
998/* 1011/*
999 * Perform any setup for the swap system 1012 * Perform any setup for the swap system
1000 */ 1013 */
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index d81cfc5a43d5..bebc19292018 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -149,6 +149,13 @@ static int alloc_swap_slot_cache(unsigned int cpu)
149 cache->nr = 0; 149 cache->nr = 0;
150 cache->cur = 0; 150 cache->cur = 0;
151 cache->n_ret = 0; 151 cache->n_ret = 0;
152 /*
153 * We initialized alloc_lock and free_lock earlier. We use
154 * !cache->slots or !cache->slots_ret to know if it is safe to acquire
155 * the corresponding lock and use the cache. Memory barrier below
156 * ensures the assumption.
157 */
158 mb();
152 cache->slots = slots; 159 cache->slots = slots;
153 slots = NULL; 160 slots = NULL;
154 cache->slots_ret = slots_ret; 161 cache->slots_ret = slots_ret;
@@ -275,7 +282,7 @@ int free_swap_slot(swp_entry_t entry)
275 struct swap_slots_cache *cache; 282 struct swap_slots_cache *cache;
276 283
277 cache = raw_cpu_ptr(&swp_slots); 284 cache = raw_cpu_ptr(&swp_slots);
278 if (use_swap_slot_cache && cache->slots_ret) { 285 if (likely(use_swap_slot_cache && cache->slots_ret)) {
279 spin_lock_irq(&cache->free_lock); 286 spin_lock_irq(&cache->free_lock);
280 /* Swap slots cache may be deactivated before acquiring lock */ 287 /* Swap slots cache may be deactivated before acquiring lock */
281 if (!use_swap_slot_cache || !cache->slots_ret) { 288 if (!use_swap_slot_cache || !cache->slots_ret) {
@@ -326,7 +333,7 @@ swp_entry_t get_swap_page(struct page *page)
326 */ 333 */
327 cache = raw_cpu_ptr(&swp_slots); 334 cache = raw_cpu_ptr(&swp_slots);
328 335
329 if (check_cache_active()) { 336 if (likely(check_cache_active() && cache->slots)) {
330 mutex_lock(&cache->alloc_lock); 337 mutex_lock(&cache->alloc_lock);
331 if (cache->slots) { 338 if (cache->slots) {
332repeat: 339repeat:
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 326439428daf..39ae7cfad90f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -36,9 +36,9 @@ static const struct address_space_operations swap_aops = {
36#endif 36#endif
37}; 37};
38 38
39struct address_space *swapper_spaces[MAX_SWAPFILES]; 39struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly;
40static unsigned int nr_swapper_spaces[MAX_SWAPFILES]; 40static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly;
41bool swap_vma_readahead = true; 41bool swap_vma_readahead __read_mostly = true;
42 42
43#define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) 43#define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2)
44#define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) 44#define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1)
@@ -319,7 +319,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
319 lru_add_drain(); 319 lru_add_drain();
320 for (i = 0; i < nr; i++) 320 for (i = 0; i < nr; i++)
321 free_swap_cache(pagep[i]); 321 free_swap_cache(pagep[i]);
322 release_pages(pagep, nr, false); 322 release_pages(pagep, nr);
323} 323}
324 324
325/* 325/*
@@ -559,6 +559,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
559 unsigned long offset = entry_offset; 559 unsigned long offset = entry_offset;
560 unsigned long start_offset, end_offset; 560 unsigned long start_offset, end_offset;
561 unsigned long mask; 561 unsigned long mask;
562 struct swap_info_struct *si = swp_swap_info(entry);
562 struct blk_plug plug; 563 struct blk_plug plug;
563 bool do_poll = true, page_allocated; 564 bool do_poll = true, page_allocated;
564 565
@@ -572,6 +573,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
572 end_offset = offset | mask; 573 end_offset = offset | mask;
573 if (!start_offset) /* First page is swap header. */ 574 if (!start_offset) /* First page is swap header. */
574 start_offset++; 575 start_offset++;
576 if (end_offset >= si->max)
577 end_offset = si->max - 1;
575 578
576 blk_start_plug(&plug); 579 blk_start_plug(&plug);
577 for (offset = start_offset; offset <= end_offset ; offset++) { 580 for (offset = start_offset; offset <= end_offset ; offset++) {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e47a21e64764..3074b02eaa09 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1328,6 +1328,13 @@ int page_swapcount(struct page *page)
1328 return count; 1328 return count;
1329} 1329}
1330 1330
1331int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
1332{
1333 pgoff_t offset = swp_offset(entry);
1334
1335 return swap_count(si->swap_map[offset]);
1336}
1337
1331static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) 1338static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
1332{ 1339{
1333 int count = 0; 1340 int count = 0;
@@ -3169,6 +3176,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
3169 if (bdi_cap_stable_pages_required(inode_to_bdi(inode))) 3176 if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
3170 p->flags |= SWP_STABLE_WRITES; 3177 p->flags |= SWP_STABLE_WRITES;
3171 3178
3179 if (bdi_cap_synchronous_io(inode_to_bdi(inode)))
3180 p->flags |= SWP_SYNCHRONOUS_IO;
3181
3172 if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { 3182 if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
3173 int cpu; 3183 int cpu;
3174 unsigned long ci, nr_cluster; 3184 unsigned long ci, nr_cluster;
@@ -3452,10 +3462,15 @@ int swapcache_prepare(swp_entry_t entry)
3452 return __swap_duplicate(entry, SWAP_HAS_CACHE); 3462 return __swap_duplicate(entry, SWAP_HAS_CACHE);
3453} 3463}
3454 3464
3465struct swap_info_struct *swp_swap_info(swp_entry_t entry)
3466{
3467 return swap_info[swp_type(entry)];
3468}
3469
3455struct swap_info_struct *page_swap_info(struct page *page) 3470struct swap_info_struct *page_swap_info(struct page *page)
3456{ 3471{
3457 swp_entry_t swap = { .val = page_private(page) }; 3472 swp_entry_t entry = { .val = page_private(page) };
3458 return swap_info[swp_type(swap)]; 3473 return swp_swap_info(entry);
3459} 3474}
3460 3475
3461/* 3476/*
@@ -3463,7 +3478,6 @@ struct swap_info_struct *page_swap_info(struct page *page)
3463 */ 3478 */
3464struct address_space *__page_file_mapping(struct page *page) 3479struct address_space *__page_file_mapping(struct page *page)
3465{ 3480{
3466 VM_BUG_ON_PAGE(!PageSwapCache(page), page);
3467 return page_swap_info(page)->swap_file->f_mapping; 3481 return page_swap_info(page)->swap_file->f_mapping;
3468} 3482}
3469EXPORT_SYMBOL_GPL(__page_file_mapping); 3483EXPORT_SYMBOL_GPL(__page_file_mapping);
@@ -3471,7 +3485,6 @@ EXPORT_SYMBOL_GPL(__page_file_mapping);
3471pgoff_t __page_file_index(struct page *page) 3485pgoff_t __page_file_index(struct page *page)
3472{ 3486{
3473 swp_entry_t swap = { .val = page_private(page) }; 3487 swp_entry_t swap = { .val = page_private(page) };
3474 VM_BUG_ON_PAGE(!PageSwapCache(page), page);
3475 return swp_offset(swap); 3488 return swp_offset(swap);
3476} 3489}
3477EXPORT_SYMBOL_GPL(__page_file_index); 3490EXPORT_SYMBOL_GPL(__page_file_index);
diff --git a/mm/truncate.c b/mm/truncate.c
index 2330223841fb..e4b4cf0f4070 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -25,44 +25,85 @@
25#include <linux/rmap.h> 25#include <linux/rmap.h>
26#include "internal.h" 26#include "internal.h"
27 27
28static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, 28/*
29 void *entry) 29 * Regular page slots are stabilized by the page lock even without the tree
30 * itself locked. These unlocked entries need verification under the tree
31 * lock.
32 */
33static inline void __clear_shadow_entry(struct address_space *mapping,
34 pgoff_t index, void *entry)
30{ 35{
31 struct radix_tree_node *node; 36 struct radix_tree_node *node;
32 void **slot; 37 void **slot;
33 38
34 spin_lock_irq(&mapping->tree_lock);
35 /*
36 * Regular page slots are stabilized by the page lock even
37 * without the tree itself locked. These unlocked entries
38 * need verification under the tree lock.
39 */
40 if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) 39 if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
41 goto unlock; 40 return;
42 if (*slot != entry) 41 if (*slot != entry)
43 goto unlock; 42 return;
44 __radix_tree_replace(&mapping->page_tree, node, slot, NULL, 43 __radix_tree_replace(&mapping->page_tree, node, slot, NULL,
45 workingset_update_node, mapping); 44 workingset_update_node);
46 mapping->nrexceptional--; 45 mapping->nrexceptional--;
47unlock: 46}
47
48static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
49 void *entry)
50{
51 spin_lock_irq(&mapping->tree_lock);
52 __clear_shadow_entry(mapping, index, entry);
48 spin_unlock_irq(&mapping->tree_lock); 53 spin_unlock_irq(&mapping->tree_lock);
49} 54}
50 55
51/* 56/*
52 * Unconditionally remove exceptional entry. Usually called from truncate path. 57 * Unconditionally remove exceptional entries. Usually called from truncate
58 * path. Note that the pagevec may be altered by this function by removing
59 * exceptional entries similar to what pagevec_remove_exceptionals does.
53 */ 60 */
54static void truncate_exceptional_entry(struct address_space *mapping, 61static void truncate_exceptional_pvec_entries(struct address_space *mapping,
55 pgoff_t index, void *entry) 62 struct pagevec *pvec, pgoff_t *indices,
63 pgoff_t end)
56{ 64{
65 int i, j;
66 bool dax, lock;
67
57 /* Handled by shmem itself */ 68 /* Handled by shmem itself */
58 if (shmem_mapping(mapping)) 69 if (shmem_mapping(mapping))
59 return; 70 return;
60 71
61 if (dax_mapping(mapping)) { 72 for (j = 0; j < pagevec_count(pvec); j++)
62 dax_delete_mapping_entry(mapping, index); 73 if (radix_tree_exceptional_entry(pvec->pages[j]))
74 break;
75
76 if (j == pagevec_count(pvec))
63 return; 77 return;
78
79 dax = dax_mapping(mapping);
80 lock = !dax && indices[j] < end;
81 if (lock)
82 spin_lock_irq(&mapping->tree_lock);
83
84 for (i = j; i < pagevec_count(pvec); i++) {
85 struct page *page = pvec->pages[i];
86 pgoff_t index = indices[i];
87
88 if (!radix_tree_exceptional_entry(page)) {
89 pvec->pages[j++] = page;
90 continue;
91 }
92
93 if (index >= end)
94 continue;
95
96 if (unlikely(dax)) {
97 dax_delete_mapping_entry(mapping, index);
98 continue;
99 }
100
101 __clear_shadow_entry(mapping, index, page);
64 } 102 }
65 clear_shadow_entry(mapping, index, entry); 103
104 if (lock)
105 spin_unlock_irq(&mapping->tree_lock);
106 pvec->nr = j;
66} 107}
67 108
68/* 109/*
@@ -134,11 +175,17 @@ void do_invalidatepage(struct page *page, unsigned int offset,
134 * its lock, b) when a concurrent invalidate_mapping_pages got there first and 175 * its lock, b) when a concurrent invalidate_mapping_pages got there first and
135 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. 176 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
136 */ 177 */
137static int 178static void
138truncate_complete_page(struct address_space *mapping, struct page *page) 179truncate_cleanup_page(struct address_space *mapping, struct page *page)
139{ 180{
140 if (page->mapping != mapping) 181 if (page_mapped(page)) {
141 return -EIO; 182 loff_t holelen;
183
184 holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE;
185 unmap_mapping_range(mapping,
186 (loff_t)page->index << PAGE_SHIFT,
187 holelen, 0);
188 }
142 189
143 if (page_has_private(page)) 190 if (page_has_private(page))
144 do_invalidatepage(page, 0, PAGE_SIZE); 191 do_invalidatepage(page, 0, PAGE_SIZE);
@@ -150,8 +197,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
150 */ 197 */
151 cancel_dirty_page(page); 198 cancel_dirty_page(page);
152 ClearPageMappedToDisk(page); 199 ClearPageMappedToDisk(page);
153 delete_from_page_cache(page);
154 return 0;
155} 200}
156 201
157/* 202/*
@@ -180,16 +225,14 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
180 225
181int truncate_inode_page(struct address_space *mapping, struct page *page) 226int truncate_inode_page(struct address_space *mapping, struct page *page)
182{ 227{
183 loff_t holelen;
184 VM_BUG_ON_PAGE(PageTail(page), page); 228 VM_BUG_ON_PAGE(PageTail(page), page);
185 229
186 holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE; 230 if (page->mapping != mapping)
187 if (page_mapped(page)) { 231 return -EIO;
188 unmap_mapping_range(mapping, 232
189 (loff_t)page->index << PAGE_SHIFT, 233 truncate_cleanup_page(mapping, page);
190 holelen, 0); 234 delete_from_page_cache(page);
191 } 235 return 0;
192 return truncate_complete_page(mapping, page);
193} 236}
194 237
195/* 238/*
@@ -287,11 +330,19 @@ void truncate_inode_pages_range(struct address_space *mapping,
287 else 330 else
288 end = (lend + 1) >> PAGE_SHIFT; 331 end = (lend + 1) >> PAGE_SHIFT;
289 332
290 pagevec_init(&pvec, 0); 333 pagevec_init(&pvec);
291 index = start; 334 index = start;
292 while (index < end && pagevec_lookup_entries(&pvec, mapping, index, 335 while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
293 min(end - index, (pgoff_t)PAGEVEC_SIZE), 336 min(end - index, (pgoff_t)PAGEVEC_SIZE),
294 indices)) { 337 indices)) {
338 /*
339 * Pagevec array has exceptional entries and we may also fail
340 * to lock some pages. So we store pages that can be deleted
341 * in a new pagevec.
342 */
343 struct pagevec locked_pvec;
344
345 pagevec_init(&locked_pvec);
295 for (i = 0; i < pagevec_count(&pvec); i++) { 346 for (i = 0; i < pagevec_count(&pvec); i++) {
296 struct page *page = pvec.pages[i]; 347 struct page *page = pvec.pages[i];
297 348
@@ -300,11 +351,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
300 if (index >= end) 351 if (index >= end)
301 break; 352 break;
302 353
303 if (radix_tree_exceptional_entry(page)) { 354 if (radix_tree_exceptional_entry(page))
304 truncate_exceptional_entry(mapping, index,
305 page);
306 continue; 355 continue;
307 }
308 356
309 if (!trylock_page(page)) 357 if (!trylock_page(page))
310 continue; 358 continue;
@@ -313,15 +361,22 @@ void truncate_inode_pages_range(struct address_space *mapping,
313 unlock_page(page); 361 unlock_page(page);
314 continue; 362 continue;
315 } 363 }
316 truncate_inode_page(mapping, page); 364 if (page->mapping != mapping) {
317 unlock_page(page); 365 unlock_page(page);
366 continue;
367 }
368 pagevec_add(&locked_pvec, page);
318 } 369 }
319 pagevec_remove_exceptionals(&pvec); 370 for (i = 0; i < pagevec_count(&locked_pvec); i++)
371 truncate_cleanup_page(mapping, locked_pvec.pages[i]);
372 delete_from_page_cache_batch(mapping, &locked_pvec);
373 for (i = 0; i < pagevec_count(&locked_pvec); i++)
374 unlock_page(locked_pvec.pages[i]);
375 truncate_exceptional_pvec_entries(mapping, &pvec, indices, end);
320 pagevec_release(&pvec); 376 pagevec_release(&pvec);
321 cond_resched(); 377 cond_resched();
322 index++; 378 index++;
323 } 379 }
324
325 if (partial_start) { 380 if (partial_start) {
326 struct page *page = find_lock_page(mapping, start - 1); 381 struct page *page = find_lock_page(mapping, start - 1);
327 if (page) { 382 if (page) {
@@ -379,6 +434,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
379 pagevec_release(&pvec); 434 pagevec_release(&pvec);
380 break; 435 break;
381 } 436 }
437
382 for (i = 0; i < pagevec_count(&pvec); i++) { 438 for (i = 0; i < pagevec_count(&pvec); i++) {
383 struct page *page = pvec.pages[i]; 439 struct page *page = pvec.pages[i];
384 440
@@ -390,11 +446,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
390 break; 446 break;
391 } 447 }
392 448
393 if (radix_tree_exceptional_entry(page)) { 449 if (radix_tree_exceptional_entry(page))
394 truncate_exceptional_entry(mapping, index,
395 page);
396 continue; 450 continue;
397 }
398 451
399 lock_page(page); 452 lock_page(page);
400 WARN_ON(page_to_index(page) != index); 453 WARN_ON(page_to_index(page) != index);
@@ -402,7 +455,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
402 truncate_inode_page(mapping, page); 455 truncate_inode_page(mapping, page);
403 unlock_page(page); 456 unlock_page(page);
404 } 457 }
405 pagevec_remove_exceptionals(&pvec); 458 truncate_exceptional_pvec_entries(mapping, &pvec, indices, end);
406 pagevec_release(&pvec); 459 pagevec_release(&pvec);
407 index++; 460 index++;
408 } 461 }
@@ -500,7 +553,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
500 unsigned long count = 0; 553 unsigned long count = 0;
501 int i; 554 int i;
502 555
503 pagevec_init(&pvec, 0); 556 pagevec_init(&pvec);
504 while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, 557 while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
505 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, 558 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
506 indices)) { 559 indices)) {
@@ -630,7 +683,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
630 if (mapping->nrpages == 0 && mapping->nrexceptional == 0) 683 if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
631 goto out; 684 goto out;
632 685
633 pagevec_init(&pvec, 0); 686 pagevec_init(&pvec);
634 index = start; 687 index = start;
635 while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, 688 while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
636 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, 689 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 15b483ef6440..c02c850ea349 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1349,7 +1349,7 @@ keep:
1349 1349
1350 mem_cgroup_uncharge_list(&free_pages); 1350 mem_cgroup_uncharge_list(&free_pages);
1351 try_to_unmap_flush(); 1351 try_to_unmap_flush();
1352 free_hot_cold_page_list(&free_pages, true); 1352 free_unref_page_list(&free_pages);
1353 1353
1354 list_splice(&ret_pages, page_list); 1354 list_splice(&ret_pages, page_list);
1355 count_vm_events(PGACTIVATE, pgactivate); 1355 count_vm_events(PGACTIVATE, pgactivate);
@@ -1824,7 +1824,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1824 spin_unlock_irq(&pgdat->lru_lock); 1824 spin_unlock_irq(&pgdat->lru_lock);
1825 1825
1826 mem_cgroup_uncharge_list(&page_list); 1826 mem_cgroup_uncharge_list(&page_list);
1827 free_hot_cold_page_list(&page_list, true); 1827 free_unref_page_list(&page_list);
1828 1828
1829 /* 1829 /*
1830 * If reclaim is isolating dirty pages under writeback, it implies 1830 * If reclaim is isolating dirty pages under writeback, it implies
@@ -2063,7 +2063,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
2063 spin_unlock_irq(&pgdat->lru_lock); 2063 spin_unlock_irq(&pgdat->lru_lock);
2064 2064
2065 mem_cgroup_uncharge_list(&l_hold); 2065 mem_cgroup_uncharge_list(&l_hold);
2066 free_hot_cold_page_list(&l_hold, true); 2066 free_unref_page_list(&l_hold);
2067 trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, 2067 trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
2068 nr_deactivate, nr_rotated, sc->priority, file); 2068 nr_deactivate, nr_rotated, sc->priority, file);
2069} 2069}
@@ -2082,7 +2082,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
2082 * If that fails and refaulting is observed, the inactive list grows. 2082 * If that fails and refaulting is observed, the inactive list grows.
2083 * 2083 *
2084 * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages 2084 * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
2085 * on this LRU, maintained by the pageout code. A zone->inactive_ratio 2085 * on this LRU, maintained by the pageout code. An inactive_ratio
2086 * of 3 means 3:1 or 25% of the pages are kept on the inactive list. 2086 * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
2087 * 2087 *
2088 * total target max 2088 * total target max
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4bb13e72ac97..40b2db6db6b1 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -32,6 +32,77 @@
32 32
33#define NUMA_STATS_THRESHOLD (U16_MAX - 2) 33#define NUMA_STATS_THRESHOLD (U16_MAX - 2)
34 34
35#ifdef CONFIG_NUMA
36int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
37
38/* zero numa counters within a zone */
39static void zero_zone_numa_counters(struct zone *zone)
40{
41 int item, cpu;
42
43 for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) {
44 atomic_long_set(&zone->vm_numa_stat[item], 0);
45 for_each_online_cpu(cpu)
46 per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]
47 = 0;
48 }
49}
50
51/* zero numa counters of all the populated zones */
52static void zero_zones_numa_counters(void)
53{
54 struct zone *zone;
55
56 for_each_populated_zone(zone)
57 zero_zone_numa_counters(zone);
58}
59
60/* zero global numa counters */
61static void zero_global_numa_counters(void)
62{
63 int item;
64
65 for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++)
66 atomic_long_set(&vm_numa_stat[item], 0);
67}
68
69static void invalid_numa_statistics(void)
70{
71 zero_zones_numa_counters();
72 zero_global_numa_counters();
73}
74
75static DEFINE_MUTEX(vm_numa_stat_lock);
76
77int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
78 void __user *buffer, size_t *length, loff_t *ppos)
79{
80 int ret, oldval;
81
82 mutex_lock(&vm_numa_stat_lock);
83 if (write)
84 oldval = sysctl_vm_numa_stat;
85 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
86 if (ret || !write)
87 goto out;
88
89 if (oldval == sysctl_vm_numa_stat)
90 goto out;
91 else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
92 static_branch_enable(&vm_numa_stat_key);
93 pr_info("enable numa statistics\n");
94 } else {
95 static_branch_disable(&vm_numa_stat_key);
96 invalid_numa_statistics();
97 pr_info("disable numa statistics, and clear numa counters\n");
98 }
99
100out:
101 mutex_unlock(&vm_numa_stat_lock);
102 return ret;
103}
104#endif
105
35#ifdef CONFIG_VM_EVENT_COUNTERS 106#ifdef CONFIG_VM_EVENT_COUNTERS
36DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; 107DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
37EXPORT_PER_CPU_SYMBOL(vm_event_states); 108EXPORT_PER_CPU_SYMBOL(vm_event_states);
@@ -1564,11 +1635,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1564 } 1635 }
1565 seq_printf(m, 1636 seq_printf(m,
1566 "\n node_unreclaimable: %u" 1637 "\n node_unreclaimable: %u"
1567 "\n start_pfn: %lu" 1638 "\n start_pfn: %lu",
1568 "\n node_inactive_ratio: %u",
1569 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES, 1639 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
1570 zone->zone_start_pfn, 1640 zone->zone_start_pfn);
1571 zone->zone_pgdat->inactive_ratio);
1572 seq_putc(m, '\n'); 1641 seq_putc(m, '\n');
1573} 1642}
1574 1643
diff --git a/mm/workingset.c b/mm/workingset.c
index b997c9de28f6..b7d616a3bbbe 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -340,14 +340,8 @@ out:
340 340
341static struct list_lru shadow_nodes; 341static struct list_lru shadow_nodes;
342 342
343void workingset_update_node(struct radix_tree_node *node, void *private) 343void workingset_update_node(struct radix_tree_node *node)
344{ 344{
345 struct address_space *mapping = private;
346
347 /* Only regular page cache has shadow entries */
348 if (dax_mapping(mapping) || shmem_mapping(mapping))
349 return;
350
351 /* 345 /*
352 * Track non-empty nodes that contain only shadow entries; 346 * Track non-empty nodes that contain only shadow entries;
353 * unlink those that contain pages or are being freed. 347 * unlink those that contain pages or are being freed.
@@ -475,7 +469,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
475 goto out_invalid; 469 goto out_invalid;
476 inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); 470 inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
477 __radix_tree_delete_node(&mapping->page_tree, node, 471 __radix_tree_delete_node(&mapping->page_tree, node,
478 workingset_update_node, mapping); 472 workingset_lookup_update(mapping));
479 473
480out_invalid: 474out_invalid:
481 spin_unlock(&mapping->tree_lock); 475 spin_unlock(&mapping->tree_lock);
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 7c38e850a8fc..685049a9048d 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1349,7 +1349,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
1349 * pools/users, we can't allow mapping in interrupt context 1349 * pools/users, we can't allow mapping in interrupt context
1350 * because it can corrupt another users mappings. 1350 * because it can corrupt another users mappings.
1351 */ 1351 */
1352 WARN_ON_ONCE(in_interrupt()); 1352 BUG_ON(in_interrupt());
1353 1353
1354 /* From now on, migration cannot move the object */ 1354 /* From now on, migration cannot move the object */
1355 pin_tag(handle); 1355 pin_tag(handle);