diff options
Diffstat (limited to 'mm')
50 files changed, 1196 insertions, 813 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 5b0adf1435de..e5e606ee5f71 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug | |||
@@ -11,7 +11,6 @@ config DEBUG_PAGEALLOC | |||
11 | bool "Debug page memory allocations" | 11 | bool "Debug page memory allocations" |
12 | depends on DEBUG_KERNEL | 12 | depends on DEBUG_KERNEL |
13 | depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC | 13 | depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC |
14 | depends on !KMEMCHECK | ||
15 | select PAGE_EXTENSION | 14 | select PAGE_EXTENSION |
16 | select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC | 15 | select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC |
17 | ---help--- | 16 | ---help--- |
diff --git a/mm/Makefile b/mm/Makefile index 4659b93cba43..e7ebd176fb93 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -17,7 +17,6 @@ KCOV_INSTRUMENT_slub.o := n | |||
17 | KCOV_INSTRUMENT_page_alloc.o := n | 17 | KCOV_INSTRUMENT_page_alloc.o := n |
18 | KCOV_INSTRUMENT_debug-pagealloc.o := n | 18 | KCOV_INSTRUMENT_debug-pagealloc.o := n |
19 | KCOV_INSTRUMENT_kmemleak.o := n | 19 | KCOV_INSTRUMENT_kmemleak.o := n |
20 | KCOV_INSTRUMENT_kmemcheck.o := n | ||
21 | KCOV_INSTRUMENT_memcontrol.o := n | 20 | KCOV_INSTRUMENT_memcontrol.o := n |
22 | KCOV_INSTRUMENT_mmzone.o := n | 21 | KCOV_INSTRUMENT_mmzone.o := n |
23 | KCOV_INSTRUMENT_vmstat.o := n | 22 | KCOV_INSTRUMENT_vmstat.o := n |
@@ -70,7 +69,6 @@ obj-$(CONFIG_KSM) += ksm.o | |||
70 | obj-$(CONFIG_PAGE_POISONING) += page_poison.o | 69 | obj-$(CONFIG_PAGE_POISONING) += page_poison.o |
71 | obj-$(CONFIG_SLAB) += slab.o | 70 | obj-$(CONFIG_SLAB) += slab.o |
72 | obj-$(CONFIG_SLUB) += slub.o | 71 | obj-$(CONFIG_SLUB) += slub.o |
73 | obj-$(CONFIG_KMEMCHECK) += kmemcheck.o | ||
74 | obj-$(CONFIG_KASAN) += kasan/ | 72 | obj-$(CONFIG_KASAN) += kasan/ |
75 | obj-$(CONFIG_FAILSLAB) += failslab.o | 73 | obj-$(CONFIG_FAILSLAB) += failslab.o |
76 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | 74 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o |
@@ -461,7 +461,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, | |||
461 | trace_cma_alloc(pfn, page, count, align); | 461 | trace_cma_alloc(pfn, page, count, align); |
462 | 462 | ||
463 | if (ret && !(gfp_mask & __GFP_NOWARN)) { | 463 | if (ret && !(gfp_mask & __GFP_NOWARN)) { |
464 | pr_info("%s: alloc failed, req-size: %zu pages, ret: %d\n", | 464 | pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n", |
465 | __func__, count, ret); | 465 | __func__, count, ret); |
466 | cma_debug_show_areas(cma); | 466 | cma_debug_show_areas(cma); |
467 | } | 467 | } |
diff --git a/mm/debug.c b/mm/debug.c index 6726bec731c9..d947f3e03b0d 100644 --- a/mm/debug.c +++ b/mm/debug.c | |||
@@ -105,7 +105,7 @@ void dump_mm(const struct mm_struct *mm) | |||
105 | "get_unmapped_area %p\n" | 105 | "get_unmapped_area %p\n" |
106 | #endif | 106 | #endif |
107 | "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" | 107 | "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" |
108 | "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" | 108 | "pgd %p mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n" |
109 | "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" | 109 | "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" |
110 | "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n" | 110 | "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n" |
111 | "start_code %lx end_code %lx start_data %lx end_data %lx\n" | 111 | "start_code %lx end_code %lx start_data %lx end_data %lx\n" |
@@ -135,8 +135,7 @@ void dump_mm(const struct mm_struct *mm) | |||
135 | mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end, | 135 | mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end, |
136 | mm->pgd, atomic_read(&mm->mm_users), | 136 | mm->pgd, atomic_read(&mm->mm_users), |
137 | atomic_read(&mm->mm_count), | 137 | atomic_read(&mm->mm_count), |
138 | atomic_long_read((atomic_long_t *)&mm->nr_ptes), | 138 | mm_pgtables_bytes(mm), |
139 | mm_nr_pmds((struct mm_struct *)mm), | ||
140 | mm->map_count, | 139 | mm->map_count, |
141 | mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, | 140 | mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, |
142 | mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm, | 141 | mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm, |
diff --git a/mm/filemap.c b/mm/filemap.c index 594d73fef8b4..923fc2ebd74a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -35,6 +35,7 @@ | |||
35 | #include <linux/hugetlb.h> | 35 | #include <linux/hugetlb.h> |
36 | #include <linux/memcontrol.h> | 36 | #include <linux/memcontrol.h> |
37 | #include <linux/cleancache.h> | 37 | #include <linux/cleancache.h> |
38 | #include <linux/shmem_fs.h> | ||
38 | #include <linux/rmap.h> | 39 | #include <linux/rmap.h> |
39 | #include "internal.h" | 40 | #include "internal.h" |
40 | 41 | ||
@@ -134,7 +135,7 @@ static int page_cache_tree_insert(struct address_space *mapping, | |||
134 | *shadowp = p; | 135 | *shadowp = p; |
135 | } | 136 | } |
136 | __radix_tree_replace(&mapping->page_tree, node, slot, page, | 137 | __radix_tree_replace(&mapping->page_tree, node, slot, page, |
137 | workingset_update_node, mapping); | 138 | workingset_lookup_update(mapping)); |
138 | mapping->nrpages++; | 139 | mapping->nrpages++; |
139 | return 0; | 140 | return 0; |
140 | } | 141 | } |
@@ -162,9 +163,12 @@ static void page_cache_tree_delete(struct address_space *mapping, | |||
162 | 163 | ||
163 | radix_tree_clear_tags(&mapping->page_tree, node, slot); | 164 | radix_tree_clear_tags(&mapping->page_tree, node, slot); |
164 | __radix_tree_replace(&mapping->page_tree, node, slot, shadow, | 165 | __radix_tree_replace(&mapping->page_tree, node, slot, shadow, |
165 | workingset_update_node, mapping); | 166 | workingset_lookup_update(mapping)); |
166 | } | 167 | } |
167 | 168 | ||
169 | page->mapping = NULL; | ||
170 | /* Leave page->index set: truncation lookup relies upon it */ | ||
171 | |||
168 | if (shadow) { | 172 | if (shadow) { |
169 | mapping->nrexceptional += nr; | 173 | mapping->nrexceptional += nr; |
170 | /* | 174 | /* |
@@ -178,17 +182,11 @@ static void page_cache_tree_delete(struct address_space *mapping, | |||
178 | mapping->nrpages -= nr; | 182 | mapping->nrpages -= nr; |
179 | } | 183 | } |
180 | 184 | ||
181 | /* | 185 | static void unaccount_page_cache_page(struct address_space *mapping, |
182 | * Delete a page from the page cache and free it. Caller has to make | 186 | struct page *page) |
183 | * sure the page is locked and that nobody else uses it - or that usage | ||
184 | * is safe. The caller must hold the mapping's tree_lock. | ||
185 | */ | ||
186 | void __delete_from_page_cache(struct page *page, void *shadow) | ||
187 | { | 187 | { |
188 | struct address_space *mapping = page->mapping; | 188 | int nr; |
189 | int nr = hpage_nr_pages(page); | ||
190 | 189 | ||
191 | trace_mm_filemap_delete_from_page_cache(page); | ||
192 | /* | 190 | /* |
193 | * if we're uptodate, flush out into the cleancache, otherwise | 191 | * if we're uptodate, flush out into the cleancache, otherwise |
194 | * invalidate any existing cleancache entries. We can't leave | 192 | * invalidate any existing cleancache entries. We can't leave |
@@ -224,15 +222,12 @@ void __delete_from_page_cache(struct page *page, void *shadow) | |||
224 | } | 222 | } |
225 | } | 223 | } |
226 | 224 | ||
227 | page_cache_tree_delete(mapping, page, shadow); | ||
228 | |||
229 | page->mapping = NULL; | ||
230 | /* Leave page->index set: truncation lookup relies upon it */ | ||
231 | |||
232 | /* hugetlb pages do not participate in page cache accounting. */ | 225 | /* hugetlb pages do not participate in page cache accounting. */ |
233 | if (PageHuge(page)) | 226 | if (PageHuge(page)) |
234 | return; | 227 | return; |
235 | 228 | ||
229 | nr = hpage_nr_pages(page); | ||
230 | |||
236 | __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); | 231 | __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); |
237 | if (PageSwapBacked(page)) { | 232 | if (PageSwapBacked(page)) { |
238 | __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); | 233 | __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); |
@@ -243,17 +238,51 @@ void __delete_from_page_cache(struct page *page, void *shadow) | |||
243 | } | 238 | } |
244 | 239 | ||
245 | /* | 240 | /* |
246 | * At this point page must be either written or cleaned by truncate. | 241 | * At this point page must be either written or cleaned by |
247 | * Dirty page here signals a bug and loss of unwritten data. | 242 | * truncate. Dirty page here signals a bug and loss of |
243 | * unwritten data. | ||
248 | * | 244 | * |
249 | * This fixes dirty accounting after removing the page entirely but | 245 | * This fixes dirty accounting after removing the page entirely |
250 | * leaves PageDirty set: it has no effect for truncated page and | 246 | * but leaves PageDirty set: it has no effect for truncated |
251 | * anyway will be cleared before returning page into buddy allocator. | 247 | * page and anyway will be cleared before returning page into |
248 | * buddy allocator. | ||
252 | */ | 249 | */ |
253 | if (WARN_ON_ONCE(PageDirty(page))) | 250 | if (WARN_ON_ONCE(PageDirty(page))) |
254 | account_page_cleaned(page, mapping, inode_to_wb(mapping->host)); | 251 | account_page_cleaned(page, mapping, inode_to_wb(mapping->host)); |
255 | } | 252 | } |
256 | 253 | ||
254 | /* | ||
255 | * Delete a page from the page cache and free it. Caller has to make | ||
256 | * sure the page is locked and that nobody else uses it - or that usage | ||
257 | * is safe. The caller must hold the mapping's tree_lock. | ||
258 | */ | ||
259 | void __delete_from_page_cache(struct page *page, void *shadow) | ||
260 | { | ||
261 | struct address_space *mapping = page->mapping; | ||
262 | |||
263 | trace_mm_filemap_delete_from_page_cache(page); | ||
264 | |||
265 | unaccount_page_cache_page(mapping, page); | ||
266 | page_cache_tree_delete(mapping, page, shadow); | ||
267 | } | ||
268 | |||
269 | static void page_cache_free_page(struct address_space *mapping, | ||
270 | struct page *page) | ||
271 | { | ||
272 | void (*freepage)(struct page *); | ||
273 | |||
274 | freepage = mapping->a_ops->freepage; | ||
275 | if (freepage) | ||
276 | freepage(page); | ||
277 | |||
278 | if (PageTransHuge(page) && !PageHuge(page)) { | ||
279 | page_ref_sub(page, HPAGE_PMD_NR); | ||
280 | VM_BUG_ON_PAGE(page_count(page) <= 0, page); | ||
281 | } else { | ||
282 | put_page(page); | ||
283 | } | ||
284 | } | ||
285 | |||
257 | /** | 286 | /** |
258 | * delete_from_page_cache - delete page from page cache | 287 | * delete_from_page_cache - delete page from page cache |
259 | * @page: the page which the kernel is trying to remove from page cache | 288 | * @page: the page which the kernel is trying to remove from page cache |
@@ -266,27 +295,98 @@ void delete_from_page_cache(struct page *page) | |||
266 | { | 295 | { |
267 | struct address_space *mapping = page_mapping(page); | 296 | struct address_space *mapping = page_mapping(page); |
268 | unsigned long flags; | 297 | unsigned long flags; |
269 | void (*freepage)(struct page *); | ||
270 | 298 | ||
271 | BUG_ON(!PageLocked(page)); | 299 | BUG_ON(!PageLocked(page)); |
272 | |||
273 | freepage = mapping->a_ops->freepage; | ||
274 | |||
275 | spin_lock_irqsave(&mapping->tree_lock, flags); | 300 | spin_lock_irqsave(&mapping->tree_lock, flags); |
276 | __delete_from_page_cache(page, NULL); | 301 | __delete_from_page_cache(page, NULL); |
277 | spin_unlock_irqrestore(&mapping->tree_lock, flags); | 302 | spin_unlock_irqrestore(&mapping->tree_lock, flags); |
278 | 303 | ||
279 | if (freepage) | 304 | page_cache_free_page(mapping, page); |
280 | freepage(page); | 305 | } |
306 | EXPORT_SYMBOL(delete_from_page_cache); | ||
281 | 307 | ||
282 | if (PageTransHuge(page) && !PageHuge(page)) { | 308 | /* |
283 | page_ref_sub(page, HPAGE_PMD_NR); | 309 | * page_cache_tree_delete_batch - delete several pages from page cache |
284 | VM_BUG_ON_PAGE(page_count(page) <= 0, page); | 310 | * @mapping: the mapping to which pages belong |
285 | } else { | 311 | * @pvec: pagevec with pages to delete |
286 | put_page(page); | 312 | * |
313 | * The function walks over mapping->page_tree and removes pages passed in @pvec | ||
314 | * from the radix tree. The function expects @pvec to be sorted by page index. | ||
315 | * It tolerates holes in @pvec (radix tree entries at those indices are not | ||
316 | * modified). The function expects only THP head pages to be present in the | ||
317 | * @pvec and takes care to delete all corresponding tail pages from the radix | ||
318 | * tree as well. | ||
319 | * | ||
320 | * The function expects mapping->tree_lock to be held. | ||
321 | */ | ||
322 | static void | ||
323 | page_cache_tree_delete_batch(struct address_space *mapping, | ||
324 | struct pagevec *pvec) | ||
325 | { | ||
326 | struct radix_tree_iter iter; | ||
327 | void **slot; | ||
328 | int total_pages = 0; | ||
329 | int i = 0, tail_pages = 0; | ||
330 | struct page *page; | ||
331 | pgoff_t start; | ||
332 | |||
333 | start = pvec->pages[0]->index; | ||
334 | radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { | ||
335 | if (i >= pagevec_count(pvec) && !tail_pages) | ||
336 | break; | ||
337 | page = radix_tree_deref_slot_protected(slot, | ||
338 | &mapping->tree_lock); | ||
339 | if (radix_tree_exceptional_entry(page)) | ||
340 | continue; | ||
341 | if (!tail_pages) { | ||
342 | /* | ||
343 | * Some page got inserted in our range? Skip it. We | ||
344 | * have our pages locked so they are protected from | ||
345 | * being removed. | ||
346 | */ | ||
347 | if (page != pvec->pages[i]) | ||
348 | continue; | ||
349 | WARN_ON_ONCE(!PageLocked(page)); | ||
350 | if (PageTransHuge(page) && !PageHuge(page)) | ||
351 | tail_pages = HPAGE_PMD_NR - 1; | ||
352 | page->mapping = NULL; | ||
353 | /* | ||
354 | * Leave page->index set: truncation lookup relies | ||
355 | * upon it | ||
356 | */ | ||
357 | i++; | ||
358 | } else { | ||
359 | tail_pages--; | ||
360 | } | ||
361 | radix_tree_clear_tags(&mapping->page_tree, iter.node, slot); | ||
362 | __radix_tree_replace(&mapping->page_tree, iter.node, slot, NULL, | ||
363 | workingset_lookup_update(mapping)); | ||
364 | total_pages++; | ||
287 | } | 365 | } |
366 | mapping->nrpages -= total_pages; | ||
367 | } | ||
368 | |||
369 | void delete_from_page_cache_batch(struct address_space *mapping, | ||
370 | struct pagevec *pvec) | ||
371 | { | ||
372 | int i; | ||
373 | unsigned long flags; | ||
374 | |||
375 | if (!pagevec_count(pvec)) | ||
376 | return; | ||
377 | |||
378 | spin_lock_irqsave(&mapping->tree_lock, flags); | ||
379 | for (i = 0; i < pagevec_count(pvec); i++) { | ||
380 | trace_mm_filemap_delete_from_page_cache(pvec->pages[i]); | ||
381 | |||
382 | unaccount_page_cache_page(mapping, pvec->pages[i]); | ||
383 | } | ||
384 | page_cache_tree_delete_batch(mapping, pvec); | ||
385 | spin_unlock_irqrestore(&mapping->tree_lock, flags); | ||
386 | |||
387 | for (i = 0; i < pagevec_count(pvec); i++) | ||
388 | page_cache_free_page(mapping, pvec->pages[i]); | ||
288 | } | 389 | } |
289 | EXPORT_SYMBOL(delete_from_page_cache); | ||
290 | 390 | ||
291 | int filemap_check_errors(struct address_space *mapping) | 391 | int filemap_check_errors(struct address_space *mapping) |
292 | { | 392 | { |
@@ -419,20 +519,18 @@ static void __filemap_fdatawait_range(struct address_space *mapping, | |||
419 | if (end_byte < start_byte) | 519 | if (end_byte < start_byte) |
420 | return; | 520 | return; |
421 | 521 | ||
422 | pagevec_init(&pvec, 0); | 522 | pagevec_init(&pvec); |
423 | while ((index <= end) && | 523 | while (index <= end) { |
424 | (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, | ||
425 | PAGECACHE_TAG_WRITEBACK, | ||
426 | min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { | ||
427 | unsigned i; | 524 | unsigned i; |
428 | 525 | ||
526 | nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, | ||
527 | end, PAGECACHE_TAG_WRITEBACK); | ||
528 | if (!nr_pages) | ||
529 | break; | ||
530 | |||
429 | for (i = 0; i < nr_pages; i++) { | 531 | for (i = 0; i < nr_pages; i++) { |
430 | struct page *page = pvec.pages[i]; | 532 | struct page *page = pvec.pages[i]; |
431 | 533 | ||
432 | /* until radix tree lookup accepts end_index */ | ||
433 | if (page->index > end) | ||
434 | continue; | ||
435 | |||
436 | wait_on_page_writeback(page); | 534 | wait_on_page_writeback(page); |
437 | ClearPageError(page); | 535 | ClearPageError(page); |
438 | } | 536 | } |
@@ -1754,9 +1852,10 @@ repeat: | |||
1754 | EXPORT_SYMBOL(find_get_pages_contig); | 1852 | EXPORT_SYMBOL(find_get_pages_contig); |
1755 | 1853 | ||
1756 | /** | 1854 | /** |
1757 | * find_get_pages_tag - find and return pages that match @tag | 1855 | * find_get_pages_range_tag - find and return pages in given range matching @tag |
1758 | * @mapping: the address_space to search | 1856 | * @mapping: the address_space to search |
1759 | * @index: the starting page index | 1857 | * @index: the starting page index |
1858 | * @end: The final page index (inclusive) | ||
1760 | * @tag: the tag index | 1859 | * @tag: the tag index |
1761 | * @nr_pages: the maximum number of pages | 1860 | * @nr_pages: the maximum number of pages |
1762 | * @pages: where the resulting pages are placed | 1861 | * @pages: where the resulting pages are placed |
@@ -1764,8 +1863,9 @@ EXPORT_SYMBOL(find_get_pages_contig); | |||
1764 | * Like find_get_pages, except we only return pages which are tagged with | 1863 | * Like find_get_pages, except we only return pages which are tagged with |
1765 | * @tag. We update @index to index the next page for the traversal. | 1864 | * @tag. We update @index to index the next page for the traversal. |
1766 | */ | 1865 | */ |
1767 | unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, | 1866 | unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, |
1768 | int tag, unsigned int nr_pages, struct page **pages) | 1867 | pgoff_t end, int tag, unsigned int nr_pages, |
1868 | struct page **pages) | ||
1769 | { | 1869 | { |
1770 | struct radix_tree_iter iter; | 1870 | struct radix_tree_iter iter; |
1771 | void **slot; | 1871 | void **slot; |
@@ -1778,6 +1878,9 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, | |||
1778 | radix_tree_for_each_tagged(slot, &mapping->page_tree, | 1878 | radix_tree_for_each_tagged(slot, &mapping->page_tree, |
1779 | &iter, *index, tag) { | 1879 | &iter, *index, tag) { |
1780 | struct page *head, *page; | 1880 | struct page *head, *page; |
1881 | |||
1882 | if (iter.index > end) | ||
1883 | break; | ||
1781 | repeat: | 1884 | repeat: |
1782 | page = radix_tree_deref_slot(slot); | 1885 | page = radix_tree_deref_slot(slot); |
1783 | if (unlikely(!page)) | 1886 | if (unlikely(!page)) |
@@ -1819,18 +1922,28 @@ repeat: | |||
1819 | } | 1922 | } |
1820 | 1923 | ||
1821 | pages[ret] = page; | 1924 | pages[ret] = page; |
1822 | if (++ret == nr_pages) | 1925 | if (++ret == nr_pages) { |
1823 | break; | 1926 | *index = pages[ret - 1]->index + 1; |
1927 | goto out; | ||
1928 | } | ||
1824 | } | 1929 | } |
1825 | 1930 | ||
1931 | /* | ||
1932 | * We come here when we got at @end. We take care to not overflow the | ||
1933 | * index @index as it confuses some of the callers. This breaks the | ||
1934 | * iteration when there is page at index -1 but that is already broken | ||
1935 | * anyway. | ||
1936 | */ | ||
1937 | if (end == (pgoff_t)-1) | ||
1938 | *index = (pgoff_t)-1; | ||
1939 | else | ||
1940 | *index = end + 1; | ||
1941 | out: | ||
1826 | rcu_read_unlock(); | 1942 | rcu_read_unlock(); |
1827 | 1943 | ||
1828 | if (ret) | ||
1829 | *index = pages[ret - 1]->index + 1; | ||
1830 | |||
1831 | return ret; | 1944 | return ret; |
1832 | } | 1945 | } |
1833 | EXPORT_SYMBOL(find_get_pages_tag); | 1946 | EXPORT_SYMBOL(find_get_pages_range_tag); |
1834 | 1947 | ||
1835 | /** | 1948 | /** |
1836 | * find_get_entries_tag - find and return entries that match @tag | 1949 | * find_get_entries_tag - find and return entries that match @tag |
@@ -2159,7 +2272,7 @@ no_cached_page: | |||
2159 | * Ok, it wasn't cached, so we need to create a new | 2272 | * Ok, it wasn't cached, so we need to create a new |
2160 | * page.. | 2273 | * page.. |
2161 | */ | 2274 | */ |
2162 | page = page_cache_alloc_cold(mapping); | 2275 | page = page_cache_alloc(mapping); |
2163 | if (!page) { | 2276 | if (!page) { |
2164 | error = -ENOMEM; | 2277 | error = -ENOMEM; |
2165 | goto out; | 2278 | goto out; |
@@ -2271,7 +2384,7 @@ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) | |||
2271 | int ret; | 2384 | int ret; |
2272 | 2385 | ||
2273 | do { | 2386 | do { |
2274 | page = __page_cache_alloc(gfp_mask|__GFP_COLD); | 2387 | page = __page_cache_alloc(gfp_mask); |
2275 | if (!page) | 2388 | if (!page) |
2276 | return -ENOMEM; | 2389 | return -ENOMEM; |
2277 | 2390 | ||
@@ -2675,7 +2788,7 @@ static struct page *do_read_cache_page(struct address_space *mapping, | |||
2675 | repeat: | 2788 | repeat: |
2676 | page = find_get_page(mapping, index); | 2789 | page = find_get_page(mapping, index); |
2677 | if (!page) { | 2790 | if (!page) { |
2678 | page = __page_cache_alloc(gfp | __GFP_COLD); | 2791 | page = __page_cache_alloc(gfp); |
2679 | if (!page) | 2792 | if (!page) |
2680 | return ERR_PTR(-ENOMEM); | 2793 | return ERR_PTR(-ENOMEM); |
2681 | err = add_to_page_cache_lru(page, mapping, index, gfp); | 2794 | err = add_to_page_cache_lru(page, mapping, index, gfp); |
@@ -803,11 +803,10 @@ static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL); | |||
803 | 803 | ||
804 | static void hmm_devmem_radix_release(struct resource *resource) | 804 | static void hmm_devmem_radix_release(struct resource *resource) |
805 | { | 805 | { |
806 | resource_size_t key, align_start, align_size, align_end; | 806 | resource_size_t key, align_start, align_size; |
807 | 807 | ||
808 | align_start = resource->start & ~(PA_SECTION_SIZE - 1); | 808 | align_start = resource->start & ~(PA_SECTION_SIZE - 1); |
809 | align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE); | 809 | align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE); |
810 | align_end = align_start + align_size - 1; | ||
811 | 810 | ||
812 | mutex_lock(&hmm_devmem_lock); | 811 | mutex_lock(&hmm_devmem_lock); |
813 | for (key = resource->start; | 812 | for (key = resource->start; |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 003f7bcd0952..86fe697e8bfb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -606,7 +606,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, | |||
606 | pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); | 606 | pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); |
607 | set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); | 607 | set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); |
608 | add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); | 608 | add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); |
609 | atomic_long_inc(&vma->vm_mm->nr_ptes); | 609 | mm_inc_nr_ptes(vma->vm_mm); |
610 | spin_unlock(vmf->ptl); | 610 | spin_unlock(vmf->ptl); |
611 | count_vm_event(THP_FAULT_ALLOC); | 611 | count_vm_event(THP_FAULT_ALLOC); |
612 | } | 612 | } |
@@ -662,7 +662,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, | |||
662 | if (pgtable) | 662 | if (pgtable) |
663 | pgtable_trans_huge_deposit(mm, pmd, pgtable); | 663 | pgtable_trans_huge_deposit(mm, pmd, pgtable); |
664 | set_pmd_at(mm, haddr, pmd, entry); | 664 | set_pmd_at(mm, haddr, pmd, entry); |
665 | atomic_long_inc(&mm->nr_ptes); | 665 | mm_inc_nr_ptes(mm); |
666 | return true; | 666 | return true; |
667 | } | 667 | } |
668 | 668 | ||
@@ -747,7 +747,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, | |||
747 | 747 | ||
748 | if (pgtable) { | 748 | if (pgtable) { |
749 | pgtable_trans_huge_deposit(mm, pmd, pgtable); | 749 | pgtable_trans_huge_deposit(mm, pmd, pgtable); |
750 | atomic_long_inc(&mm->nr_ptes); | 750 | mm_inc_nr_ptes(mm); |
751 | } | 751 | } |
752 | 752 | ||
753 | set_pmd_at(mm, addr, pmd, entry); | 753 | set_pmd_at(mm, addr, pmd, entry); |
@@ -942,7 +942,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
942 | set_pmd_at(src_mm, addr, src_pmd, pmd); | 942 | set_pmd_at(src_mm, addr, src_pmd, pmd); |
943 | } | 943 | } |
944 | add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); | 944 | add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); |
945 | atomic_long_inc(&dst_mm->nr_ptes); | 945 | mm_inc_nr_ptes(dst_mm); |
946 | pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); | 946 | pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); |
947 | set_pmd_at(dst_mm, addr, dst_pmd, pmd); | 947 | set_pmd_at(dst_mm, addr, dst_pmd, pmd); |
948 | ret = 0; | 948 | ret = 0; |
@@ -978,7 +978,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
978 | get_page(src_page); | 978 | get_page(src_page); |
979 | page_dup_rmap(src_page, true); | 979 | page_dup_rmap(src_page, true); |
980 | add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); | 980 | add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); |
981 | atomic_long_inc(&dst_mm->nr_ptes); | 981 | mm_inc_nr_ptes(dst_mm); |
982 | pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); | 982 | pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); |
983 | 983 | ||
984 | pmdp_set_wrprotect(src_mm, addr, src_pmd); | 984 | pmdp_set_wrprotect(src_mm, addr, src_pmd); |
@@ -1189,8 +1189,15 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, | |||
1189 | goto out_free_pages; | 1189 | goto out_free_pages; |
1190 | VM_BUG_ON_PAGE(!PageHead(page), page); | 1190 | VM_BUG_ON_PAGE(!PageHead(page), page); |
1191 | 1191 | ||
1192 | /* | ||
1193 | * Leave pmd empty until pte is filled note we must notify here as | ||
1194 | * concurrent CPU thread might write to new page before the call to | ||
1195 | * mmu_notifier_invalidate_range_end() happens which can lead to a | ||
1196 | * device seeing memory write in different order than CPU. | ||
1197 | * | ||
1198 | * See Documentation/vm/mmu_notifier.txt | ||
1199 | */ | ||
1192 | pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); | 1200 | pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); |
1193 | /* leave pmd empty until pte is filled */ | ||
1194 | 1201 | ||
1195 | pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd); | 1202 | pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd); |
1196 | pmd_populate(vma->vm_mm, &_pmd, pgtable); | 1203 | pmd_populate(vma->vm_mm, &_pmd, pgtable); |
@@ -1216,7 +1223,12 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, | |||
1216 | page_remove_rmap(page, true); | 1223 | page_remove_rmap(page, true); |
1217 | spin_unlock(vmf->ptl); | 1224 | spin_unlock(vmf->ptl); |
1218 | 1225 | ||
1219 | mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); | 1226 | /* |
1227 | * No need to double call mmu_notifier->invalidate_range() callback as | ||
1228 | * the above pmdp_huge_clear_flush_notify() did already call it. | ||
1229 | */ | ||
1230 | mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start, | ||
1231 | mmun_end); | ||
1220 | 1232 | ||
1221 | ret |= VM_FAULT_WRITE; | 1233 | ret |= VM_FAULT_WRITE; |
1222 | put_page(page); | 1234 | put_page(page); |
@@ -1365,7 +1377,12 @@ alloc: | |||
1365 | } | 1377 | } |
1366 | spin_unlock(vmf->ptl); | 1378 | spin_unlock(vmf->ptl); |
1367 | out_mn: | 1379 | out_mn: |
1368 | mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); | 1380 | /* |
1381 | * No need to double call mmu_notifier->invalidate_range() callback as | ||
1382 | * the above pmdp_huge_clear_flush_notify() did already call it. | ||
1383 | */ | ||
1384 | mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start, | ||
1385 | mmun_end); | ||
1369 | out: | 1386 | out: |
1370 | return ret; | 1387 | return ret; |
1371 | out_unlock: | 1388 | out_unlock: |
@@ -1678,7 +1695,7 @@ static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) | |||
1678 | 1695 | ||
1679 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); | 1696 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); |
1680 | pte_free(mm, pgtable); | 1697 | pte_free(mm, pgtable); |
1681 | atomic_long_dec(&mm->nr_ptes); | 1698 | mm_dec_nr_ptes(mm); |
1682 | } | 1699 | } |
1683 | 1700 | ||
1684 | int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | 1701 | int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, |
@@ -2017,7 +2034,12 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, | |||
2017 | 2034 | ||
2018 | out: | 2035 | out: |
2019 | spin_unlock(ptl); | 2036 | spin_unlock(ptl); |
2020 | mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PUD_SIZE); | 2037 | /* |
2038 | * No need to double call mmu_notifier->invalidate_range() callback as | ||
2039 | * the above pudp_huge_clear_flush_notify() did already call it. | ||
2040 | */ | ||
2041 | mmu_notifier_invalidate_range_only_end(mm, haddr, haddr + | ||
2042 | HPAGE_PUD_SIZE); | ||
2021 | } | 2043 | } |
2022 | #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ | 2044 | #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ |
2023 | 2045 | ||
@@ -2029,8 +2051,15 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, | |||
2029 | pmd_t _pmd; | 2051 | pmd_t _pmd; |
2030 | int i; | 2052 | int i; |
2031 | 2053 | ||
2032 | /* leave pmd empty until pte is filled */ | 2054 | /* |
2033 | pmdp_huge_clear_flush_notify(vma, haddr, pmd); | 2055 | * Leave pmd empty until pte is filled note that it is fine to delay |
2056 | * notification until mmu_notifier_invalidate_range_end() as we are | ||
2057 | * replacing a zero pmd write protected page with a zero pte write | ||
2058 | * protected page. | ||
2059 | * | ||
2060 | * See Documentation/vm/mmu_notifier.txt | ||
2061 | */ | ||
2062 | pmdp_huge_clear_flush(vma, haddr, pmd); | ||
2034 | 2063 | ||
2035 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); | 2064 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); |
2036 | pmd_populate(mm, &_pmd, pgtable); | 2065 | pmd_populate(mm, &_pmd, pgtable); |
@@ -2085,6 +2114,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, | |||
2085 | add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR); | 2114 | add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR); |
2086 | return; | 2115 | return; |
2087 | } else if (is_huge_zero_pmd(*pmd)) { | 2116 | } else if (is_huge_zero_pmd(*pmd)) { |
2117 | /* | ||
2118 | * FIXME: Do we want to invalidate secondary mmu by calling | ||
2119 | * mmu_notifier_invalidate_range() see comments below inside | ||
2120 | * __split_huge_pmd() ? | ||
2121 | * | ||
2122 | * We are going from a zero huge page write protected to zero | ||
2123 | * small page also write protected so it does not seems useful | ||
2124 | * to invalidate secondary mmu at this time. | ||
2125 | */ | ||
2088 | return __split_huge_zero_page_pmd(vma, haddr, pmd); | 2126 | return __split_huge_zero_page_pmd(vma, haddr, pmd); |
2089 | } | 2127 | } |
2090 | 2128 | ||
@@ -2220,7 +2258,21 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
2220 | __split_huge_pmd_locked(vma, pmd, haddr, freeze); | 2258 | __split_huge_pmd_locked(vma, pmd, haddr, freeze); |
2221 | out: | 2259 | out: |
2222 | spin_unlock(ptl); | 2260 | spin_unlock(ptl); |
2223 | mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE); | 2261 | /* |
2262 | * No need to double call mmu_notifier->invalidate_range() callback. | ||
2263 | * They are 3 cases to consider inside __split_huge_pmd_locked(): | ||
2264 | * 1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious | ||
2265 | * 2) __split_huge_zero_page_pmd() read only zero page and any write | ||
2266 | * fault will trigger a flush_notify before pointing to a new page | ||
2267 | * (it is fine if the secondary mmu keeps pointing to the old zero | ||
2268 | * page in the meantime) | ||
2269 | * 3) Split a huge pmd into pte pointing to the same page. No need | ||
2270 | * to invalidate secondary tlb entry they are all still valid. | ||
2271 | * any further changes to individual pte will notify. So no need | ||
2272 | * to call mmu_notifier->invalidate_range() | ||
2273 | */ | ||
2274 | mmu_notifier_invalidate_range_only_end(mm, haddr, haddr + | ||
2275 | HPAGE_PMD_SIZE); | ||
2224 | } | 2276 | } |
2225 | 2277 | ||
2226 | void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, | 2278 | void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2d2ff5e8bf2b..681b300185c0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -3256,9 +3256,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
3256 | set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); | 3256 | set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); |
3257 | } else { | 3257 | } else { |
3258 | if (cow) { | 3258 | if (cow) { |
3259 | /* | ||
3260 | * No need to notify as we are downgrading page | ||
3261 | * table protection not changing it to point | ||
3262 | * to a new page. | ||
3263 | * | ||
3264 | * See Documentation/vm/mmu_notifier.txt | ||
3265 | */ | ||
3259 | huge_ptep_set_wrprotect(src, addr, src_pte); | 3266 | huge_ptep_set_wrprotect(src, addr, src_pte); |
3260 | mmu_notifier_invalidate_range(src, mmun_start, | ||
3261 | mmun_end); | ||
3262 | } | 3267 | } |
3263 | entry = huge_ptep_get(src_pte); | 3268 | entry = huge_ptep_get(src_pte); |
3264 | ptepage = pte_page(entry); | 3269 | ptepage = pte_page(entry); |
@@ -4318,7 +4323,12 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, | |||
4318 | * and that page table be reused and filled with junk. | 4323 | * and that page table be reused and filled with junk. |
4319 | */ | 4324 | */ |
4320 | flush_hugetlb_tlb_range(vma, start, end); | 4325 | flush_hugetlb_tlb_range(vma, start, end); |
4321 | mmu_notifier_invalidate_range(mm, start, end); | 4326 | /* |
4327 | * No need to call mmu_notifier_invalidate_range() we are downgrading | ||
4328 | * page table protection not changing it to point to a new page. | ||
4329 | * | ||
4330 | * See Documentation/vm/mmu_notifier.txt | ||
4331 | */ | ||
4322 | i_mmap_unlock_write(vma->vm_file->f_mapping); | 4332 | i_mmap_unlock_write(vma->vm_file->f_mapping); |
4323 | mmu_notifier_invalidate_range_end(mm, start, end); | 4333 | mmu_notifier_invalidate_range_end(mm, start, end); |
4324 | 4334 | ||
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 6f319fb81718..405bba487df5 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c | |||
@@ -337,7 +337,7 @@ static size_t optimal_redzone(size_t object_size) | |||
337 | } | 337 | } |
338 | 338 | ||
339 | void kasan_cache_create(struct kmem_cache *cache, size_t *size, | 339 | void kasan_cache_create(struct kmem_cache *cache, size_t *size, |
340 | unsigned long *flags) | 340 | slab_flags_t *flags) |
341 | { | 341 | { |
342 | int redzone_adjust; | 342 | int redzone_adjust; |
343 | int orig_size = *size; | 343 | int orig_size = *size; |
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 43cb3043311b..ea4ff259b671 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c | |||
@@ -1270,7 +1270,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) | |||
1270 | _pmd = pmdp_collapse_flush(vma, addr, pmd); | 1270 | _pmd = pmdp_collapse_flush(vma, addr, pmd); |
1271 | spin_unlock(ptl); | 1271 | spin_unlock(ptl); |
1272 | up_write(&vma->vm_mm->mmap_sem); | 1272 | up_write(&vma->vm_mm->mmap_sem); |
1273 | atomic_long_dec(&vma->vm_mm->nr_ptes); | 1273 | mm_dec_nr_ptes(vma->vm_mm); |
1274 | pte_free(vma->vm_mm, pmd_pgtable(_pmd)); | 1274 | pte_free(vma->vm_mm, pmd_pgtable(_pmd)); |
1275 | } | 1275 | } |
1276 | } | 1276 | } |
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c index 800d64b854ea..cec594032515 100644 --- a/mm/kmemcheck.c +++ b/mm/kmemcheck.c | |||
@@ -1,126 +1 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | // SPDX-License-Identifier: GPL-2.0 | |
2 | #include <linux/gfp.h> | ||
3 | #include <linux/mm_types.h> | ||
4 | #include <linux/mm.h> | ||
5 | #include <linux/slab.h> | ||
6 | #include "slab.h" | ||
7 | #include <linux/kmemcheck.h> | ||
8 | |||
9 | void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) | ||
10 | { | ||
11 | struct page *shadow; | ||
12 | int pages; | ||
13 | int i; | ||
14 | |||
15 | pages = 1 << order; | ||
16 | |||
17 | /* | ||
18 | * With kmemcheck enabled, we need to allocate a memory area for the | ||
19 | * shadow bits as well. | ||
20 | */ | ||
21 | shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order); | ||
22 | if (!shadow) { | ||
23 | if (printk_ratelimit()) | ||
24 | pr_err("kmemcheck: failed to allocate shadow bitmap\n"); | ||
25 | return; | ||
26 | } | ||
27 | |||
28 | for(i = 0; i < pages; ++i) | ||
29 | page[i].shadow = page_address(&shadow[i]); | ||
30 | |||
31 | /* | ||
32 | * Mark it as non-present for the MMU so that our accesses to | ||
33 | * this memory will trigger a page fault and let us analyze | ||
34 | * the memory accesses. | ||
35 | */ | ||
36 | kmemcheck_hide_pages(page, pages); | ||
37 | } | ||
38 | |||
39 | void kmemcheck_free_shadow(struct page *page, int order) | ||
40 | { | ||
41 | struct page *shadow; | ||
42 | int pages; | ||
43 | int i; | ||
44 | |||
45 | if (!kmemcheck_page_is_tracked(page)) | ||
46 | return; | ||
47 | |||
48 | pages = 1 << order; | ||
49 | |||
50 | kmemcheck_show_pages(page, pages); | ||
51 | |||
52 | shadow = virt_to_page(page[0].shadow); | ||
53 | |||
54 | for(i = 0; i < pages; ++i) | ||
55 | page[i].shadow = NULL; | ||
56 | |||
57 | __free_pages(shadow, order); | ||
58 | } | ||
59 | |||
60 | void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, | ||
61 | size_t size) | ||
62 | { | ||
63 | if (unlikely(!object)) /* Skip object if allocation failed */ | ||
64 | return; | ||
65 | |||
66 | /* | ||
67 | * Has already been memset(), which initializes the shadow for us | ||
68 | * as well. | ||
69 | */ | ||
70 | if (gfpflags & __GFP_ZERO) | ||
71 | return; | ||
72 | |||
73 | /* No need to initialize the shadow of a non-tracked slab. */ | ||
74 | if (s->flags & SLAB_NOTRACK) | ||
75 | return; | ||
76 | |||
77 | if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) { | ||
78 | /* | ||
79 | * Allow notracked objects to be allocated from | ||
80 | * tracked caches. Note however that these objects | ||
81 | * will still get page faults on access, they just | ||
82 | * won't ever be flagged as uninitialized. If page | ||
83 | * faults are not acceptable, the slab cache itself | ||
84 | * should be marked NOTRACK. | ||
85 | */ | ||
86 | kmemcheck_mark_initialized(object, size); | ||
87 | } else if (!s->ctor) { | ||
88 | /* | ||
89 | * New objects should be marked uninitialized before | ||
90 | * they're returned to the called. | ||
91 | */ | ||
92 | kmemcheck_mark_uninitialized(object, size); | ||
93 | } | ||
94 | } | ||
95 | |||
96 | void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size) | ||
97 | { | ||
98 | /* TODO: RCU freeing is unsupported for now; hide false positives. */ | ||
99 | if (!s->ctor && !(s->flags & SLAB_TYPESAFE_BY_RCU)) | ||
100 | kmemcheck_mark_freed(object, size); | ||
101 | } | ||
102 | |||
103 | void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order, | ||
104 | gfp_t gfpflags) | ||
105 | { | ||
106 | int pages; | ||
107 | |||
108 | if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK)) | ||
109 | return; | ||
110 | |||
111 | pages = 1 << order; | ||
112 | |||
113 | /* | ||
114 | * NOTE: We choose to track GFP_ZERO pages too; in fact, they | ||
115 | * can become uninitialized by copying uninitialized memory | ||
116 | * into them. | ||
117 | */ | ||
118 | |||
119 | /* XXX: Can use zone->node for node? */ | ||
120 | kmemcheck_alloc_shadow(page, order, gfpflags, -1); | ||
121 | |||
122 | if (gfpflags & __GFP_ZERO) | ||
123 | kmemcheck_mark_initialized_pages(page, pages); | ||
124 | else | ||
125 | kmemcheck_mark_uninitialized_pages(page, pages); | ||
126 | } | ||
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 7780cd83a495..e4738d5e9b8c 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -110,7 +110,6 @@ | |||
110 | #include <linux/atomic.h> | 110 | #include <linux/atomic.h> |
111 | 111 | ||
112 | #include <linux/kasan.h> | 112 | #include <linux/kasan.h> |
113 | #include <linux/kmemcheck.h> | ||
114 | #include <linux/kmemleak.h> | 113 | #include <linux/kmemleak.h> |
115 | #include <linux/memory_hotplug.h> | 114 | #include <linux/memory_hotplug.h> |
116 | 115 | ||
@@ -1238,9 +1237,6 @@ static bool update_checksum(struct kmemleak_object *object) | |||
1238 | { | 1237 | { |
1239 | u32 old_csum = object->checksum; | 1238 | u32 old_csum = object->checksum; |
1240 | 1239 | ||
1241 | if (!kmemcheck_is_obj_initialized(object->pointer, object->size)) | ||
1242 | return false; | ||
1243 | |||
1244 | kasan_disable_current(); | 1240 | kasan_disable_current(); |
1245 | object->checksum = crc32(0, (void *)object->pointer, object->size); | 1241 | object->checksum = crc32(0, (void *)object->pointer, object->size); |
1246 | kasan_enable_current(); | 1242 | kasan_enable_current(); |
@@ -1314,11 +1310,6 @@ static void scan_block(void *_start, void *_end, | |||
1314 | if (scan_should_stop()) | 1310 | if (scan_should_stop()) |
1315 | break; | 1311 | break; |
1316 | 1312 | ||
1317 | /* don't scan uninitialized memory */ | ||
1318 | if (!kmemcheck_is_obj_initialized((unsigned long)ptr, | ||
1319 | BYTES_PER_POINTER)) | ||
1320 | continue; | ||
1321 | |||
1322 | kasan_disable_current(); | 1313 | kasan_disable_current(); |
1323 | pointer = *ptr; | 1314 | pointer = *ptr; |
1324 | kasan_enable_current(); | 1315 | kasan_enable_current(); |
@@ -2104,7 +2095,7 @@ static int __init kmemleak_late_init(void) | |||
2104 | return -ENOMEM; | 2095 | return -ENOMEM; |
2105 | } | 2096 | } |
2106 | 2097 | ||
2107 | dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL, | 2098 | dentry = debugfs_create_file("kmemleak", 0644, NULL, NULL, |
2108 | &kmemleak_fops); | 2099 | &kmemleak_fops); |
2109 | if (!dentry) | 2100 | if (!dentry) |
2110 | pr_warn("Failed to create the debugfs kmemleak file\n"); | 2101 | pr_warn("Failed to create the debugfs kmemleak file\n"); |
@@ -1052,8 +1052,13 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, | |||
1052 | * So we clear the pte and flush the tlb before the check | 1052 | * So we clear the pte and flush the tlb before the check |
1053 | * this assure us that no O_DIRECT can happen after the check | 1053 | * this assure us that no O_DIRECT can happen after the check |
1054 | * or in the middle of the check. | 1054 | * or in the middle of the check. |
1055 | * | ||
1056 | * No need to notify as we are downgrading page table to read | ||
1057 | * only not changing it to point to a new page. | ||
1058 | * | ||
1059 | * See Documentation/vm/mmu_notifier.txt | ||
1055 | */ | 1060 | */ |
1056 | entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte); | 1061 | entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte); |
1057 | /* | 1062 | /* |
1058 | * Check that no O_DIRECT or similar I/O is in progress on the | 1063 | * Check that no O_DIRECT or similar I/O is in progress on the |
1059 | * page | 1064 | * page |
@@ -1136,7 +1141,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
1136 | } | 1141 | } |
1137 | 1142 | ||
1138 | flush_cache_page(vma, addr, pte_pfn(*ptep)); | 1143 | flush_cache_page(vma, addr, pte_pfn(*ptep)); |
1139 | ptep_clear_flush_notify(vma, addr, ptep); | 1144 | /* |
1145 | * No need to notify as we are replacing a read only page with another | ||
1146 | * read only page with the same content. | ||
1147 | * | ||
1148 | * See Documentation/vm/mmu_notifier.txt | ||
1149 | */ | ||
1150 | ptep_clear_flush(vma, addr, ptep); | ||
1140 | set_pte_at_notify(mm, addr, ptep, newpte); | 1151 | set_pte_at_notify(mm, addr, ptep, newpte); |
1141 | 1152 | ||
1142 | page_remove_rmap(page, false); | 1153 | page_remove_rmap(page, false); |
diff --git a/mm/list_lru.c b/mm/list_lru.c index f141f0c80ff3..fd41e969ede5 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c | |||
@@ -221,6 +221,7 @@ restart: | |||
221 | switch (ret) { | 221 | switch (ret) { |
222 | case LRU_REMOVED_RETRY: | 222 | case LRU_REMOVED_RETRY: |
223 | assert_spin_locked(&nlru->lock); | 223 | assert_spin_locked(&nlru->lock); |
224 | /* fall through */ | ||
224 | case LRU_REMOVED: | 225 | case LRU_REMOVED: |
225 | isolated++; | 226 | isolated++; |
226 | nlru->nr_items--; | 227 | nlru->nr_items--; |
diff --git a/mm/memblock.c b/mm/memblock.c index 91205780e6b1..46aacdfa4f4d 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -533,7 +533,7 @@ repeat: | |||
533 | base = obase; | 533 | base = obase; |
534 | nr_new = 0; | 534 | nr_new = 0; |
535 | 535 | ||
536 | for_each_memblock_type(type, rgn) { | 536 | for_each_memblock_type(idx, type, rgn) { |
537 | phys_addr_t rbase = rgn->base; | 537 | phys_addr_t rbase = rgn->base; |
538 | phys_addr_t rend = rbase + rgn->size; | 538 | phys_addr_t rend = rbase + rgn->size; |
539 | 539 | ||
@@ -637,7 +637,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, | |||
637 | if (memblock_double_array(type, base, size) < 0) | 637 | if (memblock_double_array(type, base, size) < 0) |
638 | return -ENOMEM; | 638 | return -ENOMEM; |
639 | 639 | ||
640 | for_each_memblock_type(type, rgn) { | 640 | for_each_memblock_type(idx, type, rgn) { |
641 | phys_addr_t rbase = rgn->base; | 641 | phys_addr_t rbase = rgn->base; |
642 | phys_addr_t rend = rbase + rgn->size; | 642 | phys_addr_t rend = rbase + rgn->size; |
643 | 643 | ||
@@ -1327,7 +1327,6 @@ again: | |||
1327 | return NULL; | 1327 | return NULL; |
1328 | done: | 1328 | done: |
1329 | ptr = phys_to_virt(alloc); | 1329 | ptr = phys_to_virt(alloc); |
1330 | memset(ptr, 0, size); | ||
1331 | 1330 | ||
1332 | /* | 1331 | /* |
1333 | * The min_count is set to 0 so that bootmem allocated blocks | 1332 | * The min_count is set to 0 so that bootmem allocated blocks |
@@ -1341,6 +1340,45 @@ done: | |||
1341 | } | 1340 | } |
1342 | 1341 | ||
1343 | /** | 1342 | /** |
1343 | * memblock_virt_alloc_try_nid_raw - allocate boot memory block without zeroing | ||
1344 | * memory and without panicking | ||
1345 | * @size: size of memory block to be allocated in bytes | ||
1346 | * @align: alignment of the region and block's size | ||
1347 | * @min_addr: the lower bound of the memory region from where the allocation | ||
1348 | * is preferred (phys address) | ||
1349 | * @max_addr: the upper bound of the memory region from where the allocation | ||
1350 | * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to | ||
1351 | * allocate only from memory limited by memblock.current_limit value | ||
1352 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node | ||
1353 | * | ||
1354 | * Public function, provides additional debug information (including caller | ||
1355 | * info), if enabled. Does not zero allocated memory, does not panic if request | ||
1356 | * cannot be satisfied. | ||
1357 | * | ||
1358 | * RETURNS: | ||
1359 | * Virtual address of allocated memory block on success, NULL on failure. | ||
1360 | */ | ||
1361 | void * __init memblock_virt_alloc_try_nid_raw( | ||
1362 | phys_addr_t size, phys_addr_t align, | ||
1363 | phys_addr_t min_addr, phys_addr_t max_addr, | ||
1364 | int nid) | ||
1365 | { | ||
1366 | void *ptr; | ||
1367 | |||
1368 | memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", | ||
1369 | __func__, (u64)size, (u64)align, nid, (u64)min_addr, | ||
1370 | (u64)max_addr, (void *)_RET_IP_); | ||
1371 | |||
1372 | ptr = memblock_virt_alloc_internal(size, align, | ||
1373 | min_addr, max_addr, nid); | ||
1374 | #ifdef CONFIG_DEBUG_VM | ||
1375 | if (ptr && size > 0) | ||
1376 | memset(ptr, 0xff, size); | ||
1377 | #endif | ||
1378 | return ptr; | ||
1379 | } | ||
1380 | |||
1381 | /** | ||
1344 | * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block | 1382 | * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block |
1345 | * @size: size of memory block to be allocated in bytes | 1383 | * @size: size of memory block to be allocated in bytes |
1346 | * @align: alignment of the region and block's size | 1384 | * @align: alignment of the region and block's size |
@@ -1351,8 +1389,8 @@ done: | |||
1351 | * allocate only from memory limited by memblock.current_limit value | 1389 | * allocate only from memory limited by memblock.current_limit value |
1352 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node | 1390 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node |
1353 | * | 1391 | * |
1354 | * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides | 1392 | * Public function, provides additional debug information (including caller |
1355 | * additional debug information (including caller info), if enabled. | 1393 | * info), if enabled. This function zeroes the allocated memory. |
1356 | * | 1394 | * |
1357 | * RETURNS: | 1395 | * RETURNS: |
1358 | * Virtual address of allocated memory block on success, NULL on failure. | 1396 | * Virtual address of allocated memory block on success, NULL on failure. |
@@ -1362,11 +1400,17 @@ void * __init memblock_virt_alloc_try_nid_nopanic( | |||
1362 | phys_addr_t min_addr, phys_addr_t max_addr, | 1400 | phys_addr_t min_addr, phys_addr_t max_addr, |
1363 | int nid) | 1401 | int nid) |
1364 | { | 1402 | { |
1403 | void *ptr; | ||
1404 | |||
1365 | memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", | 1405 | memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", |
1366 | __func__, (u64)size, (u64)align, nid, (u64)min_addr, | 1406 | __func__, (u64)size, (u64)align, nid, (u64)min_addr, |
1367 | (u64)max_addr, (void *)_RET_IP_); | 1407 | (u64)max_addr, (void *)_RET_IP_); |
1368 | return memblock_virt_alloc_internal(size, align, min_addr, | 1408 | |
1369 | max_addr, nid); | 1409 | ptr = memblock_virt_alloc_internal(size, align, |
1410 | min_addr, max_addr, nid); | ||
1411 | if (ptr) | ||
1412 | memset(ptr, 0, size); | ||
1413 | return ptr; | ||
1370 | } | 1414 | } |
1371 | 1415 | ||
1372 | /** | 1416 | /** |
@@ -1380,7 +1424,7 @@ void * __init memblock_virt_alloc_try_nid_nopanic( | |||
1380 | * allocate only from memory limited by memblock.current_limit value | 1424 | * allocate only from memory limited by memblock.current_limit value |
1381 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node | 1425 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node |
1382 | * | 1426 | * |
1383 | * Public panicking version of _memblock_virt_alloc_try_nid_nopanic() | 1427 | * Public panicking version of memblock_virt_alloc_try_nid_nopanic() |
1384 | * which provides debug information (including caller info), if enabled, | 1428 | * which provides debug information (including caller info), if enabled, |
1385 | * and panics if the request can not be satisfied. | 1429 | * and panics if the request can not be satisfied. |
1386 | * | 1430 | * |
@@ -1399,8 +1443,10 @@ void * __init memblock_virt_alloc_try_nid( | |||
1399 | (u64)max_addr, (void *)_RET_IP_); | 1443 | (u64)max_addr, (void *)_RET_IP_); |
1400 | ptr = memblock_virt_alloc_internal(size, align, | 1444 | ptr = memblock_virt_alloc_internal(size, align, |
1401 | min_addr, max_addr, nid); | 1445 | min_addr, max_addr, nid); |
1402 | if (ptr) | 1446 | if (ptr) { |
1447 | memset(ptr, 0, size); | ||
1403 | return ptr; | 1448 | return ptr; |
1449 | } | ||
1404 | 1450 | ||
1405 | panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n", | 1451 | panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n", |
1406 | __func__, (u64)size, (u64)align, nid, (u64)min_addr, | 1452 | __func__, (u64)size, (u64)align, nid, (u64)min_addr, |
@@ -1715,7 +1761,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type) | |||
1715 | 1761 | ||
1716 | pr_info(" %s.cnt = 0x%lx\n", type->name, type->cnt); | 1762 | pr_info(" %s.cnt = 0x%lx\n", type->name, type->cnt); |
1717 | 1763 | ||
1718 | for_each_memblock_type(type, rgn) { | 1764 | for_each_memblock_type(idx, type, rgn) { |
1719 | char nid_buf[32] = ""; | 1765 | char nid_buf[32] = ""; |
1720 | 1766 | ||
1721 | base = rgn->base; | 1767 | base = rgn->base; |
@@ -1739,7 +1785,7 @@ memblock_reserved_memory_within(phys_addr_t start_addr, phys_addr_t end_addr) | |||
1739 | unsigned long size = 0; | 1785 | unsigned long size = 0; |
1740 | int idx; | 1786 | int idx; |
1741 | 1787 | ||
1742 | for_each_memblock_type((&memblock.reserved), rgn) { | 1788 | for_each_memblock_type(idx, (&memblock.reserved), rgn) { |
1743 | phys_addr_t start, end; | 1789 | phys_addr_t start, end; |
1744 | 1790 | ||
1745 | if (rgn->base + rgn->size < start_addr) | 1791 | if (rgn->base + rgn->size < start_addr) |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 661f046ad318..50e6906314f8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -4049,7 +4049,7 @@ static struct cftype mem_cgroup_legacy_files[] = { | |||
4049 | .write = mem_cgroup_reset, | 4049 | .write = mem_cgroup_reset, |
4050 | .read_u64 = mem_cgroup_read_u64, | 4050 | .read_u64 = mem_cgroup_read_u64, |
4051 | }, | 4051 | }, |
4052 | #ifdef CONFIG_SLABINFO | 4052 | #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) |
4053 | { | 4053 | { |
4054 | .name = "kmem.slabinfo", | 4054 | .name = "kmem.slabinfo", |
4055 | .seq_start = memcg_slab_start, | 4055 | .seq_start = memcg_slab_start, |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 88366626c0b7..4acdf393a801 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -1587,7 +1587,7 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1587 | ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, | 1587 | ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, |
1588 | MIGRATE_SYNC, MR_MEMORY_FAILURE); | 1588 | MIGRATE_SYNC, MR_MEMORY_FAILURE); |
1589 | if (ret) { | 1589 | if (ret) { |
1590 | pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n", | 1590 | pr_info("soft offline: %#lx: hugepage migration failed %d, type %lx (%pGp)\n", |
1591 | pfn, ret, page->flags, &page->flags); | 1591 | pfn, ret, page->flags, &page->flags); |
1592 | if (!list_empty(&pagelist)) | 1592 | if (!list_empty(&pagelist)) |
1593 | putback_movable_pages(&pagelist); | 1593 | putback_movable_pages(&pagelist); |
diff --git a/mm/memory.c b/mm/memory.c index cae514e7dcfc..85e7a87da79f 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -438,7 +438,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, | |||
438 | pgtable_t token = pmd_pgtable(*pmd); | 438 | pgtable_t token = pmd_pgtable(*pmd); |
439 | pmd_clear(pmd); | 439 | pmd_clear(pmd); |
440 | pte_free_tlb(tlb, token, addr); | 440 | pte_free_tlb(tlb, token, addr); |
441 | atomic_long_dec(&tlb->mm->nr_ptes); | 441 | mm_dec_nr_ptes(tlb->mm); |
442 | } | 442 | } |
443 | 443 | ||
444 | static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, | 444 | static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, |
@@ -506,6 +506,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, | |||
506 | pud = pud_offset(p4d, start); | 506 | pud = pud_offset(p4d, start); |
507 | p4d_clear(p4d); | 507 | p4d_clear(p4d); |
508 | pud_free_tlb(tlb, pud, start); | 508 | pud_free_tlb(tlb, pud, start); |
509 | mm_dec_nr_puds(tlb->mm); | ||
509 | } | 510 | } |
510 | 511 | ||
511 | static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, | 512 | static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, |
@@ -665,7 +666,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) | |||
665 | 666 | ||
666 | ptl = pmd_lock(mm, pmd); | 667 | ptl = pmd_lock(mm, pmd); |
667 | if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ | 668 | if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ |
668 | atomic_long_inc(&mm->nr_ptes); | 669 | mm_inc_nr_ptes(mm); |
669 | pmd_populate(mm, pmd, new); | 670 | pmd_populate(mm, pmd, new); |
670 | new = NULL; | 671 | new = NULL; |
671 | } | 672 | } |
@@ -2554,7 +2555,11 @@ static int wp_page_copy(struct vm_fault *vmf) | |||
2554 | put_page(new_page); | 2555 | put_page(new_page); |
2555 | 2556 | ||
2556 | pte_unmap_unlock(vmf->pte, vmf->ptl); | 2557 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
2557 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2558 | /* |
2559 | * No need to double call mmu_notifier->invalidate_range() callback as | ||
2560 | * the above ptep_clear_flush_notify() did already call it. | ||
2561 | */ | ||
2562 | mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end); | ||
2558 | if (old_page) { | 2563 | if (old_page) { |
2559 | /* | 2564 | /* |
2560 | * Don't let another task, with possibly unlocked vma, | 2565 | * Don't let another task, with possibly unlocked vma, |
@@ -2842,7 +2847,7 @@ EXPORT_SYMBOL(unmap_mapping_range); | |||
2842 | int do_swap_page(struct vm_fault *vmf) | 2847 | int do_swap_page(struct vm_fault *vmf) |
2843 | { | 2848 | { |
2844 | struct vm_area_struct *vma = vmf->vma; | 2849 | struct vm_area_struct *vma = vmf->vma; |
2845 | struct page *page = NULL, *swapcache; | 2850 | struct page *page = NULL, *swapcache = NULL; |
2846 | struct mem_cgroup *memcg; | 2851 | struct mem_cgroup *memcg; |
2847 | struct vma_swap_readahead swap_ra; | 2852 | struct vma_swap_readahead swap_ra; |
2848 | swp_entry_t entry; | 2853 | swp_entry_t entry; |
@@ -2881,17 +2886,36 @@ int do_swap_page(struct vm_fault *vmf) | |||
2881 | } | 2886 | } |
2882 | goto out; | 2887 | goto out; |
2883 | } | 2888 | } |
2889 | |||
2890 | |||
2884 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); | 2891 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); |
2885 | if (!page) | 2892 | if (!page) |
2886 | page = lookup_swap_cache(entry, vma_readahead ? vma : NULL, | 2893 | page = lookup_swap_cache(entry, vma_readahead ? vma : NULL, |
2887 | vmf->address); | 2894 | vmf->address); |
2888 | if (!page) { | 2895 | if (!page) { |
2889 | if (vma_readahead) | 2896 | struct swap_info_struct *si = swp_swap_info(entry); |
2890 | page = do_swap_page_readahead(entry, | 2897 | |
2891 | GFP_HIGHUSER_MOVABLE, vmf, &swap_ra); | 2898 | if (si->flags & SWP_SYNCHRONOUS_IO && |
2892 | else | 2899 | __swap_count(si, entry) == 1) { |
2893 | page = swapin_readahead(entry, | 2900 | /* skip swapcache */ |
2894 | GFP_HIGHUSER_MOVABLE, vma, vmf->address); | 2901 | page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); |
2902 | if (page) { | ||
2903 | __SetPageLocked(page); | ||
2904 | __SetPageSwapBacked(page); | ||
2905 | set_page_private(page, entry.val); | ||
2906 | lru_cache_add_anon(page); | ||
2907 | swap_readpage(page, true); | ||
2908 | } | ||
2909 | } else { | ||
2910 | if (vma_readahead) | ||
2911 | page = do_swap_page_readahead(entry, | ||
2912 | GFP_HIGHUSER_MOVABLE, vmf, &swap_ra); | ||
2913 | else | ||
2914 | page = swapin_readahead(entry, | ||
2915 | GFP_HIGHUSER_MOVABLE, vma, vmf->address); | ||
2916 | swapcache = page; | ||
2917 | } | ||
2918 | |||
2895 | if (!page) { | 2919 | if (!page) { |
2896 | /* | 2920 | /* |
2897 | * Back out if somebody else faulted in this pte | 2921 | * Back out if somebody else faulted in this pte |
@@ -2920,7 +2944,6 @@ int do_swap_page(struct vm_fault *vmf) | |||
2920 | goto out_release; | 2944 | goto out_release; |
2921 | } | 2945 | } |
2922 | 2946 | ||
2923 | swapcache = page; | ||
2924 | locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); | 2947 | locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); |
2925 | 2948 | ||
2926 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2949 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
@@ -2935,7 +2958,8 @@ int do_swap_page(struct vm_fault *vmf) | |||
2935 | * test below, are not enough to exclude that. Even if it is still | 2958 | * test below, are not enough to exclude that. Even if it is still |
2936 | * swapcache, we need to check that the page's swap has not changed. | 2959 | * swapcache, we need to check that the page's swap has not changed. |
2937 | */ | 2960 | */ |
2938 | if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) | 2961 | if (unlikely((!PageSwapCache(page) || |
2962 | page_private(page) != entry.val)) && swapcache) | ||
2939 | goto out_page; | 2963 | goto out_page; |
2940 | 2964 | ||
2941 | page = ksm_might_need_to_copy(page, vma, vmf->address); | 2965 | page = ksm_might_need_to_copy(page, vma, vmf->address); |
@@ -2988,14 +3012,16 @@ int do_swap_page(struct vm_fault *vmf) | |||
2988 | pte = pte_mksoft_dirty(pte); | 3012 | pte = pte_mksoft_dirty(pte); |
2989 | set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); | 3013 | set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); |
2990 | vmf->orig_pte = pte; | 3014 | vmf->orig_pte = pte; |
2991 | if (page == swapcache) { | 3015 | |
2992 | do_page_add_anon_rmap(page, vma, vmf->address, exclusive); | 3016 | /* ksm created a completely new copy */ |
2993 | mem_cgroup_commit_charge(page, memcg, true, false); | 3017 | if (unlikely(page != swapcache && swapcache)) { |
2994 | activate_page(page); | ||
2995 | } else { /* ksm created a completely new copy */ | ||
2996 | page_add_new_anon_rmap(page, vma, vmf->address, false); | 3018 | page_add_new_anon_rmap(page, vma, vmf->address, false); |
2997 | mem_cgroup_commit_charge(page, memcg, false, false); | 3019 | mem_cgroup_commit_charge(page, memcg, false, false); |
2998 | lru_cache_add_active_or_unevictable(page, vma); | 3020 | lru_cache_add_active_or_unevictable(page, vma); |
3021 | } else { | ||
3022 | do_page_add_anon_rmap(page, vma, vmf->address, exclusive); | ||
3023 | mem_cgroup_commit_charge(page, memcg, true, false); | ||
3024 | activate_page(page); | ||
2999 | } | 3025 | } |
3000 | 3026 | ||
3001 | swap_free(entry); | 3027 | swap_free(entry); |
@@ -3003,7 +3029,7 @@ int do_swap_page(struct vm_fault *vmf) | |||
3003 | (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) | 3029 | (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) |
3004 | try_to_free_swap(page); | 3030 | try_to_free_swap(page); |
3005 | unlock_page(page); | 3031 | unlock_page(page); |
3006 | if (page != swapcache) { | 3032 | if (page != swapcache && swapcache) { |
3007 | /* | 3033 | /* |
3008 | * Hold the lock to avoid the swap entry to be reused | 3034 | * Hold the lock to avoid the swap entry to be reused |
3009 | * until we take the PT lock for the pte_same() check | 3035 | * until we take the PT lock for the pte_same() check |
@@ -3036,7 +3062,7 @@ out_page: | |||
3036 | unlock_page(page); | 3062 | unlock_page(page); |
3037 | out_release: | 3063 | out_release: |
3038 | put_page(page); | 3064 | put_page(page); |
3039 | if (page != swapcache) { | 3065 | if (page != swapcache && swapcache) { |
3040 | unlock_page(swapcache); | 3066 | unlock_page(swapcache); |
3041 | put_page(swapcache); | 3067 | put_page(swapcache); |
3042 | } | 3068 | } |
@@ -3212,7 +3238,7 @@ static int pte_alloc_one_map(struct vm_fault *vmf) | |||
3212 | goto map_pte; | 3238 | goto map_pte; |
3213 | } | 3239 | } |
3214 | 3240 | ||
3215 | atomic_long_inc(&vma->vm_mm->nr_ptes); | 3241 | mm_inc_nr_ptes(vma->vm_mm); |
3216 | pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); | 3242 | pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); |
3217 | spin_unlock(vmf->ptl); | 3243 | spin_unlock(vmf->ptl); |
3218 | vmf->prealloc_pte = NULL; | 3244 | vmf->prealloc_pte = NULL; |
@@ -3271,7 +3297,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf) | |||
3271 | * We are going to consume the prealloc table, | 3297 | * We are going to consume the prealloc table, |
3272 | * count that as nr_ptes. | 3298 | * count that as nr_ptes. |
3273 | */ | 3299 | */ |
3274 | atomic_long_inc(&vma->vm_mm->nr_ptes); | 3300 | mm_inc_nr_ptes(vma->vm_mm); |
3275 | vmf->prealloc_pte = NULL; | 3301 | vmf->prealloc_pte = NULL; |
3276 | } | 3302 | } |
3277 | 3303 | ||
@@ -4124,15 +4150,17 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) | |||
4124 | 4150 | ||
4125 | spin_lock(&mm->page_table_lock); | 4151 | spin_lock(&mm->page_table_lock); |
4126 | #ifndef __ARCH_HAS_5LEVEL_HACK | 4152 | #ifndef __ARCH_HAS_5LEVEL_HACK |
4127 | if (p4d_present(*p4d)) /* Another has populated it */ | 4153 | if (!p4d_present(*p4d)) { |
4128 | pud_free(mm, new); | 4154 | mm_inc_nr_puds(mm); |
4129 | else | ||
4130 | p4d_populate(mm, p4d, new); | 4155 | p4d_populate(mm, p4d, new); |
4131 | #else | 4156 | } else /* Another has populated it */ |
4132 | if (pgd_present(*p4d)) /* Another has populated it */ | ||
4133 | pud_free(mm, new); | 4157 | pud_free(mm, new); |
4134 | else | 4158 | #else |
4159 | if (!pgd_present(*p4d)) { | ||
4160 | mm_inc_nr_puds(mm); | ||
4135 | pgd_populate(mm, p4d, new); | 4161 | pgd_populate(mm, p4d, new); |
4162 | } else /* Another has populated it */ | ||
4163 | pud_free(mm, new); | ||
4136 | #endif /* __ARCH_HAS_5LEVEL_HACK */ | 4164 | #endif /* __ARCH_HAS_5LEVEL_HACK */ |
4137 | spin_unlock(&mm->page_table_lock); | 4165 | spin_unlock(&mm->page_table_lock); |
4138 | return 0; | 4166 | return 0; |
@@ -4457,17 +4485,15 @@ void print_vma_addr(char *prefix, unsigned long ip) | |||
4457 | struct vm_area_struct *vma; | 4485 | struct vm_area_struct *vma; |
4458 | 4486 | ||
4459 | /* | 4487 | /* |
4460 | * Do not print if we are in atomic | 4488 | * we might be running from an atomic context so we cannot sleep |
4461 | * contexts (in exception stacks, etc.): | ||
4462 | */ | 4489 | */ |
4463 | if (preempt_count()) | 4490 | if (!down_read_trylock(&mm->mmap_sem)) |
4464 | return; | 4491 | return; |
4465 | 4492 | ||
4466 | down_read(&mm->mmap_sem); | ||
4467 | vma = find_vma(mm, ip); | 4493 | vma = find_vma(mm, ip); |
4468 | if (vma && vma->vm_file) { | 4494 | if (vma && vma->vm_file) { |
4469 | struct file *f = vma->vm_file; | 4495 | struct file *f = vma->vm_file; |
4470 | char *buf = (char *)__get_free_page(GFP_KERNEL); | 4496 | char *buf = (char *)__get_free_page(GFP_NOWAIT); |
4471 | if (buf) { | 4497 | if (buf) { |
4472 | char *p; | 4498 | char *p; |
4473 | 4499 | ||
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index d4b5f29906b9..c52aa05b106c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -265,7 +265,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn, | |||
265 | /* | 265 | /* |
266 | * Make all the pages reserved so that nobody will stumble over half | 266 | * Make all the pages reserved so that nobody will stumble over half |
267 | * initialized state. | 267 | * initialized state. |
268 | * FIXME: We also have to associate it with a node because pfn_to_node | 268 | * FIXME: We also have to associate it with a node because page_to_nid |
269 | * relies on having page with the proper node. | 269 | * relies on having page with the proper node. |
270 | */ | 270 | */ |
271 | for (i = 0; i < PAGES_PER_SECTION; i++) { | 271 | for (i = 0; i < PAGES_PER_SECTION; i++) { |
@@ -1590,11 +1590,11 @@ static void node_states_clear_node(int node, struct memory_notify *arg) | |||
1590 | } | 1590 | } |
1591 | 1591 | ||
1592 | static int __ref __offline_pages(unsigned long start_pfn, | 1592 | static int __ref __offline_pages(unsigned long start_pfn, |
1593 | unsigned long end_pfn, unsigned long timeout) | 1593 | unsigned long end_pfn) |
1594 | { | 1594 | { |
1595 | unsigned long pfn, nr_pages, expire; | 1595 | unsigned long pfn, nr_pages; |
1596 | long offlined_pages; | 1596 | long offlined_pages; |
1597 | int ret, drain, retry_max, node; | 1597 | int ret, node; |
1598 | unsigned long flags; | 1598 | unsigned long flags; |
1599 | unsigned long valid_start, valid_end; | 1599 | unsigned long valid_start, valid_end; |
1600 | struct zone *zone; | 1600 | struct zone *zone; |
@@ -1630,44 +1630,22 @@ static int __ref __offline_pages(unsigned long start_pfn, | |||
1630 | goto failed_removal; | 1630 | goto failed_removal; |
1631 | 1631 | ||
1632 | pfn = start_pfn; | 1632 | pfn = start_pfn; |
1633 | expire = jiffies + timeout; | ||
1634 | drain = 0; | ||
1635 | retry_max = 5; | ||
1636 | repeat: | 1633 | repeat: |
1637 | /* start memory hot removal */ | 1634 | /* start memory hot removal */ |
1638 | ret = -EAGAIN; | ||
1639 | if (time_after(jiffies, expire)) | ||
1640 | goto failed_removal; | ||
1641 | ret = -EINTR; | 1635 | ret = -EINTR; |
1642 | if (signal_pending(current)) | 1636 | if (signal_pending(current)) |
1643 | goto failed_removal; | 1637 | goto failed_removal; |
1644 | ret = 0; | 1638 | |
1645 | if (drain) { | 1639 | cond_resched(); |
1646 | lru_add_drain_all_cpuslocked(); | 1640 | lru_add_drain_all_cpuslocked(); |
1647 | cond_resched(); | 1641 | drain_all_pages(zone); |
1648 | drain_all_pages(zone); | ||
1649 | } | ||
1650 | 1642 | ||
1651 | pfn = scan_movable_pages(start_pfn, end_pfn); | 1643 | pfn = scan_movable_pages(start_pfn, end_pfn); |
1652 | if (pfn) { /* We have movable pages */ | 1644 | if (pfn) { /* We have movable pages */ |
1653 | ret = do_migrate_range(pfn, end_pfn); | 1645 | ret = do_migrate_range(pfn, end_pfn); |
1654 | if (!ret) { | 1646 | goto repeat; |
1655 | drain = 1; | ||
1656 | goto repeat; | ||
1657 | } else { | ||
1658 | if (ret < 0) | ||
1659 | if (--retry_max == 0) | ||
1660 | goto failed_removal; | ||
1661 | yield(); | ||
1662 | drain = 1; | ||
1663 | goto repeat; | ||
1664 | } | ||
1665 | } | 1647 | } |
1666 | /* drain all zone's lru pagevec, this is asynchronous... */ | 1648 | |
1667 | lru_add_drain_all_cpuslocked(); | ||
1668 | yield(); | ||
1669 | /* drain pcp pages, this is synchronous. */ | ||
1670 | drain_all_pages(zone); | ||
1671 | /* | 1649 | /* |
1672 | * dissolve free hugepages in the memory block before doing offlining | 1650 | * dissolve free hugepages in the memory block before doing offlining |
1673 | * actually in order to make hugetlbfs's object counting consistent. | 1651 | * actually in order to make hugetlbfs's object counting consistent. |
@@ -1677,10 +1655,8 @@ repeat: | |||
1677 | goto failed_removal; | 1655 | goto failed_removal; |
1678 | /* check again */ | 1656 | /* check again */ |
1679 | offlined_pages = check_pages_isolated(start_pfn, end_pfn); | 1657 | offlined_pages = check_pages_isolated(start_pfn, end_pfn); |
1680 | if (offlined_pages < 0) { | 1658 | if (offlined_pages < 0) |
1681 | ret = -EBUSY; | 1659 | goto repeat; |
1682 | goto failed_removal; | ||
1683 | } | ||
1684 | pr_info("Offlined Pages %ld\n", offlined_pages); | 1660 | pr_info("Offlined Pages %ld\n", offlined_pages); |
1685 | /* Ok, all of our target is isolated. | 1661 | /* Ok, all of our target is isolated. |
1686 | We cannot do rollback at this point. */ | 1662 | We cannot do rollback at this point. */ |
@@ -1728,7 +1704,7 @@ failed_removal: | |||
1728 | /* Must be protected by mem_hotplug_begin() or a device_lock */ | 1704 | /* Must be protected by mem_hotplug_begin() or a device_lock */ |
1729 | int offline_pages(unsigned long start_pfn, unsigned long nr_pages) | 1705 | int offline_pages(unsigned long start_pfn, unsigned long nr_pages) |
1730 | { | 1706 | { |
1731 | return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); | 1707 | return __offline_pages(start_pfn, start_pfn + nr_pages); |
1732 | } | 1708 | } |
1733 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | 1709 | #endif /* CONFIG_MEMORY_HOTREMOVE */ |
1734 | 1710 | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a2af6d58a68f..4ce44d3ff03d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -85,6 +85,7 @@ | |||
85 | #include <linux/interrupt.h> | 85 | #include <linux/interrupt.h> |
86 | #include <linux/init.h> | 86 | #include <linux/init.h> |
87 | #include <linux/compat.h> | 87 | #include <linux/compat.h> |
88 | #include <linux/ptrace.h> | ||
88 | #include <linux/swap.h> | 89 | #include <linux/swap.h> |
89 | #include <linux/seq_file.h> | 90 | #include <linux/seq_file.h> |
90 | #include <linux/proc_fs.h> | 91 | #include <linux/proc_fs.h> |
@@ -1365,7 +1366,6 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, | |||
1365 | const unsigned long __user *, old_nodes, | 1366 | const unsigned long __user *, old_nodes, |
1366 | const unsigned long __user *, new_nodes) | 1367 | const unsigned long __user *, new_nodes) |
1367 | { | 1368 | { |
1368 | const struct cred *cred = current_cred(), *tcred; | ||
1369 | struct mm_struct *mm = NULL; | 1369 | struct mm_struct *mm = NULL; |
1370 | struct task_struct *task; | 1370 | struct task_struct *task; |
1371 | nodemask_t task_nodes; | 1371 | nodemask_t task_nodes; |
@@ -1401,15 +1401,10 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, | |||
1401 | err = -EINVAL; | 1401 | err = -EINVAL; |
1402 | 1402 | ||
1403 | /* | 1403 | /* |
1404 | * Check if this process has the right to modify the specified | 1404 | * Check if this process has the right to modify the specified process. |
1405 | * process. The right exists if the process has administrative | 1405 | * Use the regular "ptrace_may_access()" checks. |
1406 | * capabilities, superuser privileges or the same | ||
1407 | * userid as the target process. | ||
1408 | */ | 1406 | */ |
1409 | tcred = __task_cred(task); | 1407 | if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { |
1410 | if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) && | ||
1411 | !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) && | ||
1412 | !capable(CAP_SYS_NICE)) { | ||
1413 | rcu_read_unlock(); | 1408 | rcu_read_unlock(); |
1414 | err = -EPERM; | 1409 | err = -EPERM; |
1415 | goto out_put; | 1410 | goto out_put; |
@@ -1920,6 +1915,9 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, | |||
1920 | struct page *page; | 1915 | struct page *page; |
1921 | 1916 | ||
1922 | page = __alloc_pages(gfp, order, nid); | 1917 | page = __alloc_pages(gfp, order, nid); |
1918 | /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */ | ||
1919 | if (!static_branch_likely(&vm_numa_stat_key)) | ||
1920 | return page; | ||
1923 | if (page && page_to_nid(page) == nid) { | 1921 | if (page && page_to_nid(page) == nid) { |
1924 | preempt_disable(); | 1922 | preempt_disable(); |
1925 | __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT); | 1923 | __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT); |
diff --git a/mm/mempool.c b/mm/mempool.c index c4a23cdae3f0..7d8c5a0010a2 100644 --- a/mm/mempool.c +++ b/mm/mempool.c | |||
@@ -189,7 +189,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, | |||
189 | pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id); | 189 | pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id); |
190 | if (!pool) | 190 | if (!pool) |
191 | return NULL; | 191 | return NULL; |
192 | pool->elements = kmalloc_node(min_nr * sizeof(void *), | 192 | pool->elements = kmalloc_array_node(min_nr, sizeof(void *), |
193 | gfp_mask, node_id); | 193 | gfp_mask, node_id); |
194 | if (!pool->elements) { | 194 | if (!pool->elements) { |
195 | kfree(pool); | 195 | kfree(pool); |
diff --git a/mm/migrate.c b/mm/migrate.c index 1236449b4777..4d0be47a322a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -2089,7 +2089,11 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
2089 | set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); | 2089 | set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); |
2090 | 2090 | ||
2091 | spin_unlock(ptl); | 2091 | spin_unlock(ptl); |
2092 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2092 | /* |
2093 | * No need to double call mmu_notifier->invalidate_range() callback as | ||
2094 | * the above pmdp_huge_clear_flush_notify() did already call it. | ||
2095 | */ | ||
2096 | mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end); | ||
2093 | 2097 | ||
2094 | /* Take an "isolate" reference and put new page on the LRU. */ | 2098 | /* Take an "isolate" reference and put new page on the LRU. */ |
2095 | get_page(new_page); | 2099 | get_page(new_page); |
@@ -2805,9 +2809,14 @@ static void migrate_vma_pages(struct migrate_vma *migrate) | |||
2805 | migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; | 2809 | migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; |
2806 | } | 2810 | } |
2807 | 2811 | ||
2812 | /* | ||
2813 | * No need to double call mmu_notifier->invalidate_range() callback as | ||
2814 | * the above ptep_clear_flush_notify() inside migrate_vma_insert_page() | ||
2815 | * did already call it. | ||
2816 | */ | ||
2808 | if (notified) | 2817 | if (notified) |
2809 | mmu_notifier_invalidate_range_end(mm, mmu_start, | 2818 | mmu_notifier_invalidate_range_only_end(mm, mmu_start, |
2810 | migrate->end); | 2819 | migrate->end); |
2811 | } | 2820 | } |
2812 | 2821 | ||
2813 | /* | 2822 | /* |
diff --git a/mm/mlock.c b/mm/mlock.c index 46af369c13e5..30472d438794 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -289,7 +289,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) | |||
289 | struct pagevec pvec_putback; | 289 | struct pagevec pvec_putback; |
290 | int pgrescued = 0; | 290 | int pgrescued = 0; |
291 | 291 | ||
292 | pagevec_init(&pvec_putback, 0); | 292 | pagevec_init(&pvec_putback); |
293 | 293 | ||
294 | /* Phase 1: page isolation */ | 294 | /* Phase 1: page isolation */ |
295 | spin_lock_irq(zone_lru_lock(zone)); | 295 | spin_lock_irq(zone_lru_lock(zone)); |
@@ -448,7 +448,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, | |||
448 | struct pagevec pvec; | 448 | struct pagevec pvec; |
449 | struct zone *zone; | 449 | struct zone *zone; |
450 | 450 | ||
451 | pagevec_init(&pvec, 0); | 451 | pagevec_init(&pvec); |
452 | /* | 452 | /* |
453 | * Although FOLL_DUMP is intended for get_dump_page(), | 453 | * Although FOLL_DUMP is intended for get_dump_page(), |
454 | * it just so happens that its special treatment of the | 454 | * it just so happens that its special treatment of the |
@@ -670,8 +670,6 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla | |||
670 | if (!can_do_mlock()) | 670 | if (!can_do_mlock()) |
671 | return -EPERM; | 671 | return -EPERM; |
672 | 672 | ||
673 | lru_add_drain_all(); /* flush pagevec */ | ||
674 | |||
675 | len = PAGE_ALIGN(len + (offset_in_page(start))); | 673 | len = PAGE_ALIGN(len + (offset_in_page(start))); |
676 | start &= PAGE_MASK; | 674 | start &= PAGE_MASK; |
677 | 675 | ||
@@ -798,9 +796,6 @@ SYSCALL_DEFINE1(mlockall, int, flags) | |||
798 | if (!can_do_mlock()) | 796 | if (!can_do_mlock()) |
799 | return -EPERM; | 797 | return -EPERM; |
800 | 798 | ||
801 | if (flags & MCL_CURRENT) | ||
802 | lru_add_drain_all(); /* flush pagevec */ | ||
803 | |||
804 | lock_limit = rlimit(RLIMIT_MEMLOCK); | 799 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
805 | lock_limit >>= PAGE_SHIFT; | 800 | lock_limit >>= PAGE_SHIFT; |
806 | 801 | ||
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 314285284e6e..96edb33fd09a 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
@@ -190,7 +190,9 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, | |||
190 | EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start); | 190 | EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start); |
191 | 191 | ||
192 | void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, | 192 | void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, |
193 | unsigned long start, unsigned long end) | 193 | unsigned long start, |
194 | unsigned long end, | ||
195 | bool only_end) | ||
194 | { | 196 | { |
195 | struct mmu_notifier *mn; | 197 | struct mmu_notifier *mn; |
196 | int id; | 198 | int id; |
@@ -204,8 +206,13 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, | |||
204 | * subsystem registers either invalidate_range_start()/end() or | 206 | * subsystem registers either invalidate_range_start()/end() or |
205 | * invalidate_range(), so this will be no additional overhead | 207 | * invalidate_range(), so this will be no additional overhead |
206 | * (besides the pointer check). | 208 | * (besides the pointer check). |
209 | * | ||
210 | * We skip call to invalidate_range() if we know it is safe ie | ||
211 | * call site use mmu_notifier_invalidate_range_only_end() which | ||
212 | * is safe to do when we know that a call to invalidate_range() | ||
213 | * already happen under page table lock. | ||
207 | */ | 214 | */ |
208 | if (mn->ops->invalidate_range) | 215 | if (!only_end && mn->ops->invalidate_range) |
209 | mn->ops->invalidate_range(mn, mm, start, end); | 216 | mn->ops->invalidate_range(mn, mm, start, end); |
210 | if (mn->ops->invalidate_range_end) | 217 | if (mn->ops->invalidate_range_end) |
211 | mn->ops->invalidate_range_end(mn, mm, start, end); | 218 | mn->ops->invalidate_range_end(mn, mm, start, end); |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index dee0f75c3013..c86fbd1b590e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -44,6 +44,7 @@ | |||
44 | 44 | ||
45 | #include <asm/tlb.h> | 45 | #include <asm/tlb.h> |
46 | #include "internal.h" | 46 | #include "internal.h" |
47 | #include "slab.h" | ||
47 | 48 | ||
48 | #define CREATE_TRACE_POINTS | 49 | #define CREATE_TRACE_POINTS |
49 | #include <trace/events/oom.h> | 50 | #include <trace/events/oom.h> |
@@ -161,6 +162,25 @@ static bool oom_unkillable_task(struct task_struct *p, | |||
161 | return false; | 162 | return false; |
162 | } | 163 | } |
163 | 164 | ||
165 | /* | ||
166 | * Print out unreclaimble slabs info when unreclaimable slabs amount is greater | ||
167 | * than all user memory (LRU pages) | ||
168 | */ | ||
169 | static bool is_dump_unreclaim_slabs(void) | ||
170 | { | ||
171 | unsigned long nr_lru; | ||
172 | |||
173 | nr_lru = global_node_page_state(NR_ACTIVE_ANON) + | ||
174 | global_node_page_state(NR_INACTIVE_ANON) + | ||
175 | global_node_page_state(NR_ACTIVE_FILE) + | ||
176 | global_node_page_state(NR_INACTIVE_FILE) + | ||
177 | global_node_page_state(NR_ISOLATED_ANON) + | ||
178 | global_node_page_state(NR_ISOLATED_FILE) + | ||
179 | global_node_page_state(NR_UNEVICTABLE); | ||
180 | |||
181 | return (global_node_page_state(NR_SLAB_UNRECLAIMABLE) > nr_lru); | ||
182 | } | ||
183 | |||
164 | /** | 184 | /** |
165 | * oom_badness - heuristic function to determine which candidate task to kill | 185 | * oom_badness - heuristic function to determine which candidate task to kill |
166 | * @p: task struct of which task we should calculate | 186 | * @p: task struct of which task we should calculate |
@@ -201,7 +221,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
201 | * task's rss, pagetable and swap space use. | 221 | * task's rss, pagetable and swap space use. |
202 | */ | 222 | */ |
203 | points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + | 223 | points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + |
204 | atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm); | 224 | mm_pgtables_bytes(p->mm) / PAGE_SIZE; |
205 | task_unlock(p); | 225 | task_unlock(p); |
206 | 226 | ||
207 | /* | 227 | /* |
@@ -369,15 +389,15 @@ static void select_bad_process(struct oom_control *oc) | |||
369 | * Dumps the current memory state of all eligible tasks. Tasks not in the same | 389 | * Dumps the current memory state of all eligible tasks. Tasks not in the same |
370 | * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes | 390 | * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes |
371 | * are not shown. | 391 | * are not shown. |
372 | * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes, | 392 | * State information includes task's pid, uid, tgid, vm size, rss, |
373 | * swapents, oom_score_adj value, and name. | 393 | * pgtables_bytes, swapents, oom_score_adj value, and name. |
374 | */ | 394 | */ |
375 | static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) | 395 | static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) |
376 | { | 396 | { |
377 | struct task_struct *p; | 397 | struct task_struct *p; |
378 | struct task_struct *task; | 398 | struct task_struct *task; |
379 | 399 | ||
380 | pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n"); | 400 | pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); |
381 | rcu_read_lock(); | 401 | rcu_read_lock(); |
382 | for_each_process(p) { | 402 | for_each_process(p) { |
383 | if (oom_unkillable_task(p, memcg, nodemask)) | 403 | if (oom_unkillable_task(p, memcg, nodemask)) |
@@ -393,11 +413,10 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) | |||
393 | continue; | 413 | continue; |
394 | } | 414 | } |
395 | 415 | ||
396 | pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n", | 416 | pr_info("[%5d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n", |
397 | task->pid, from_kuid(&init_user_ns, task_uid(task)), | 417 | task->pid, from_kuid(&init_user_ns, task_uid(task)), |
398 | task->tgid, task->mm->total_vm, get_mm_rss(task->mm), | 418 | task->tgid, task->mm->total_vm, get_mm_rss(task->mm), |
399 | atomic_long_read(&task->mm->nr_ptes), | 419 | mm_pgtables_bytes(task->mm), |
400 | mm_nr_pmds(task->mm), | ||
401 | get_mm_counter(task->mm, MM_SWAPENTS), | 420 | get_mm_counter(task->mm, MM_SWAPENTS), |
402 | task->signal->oom_score_adj, task->comm); | 421 | task->signal->oom_score_adj, task->comm); |
403 | task_unlock(task); | 422 | task_unlock(task); |
@@ -407,23 +426,22 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) | |||
407 | 426 | ||
408 | static void dump_header(struct oom_control *oc, struct task_struct *p) | 427 | static void dump_header(struct oom_control *oc, struct task_struct *p) |
409 | { | 428 | { |
410 | pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=", | 429 | pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n", |
411 | current->comm, oc->gfp_mask, &oc->gfp_mask); | 430 | current->comm, oc->gfp_mask, &oc->gfp_mask, |
412 | if (oc->nodemask) | 431 | nodemask_pr_args(oc->nodemask), oc->order, |
413 | pr_cont("%*pbl", nodemask_pr_args(oc->nodemask)); | 432 | current->signal->oom_score_adj); |
414 | else | ||
415 | pr_cont("(null)"); | ||
416 | pr_cont(", order=%d, oom_score_adj=%hd\n", | ||
417 | oc->order, current->signal->oom_score_adj); | ||
418 | if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) | 433 | if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) |
419 | pr_warn("COMPACTION is disabled!!!\n"); | 434 | pr_warn("COMPACTION is disabled!!!\n"); |
420 | 435 | ||
421 | cpuset_print_current_mems_allowed(); | 436 | cpuset_print_current_mems_allowed(); |
422 | dump_stack(); | 437 | dump_stack(); |
423 | if (oc->memcg) | 438 | if (is_memcg_oom(oc)) |
424 | mem_cgroup_print_oom_info(oc->memcg, p); | 439 | mem_cgroup_print_oom_info(oc->memcg, p); |
425 | else | 440 | else { |
426 | show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask); | 441 | show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask); |
442 | if (is_dump_unreclaim_slabs()) | ||
443 | dump_unreclaimable_slab(); | ||
444 | } | ||
427 | if (sysctl_oom_dump_tasks) | 445 | if (sysctl_oom_dump_tasks) |
428 | dump_tasks(oc->memcg, oc->nodemask); | 446 | dump_tasks(oc->memcg, oc->nodemask); |
429 | } | 447 | } |
@@ -618,9 +636,6 @@ static int oom_reaper(void *unused) | |||
618 | 636 | ||
619 | static void wake_oom_reaper(struct task_struct *tsk) | 637 | static void wake_oom_reaper(struct task_struct *tsk) |
620 | { | 638 | { |
621 | if (!oom_reaper_th) | ||
622 | return; | ||
623 | |||
624 | /* tsk is already queued? */ | 639 | /* tsk is already queued? */ |
625 | if (tsk == oom_reaper_list || tsk->oom_reaper_list) | 640 | if (tsk == oom_reaper_list || tsk->oom_reaper_list) |
626 | return; | 641 | return; |
@@ -638,11 +653,6 @@ static void wake_oom_reaper(struct task_struct *tsk) | |||
638 | static int __init oom_init(void) | 653 | static int __init oom_init(void) |
639 | { | 654 | { |
640 | oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); | 655 | oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); |
641 | if (IS_ERR(oom_reaper_th)) { | ||
642 | pr_err("Unable to start OOM reaper %ld. Continuing regardless\n", | ||
643 | PTR_ERR(oom_reaper_th)); | ||
644 | oom_reaper_th = NULL; | ||
645 | } | ||
646 | return 0; | 656 | return 0; |
647 | } | 657 | } |
648 | subsys_initcall(oom_init) | 658 | subsys_initcall(oom_init) |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c518c845f202..8a1551154285 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -433,8 +433,11 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc) | |||
433 | else | 433 | else |
434 | bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; | 434 | bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; |
435 | 435 | ||
436 | if (bg_thresh >= thresh) | 436 | if (unlikely(bg_thresh >= thresh)) { |
437 | pr_warn("vm direct limit must be set greater than background limit.\n"); | ||
437 | bg_thresh = thresh / 2; | 438 | bg_thresh = thresh / 2; |
439 | } | ||
440 | |||
438 | tsk = current; | 441 | tsk = current; |
439 | if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { | 442 | if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { |
440 | bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; | 443 | bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; |
@@ -625,9 +628,9 @@ EXPORT_SYMBOL_GPL(wb_writeout_inc); | |||
625 | * On idle system, we can be called long after we scheduled because we use | 628 | * On idle system, we can be called long after we scheduled because we use |
626 | * deferred timers so count with missed periods. | 629 | * deferred timers so count with missed periods. |
627 | */ | 630 | */ |
628 | static void writeout_period(unsigned long t) | 631 | static void writeout_period(struct timer_list *t) |
629 | { | 632 | { |
630 | struct wb_domain *dom = (void *)t; | 633 | struct wb_domain *dom = from_timer(dom, t, period_timer); |
631 | int miss_periods = (jiffies - dom->period_time) / | 634 | int miss_periods = (jiffies - dom->period_time) / |
632 | VM_COMPLETIONS_PERIOD_LEN; | 635 | VM_COMPLETIONS_PERIOD_LEN; |
633 | 636 | ||
@@ -650,8 +653,7 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp) | |||
650 | 653 | ||
651 | spin_lock_init(&dom->lock); | 654 | spin_lock_init(&dom->lock); |
652 | 655 | ||
653 | setup_deferrable_timer(&dom->period_timer, writeout_period, | 656 | timer_setup(&dom->period_timer, writeout_period, TIMER_DEFERRABLE); |
654 | (unsigned long)dom); | ||
655 | 657 | ||
656 | dom->dirty_limit_tstamp = jiffies; | 658 | dom->dirty_limit_tstamp = jiffies; |
657 | 659 | ||
@@ -1543,7 +1545,7 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) | |||
1543 | * actually dirty; with m+n sitting in the percpu | 1545 | * actually dirty; with m+n sitting in the percpu |
1544 | * deltas. | 1546 | * deltas. |
1545 | */ | 1547 | */ |
1546 | if (dtc->wb_thresh < 2 * wb_stat_error(wb)) { | 1548 | if (dtc->wb_thresh < 2 * wb_stat_error()) { |
1547 | wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); | 1549 | wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); |
1548 | dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK); | 1550 | dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK); |
1549 | } else { | 1551 | } else { |
@@ -1559,8 +1561,7 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) | |||
1559 | * If we're over `background_thresh' then the writeback threads are woken to | 1561 | * If we're over `background_thresh' then the writeback threads are woken to |
1560 | * perform some writeout. | 1562 | * perform some writeout. |
1561 | */ | 1563 | */ |
1562 | static void balance_dirty_pages(struct address_space *mapping, | 1564 | static void balance_dirty_pages(struct bdi_writeback *wb, |
1563 | struct bdi_writeback *wb, | ||
1564 | unsigned long pages_dirtied) | 1565 | unsigned long pages_dirtied) |
1565 | { | 1566 | { |
1566 | struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; | 1567 | struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; |
@@ -1802,7 +1803,7 @@ pause: | |||
1802 | * more page. However wb_dirty has accounting errors. So use | 1803 | * more page. However wb_dirty has accounting errors. So use |
1803 | * the larger and more IO friendly wb_stat_error. | 1804 | * the larger and more IO friendly wb_stat_error. |
1804 | */ | 1805 | */ |
1805 | if (sdtc->wb_dirty <= wb_stat_error(wb)) | 1806 | if (sdtc->wb_dirty <= wb_stat_error()) |
1806 | break; | 1807 | break; |
1807 | 1808 | ||
1808 | if (fatal_signal_pending(current)) | 1809 | if (fatal_signal_pending(current)) |
@@ -1910,7 +1911,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) | |||
1910 | preempt_enable(); | 1911 | preempt_enable(); |
1911 | 1912 | ||
1912 | if (unlikely(current->nr_dirtied >= ratelimit)) | 1913 | if (unlikely(current->nr_dirtied >= ratelimit)) |
1913 | balance_dirty_pages(mapping, wb, current->nr_dirtied); | 1914 | balance_dirty_pages(wb, current->nr_dirtied); |
1914 | 1915 | ||
1915 | wb_put(wb); | 1916 | wb_put(wb); |
1916 | } | 1917 | } |
@@ -2167,7 +2168,7 @@ int write_cache_pages(struct address_space *mapping, | |||
2167 | int range_whole = 0; | 2168 | int range_whole = 0; |
2168 | int tag; | 2169 | int tag; |
2169 | 2170 | ||
2170 | pagevec_init(&pvec, 0); | 2171 | pagevec_init(&pvec); |
2171 | if (wbc->range_cyclic) { | 2172 | if (wbc->range_cyclic) { |
2172 | writeback_index = mapping->writeback_index; /* prev offset */ | 2173 | writeback_index = mapping->writeback_index; /* prev offset */ |
2173 | index = writeback_index; | 2174 | index = writeback_index; |
@@ -2194,30 +2195,14 @@ retry: | |||
2194 | while (!done && (index <= end)) { | 2195 | while (!done && (index <= end)) { |
2195 | int i; | 2196 | int i; |
2196 | 2197 | ||
2197 | nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, | 2198 | nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, |
2198 | min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); | 2199 | tag); |
2199 | if (nr_pages == 0) | 2200 | if (nr_pages == 0) |
2200 | break; | 2201 | break; |
2201 | 2202 | ||
2202 | for (i = 0; i < nr_pages; i++) { | 2203 | for (i = 0; i < nr_pages; i++) { |
2203 | struct page *page = pvec.pages[i]; | 2204 | struct page *page = pvec.pages[i]; |
2204 | 2205 | ||
2205 | /* | ||
2206 | * At this point, the page may be truncated or | ||
2207 | * invalidated (changing page->mapping to NULL), or | ||
2208 | * even swizzled back from swapper_space to tmpfs file | ||
2209 | * mapping. However, page->index will not change | ||
2210 | * because we have a reference on the page. | ||
2211 | */ | ||
2212 | if (page->index > end) { | ||
2213 | /* | ||
2214 | * can't be range_cyclic (1st pass) because | ||
2215 | * end == -1 in that case. | ||
2216 | */ | ||
2217 | done = 1; | ||
2218 | break; | ||
2219 | } | ||
2220 | |||
2221 | done_index = page->index; | 2206 | done_index = page->index; |
2222 | 2207 | ||
2223 | lock_page(page); | 2208 | lock_page(page); |
@@ -2623,7 +2608,7 @@ EXPORT_SYMBOL(set_page_dirty_lock); | |||
2623 | * page without actually doing it through the VM. Can you say "ext3 is | 2608 | * page without actually doing it through the VM. Can you say "ext3 is |
2624 | * horribly ugly"? Thought you could. | 2609 | * horribly ugly"? Thought you could. |
2625 | */ | 2610 | */ |
2626 | void cancel_dirty_page(struct page *page) | 2611 | void __cancel_dirty_page(struct page *page) |
2627 | { | 2612 | { |
2628 | struct address_space *mapping = page_mapping(page); | 2613 | struct address_space *mapping = page_mapping(page); |
2629 | 2614 | ||
@@ -2644,7 +2629,7 @@ void cancel_dirty_page(struct page *page) | |||
2644 | ClearPageDirty(page); | 2629 | ClearPageDirty(page); |
2645 | } | 2630 | } |
2646 | } | 2631 | } |
2647 | EXPORT_SYMBOL(cancel_dirty_page); | 2632 | EXPORT_SYMBOL(__cancel_dirty_page); |
2648 | 2633 | ||
2649 | /* | 2634 | /* |
2650 | * Clear a page's dirty flag, while caring for dirty memory accounting. | 2635 | * Clear a page's dirty flag, while caring for dirty memory accounting. |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 77e4d3c5c57b..55ded92f9809 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -24,7 +24,6 @@ | |||
24 | #include <linux/memblock.h> | 24 | #include <linux/memblock.h> |
25 | #include <linux/compiler.h> | 25 | #include <linux/compiler.h> |
26 | #include <linux/kernel.h> | 26 | #include <linux/kernel.h> |
27 | #include <linux/kmemcheck.h> | ||
28 | #include <linux/kasan.h> | 27 | #include <linux/kasan.h> |
29 | #include <linux/module.h> | 28 | #include <linux/module.h> |
30 | #include <linux/suspend.h> | 29 | #include <linux/suspend.h> |
@@ -83,6 +82,8 @@ DEFINE_PER_CPU(int, numa_node); | |||
83 | EXPORT_PER_CPU_SYMBOL(numa_node); | 82 | EXPORT_PER_CPU_SYMBOL(numa_node); |
84 | #endif | 83 | #endif |
85 | 84 | ||
85 | DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key); | ||
86 | |||
86 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES | 87 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES |
87 | /* | 88 | /* |
88 | * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. | 89 | * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. |
@@ -290,28 +291,37 @@ EXPORT_SYMBOL(nr_online_nodes); | |||
290 | int page_group_by_mobility_disabled __read_mostly; | 291 | int page_group_by_mobility_disabled __read_mostly; |
291 | 292 | ||
292 | #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT | 293 | #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT |
294 | |||
295 | /* | ||
296 | * Determine how many pages need to be initialized durig early boot | ||
297 | * (non-deferred initialization). | ||
298 | * The value of first_deferred_pfn will be set later, once non-deferred pages | ||
299 | * are initialized, but for now set it ULONG_MAX. | ||
300 | */ | ||
293 | static inline void reset_deferred_meminit(pg_data_t *pgdat) | 301 | static inline void reset_deferred_meminit(pg_data_t *pgdat) |
294 | { | 302 | { |
295 | unsigned long max_initialise; | 303 | phys_addr_t start_addr, end_addr; |
296 | unsigned long reserved_lowmem; | 304 | unsigned long max_pgcnt; |
305 | unsigned long reserved; | ||
297 | 306 | ||
298 | /* | 307 | /* |
299 | * Initialise at least 2G of a node but also take into account that | 308 | * Initialise at least 2G of a node but also take into account that |
300 | * two large system hashes that can take up 1GB for 0.25TB/node. | 309 | * two large system hashes that can take up 1GB for 0.25TB/node. |
301 | */ | 310 | */ |
302 | max_initialise = max(2UL << (30 - PAGE_SHIFT), | 311 | max_pgcnt = max(2UL << (30 - PAGE_SHIFT), |
303 | (pgdat->node_spanned_pages >> 8)); | 312 | (pgdat->node_spanned_pages >> 8)); |
304 | 313 | ||
305 | /* | 314 | /* |
306 | * Compensate the all the memblock reservations (e.g. crash kernel) | 315 | * Compensate the all the memblock reservations (e.g. crash kernel) |
307 | * from the initial estimation to make sure we will initialize enough | 316 | * from the initial estimation to make sure we will initialize enough |
308 | * memory to boot. | 317 | * memory to boot. |
309 | */ | 318 | */ |
310 | reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn, | 319 | start_addr = PFN_PHYS(pgdat->node_start_pfn); |
311 | pgdat->node_start_pfn + max_initialise); | 320 | end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt); |
312 | max_initialise += reserved_lowmem; | 321 | reserved = memblock_reserved_memory_within(start_addr, end_addr); |
322 | max_pgcnt += PHYS_PFN(reserved); | ||
313 | 323 | ||
314 | pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages); | 324 | pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages); |
315 | pgdat->first_deferred_pfn = ULONG_MAX; | 325 | pgdat->first_deferred_pfn = ULONG_MAX; |
316 | } | 326 | } |
317 | 327 | ||
@@ -338,7 +348,7 @@ static inline bool update_defer_init(pg_data_t *pgdat, | |||
338 | if (zone_end < pgdat_end_pfn(pgdat)) | 348 | if (zone_end < pgdat_end_pfn(pgdat)) |
339 | return true; | 349 | return true; |
340 | (*nr_initialised)++; | 350 | (*nr_initialised)++; |
341 | if ((*nr_initialised > pgdat->static_init_size) && | 351 | if ((*nr_initialised > pgdat->static_init_pgcnt) && |
342 | (pfn & (PAGES_PER_SECTION - 1)) == 0) { | 352 | (pfn & (PAGES_PER_SECTION - 1)) == 0) { |
343 | pgdat->first_deferred_pfn = pfn; | 353 | pgdat->first_deferred_pfn = pfn; |
344 | return false; | 354 | return false; |
@@ -1013,7 +1023,6 @@ static __always_inline bool free_pages_prepare(struct page *page, | |||
1013 | VM_BUG_ON_PAGE(PageTail(page), page); | 1023 | VM_BUG_ON_PAGE(PageTail(page), page); |
1014 | 1024 | ||
1015 | trace_mm_page_free(page, order); | 1025 | trace_mm_page_free(page, order); |
1016 | kmemcheck_free_shadow(page, order); | ||
1017 | 1026 | ||
1018 | /* | 1027 | /* |
1019 | * Check tail pages before head page information is cleared to | 1028 | * Check tail pages before head page information is cleared to |
@@ -1170,6 +1179,7 @@ static void free_one_page(struct zone *zone, | |||
1170 | static void __meminit __init_single_page(struct page *page, unsigned long pfn, | 1179 | static void __meminit __init_single_page(struct page *page, unsigned long pfn, |
1171 | unsigned long zone, int nid) | 1180 | unsigned long zone, int nid) |
1172 | { | 1181 | { |
1182 | mm_zero_struct_page(page); | ||
1173 | set_page_links(page, zone, nid, pfn); | 1183 | set_page_links(page, zone, nid, pfn); |
1174 | init_page_count(page); | 1184 | init_page_count(page); |
1175 | page_mapcount_reset(page); | 1185 | page_mapcount_reset(page); |
@@ -1410,14 +1420,17 @@ void clear_zone_contiguous(struct zone *zone) | |||
1410 | } | 1420 | } |
1411 | 1421 | ||
1412 | #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT | 1422 | #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT |
1413 | static void __init deferred_free_range(struct page *page, | 1423 | static void __init deferred_free_range(unsigned long pfn, |
1414 | unsigned long pfn, int nr_pages) | 1424 | unsigned long nr_pages) |
1415 | { | 1425 | { |
1416 | int i; | 1426 | struct page *page; |
1427 | unsigned long i; | ||
1417 | 1428 | ||
1418 | if (!page) | 1429 | if (!nr_pages) |
1419 | return; | 1430 | return; |
1420 | 1431 | ||
1432 | page = pfn_to_page(pfn); | ||
1433 | |||
1421 | /* Free a large naturally-aligned chunk if possible */ | 1434 | /* Free a large naturally-aligned chunk if possible */ |
1422 | if (nr_pages == pageblock_nr_pages && | 1435 | if (nr_pages == pageblock_nr_pages && |
1423 | (pfn & (pageblock_nr_pages - 1)) == 0) { | 1436 | (pfn & (pageblock_nr_pages - 1)) == 0) { |
@@ -1443,19 +1456,109 @@ static inline void __init pgdat_init_report_one_done(void) | |||
1443 | complete(&pgdat_init_all_done_comp); | 1456 | complete(&pgdat_init_all_done_comp); |
1444 | } | 1457 | } |
1445 | 1458 | ||
1459 | /* | ||
1460 | * Helper for deferred_init_range, free the given range, reset the counters, and | ||
1461 | * return number of pages freed. | ||
1462 | */ | ||
1463 | static inline unsigned long __init __def_free(unsigned long *nr_free, | ||
1464 | unsigned long *free_base_pfn, | ||
1465 | struct page **page) | ||
1466 | { | ||
1467 | unsigned long nr = *nr_free; | ||
1468 | |||
1469 | deferred_free_range(*free_base_pfn, nr); | ||
1470 | *free_base_pfn = 0; | ||
1471 | *nr_free = 0; | ||
1472 | *page = NULL; | ||
1473 | |||
1474 | return nr; | ||
1475 | } | ||
1476 | |||
1477 | static unsigned long __init deferred_init_range(int nid, int zid, | ||
1478 | unsigned long start_pfn, | ||
1479 | unsigned long end_pfn) | ||
1480 | { | ||
1481 | struct mminit_pfnnid_cache nid_init_state = { }; | ||
1482 | unsigned long nr_pgmask = pageblock_nr_pages - 1; | ||
1483 | unsigned long free_base_pfn = 0; | ||
1484 | unsigned long nr_pages = 0; | ||
1485 | unsigned long nr_free = 0; | ||
1486 | struct page *page = NULL; | ||
1487 | unsigned long pfn; | ||
1488 | |||
1489 | /* | ||
1490 | * First we check if pfn is valid on architectures where it is possible | ||
1491 | * to have holes within pageblock_nr_pages. On systems where it is not | ||
1492 | * possible, this function is optimized out. | ||
1493 | * | ||
1494 | * Then, we check if a current large page is valid by only checking the | ||
1495 | * validity of the head pfn. | ||
1496 | * | ||
1497 | * meminit_pfn_in_nid is checked on systems where pfns can interleave | ||
1498 | * within a node: a pfn is between start and end of a node, but does not | ||
1499 | * belong to this memory node. | ||
1500 | * | ||
1501 | * Finally, we minimize pfn page lookups and scheduler checks by | ||
1502 | * performing it only once every pageblock_nr_pages. | ||
1503 | * | ||
1504 | * We do it in two loops: first we initialize struct page, than free to | ||
1505 | * buddy allocator, becuse while we are freeing pages we can access | ||
1506 | * pages that are ahead (computing buddy page in __free_one_page()). | ||
1507 | */ | ||
1508 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | ||
1509 | if (!pfn_valid_within(pfn)) | ||
1510 | continue; | ||
1511 | if ((pfn & nr_pgmask) || pfn_valid(pfn)) { | ||
1512 | if (meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { | ||
1513 | if (page && (pfn & nr_pgmask)) | ||
1514 | page++; | ||
1515 | else | ||
1516 | page = pfn_to_page(pfn); | ||
1517 | __init_single_page(page, pfn, zid, nid); | ||
1518 | cond_resched(); | ||
1519 | } | ||
1520 | } | ||
1521 | } | ||
1522 | |||
1523 | page = NULL; | ||
1524 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | ||
1525 | if (!pfn_valid_within(pfn)) { | ||
1526 | nr_pages += __def_free(&nr_free, &free_base_pfn, &page); | ||
1527 | } else if (!(pfn & nr_pgmask) && !pfn_valid(pfn)) { | ||
1528 | nr_pages += __def_free(&nr_free, &free_base_pfn, &page); | ||
1529 | } else if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { | ||
1530 | nr_pages += __def_free(&nr_free, &free_base_pfn, &page); | ||
1531 | } else if (page && (pfn & nr_pgmask)) { | ||
1532 | page++; | ||
1533 | nr_free++; | ||
1534 | } else { | ||
1535 | nr_pages += __def_free(&nr_free, &free_base_pfn, &page); | ||
1536 | page = pfn_to_page(pfn); | ||
1537 | free_base_pfn = pfn; | ||
1538 | nr_free = 1; | ||
1539 | cond_resched(); | ||
1540 | } | ||
1541 | } | ||
1542 | /* Free the last block of pages to allocator */ | ||
1543 | nr_pages += __def_free(&nr_free, &free_base_pfn, &page); | ||
1544 | |||
1545 | return nr_pages; | ||
1546 | } | ||
1547 | |||
1446 | /* Initialise remaining memory on a node */ | 1548 | /* Initialise remaining memory on a node */ |
1447 | static int __init deferred_init_memmap(void *data) | 1549 | static int __init deferred_init_memmap(void *data) |
1448 | { | 1550 | { |
1449 | pg_data_t *pgdat = data; | 1551 | pg_data_t *pgdat = data; |
1450 | int nid = pgdat->node_id; | 1552 | int nid = pgdat->node_id; |
1451 | struct mminit_pfnnid_cache nid_init_state = { }; | ||
1452 | unsigned long start = jiffies; | 1553 | unsigned long start = jiffies; |
1453 | unsigned long nr_pages = 0; | 1554 | unsigned long nr_pages = 0; |
1454 | unsigned long walk_start, walk_end; | 1555 | unsigned long spfn, epfn; |
1455 | int i, zid; | 1556 | phys_addr_t spa, epa; |
1557 | int zid; | ||
1456 | struct zone *zone; | 1558 | struct zone *zone; |
1457 | unsigned long first_init_pfn = pgdat->first_deferred_pfn; | 1559 | unsigned long first_init_pfn = pgdat->first_deferred_pfn; |
1458 | const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); | 1560 | const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); |
1561 | u64 i; | ||
1459 | 1562 | ||
1460 | if (first_init_pfn == ULONG_MAX) { | 1563 | if (first_init_pfn == ULONG_MAX) { |
1461 | pgdat_init_report_one_done(); | 1564 | pgdat_init_report_one_done(); |
@@ -1477,83 +1580,12 @@ static int __init deferred_init_memmap(void *data) | |||
1477 | if (first_init_pfn < zone_end_pfn(zone)) | 1580 | if (first_init_pfn < zone_end_pfn(zone)) |
1478 | break; | 1581 | break; |
1479 | } | 1582 | } |
1583 | first_init_pfn = max(zone->zone_start_pfn, first_init_pfn); | ||
1480 | 1584 | ||
1481 | for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) { | 1585 | for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { |
1482 | unsigned long pfn, end_pfn; | 1586 | spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); |
1483 | struct page *page = NULL; | 1587 | epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); |
1484 | struct page *free_base_page = NULL; | 1588 | nr_pages += deferred_init_range(nid, zid, spfn, epfn); |
1485 | unsigned long free_base_pfn = 0; | ||
1486 | int nr_to_free = 0; | ||
1487 | |||
1488 | end_pfn = min(walk_end, zone_end_pfn(zone)); | ||
1489 | pfn = first_init_pfn; | ||
1490 | if (pfn < walk_start) | ||
1491 | pfn = walk_start; | ||
1492 | if (pfn < zone->zone_start_pfn) | ||
1493 | pfn = zone->zone_start_pfn; | ||
1494 | |||
1495 | for (; pfn < end_pfn; pfn++) { | ||
1496 | if (!pfn_valid_within(pfn)) | ||
1497 | goto free_range; | ||
1498 | |||
1499 | /* | ||
1500 | * Ensure pfn_valid is checked every | ||
1501 | * pageblock_nr_pages for memory holes | ||
1502 | */ | ||
1503 | if ((pfn & (pageblock_nr_pages - 1)) == 0) { | ||
1504 | if (!pfn_valid(pfn)) { | ||
1505 | page = NULL; | ||
1506 | goto free_range; | ||
1507 | } | ||
1508 | } | ||
1509 | |||
1510 | if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { | ||
1511 | page = NULL; | ||
1512 | goto free_range; | ||
1513 | } | ||
1514 | |||
1515 | /* Minimise pfn page lookups and scheduler checks */ | ||
1516 | if (page && (pfn & (pageblock_nr_pages - 1)) != 0) { | ||
1517 | page++; | ||
1518 | } else { | ||
1519 | nr_pages += nr_to_free; | ||
1520 | deferred_free_range(free_base_page, | ||
1521 | free_base_pfn, nr_to_free); | ||
1522 | free_base_page = NULL; | ||
1523 | free_base_pfn = nr_to_free = 0; | ||
1524 | |||
1525 | page = pfn_to_page(pfn); | ||
1526 | cond_resched(); | ||
1527 | } | ||
1528 | |||
1529 | if (page->flags) { | ||
1530 | VM_BUG_ON(page_zone(page) != zone); | ||
1531 | goto free_range; | ||
1532 | } | ||
1533 | |||
1534 | __init_single_page(page, pfn, zid, nid); | ||
1535 | if (!free_base_page) { | ||
1536 | free_base_page = page; | ||
1537 | free_base_pfn = pfn; | ||
1538 | nr_to_free = 0; | ||
1539 | } | ||
1540 | nr_to_free++; | ||
1541 | |||
1542 | /* Where possible, batch up pages for a single free */ | ||
1543 | continue; | ||
1544 | free_range: | ||
1545 | /* Free the current block of pages to allocator */ | ||
1546 | nr_pages += nr_to_free; | ||
1547 | deferred_free_range(free_base_page, free_base_pfn, | ||
1548 | nr_to_free); | ||
1549 | free_base_page = NULL; | ||
1550 | free_base_pfn = nr_to_free = 0; | ||
1551 | } | ||
1552 | /* Free the last block of pages to allocator */ | ||
1553 | nr_pages += nr_to_free; | ||
1554 | deferred_free_range(free_base_page, free_base_pfn, nr_to_free); | ||
1555 | |||
1556 | first_init_pfn = max(end_pfn, first_init_pfn); | ||
1557 | } | 1589 | } |
1558 | 1590 | ||
1559 | /* Sanity check that the next zone really is unpopulated */ | 1591 | /* Sanity check that the next zone really is unpopulated */ |
@@ -1792,7 +1824,7 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags | |||
1792 | * Go through the free lists for the given migratetype and remove | 1824 | * Go through the free lists for the given migratetype and remove |
1793 | * the smallest available page from the freelists | 1825 | * the smallest available page from the freelists |
1794 | */ | 1826 | */ |
1795 | static inline | 1827 | static __always_inline |
1796 | struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | 1828 | struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, |
1797 | int migratetype) | 1829 | int migratetype) |
1798 | { | 1830 | { |
@@ -1836,7 +1868,7 @@ static int fallbacks[MIGRATE_TYPES][4] = { | |||
1836 | }; | 1868 | }; |
1837 | 1869 | ||
1838 | #ifdef CONFIG_CMA | 1870 | #ifdef CONFIG_CMA |
1839 | static struct page *__rmqueue_cma_fallback(struct zone *zone, | 1871 | static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone, |
1840 | unsigned int order) | 1872 | unsigned int order) |
1841 | { | 1873 | { |
1842 | return __rmqueue_smallest(zone, order, MIGRATE_CMA); | 1874 | return __rmqueue_smallest(zone, order, MIGRATE_CMA); |
@@ -2217,7 +2249,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, | |||
2217 | * deviation from the rest of this file, to make the for loop | 2249 | * deviation from the rest of this file, to make the for loop |
2218 | * condition simpler. | 2250 | * condition simpler. |
2219 | */ | 2251 | */ |
2220 | static inline bool | 2252 | static __always_inline bool |
2221 | __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | 2253 | __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) |
2222 | { | 2254 | { |
2223 | struct free_area *area; | 2255 | struct free_area *area; |
@@ -2289,8 +2321,8 @@ do_steal: | |||
2289 | * Do the hard work of removing an element from the buddy allocator. | 2321 | * Do the hard work of removing an element from the buddy allocator. |
2290 | * Call me with the zone->lock already held. | 2322 | * Call me with the zone->lock already held. |
2291 | */ | 2323 | */ |
2292 | static struct page *__rmqueue(struct zone *zone, unsigned int order, | 2324 | static __always_inline struct page * |
2293 | int migratetype) | 2325 | __rmqueue(struct zone *zone, unsigned int order, int migratetype) |
2294 | { | 2326 | { |
2295 | struct page *page; | 2327 | struct page *page; |
2296 | 2328 | ||
@@ -2315,7 +2347,7 @@ retry: | |||
2315 | */ | 2347 | */ |
2316 | static int rmqueue_bulk(struct zone *zone, unsigned int order, | 2348 | static int rmqueue_bulk(struct zone *zone, unsigned int order, |
2317 | unsigned long count, struct list_head *list, | 2349 | unsigned long count, struct list_head *list, |
2318 | int migratetype, bool cold) | 2350 | int migratetype) |
2319 | { | 2351 | { |
2320 | int i, alloced = 0; | 2352 | int i, alloced = 0; |
2321 | 2353 | ||
@@ -2329,19 +2361,16 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
2329 | continue; | 2361 | continue; |
2330 | 2362 | ||
2331 | /* | 2363 | /* |
2332 | * Split buddy pages returned by expand() are received here | 2364 | * Split buddy pages returned by expand() are received here in |
2333 | * in physical page order. The page is added to the callers and | 2365 | * physical page order. The page is added to the tail of |
2334 | * list and the list head then moves forward. From the callers | 2366 | * caller's list. From the callers perspective, the linked list |
2335 | * perspective, the linked list is ordered by page number in | 2367 | * is ordered by page number under some conditions. This is |
2336 | * some conditions. This is useful for IO devices that can | 2368 | * useful for IO devices that can forward direction from the |
2337 | * merge IO requests if the physical pages are ordered | 2369 | * head, thus also in the physical page order. This is useful |
2338 | * properly. | 2370 | * for IO devices that can merge IO requests if the physical |
2371 | * pages are ordered properly. | ||
2339 | */ | 2372 | */ |
2340 | if (likely(!cold)) | 2373 | list_add_tail(&page->lru, list); |
2341 | list_add(&page->lru, list); | ||
2342 | else | ||
2343 | list_add_tail(&page->lru, list); | ||
2344 | list = &page->lru; | ||
2345 | alloced++; | 2374 | alloced++; |
2346 | if (is_migrate_cma(get_pcppage_migratetype(page))) | 2375 | if (is_migrate_cma(get_pcppage_migratetype(page))) |
2347 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, | 2376 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, |
@@ -2590,24 +2619,25 @@ void mark_free_pages(struct zone *zone) | |||
2590 | } | 2619 | } |
2591 | #endif /* CONFIG_PM */ | 2620 | #endif /* CONFIG_PM */ |
2592 | 2621 | ||
2593 | /* | 2622 | static bool free_unref_page_prepare(struct page *page, unsigned long pfn) |
2594 | * Free a 0-order page | ||
2595 | * cold == true ? free a cold page : free a hot page | ||
2596 | */ | ||
2597 | void free_hot_cold_page(struct page *page, bool cold) | ||
2598 | { | 2623 | { |
2599 | struct zone *zone = page_zone(page); | ||
2600 | struct per_cpu_pages *pcp; | ||
2601 | unsigned long flags; | ||
2602 | unsigned long pfn = page_to_pfn(page); | ||
2603 | int migratetype; | 2624 | int migratetype; |
2604 | 2625 | ||
2605 | if (!free_pcp_prepare(page)) | 2626 | if (!free_pcp_prepare(page)) |
2606 | return; | 2627 | return false; |
2607 | 2628 | ||
2608 | migratetype = get_pfnblock_migratetype(page, pfn); | 2629 | migratetype = get_pfnblock_migratetype(page, pfn); |
2609 | set_pcppage_migratetype(page, migratetype); | 2630 | set_pcppage_migratetype(page, migratetype); |
2610 | local_irq_save(flags); | 2631 | return true; |
2632 | } | ||
2633 | |||
2634 | static void free_unref_page_commit(struct page *page, unsigned long pfn) | ||
2635 | { | ||
2636 | struct zone *zone = page_zone(page); | ||
2637 | struct per_cpu_pages *pcp; | ||
2638 | int migratetype; | ||
2639 | |||
2640 | migratetype = get_pcppage_migratetype(page); | ||
2611 | __count_vm_event(PGFREE); | 2641 | __count_vm_event(PGFREE); |
2612 | 2642 | ||
2613 | /* | 2643 | /* |
@@ -2620,38 +2650,62 @@ void free_hot_cold_page(struct page *page, bool cold) | |||
2620 | if (migratetype >= MIGRATE_PCPTYPES) { | 2650 | if (migratetype >= MIGRATE_PCPTYPES) { |
2621 | if (unlikely(is_migrate_isolate(migratetype))) { | 2651 | if (unlikely(is_migrate_isolate(migratetype))) { |
2622 | free_one_page(zone, page, pfn, 0, migratetype); | 2652 | free_one_page(zone, page, pfn, 0, migratetype); |
2623 | goto out; | 2653 | return; |
2624 | } | 2654 | } |
2625 | migratetype = MIGRATE_MOVABLE; | 2655 | migratetype = MIGRATE_MOVABLE; |
2626 | } | 2656 | } |
2627 | 2657 | ||
2628 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | 2658 | pcp = &this_cpu_ptr(zone->pageset)->pcp; |
2629 | if (!cold) | 2659 | list_add(&page->lru, &pcp->lists[migratetype]); |
2630 | list_add(&page->lru, &pcp->lists[migratetype]); | ||
2631 | else | ||
2632 | list_add_tail(&page->lru, &pcp->lists[migratetype]); | ||
2633 | pcp->count++; | 2660 | pcp->count++; |
2634 | if (pcp->count >= pcp->high) { | 2661 | if (pcp->count >= pcp->high) { |
2635 | unsigned long batch = READ_ONCE(pcp->batch); | 2662 | unsigned long batch = READ_ONCE(pcp->batch); |
2636 | free_pcppages_bulk(zone, batch, pcp); | 2663 | free_pcppages_bulk(zone, batch, pcp); |
2637 | pcp->count -= batch; | 2664 | pcp->count -= batch; |
2638 | } | 2665 | } |
2666 | } | ||
2639 | 2667 | ||
2640 | out: | 2668 | /* |
2669 | * Free a 0-order page | ||
2670 | */ | ||
2671 | void free_unref_page(struct page *page) | ||
2672 | { | ||
2673 | unsigned long flags; | ||
2674 | unsigned long pfn = page_to_pfn(page); | ||
2675 | |||
2676 | if (!free_unref_page_prepare(page, pfn)) | ||
2677 | return; | ||
2678 | |||
2679 | local_irq_save(flags); | ||
2680 | free_unref_page_commit(page, pfn); | ||
2641 | local_irq_restore(flags); | 2681 | local_irq_restore(flags); |
2642 | } | 2682 | } |
2643 | 2683 | ||
2644 | /* | 2684 | /* |
2645 | * Free a list of 0-order pages | 2685 | * Free a list of 0-order pages |
2646 | */ | 2686 | */ |
2647 | void free_hot_cold_page_list(struct list_head *list, bool cold) | 2687 | void free_unref_page_list(struct list_head *list) |
2648 | { | 2688 | { |
2649 | struct page *page, *next; | 2689 | struct page *page, *next; |
2690 | unsigned long flags, pfn; | ||
2691 | |||
2692 | /* Prepare pages for freeing */ | ||
2693 | list_for_each_entry_safe(page, next, list, lru) { | ||
2694 | pfn = page_to_pfn(page); | ||
2695 | if (!free_unref_page_prepare(page, pfn)) | ||
2696 | list_del(&page->lru); | ||
2697 | set_page_private(page, pfn); | ||
2698 | } | ||
2650 | 2699 | ||
2700 | local_irq_save(flags); | ||
2651 | list_for_each_entry_safe(page, next, list, lru) { | 2701 | list_for_each_entry_safe(page, next, list, lru) { |
2652 | trace_mm_page_free_batched(page, cold); | 2702 | unsigned long pfn = page_private(page); |
2653 | free_hot_cold_page(page, cold); | 2703 | |
2704 | set_page_private(page, 0); | ||
2705 | trace_mm_page_free_batched(page); | ||
2706 | free_unref_page_commit(page, pfn); | ||
2654 | } | 2707 | } |
2708 | local_irq_restore(flags); | ||
2655 | } | 2709 | } |
2656 | 2710 | ||
2657 | /* | 2711 | /* |
@@ -2669,15 +2723,6 @@ void split_page(struct page *page, unsigned int order) | |||
2669 | VM_BUG_ON_PAGE(PageCompound(page), page); | 2723 | VM_BUG_ON_PAGE(PageCompound(page), page); |
2670 | VM_BUG_ON_PAGE(!page_count(page), page); | 2724 | VM_BUG_ON_PAGE(!page_count(page), page); |
2671 | 2725 | ||
2672 | #ifdef CONFIG_KMEMCHECK | ||
2673 | /* | ||
2674 | * Split shadow pages too, because free(page[0]) would | ||
2675 | * otherwise free the whole shadow. | ||
2676 | */ | ||
2677 | if (kmemcheck_page_is_tracked(page)) | ||
2678 | split_page(virt_to_page(page[0].shadow), order); | ||
2679 | #endif | ||
2680 | |||
2681 | for (i = 1; i < (1 << order); i++) | 2726 | for (i = 1; i < (1 << order); i++) |
2682 | set_page_refcounted(page + i); | 2727 | set_page_refcounted(page + i); |
2683 | split_page_owner(page, order); | 2728 | split_page_owner(page, order); |
@@ -2743,6 +2788,10 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) | |||
2743 | #ifdef CONFIG_NUMA | 2788 | #ifdef CONFIG_NUMA |
2744 | enum numa_stat_item local_stat = NUMA_LOCAL; | 2789 | enum numa_stat_item local_stat = NUMA_LOCAL; |
2745 | 2790 | ||
2791 | /* skip numa counters update if numa stats is disabled */ | ||
2792 | if (!static_branch_likely(&vm_numa_stat_key)) | ||
2793 | return; | ||
2794 | |||
2746 | if (z->node != numa_node_id()) | 2795 | if (z->node != numa_node_id()) |
2747 | local_stat = NUMA_OTHER; | 2796 | local_stat = NUMA_OTHER; |
2748 | 2797 | ||
@@ -2758,7 +2807,7 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) | |||
2758 | 2807 | ||
2759 | /* Remove page from the per-cpu list, caller must protect the list */ | 2808 | /* Remove page from the per-cpu list, caller must protect the list */ |
2760 | static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, | 2809 | static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, |
2761 | bool cold, struct per_cpu_pages *pcp, | 2810 | struct per_cpu_pages *pcp, |
2762 | struct list_head *list) | 2811 | struct list_head *list) |
2763 | { | 2812 | { |
2764 | struct page *page; | 2813 | struct page *page; |
@@ -2767,16 +2816,12 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, | |||
2767 | if (list_empty(list)) { | 2816 | if (list_empty(list)) { |
2768 | pcp->count += rmqueue_bulk(zone, 0, | 2817 | pcp->count += rmqueue_bulk(zone, 0, |
2769 | pcp->batch, list, | 2818 | pcp->batch, list, |
2770 | migratetype, cold); | 2819 | migratetype); |
2771 | if (unlikely(list_empty(list))) | 2820 | if (unlikely(list_empty(list))) |
2772 | return NULL; | 2821 | return NULL; |
2773 | } | 2822 | } |
2774 | 2823 | ||
2775 | if (cold) | 2824 | page = list_first_entry(list, struct page, lru); |
2776 | page = list_last_entry(list, struct page, lru); | ||
2777 | else | ||
2778 | page = list_first_entry(list, struct page, lru); | ||
2779 | |||
2780 | list_del(&page->lru); | 2825 | list_del(&page->lru); |
2781 | pcp->count--; | 2826 | pcp->count--; |
2782 | } while (check_new_pcp(page)); | 2827 | } while (check_new_pcp(page)); |
@@ -2791,14 +2836,13 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, | |||
2791 | { | 2836 | { |
2792 | struct per_cpu_pages *pcp; | 2837 | struct per_cpu_pages *pcp; |
2793 | struct list_head *list; | 2838 | struct list_head *list; |
2794 | bool cold = ((gfp_flags & __GFP_COLD) != 0); | ||
2795 | struct page *page; | 2839 | struct page *page; |
2796 | unsigned long flags; | 2840 | unsigned long flags; |
2797 | 2841 | ||
2798 | local_irq_save(flags); | 2842 | local_irq_save(flags); |
2799 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | 2843 | pcp = &this_cpu_ptr(zone->pageset)->pcp; |
2800 | list = &pcp->lists[migratetype]; | 2844 | list = &pcp->lists[migratetype]; |
2801 | page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list); | 2845 | page = __rmqueue_pcplist(zone, migratetype, pcp, list); |
2802 | if (page) { | 2846 | if (page) { |
2803 | __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); | 2847 | __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); |
2804 | zone_statistics(preferred_zone, zone); | 2848 | zone_statistics(preferred_zone, zone); |
@@ -3006,9 +3050,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, | |||
3006 | if (!area->nr_free) | 3050 | if (!area->nr_free) |
3007 | continue; | 3051 | continue; |
3008 | 3052 | ||
3009 | if (alloc_harder) | ||
3010 | return true; | ||
3011 | |||
3012 | for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { | 3053 | for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { |
3013 | if (!list_empty(&area->free_list[mt])) | 3054 | if (!list_empty(&area->free_list[mt])) |
3014 | return true; | 3055 | return true; |
@@ -3020,6 +3061,9 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, | |||
3020 | return true; | 3061 | return true; |
3021 | } | 3062 | } |
3022 | #endif | 3063 | #endif |
3064 | if (alloc_harder && | ||
3065 | !list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) | ||
3066 | return true; | ||
3023 | } | 3067 | } |
3024 | return false; | 3068 | return false; |
3025 | } | 3069 | } |
@@ -3235,20 +3279,14 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) | |||
3235 | if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) | 3279 | if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) |
3236 | return; | 3280 | return; |
3237 | 3281 | ||
3238 | pr_warn("%s: ", current->comm); | ||
3239 | |||
3240 | va_start(args, fmt); | 3282 | va_start(args, fmt); |
3241 | vaf.fmt = fmt; | 3283 | vaf.fmt = fmt; |
3242 | vaf.va = &args; | 3284 | vaf.va = &args; |
3243 | pr_cont("%pV", &vaf); | 3285 | pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n", |
3286 | current->comm, &vaf, gfp_mask, &gfp_mask, | ||
3287 | nodemask_pr_args(nodemask)); | ||
3244 | va_end(args); | 3288 | va_end(args); |
3245 | 3289 | ||
3246 | pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask); | ||
3247 | if (nodemask) | ||
3248 | pr_cont("%*pbl\n", nodemask_pr_args(nodemask)); | ||
3249 | else | ||
3250 | pr_cont("(null)\n"); | ||
3251 | |||
3252 | cpuset_print_current_mems_allowed(); | 3290 | cpuset_print_current_mems_allowed(); |
3253 | 3291 | ||
3254 | dump_stack(); | 3292 | dump_stack(); |
@@ -3868,8 +3906,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
3868 | enum compact_result compact_result; | 3906 | enum compact_result compact_result; |
3869 | int compaction_retries; | 3907 | int compaction_retries; |
3870 | int no_progress_loops; | 3908 | int no_progress_loops; |
3871 | unsigned long alloc_start = jiffies; | ||
3872 | unsigned int stall_timeout = 10 * HZ; | ||
3873 | unsigned int cpuset_mems_cookie; | 3909 | unsigned int cpuset_mems_cookie; |
3874 | int reserve_flags; | 3910 | int reserve_flags; |
3875 | 3911 | ||
@@ -4001,14 +4037,6 @@ retry: | |||
4001 | if (!can_direct_reclaim) | 4037 | if (!can_direct_reclaim) |
4002 | goto nopage; | 4038 | goto nopage; |
4003 | 4039 | ||
4004 | /* Make sure we know about allocations which stall for too long */ | ||
4005 | if (time_after(jiffies, alloc_start + stall_timeout)) { | ||
4006 | warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask, | ||
4007 | "page allocation stalls for %ums, order:%u", | ||
4008 | jiffies_to_msecs(jiffies-alloc_start), order); | ||
4009 | stall_timeout += 10 * HZ; | ||
4010 | } | ||
4011 | |||
4012 | /* Avoid recursion of direct reclaim */ | 4040 | /* Avoid recursion of direct reclaim */ |
4013 | if (current->flags & PF_MEMALLOC) | 4041 | if (current->flags & PF_MEMALLOC) |
4014 | goto nopage; | 4042 | goto nopage; |
@@ -4223,9 +4251,6 @@ out: | |||
4223 | page = NULL; | 4251 | page = NULL; |
4224 | } | 4252 | } |
4225 | 4253 | ||
4226 | if (kmemcheck_enabled && page) | ||
4227 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); | ||
4228 | |||
4229 | trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); | 4254 | trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); |
4230 | 4255 | ||
4231 | return page; | 4256 | return page; |
@@ -4262,7 +4287,7 @@ void __free_pages(struct page *page, unsigned int order) | |||
4262 | { | 4287 | { |
4263 | if (put_page_testzero(page)) { | 4288 | if (put_page_testzero(page)) { |
4264 | if (order == 0) | 4289 | if (order == 0) |
4265 | free_hot_cold_page(page, false); | 4290 | free_unref_page(page); |
4266 | else | 4291 | else |
4267 | __free_pages_ok(page, order); | 4292 | __free_pages_ok(page, order); |
4268 | } | 4293 | } |
@@ -4320,7 +4345,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count) | |||
4320 | unsigned int order = compound_order(page); | 4345 | unsigned int order = compound_order(page); |
4321 | 4346 | ||
4322 | if (order == 0) | 4347 | if (order == 0) |
4323 | free_hot_cold_page(page, false); | 4348 | free_unref_page(page); |
4324 | else | 4349 | else |
4325 | __free_pages_ok(page, order); | 4350 | __free_pages_ok(page, order); |
4326 | } | 4351 | } |
@@ -6126,6 +6151,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) | |||
6126 | } | 6151 | } |
6127 | } | 6152 | } |
6128 | 6153 | ||
6154 | #ifdef CONFIG_FLAT_NODE_MEM_MAP | ||
6129 | static void __ref alloc_node_mem_map(struct pglist_data *pgdat) | 6155 | static void __ref alloc_node_mem_map(struct pglist_data *pgdat) |
6130 | { | 6156 | { |
6131 | unsigned long __maybe_unused start = 0; | 6157 | unsigned long __maybe_unused start = 0; |
@@ -6135,7 +6161,6 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) | |||
6135 | if (!pgdat->node_spanned_pages) | 6161 | if (!pgdat->node_spanned_pages) |
6136 | return; | 6162 | return; |
6137 | 6163 | ||
6138 | #ifdef CONFIG_FLAT_NODE_MEM_MAP | ||
6139 | start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); | 6164 | start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); |
6140 | offset = pgdat->node_start_pfn - start; | 6165 | offset = pgdat->node_start_pfn - start; |
6141 | /* ia64 gets its own node_mem_map, before this, without bootmem */ | 6166 | /* ia64 gets its own node_mem_map, before this, without bootmem */ |
@@ -6157,6 +6182,9 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) | |||
6157 | pgdat->node_id); | 6182 | pgdat->node_id); |
6158 | pgdat->node_mem_map = map + offset; | 6183 | pgdat->node_mem_map = map + offset; |
6159 | } | 6184 | } |
6185 | pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", | ||
6186 | __func__, pgdat->node_id, (unsigned long)pgdat, | ||
6187 | (unsigned long)pgdat->node_mem_map); | ||
6160 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 6188 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
6161 | /* | 6189 | /* |
6162 | * With no DISCONTIG, the global mem_map is just set as node 0's | 6190 | * With no DISCONTIG, the global mem_map is just set as node 0's |
@@ -6169,8 +6197,10 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) | |||
6169 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 6197 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
6170 | } | 6198 | } |
6171 | #endif | 6199 | #endif |
6172 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ | ||
6173 | } | 6200 | } |
6201 | #else | ||
6202 | static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { } | ||
6203 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ | ||
6174 | 6204 | ||
6175 | void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | 6205 | void __paginginit free_area_init_node(int nid, unsigned long *zones_size, |
6176 | unsigned long node_start_pfn, unsigned long *zholes_size) | 6206 | unsigned long node_start_pfn, unsigned long *zholes_size) |
@@ -6197,16 +6227,49 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
6197 | zones_size, zholes_size); | 6227 | zones_size, zholes_size); |
6198 | 6228 | ||
6199 | alloc_node_mem_map(pgdat); | 6229 | alloc_node_mem_map(pgdat); |
6200 | #ifdef CONFIG_FLAT_NODE_MEM_MAP | ||
6201 | printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", | ||
6202 | nid, (unsigned long)pgdat, | ||
6203 | (unsigned long)pgdat->node_mem_map); | ||
6204 | #endif | ||
6205 | 6230 | ||
6206 | reset_deferred_meminit(pgdat); | 6231 | reset_deferred_meminit(pgdat); |
6207 | free_area_init_core(pgdat); | 6232 | free_area_init_core(pgdat); |
6208 | } | 6233 | } |
6209 | 6234 | ||
6235 | #ifdef CONFIG_HAVE_MEMBLOCK | ||
6236 | /* | ||
6237 | * Only struct pages that are backed by physical memory are zeroed and | ||
6238 | * initialized by going through __init_single_page(). But, there are some | ||
6239 | * struct pages which are reserved in memblock allocator and their fields | ||
6240 | * may be accessed (for example page_to_pfn() on some configuration accesses | ||
6241 | * flags). We must explicitly zero those struct pages. | ||
6242 | */ | ||
6243 | void __paginginit zero_resv_unavail(void) | ||
6244 | { | ||
6245 | phys_addr_t start, end; | ||
6246 | unsigned long pfn; | ||
6247 | u64 i, pgcnt; | ||
6248 | |||
6249 | /* | ||
6250 | * Loop through ranges that are reserved, but do not have reported | ||
6251 | * physical memory backing. | ||
6252 | */ | ||
6253 | pgcnt = 0; | ||
6254 | for_each_resv_unavail_range(i, &start, &end) { | ||
6255 | for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) { | ||
6256 | mm_zero_struct_page(pfn_to_page(pfn)); | ||
6257 | pgcnt++; | ||
6258 | } | ||
6259 | } | ||
6260 | |||
6261 | /* | ||
6262 | * Struct pages that do not have backing memory. This could be because | ||
6263 | * firmware is using some of this memory, or for some other reasons. | ||
6264 | * Once memblock is changed so such behaviour is not allowed: i.e. | ||
6265 | * list of "reserved" memory must be a subset of list of "memory", then | ||
6266 | * this code can be removed. | ||
6267 | */ | ||
6268 | if (pgcnt) | ||
6269 | pr_info("Reserved but unavailable: %lld pages", pgcnt); | ||
6270 | } | ||
6271 | #endif /* CONFIG_HAVE_MEMBLOCK */ | ||
6272 | |||
6210 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 6273 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
6211 | 6274 | ||
6212 | #if MAX_NUMNODES > 1 | 6275 | #if MAX_NUMNODES > 1 |
@@ -6630,6 +6693,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
6630 | node_set_state(nid, N_MEMORY); | 6693 | node_set_state(nid, N_MEMORY); |
6631 | check_for_memory(pgdat, nid); | 6694 | check_for_memory(pgdat, nid); |
6632 | } | 6695 | } |
6696 | zero_resv_unavail(); | ||
6633 | } | 6697 | } |
6634 | 6698 | ||
6635 | static int __init cmdline_parse_core(char *p, unsigned long *core) | 6699 | static int __init cmdline_parse_core(char *p, unsigned long *core) |
@@ -6793,6 +6857,7 @@ void __init free_area_init(unsigned long *zones_size) | |||
6793 | { | 6857 | { |
6794 | free_area_init_node(0, zones_size, | 6858 | free_area_init_node(0, zones_size, |
6795 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); | 6859 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); |
6860 | zero_resv_unavail(); | ||
6796 | } | 6861 | } |
6797 | 6862 | ||
6798 | static int page_alloc_cpu_dead(unsigned int cpu) | 6863 | static int page_alloc_cpu_dead(unsigned int cpu) |
@@ -7305,18 +7370,17 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
7305 | 7370 | ||
7306 | log2qty = ilog2(numentries); | 7371 | log2qty = ilog2(numentries); |
7307 | 7372 | ||
7308 | /* | ||
7309 | * memblock allocator returns zeroed memory already, so HASH_ZERO is | ||
7310 | * currently not used when HASH_EARLY is specified. | ||
7311 | */ | ||
7312 | gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; | 7373 | gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; |
7313 | do { | 7374 | do { |
7314 | size = bucketsize << log2qty; | 7375 | size = bucketsize << log2qty; |
7315 | if (flags & HASH_EARLY) | 7376 | if (flags & HASH_EARLY) { |
7316 | table = memblock_virt_alloc_nopanic(size, 0); | 7377 | if (flags & HASH_ZERO) |
7317 | else if (hashdist) | 7378 | table = memblock_virt_alloc_nopanic(size, 0); |
7379 | else | ||
7380 | table = memblock_virt_alloc_raw(size, 0); | ||
7381 | } else if (hashdist) { | ||
7318 | table = __vmalloc(size, gfp_flags, PAGE_KERNEL); | 7382 | table = __vmalloc(size, gfp_flags, PAGE_KERNEL); |
7319 | else { | 7383 | } else { |
7320 | /* | 7384 | /* |
7321 | * If bucketsize is not a power-of-two, we may free | 7385 | * If bucketsize is not a power-of-two, we may free |
7322 | * some pages at the end of hash table which | 7386 | * some pages at the end of hash table which |
@@ -7353,10 +7417,10 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
7353 | * race condition. So you can't expect this function should be exact. | 7417 | * race condition. So you can't expect this function should be exact. |
7354 | */ | 7418 | */ |
7355 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count, | 7419 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count, |
7420 | int migratetype, | ||
7356 | bool skip_hwpoisoned_pages) | 7421 | bool skip_hwpoisoned_pages) |
7357 | { | 7422 | { |
7358 | unsigned long pfn, iter, found; | 7423 | unsigned long pfn, iter, found; |
7359 | int mt; | ||
7360 | 7424 | ||
7361 | /* | 7425 | /* |
7362 | * For avoiding noise data, lru_add_drain_all() should be called | 7426 | * For avoiding noise data, lru_add_drain_all() should be called |
@@ -7364,8 +7428,14 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, | |||
7364 | */ | 7428 | */ |
7365 | if (zone_idx(zone) == ZONE_MOVABLE) | 7429 | if (zone_idx(zone) == ZONE_MOVABLE) |
7366 | return false; | 7430 | return false; |
7367 | mt = get_pageblock_migratetype(page); | 7431 | |
7368 | if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) | 7432 | /* |
7433 | * CMA allocations (alloc_contig_range) really need to mark isolate | ||
7434 | * CMA pageblocks even when they are not movable in fact so consider | ||
7435 | * them movable here. | ||
7436 | */ | ||
7437 | if (is_migrate_cma(migratetype) && | ||
7438 | is_migrate_cma(get_pageblock_migratetype(page))) | ||
7369 | return false; | 7439 | return false; |
7370 | 7440 | ||
7371 | pfn = page_to_pfn(page); | 7441 | pfn = page_to_pfn(page); |
@@ -7377,6 +7447,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, | |||
7377 | 7447 | ||
7378 | page = pfn_to_page(check); | 7448 | page = pfn_to_page(check); |
7379 | 7449 | ||
7450 | if (PageReserved(page)) | ||
7451 | return true; | ||
7452 | |||
7380 | /* | 7453 | /* |
7381 | * Hugepages are not in LRU lists, but they're movable. | 7454 | * Hugepages are not in LRU lists, but they're movable. |
7382 | * We need not scan over tail pages bacause we don't | 7455 | * We need not scan over tail pages bacause we don't |
@@ -7450,7 +7523,7 @@ bool is_pageblock_removable_nolock(struct page *page) | |||
7450 | if (!zone_spans_pfn(zone, pfn)) | 7523 | if (!zone_spans_pfn(zone, pfn)) |
7451 | return false; | 7524 | return false; |
7452 | 7525 | ||
7453 | return !has_unmovable_pages(zone, page, 0, true); | 7526 | return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true); |
7454 | } | 7527 | } |
7455 | 7528 | ||
7456 | #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) | 7529 | #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) |
diff --git a/mm/page_ext.c b/mm/page_ext.c index 4f0367d472c4..2c16216c29b6 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c | |||
@@ -125,7 +125,6 @@ struct page_ext *lookup_page_ext(struct page *page) | |||
125 | struct page_ext *base; | 125 | struct page_ext *base; |
126 | 126 | ||
127 | base = NODE_DATA(page_to_nid(page))->node_page_ext; | 127 | base = NODE_DATA(page_to_nid(page))->node_page_ext; |
128 | #if defined(CONFIG_DEBUG_VM) | ||
129 | /* | 128 | /* |
130 | * The sanity checks the page allocator does upon freeing a | 129 | * The sanity checks the page allocator does upon freeing a |
131 | * page can reach here before the page_ext arrays are | 130 | * page can reach here before the page_ext arrays are |
@@ -134,7 +133,6 @@ struct page_ext *lookup_page_ext(struct page *page) | |||
134 | */ | 133 | */ |
135 | if (unlikely(!base)) | 134 | if (unlikely(!base)) |
136 | return NULL; | 135 | return NULL; |
137 | #endif | ||
138 | index = pfn - round_down(node_start_pfn(page_to_nid(page)), | 136 | index = pfn - round_down(node_start_pfn(page_to_nid(page)), |
139 | MAX_ORDER_NR_PAGES); | 137 | MAX_ORDER_NR_PAGES); |
140 | return get_entry(base, index); | 138 | return get_entry(base, index); |
@@ -199,7 +197,6 @@ struct page_ext *lookup_page_ext(struct page *page) | |||
199 | { | 197 | { |
200 | unsigned long pfn = page_to_pfn(page); | 198 | unsigned long pfn = page_to_pfn(page); |
201 | struct mem_section *section = __pfn_to_section(pfn); | 199 | struct mem_section *section = __pfn_to_section(pfn); |
202 | #if defined(CONFIG_DEBUG_VM) | ||
203 | /* | 200 | /* |
204 | * The sanity checks the page allocator does upon freeing a | 201 | * The sanity checks the page allocator does upon freeing a |
205 | * page can reach here before the page_ext arrays are | 202 | * page can reach here before the page_ext arrays are |
@@ -208,7 +205,6 @@ struct page_ext *lookup_page_ext(struct page *page) | |||
208 | */ | 205 | */ |
209 | if (!section->page_ext) | 206 | if (!section->page_ext) |
210 | return NULL; | 207 | return NULL; |
211 | #endif | ||
212 | return get_entry(section->page_ext, pfn); | 208 | return get_entry(section->page_ext, pfn); |
213 | } | 209 | } |
214 | 210 | ||
diff --git a/mm/page_io.c b/mm/page_io.c index cd52b9cc169b..e93f1a4cacd7 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -347,7 +347,7 @@ out: | |||
347 | return ret; | 347 | return ret; |
348 | } | 348 | } |
349 | 349 | ||
350 | int swap_readpage(struct page *page, bool do_poll) | 350 | int swap_readpage(struct page *page, bool synchronous) |
351 | { | 351 | { |
352 | struct bio *bio; | 352 | struct bio *bio; |
353 | int ret = 0; | 353 | int ret = 0; |
@@ -355,7 +355,7 @@ int swap_readpage(struct page *page, bool do_poll) | |||
355 | blk_qc_t qc; | 355 | blk_qc_t qc; |
356 | struct gendisk *disk; | 356 | struct gendisk *disk; |
357 | 357 | ||
358 | VM_BUG_ON_PAGE(!PageSwapCache(page), page); | 358 | VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page); |
359 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 359 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
360 | VM_BUG_ON_PAGE(PageUptodate(page), page); | 360 | VM_BUG_ON_PAGE(PageUptodate(page), page); |
361 | if (frontswap_load(page) == 0) { | 361 | if (frontswap_load(page) == 0) { |
@@ -403,7 +403,7 @@ int swap_readpage(struct page *page, bool do_poll) | |||
403 | count_vm_event(PSWPIN); | 403 | count_vm_event(PSWPIN); |
404 | bio_get(bio); | 404 | bio_get(bio); |
405 | qc = submit_bio(bio); | 405 | qc = submit_bio(bio); |
406 | while (do_poll) { | 406 | while (synchronous) { |
407 | set_current_state(TASK_UNINTERRUPTIBLE); | 407 | set_current_state(TASK_UNINTERRUPTIBLE); |
408 | if (!READ_ONCE(bio->bi_private)) | 408 | if (!READ_ONCE(bio->bi_private)) |
409 | break; | 409 | break; |
diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 44f213935bf6..165ed8117bd1 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c | |||
@@ -15,7 +15,7 @@ | |||
15 | #define CREATE_TRACE_POINTS | 15 | #define CREATE_TRACE_POINTS |
16 | #include <trace/events/page_isolation.h> | 16 | #include <trace/events/page_isolation.h> |
17 | 17 | ||
18 | static int set_migratetype_isolate(struct page *page, | 18 | static int set_migratetype_isolate(struct page *page, int migratetype, |
19 | bool skip_hwpoisoned_pages) | 19 | bool skip_hwpoisoned_pages) |
20 | { | 20 | { |
21 | struct zone *zone; | 21 | struct zone *zone; |
@@ -52,7 +52,7 @@ static int set_migratetype_isolate(struct page *page, | |||
52 | * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. | 52 | * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. |
53 | * We just check MOVABLE pages. | 53 | * We just check MOVABLE pages. |
54 | */ | 54 | */ |
55 | if (!has_unmovable_pages(zone, page, arg.pages_found, | 55 | if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype, |
56 | skip_hwpoisoned_pages)) | 56 | skip_hwpoisoned_pages)) |
57 | ret = 0; | 57 | ret = 0; |
58 | 58 | ||
@@ -64,14 +64,14 @@ static int set_migratetype_isolate(struct page *page, | |||
64 | out: | 64 | out: |
65 | if (!ret) { | 65 | if (!ret) { |
66 | unsigned long nr_pages; | 66 | unsigned long nr_pages; |
67 | int migratetype = get_pageblock_migratetype(page); | 67 | int mt = get_pageblock_migratetype(page); |
68 | 68 | ||
69 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | 69 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); |
70 | zone->nr_isolate_pageblock++; | 70 | zone->nr_isolate_pageblock++; |
71 | nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE, | 71 | nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE, |
72 | NULL); | 72 | NULL); |
73 | 73 | ||
74 | __mod_zone_freepage_state(zone, -nr_pages, migratetype); | 74 | __mod_zone_freepage_state(zone, -nr_pages, mt); |
75 | } | 75 | } |
76 | 76 | ||
77 | spin_unlock_irqrestore(&zone->lock, flags); | 77 | spin_unlock_irqrestore(&zone->lock, flags); |
@@ -183,7 +183,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, | |||
183 | pfn += pageblock_nr_pages) { | 183 | pfn += pageblock_nr_pages) { |
184 | page = __first_valid_page(pfn, pageblock_nr_pages); | 184 | page = __first_valid_page(pfn, pageblock_nr_pages); |
185 | if (page && | 185 | if (page && |
186 | set_migratetype_isolate(page, skip_hwpoisoned_pages)) { | 186 | set_migratetype_isolate(page, migratetype, skip_hwpoisoned_pages)) { |
187 | undo_pfn = pfn; | 187 | undo_pfn = pfn; |
188 | goto undo; | 188 | goto undo; |
189 | } | 189 | } |
diff --git a/mm/page_owner.c b/mm/page_owner.c index 4f44b95b9d1e..8592543a0f15 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c | |||
@@ -20,9 +20,9 @@ | |||
20 | #define PAGE_OWNER_STACK_DEPTH (16) | 20 | #define PAGE_OWNER_STACK_DEPTH (16) |
21 | 21 | ||
22 | struct page_owner { | 22 | struct page_owner { |
23 | unsigned int order; | 23 | unsigned short order; |
24 | short last_migrate_reason; | ||
24 | gfp_t gfp_mask; | 25 | gfp_t gfp_mask; |
25 | int last_migrate_reason; | ||
26 | depot_stack_handle_t handle; | 26 | depot_stack_handle_t handle; |
27 | }; | 27 | }; |
28 | 28 | ||
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 15dab691ea70..9158e5a81391 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c | |||
@@ -81,7 +81,7 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk, | |||
81 | static int pcpu_alloc_pages(struct pcpu_chunk *chunk, | 81 | static int pcpu_alloc_pages(struct pcpu_chunk *chunk, |
82 | struct page **pages, int page_start, int page_end) | 82 | struct page **pages, int page_start, int page_end) |
83 | { | 83 | { |
84 | const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; | 84 | const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM; |
85 | unsigned int cpu, tcpu; | 85 | unsigned int cpu, tcpu; |
86 | int i; | 86 | int i; |
87 | 87 | ||
@@ -899,7 +899,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, | |||
899 | mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); | 899 | mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); |
900 | 900 | ||
901 | while (page_vma_mapped_walk(&pvmw)) { | 901 | while (page_vma_mapped_walk(&pvmw)) { |
902 | unsigned long cstart, cend; | 902 | unsigned long cstart; |
903 | int ret = 0; | 903 | int ret = 0; |
904 | 904 | ||
905 | cstart = address = pvmw.address; | 905 | cstart = address = pvmw.address; |
@@ -915,7 +915,6 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, | |||
915 | entry = pte_wrprotect(entry); | 915 | entry = pte_wrprotect(entry); |
916 | entry = pte_mkclean(entry); | 916 | entry = pte_mkclean(entry); |
917 | set_pte_at(vma->vm_mm, address, pte, entry); | 917 | set_pte_at(vma->vm_mm, address, pte, entry); |
918 | cend = cstart + PAGE_SIZE; | ||
919 | ret = 1; | 918 | ret = 1; |
920 | } else { | 919 | } else { |
921 | #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE | 920 | #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE |
@@ -931,7 +930,6 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, | |||
931 | entry = pmd_mkclean(entry); | 930 | entry = pmd_mkclean(entry); |
932 | set_pmd_at(vma->vm_mm, address, pmd, entry); | 931 | set_pmd_at(vma->vm_mm, address, pmd, entry); |
933 | cstart &= PMD_MASK; | 932 | cstart &= PMD_MASK; |
934 | cend = cstart + PMD_SIZE; | ||
935 | ret = 1; | 933 | ret = 1; |
936 | #else | 934 | #else |
937 | /* unexpected pmd-mapped page? */ | 935 | /* unexpected pmd-mapped page? */ |
@@ -939,10 +937,15 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, | |||
939 | #endif | 937 | #endif |
940 | } | 938 | } |
941 | 939 | ||
942 | if (ret) { | 940 | /* |
943 | mmu_notifier_invalidate_range(vma->vm_mm, cstart, cend); | 941 | * No need to call mmu_notifier_invalidate_range() as we are |
942 | * downgrading page table protection not changing it to point | ||
943 | * to a new page. | ||
944 | * | ||
945 | * See Documentation/vm/mmu_notifier.txt | ||
946 | */ | ||
947 | if (ret) | ||
944 | (*cleaned)++; | 948 | (*cleaned)++; |
945 | } | ||
946 | } | 949 | } |
947 | 950 | ||
948 | mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); | 951 | mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); |
@@ -1318,7 +1321,7 @@ void page_remove_rmap(struct page *page, bool compound) | |||
1318 | * It would be tidy to reset the PageAnon mapping here, | 1321 | * It would be tidy to reset the PageAnon mapping here, |
1319 | * but that might overwrite a racing page_add_anon_rmap | 1322 | * but that might overwrite a racing page_add_anon_rmap |
1320 | * which increments mapcount after us but sets mapping | 1323 | * which increments mapcount after us but sets mapping |
1321 | * before us: so leave the reset to free_hot_cold_page, | 1324 | * before us: so leave the reset to free_unref_page, |
1322 | * and remember that it's only reliable while mapped. | 1325 | * and remember that it's only reliable while mapped. |
1323 | * Leaving it set also helps swapoff to reinstate ptes | 1326 | * Leaving it set also helps swapoff to reinstate ptes |
1324 | * faster for those pages still in swapcache. | 1327 | * faster for those pages still in swapcache. |
@@ -1426,6 +1429,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1426 | if (pte_soft_dirty(pteval)) | 1429 | if (pte_soft_dirty(pteval)) |
1427 | swp_pte = pte_swp_mksoft_dirty(swp_pte); | 1430 | swp_pte = pte_swp_mksoft_dirty(swp_pte); |
1428 | set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); | 1431 | set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); |
1432 | /* | ||
1433 | * No need to invalidate here it will synchronize on | ||
1434 | * against the special swap migration pte. | ||
1435 | */ | ||
1429 | goto discard; | 1436 | goto discard; |
1430 | } | 1437 | } |
1431 | 1438 | ||
@@ -1483,6 +1490,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1483 | * will take care of the rest. | 1490 | * will take care of the rest. |
1484 | */ | 1491 | */ |
1485 | dec_mm_counter(mm, mm_counter(page)); | 1492 | dec_mm_counter(mm, mm_counter(page)); |
1493 | /* We have to invalidate as we cleared the pte */ | ||
1494 | mmu_notifier_invalidate_range(mm, address, | ||
1495 | address + PAGE_SIZE); | ||
1486 | } else if (IS_ENABLED(CONFIG_MIGRATION) && | 1496 | } else if (IS_ENABLED(CONFIG_MIGRATION) && |
1487 | (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) { | 1497 | (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) { |
1488 | swp_entry_t entry; | 1498 | swp_entry_t entry; |
@@ -1498,6 +1508,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1498 | if (pte_soft_dirty(pteval)) | 1508 | if (pte_soft_dirty(pteval)) |
1499 | swp_pte = pte_swp_mksoft_dirty(swp_pte); | 1509 | swp_pte = pte_swp_mksoft_dirty(swp_pte); |
1500 | set_pte_at(mm, address, pvmw.pte, swp_pte); | 1510 | set_pte_at(mm, address, pvmw.pte, swp_pte); |
1511 | /* | ||
1512 | * No need to invalidate here it will synchronize on | ||
1513 | * against the special swap migration pte. | ||
1514 | */ | ||
1501 | } else if (PageAnon(page)) { | 1515 | } else if (PageAnon(page)) { |
1502 | swp_entry_t entry = { .val = page_private(subpage) }; | 1516 | swp_entry_t entry = { .val = page_private(subpage) }; |
1503 | pte_t swp_pte; | 1517 | pte_t swp_pte; |
@@ -1509,6 +1523,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1509 | WARN_ON_ONCE(1); | 1523 | WARN_ON_ONCE(1); |
1510 | ret = false; | 1524 | ret = false; |
1511 | /* We have to invalidate as we cleared the pte */ | 1525 | /* We have to invalidate as we cleared the pte */ |
1526 | mmu_notifier_invalidate_range(mm, address, | ||
1527 | address + PAGE_SIZE); | ||
1512 | page_vma_mapped_walk_done(&pvmw); | 1528 | page_vma_mapped_walk_done(&pvmw); |
1513 | break; | 1529 | break; |
1514 | } | 1530 | } |
@@ -1516,6 +1532,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1516 | /* MADV_FREE page check */ | 1532 | /* MADV_FREE page check */ |
1517 | if (!PageSwapBacked(page)) { | 1533 | if (!PageSwapBacked(page)) { |
1518 | if (!PageDirty(page)) { | 1534 | if (!PageDirty(page)) { |
1535 | /* Invalidate as we cleared the pte */ | ||
1536 | mmu_notifier_invalidate_range(mm, | ||
1537 | address, address + PAGE_SIZE); | ||
1519 | dec_mm_counter(mm, MM_ANONPAGES); | 1538 | dec_mm_counter(mm, MM_ANONPAGES); |
1520 | goto discard; | 1539 | goto discard; |
1521 | } | 1540 | } |
@@ -1549,13 +1568,39 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1549 | if (pte_soft_dirty(pteval)) | 1568 | if (pte_soft_dirty(pteval)) |
1550 | swp_pte = pte_swp_mksoft_dirty(swp_pte); | 1569 | swp_pte = pte_swp_mksoft_dirty(swp_pte); |
1551 | set_pte_at(mm, address, pvmw.pte, swp_pte); | 1570 | set_pte_at(mm, address, pvmw.pte, swp_pte); |
1552 | } else | 1571 | /* Invalidate as we cleared the pte */ |
1572 | mmu_notifier_invalidate_range(mm, address, | ||
1573 | address + PAGE_SIZE); | ||
1574 | } else { | ||
1575 | /* | ||
1576 | * We should not need to notify here as we reach this | ||
1577 | * case only from freeze_page() itself only call from | ||
1578 | * split_huge_page_to_list() so everything below must | ||
1579 | * be true: | ||
1580 | * - page is not anonymous | ||
1581 | * - page is locked | ||
1582 | * | ||
1583 | * So as it is a locked file back page thus it can not | ||
1584 | * be remove from the page cache and replace by a new | ||
1585 | * page before mmu_notifier_invalidate_range_end so no | ||
1586 | * concurrent thread might update its page table to | ||
1587 | * point at new page while a device still is using this | ||
1588 | * page. | ||
1589 | * | ||
1590 | * See Documentation/vm/mmu_notifier.txt | ||
1591 | */ | ||
1553 | dec_mm_counter(mm, mm_counter_file(page)); | 1592 | dec_mm_counter(mm, mm_counter_file(page)); |
1593 | } | ||
1554 | discard: | 1594 | discard: |
1595 | /* | ||
1596 | * No need to call mmu_notifier_invalidate_range() it has be | ||
1597 | * done above for all cases requiring it to happen under page | ||
1598 | * table lock before mmu_notifier_invalidate_range_end() | ||
1599 | * | ||
1600 | * See Documentation/vm/mmu_notifier.txt | ||
1601 | */ | ||
1555 | page_remove_rmap(subpage, PageHuge(page)); | 1602 | page_remove_rmap(subpage, PageHuge(page)); |
1556 | put_page(page); | 1603 | put_page(page); |
1557 | mmu_notifier_invalidate_range(mm, address, | ||
1558 | address + PAGE_SIZE); | ||
1559 | } | 1604 | } |
1560 | 1605 | ||
1561 | mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); | 1606 | mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); |
diff --git a/mm/shmem.c b/mm/shmem.c index 07a1d22807be..ab22eaa2412e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -338,7 +338,7 @@ static int shmem_radix_tree_replace(struct address_space *mapping, | |||
338 | if (item != expected) | 338 | if (item != expected) |
339 | return -ENOENT; | 339 | return -ENOENT; |
340 | __radix_tree_replace(&mapping->page_tree, node, pslot, | 340 | __radix_tree_replace(&mapping->page_tree, node, pslot, |
341 | replacement, NULL, NULL); | 341 | replacement, NULL); |
342 | return 0; | 342 | return 0; |
343 | } | 343 | } |
344 | 344 | ||
@@ -747,7 +747,7 @@ void shmem_unlock_mapping(struct address_space *mapping) | |||
747 | pgoff_t indices[PAGEVEC_SIZE]; | 747 | pgoff_t indices[PAGEVEC_SIZE]; |
748 | pgoff_t index = 0; | 748 | pgoff_t index = 0; |
749 | 749 | ||
750 | pagevec_init(&pvec, 0); | 750 | pagevec_init(&pvec); |
751 | /* | 751 | /* |
752 | * Minor point, but we might as well stop if someone else SHM_LOCKs it. | 752 | * Minor point, but we might as well stop if someone else SHM_LOCKs it. |
753 | */ | 753 | */ |
@@ -790,7 +790,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, | |||
790 | if (lend == -1) | 790 | if (lend == -1) |
791 | end = -1; /* unsigned, so actually very big */ | 791 | end = -1; /* unsigned, so actually very big */ |
792 | 792 | ||
793 | pagevec_init(&pvec, 0); | 793 | pagevec_init(&pvec); |
794 | index = start; | 794 | index = start; |
795 | while (index < end) { | 795 | while (index < end) { |
796 | pvec.nr = find_get_entries(mapping, index, | 796 | pvec.nr = find_get_entries(mapping, index, |
@@ -2528,7 +2528,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, | |||
2528 | bool done = false; | 2528 | bool done = false; |
2529 | int i; | 2529 | int i; |
2530 | 2530 | ||
2531 | pagevec_init(&pvec, 0); | 2531 | pagevec_init(&pvec); |
2532 | pvec.nr = 1; /* start small: we may be there already */ | 2532 | pvec.nr = 1; /* start small: we may be there already */ |
2533 | while (!done) { | 2533 | while (!done) { |
2534 | pvec.nr = find_get_entries(mapping, index, | 2534 | pvec.nr = find_get_entries(mapping, index, |
@@ -3862,12 +3862,11 @@ static void shmem_init_inode(void *foo) | |||
3862 | inode_init_once(&info->vfs_inode); | 3862 | inode_init_once(&info->vfs_inode); |
3863 | } | 3863 | } |
3864 | 3864 | ||
3865 | static int shmem_init_inodecache(void) | 3865 | static void shmem_init_inodecache(void) |
3866 | { | 3866 | { |
3867 | shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", | 3867 | shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", |
3868 | sizeof(struct shmem_inode_info), | 3868 | sizeof(struct shmem_inode_info), |
3869 | 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); | 3869 | 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); |
3870 | return 0; | ||
3871 | } | 3870 | } |
3872 | 3871 | ||
3873 | static void shmem_destroy_inodecache(void) | 3872 | static void shmem_destroy_inodecache(void) |
@@ -3991,9 +3990,7 @@ int __init shmem_init(void) | |||
3991 | if (shmem_inode_cachep) | 3990 | if (shmem_inode_cachep) |
3992 | return 0; | 3991 | return 0; |
3993 | 3992 | ||
3994 | error = shmem_init_inodecache(); | 3993 | shmem_init_inodecache(); |
3995 | if (error) | ||
3996 | goto out3; | ||
3997 | 3994 | ||
3998 | error = register_filesystem(&shmem_fs_type); | 3995 | error = register_filesystem(&shmem_fs_type); |
3999 | if (error) { | 3996 | if (error) { |
@@ -4020,7 +4017,6 @@ out1: | |||
4020 | unregister_filesystem(&shmem_fs_type); | 4017 | unregister_filesystem(&shmem_fs_type); |
4021 | out2: | 4018 | out2: |
4022 | shmem_destroy_inodecache(); | 4019 | shmem_destroy_inodecache(); |
4023 | out3: | ||
4024 | shm_mnt = ERR_PTR(error); | 4020 | shm_mnt = ERR_PTR(error); |
4025 | return error; | 4021 | return error; |
4026 | } | 4022 | } |
@@ -4102,6 +4098,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma) | |||
4102 | if (i_size >= HPAGE_PMD_SIZE && | 4098 | if (i_size >= HPAGE_PMD_SIZE && |
4103 | i_size >> PAGE_SHIFT >= off) | 4099 | i_size >> PAGE_SHIFT >= off) |
4104 | return true; | 4100 | return true; |
4101 | /* fall through */ | ||
4105 | case SHMEM_HUGE_ADVISE: | 4102 | case SHMEM_HUGE_ADVISE: |
4106 | /* TODO: implement fadvise() hints */ | 4103 | /* TODO: implement fadvise() hints */ |
4107 | return (vma->vm_flags & VM_HUGEPAGE); | 4104 | return (vma->vm_flags & VM_HUGEPAGE); |
@@ -114,7 +114,6 @@ | |||
114 | #include <linux/rtmutex.h> | 114 | #include <linux/rtmutex.h> |
115 | #include <linux/reciprocal_div.h> | 115 | #include <linux/reciprocal_div.h> |
116 | #include <linux/debugobjects.h> | 116 | #include <linux/debugobjects.h> |
117 | #include <linux/kmemcheck.h> | ||
118 | #include <linux/memory.h> | 117 | #include <linux/memory.h> |
119 | #include <linux/prefetch.h> | 118 | #include <linux/prefetch.h> |
120 | #include <linux/sched/task_stack.h> | 119 | #include <linux/sched/task_stack.h> |
@@ -252,8 +251,8 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) | |||
252 | MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ | 251 | MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ |
253 | } while (0) | 252 | } while (0) |
254 | 253 | ||
255 | #define CFLGS_OBJFREELIST_SLAB (0x40000000UL) | 254 | #define CFLGS_OBJFREELIST_SLAB ((slab_flags_t __force)0x40000000U) |
256 | #define CFLGS_OFF_SLAB (0x80000000UL) | 255 | #define CFLGS_OFF_SLAB ((slab_flags_t __force)0x80000000U) |
257 | #define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB) | 256 | #define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB) |
258 | #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) | 257 | #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) |
259 | 258 | ||
@@ -441,7 +440,7 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) | |||
441 | * Calculate the number of objects and left-over bytes for a given buffer size. | 440 | * Calculate the number of objects and left-over bytes for a given buffer size. |
442 | */ | 441 | */ |
443 | static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size, | 442 | static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size, |
444 | unsigned long flags, size_t *left_over) | 443 | slab_flags_t flags, size_t *left_over) |
445 | { | 444 | { |
446 | unsigned int num; | 445 | unsigned int num; |
447 | size_t slab_size = PAGE_SIZE << gfporder; | 446 | size_t slab_size = PAGE_SIZE << gfporder; |
@@ -1410,10 +1409,8 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, | |||
1410 | int nr_pages; | 1409 | int nr_pages; |
1411 | 1410 | ||
1412 | flags |= cachep->allocflags; | 1411 | flags |= cachep->allocflags; |
1413 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | ||
1414 | flags |= __GFP_RECLAIMABLE; | ||
1415 | 1412 | ||
1416 | page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); | 1413 | page = __alloc_pages_node(nodeid, flags, cachep->gfporder); |
1417 | if (!page) { | 1414 | if (!page) { |
1418 | slab_out_of_memory(cachep, flags, nodeid); | 1415 | slab_out_of_memory(cachep, flags, nodeid); |
1419 | return NULL; | 1416 | return NULL; |
@@ -1435,15 +1432,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, | |||
1435 | if (sk_memalloc_socks() && page_is_pfmemalloc(page)) | 1432 | if (sk_memalloc_socks() && page_is_pfmemalloc(page)) |
1436 | SetPageSlabPfmemalloc(page); | 1433 | SetPageSlabPfmemalloc(page); |
1437 | 1434 | ||
1438 | if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { | ||
1439 | kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); | ||
1440 | |||
1441 | if (cachep->ctor) | ||
1442 | kmemcheck_mark_uninitialized_pages(page, nr_pages); | ||
1443 | else | ||
1444 | kmemcheck_mark_unallocated_pages(page, nr_pages); | ||
1445 | } | ||
1446 | |||
1447 | return page; | 1435 | return page; |
1448 | } | 1436 | } |
1449 | 1437 | ||
@@ -1455,8 +1443,6 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) | |||
1455 | int order = cachep->gfporder; | 1443 | int order = cachep->gfporder; |
1456 | unsigned long nr_freed = (1 << order); | 1444 | unsigned long nr_freed = (1 << order); |
1457 | 1445 | ||
1458 | kmemcheck_free_shadow(page, order); | ||
1459 | |||
1460 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1446 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
1461 | mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, -nr_freed); | 1447 | mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, -nr_freed); |
1462 | else | 1448 | else |
@@ -1761,7 +1747,7 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) | |||
1761 | * towards high-order requests, this should be changed. | 1747 | * towards high-order requests, this should be changed. |
1762 | */ | 1748 | */ |
1763 | static size_t calculate_slab_order(struct kmem_cache *cachep, | 1749 | static size_t calculate_slab_order(struct kmem_cache *cachep, |
1764 | size_t size, unsigned long flags) | 1750 | size_t size, slab_flags_t flags) |
1765 | { | 1751 | { |
1766 | size_t left_over = 0; | 1752 | size_t left_over = 0; |
1767 | int gfporder; | 1753 | int gfporder; |
@@ -1888,8 +1874,8 @@ static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) | |||
1888 | return 0; | 1874 | return 0; |
1889 | } | 1875 | } |
1890 | 1876 | ||
1891 | unsigned long kmem_cache_flags(unsigned long object_size, | 1877 | slab_flags_t kmem_cache_flags(unsigned long object_size, |
1892 | unsigned long flags, const char *name, | 1878 | slab_flags_t flags, const char *name, |
1893 | void (*ctor)(void *)) | 1879 | void (*ctor)(void *)) |
1894 | { | 1880 | { |
1895 | return flags; | 1881 | return flags; |
@@ -1897,7 +1883,7 @@ unsigned long kmem_cache_flags(unsigned long object_size, | |||
1897 | 1883 | ||
1898 | struct kmem_cache * | 1884 | struct kmem_cache * |
1899 | __kmem_cache_alias(const char *name, size_t size, size_t align, | 1885 | __kmem_cache_alias(const char *name, size_t size, size_t align, |
1900 | unsigned long flags, void (*ctor)(void *)) | 1886 | slab_flags_t flags, void (*ctor)(void *)) |
1901 | { | 1887 | { |
1902 | struct kmem_cache *cachep; | 1888 | struct kmem_cache *cachep; |
1903 | 1889 | ||
@@ -1915,7 +1901,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, | |||
1915 | } | 1901 | } |
1916 | 1902 | ||
1917 | static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, | 1903 | static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, |
1918 | size_t size, unsigned long flags) | 1904 | size_t size, slab_flags_t flags) |
1919 | { | 1905 | { |
1920 | size_t left; | 1906 | size_t left; |
1921 | 1907 | ||
@@ -1938,7 +1924,7 @@ static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, | |||
1938 | } | 1924 | } |
1939 | 1925 | ||
1940 | static bool set_off_slab_cache(struct kmem_cache *cachep, | 1926 | static bool set_off_slab_cache(struct kmem_cache *cachep, |
1941 | size_t size, unsigned long flags) | 1927 | size_t size, slab_flags_t flags) |
1942 | { | 1928 | { |
1943 | size_t left; | 1929 | size_t left; |
1944 | 1930 | ||
@@ -1972,7 +1958,7 @@ static bool set_off_slab_cache(struct kmem_cache *cachep, | |||
1972 | } | 1958 | } |
1973 | 1959 | ||
1974 | static bool set_on_slab_cache(struct kmem_cache *cachep, | 1960 | static bool set_on_slab_cache(struct kmem_cache *cachep, |
1975 | size_t size, unsigned long flags) | 1961 | size_t size, slab_flags_t flags) |
1976 | { | 1962 | { |
1977 | size_t left; | 1963 | size_t left; |
1978 | 1964 | ||
@@ -2008,8 +1994,7 @@ static bool set_on_slab_cache(struct kmem_cache *cachep, | |||
2008 | * cacheline. This can be beneficial if you're counting cycles as closely | 1994 | * cacheline. This can be beneficial if you're counting cycles as closely |
2009 | * as davem. | 1995 | * as davem. |
2010 | */ | 1996 | */ |
2011 | int | 1997 | int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags) |
2012 | __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | ||
2013 | { | 1998 | { |
2014 | size_t ralign = BYTES_PER_WORD; | 1999 | size_t ralign = BYTES_PER_WORD; |
2015 | gfp_t gfp; | 2000 | gfp_t gfp; |
@@ -2144,6 +2129,8 @@ done: | |||
2144 | cachep->allocflags = __GFP_COMP; | 2129 | cachep->allocflags = __GFP_COMP; |
2145 | if (flags & SLAB_CACHE_DMA) | 2130 | if (flags & SLAB_CACHE_DMA) |
2146 | cachep->allocflags |= GFP_DMA; | 2131 | cachep->allocflags |= GFP_DMA; |
2132 | if (flags & SLAB_RECLAIM_ACCOUNT) | ||
2133 | cachep->allocflags |= __GFP_RECLAIMABLE; | ||
2147 | cachep->size = size; | 2134 | cachep->size = size; |
2148 | cachep->reciprocal_buffer_size = reciprocal_value(size); | 2135 | cachep->reciprocal_buffer_size = reciprocal_value(size); |
2149 | 2136 | ||
@@ -3516,8 +3503,6 @@ void ___cache_free(struct kmem_cache *cachep, void *objp, | |||
3516 | kmemleak_free_recursive(objp, cachep->flags); | 3503 | kmemleak_free_recursive(objp, cachep->flags); |
3517 | objp = cache_free_debugcheck(cachep, objp, caller); | 3504 | objp = cache_free_debugcheck(cachep, objp, caller); |
3518 | 3505 | ||
3519 | kmemcheck_slab_free(cachep, objp, cachep->object_size); | ||
3520 | |||
3521 | /* | 3506 | /* |
3522 | * Skip calling cache_free_alien() when the platform is not numa. | 3507 | * Skip calling cache_free_alien() when the platform is not numa. |
3523 | * This will avoid cache misses that happen while accessing slabp (which | 3508 | * This will avoid cache misses that happen while accessing slabp (which |
@@ -4097,7 +4082,6 @@ out: | |||
4097 | schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC)); | 4082 | schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC)); |
4098 | } | 4083 | } |
4099 | 4084 | ||
4100 | #ifdef CONFIG_SLABINFO | ||
4101 | void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) | 4085 | void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) |
4102 | { | 4086 | { |
4103 | unsigned long active_objs, num_objs, active_slabs; | 4087 | unsigned long active_objs, num_objs, active_slabs; |
@@ -4405,7 +4389,6 @@ static int __init slab_proc_init(void) | |||
4405 | return 0; | 4389 | return 0; |
4406 | } | 4390 | } |
4407 | module_init(slab_proc_init); | 4391 | module_init(slab_proc_init); |
4408 | #endif | ||
4409 | 4392 | ||
4410 | #ifdef CONFIG_HARDENED_USERCOPY | 4393 | #ifdef CONFIG_HARDENED_USERCOPY |
4411 | /* | 4394 | /* |
@@ -21,7 +21,7 @@ struct kmem_cache { | |||
21 | unsigned int object_size;/* The original size of the object */ | 21 | unsigned int object_size;/* The original size of the object */ |
22 | unsigned int size; /* The aligned/padded/added on size */ | 22 | unsigned int size; /* The aligned/padded/added on size */ |
23 | unsigned int align; /* Alignment as calculated */ | 23 | unsigned int align; /* Alignment as calculated */ |
24 | unsigned long flags; /* Active flags on the slab */ | 24 | slab_flags_t flags; /* Active flags on the slab */ |
25 | const char *name; /* Slab name for sysfs */ | 25 | const char *name; /* Slab name for sysfs */ |
26 | int refcount; /* Use counter */ | 26 | int refcount; /* Use counter */ |
27 | void (*ctor)(void *); /* Called on object slot creation */ | 27 | void (*ctor)(void *); /* Called on object slot creation */ |
@@ -40,7 +40,6 @@ struct kmem_cache { | |||
40 | 40 | ||
41 | #include <linux/memcontrol.h> | 41 | #include <linux/memcontrol.h> |
42 | #include <linux/fault-inject.h> | 42 | #include <linux/fault-inject.h> |
43 | #include <linux/kmemcheck.h> | ||
44 | #include <linux/kasan.h> | 43 | #include <linux/kasan.h> |
45 | #include <linux/kmemleak.h> | 44 | #include <linux/kmemleak.h> |
46 | #include <linux/random.h> | 45 | #include <linux/random.h> |
@@ -79,13 +78,13 @@ extern const struct kmalloc_info_struct { | |||
79 | unsigned long size; | 78 | unsigned long size; |
80 | } kmalloc_info[]; | 79 | } kmalloc_info[]; |
81 | 80 | ||
82 | unsigned long calculate_alignment(unsigned long flags, | 81 | unsigned long calculate_alignment(slab_flags_t flags, |
83 | unsigned long align, unsigned long size); | 82 | unsigned long align, unsigned long size); |
84 | 83 | ||
85 | #ifndef CONFIG_SLOB | 84 | #ifndef CONFIG_SLOB |
86 | /* Kmalloc array related functions */ | 85 | /* Kmalloc array related functions */ |
87 | void setup_kmalloc_cache_index_table(void); | 86 | void setup_kmalloc_cache_index_table(void); |
88 | void create_kmalloc_caches(unsigned long); | 87 | void create_kmalloc_caches(slab_flags_t); |
89 | 88 | ||
90 | /* Find the kmalloc slab corresponding for a certain size */ | 89 | /* Find the kmalloc slab corresponding for a certain size */ |
91 | struct kmem_cache *kmalloc_slab(size_t, gfp_t); | 90 | struct kmem_cache *kmalloc_slab(size_t, gfp_t); |
@@ -93,32 +92,32 @@ struct kmem_cache *kmalloc_slab(size_t, gfp_t); | |||
93 | 92 | ||
94 | 93 | ||
95 | /* Functions provided by the slab allocators */ | 94 | /* Functions provided by the slab allocators */ |
96 | extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags); | 95 | int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags); |
97 | 96 | ||
98 | extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size, | 97 | extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size, |
99 | unsigned long flags); | 98 | slab_flags_t flags); |
100 | extern void create_boot_cache(struct kmem_cache *, const char *name, | 99 | extern void create_boot_cache(struct kmem_cache *, const char *name, |
101 | size_t size, unsigned long flags); | 100 | size_t size, slab_flags_t flags); |
102 | 101 | ||
103 | int slab_unmergeable(struct kmem_cache *s); | 102 | int slab_unmergeable(struct kmem_cache *s); |
104 | struct kmem_cache *find_mergeable(size_t size, size_t align, | 103 | struct kmem_cache *find_mergeable(size_t size, size_t align, |
105 | unsigned long flags, const char *name, void (*ctor)(void *)); | 104 | slab_flags_t flags, const char *name, void (*ctor)(void *)); |
106 | #ifndef CONFIG_SLOB | 105 | #ifndef CONFIG_SLOB |
107 | struct kmem_cache * | 106 | struct kmem_cache * |
108 | __kmem_cache_alias(const char *name, size_t size, size_t align, | 107 | __kmem_cache_alias(const char *name, size_t size, size_t align, |
109 | unsigned long flags, void (*ctor)(void *)); | 108 | slab_flags_t flags, void (*ctor)(void *)); |
110 | 109 | ||
111 | unsigned long kmem_cache_flags(unsigned long object_size, | 110 | slab_flags_t kmem_cache_flags(unsigned long object_size, |
112 | unsigned long flags, const char *name, | 111 | slab_flags_t flags, const char *name, |
113 | void (*ctor)(void *)); | 112 | void (*ctor)(void *)); |
114 | #else | 113 | #else |
115 | static inline struct kmem_cache * | 114 | static inline struct kmem_cache * |
116 | __kmem_cache_alias(const char *name, size_t size, size_t align, | 115 | __kmem_cache_alias(const char *name, size_t size, size_t align, |
117 | unsigned long flags, void (*ctor)(void *)) | 116 | slab_flags_t flags, void (*ctor)(void *)) |
118 | { return NULL; } | 117 | { return NULL; } |
119 | 118 | ||
120 | static inline unsigned long kmem_cache_flags(unsigned long object_size, | 119 | static inline slab_flags_t kmem_cache_flags(unsigned long object_size, |
121 | unsigned long flags, const char *name, | 120 | slab_flags_t flags, const char *name, |
122 | void (*ctor)(void *)) | 121 | void (*ctor)(void *)) |
123 | { | 122 | { |
124 | return flags; | 123 | return flags; |
@@ -142,10 +141,10 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, | |||
142 | #if defined(CONFIG_SLAB) | 141 | #if defined(CONFIG_SLAB) |
143 | #define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ | 142 | #define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ |
144 | SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \ | 143 | SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \ |
145 | SLAB_NOTRACK | SLAB_ACCOUNT) | 144 | SLAB_ACCOUNT) |
146 | #elif defined(CONFIG_SLUB) | 145 | #elif defined(CONFIG_SLUB) |
147 | #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ | 146 | #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ |
148 | SLAB_TEMPORARY | SLAB_NOTRACK | SLAB_ACCOUNT) | 147 | SLAB_TEMPORARY | SLAB_ACCOUNT) |
149 | #else | 148 | #else |
150 | #define SLAB_CACHE_FLAGS (0) | 149 | #define SLAB_CACHE_FLAGS (0) |
151 | #endif | 150 | #endif |
@@ -164,7 +163,6 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, | |||
164 | SLAB_NOLEAKTRACE | \ | 163 | SLAB_NOLEAKTRACE | \ |
165 | SLAB_RECLAIM_ACCOUNT | \ | 164 | SLAB_RECLAIM_ACCOUNT | \ |
166 | SLAB_TEMPORARY | \ | 165 | SLAB_TEMPORARY | \ |
167 | SLAB_NOTRACK | \ | ||
168 | SLAB_ACCOUNT) | 166 | SLAB_ACCOUNT) |
169 | 167 | ||
170 | int __kmem_cache_shutdown(struct kmem_cache *); | 168 | int __kmem_cache_shutdown(struct kmem_cache *); |
@@ -439,7 +437,6 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, | |||
439 | for (i = 0; i < size; i++) { | 437 | for (i = 0; i < size; i++) { |
440 | void *object = p[i]; | 438 | void *object = p[i]; |
441 | 439 | ||
442 | kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); | ||
443 | kmemleak_alloc_recursive(object, s->object_size, 1, | 440 | kmemleak_alloc_recursive(object, s->object_size, 1, |
444 | s->flags, flags); | 441 | s->flags, flags); |
445 | kasan_slab_alloc(s, object, flags); | 442 | kasan_slab_alloc(s, object, flags); |
@@ -506,6 +503,14 @@ void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos); | |||
506 | void memcg_slab_stop(struct seq_file *m, void *p); | 503 | void memcg_slab_stop(struct seq_file *m, void *p); |
507 | int memcg_slab_show(struct seq_file *m, void *p); | 504 | int memcg_slab_show(struct seq_file *m, void *p); |
508 | 505 | ||
506 | #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) | ||
507 | void dump_unreclaimable_slab(void); | ||
508 | #else | ||
509 | static inline void dump_unreclaimable_slab(void) | ||
510 | { | ||
511 | } | ||
512 | #endif | ||
513 | |||
509 | void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); | 514 | void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); |
510 | 515 | ||
511 | #ifdef CONFIG_SLAB_FREELIST_RANDOM | 516 | #ifdef CONFIG_SLAB_FREELIST_RANDOM |
diff --git a/mm/slab_common.c b/mm/slab_common.c index 0d7fe71ff5e4..c8cb36774ba1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -44,7 +44,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, | |||
44 | SLAB_FAILSLAB | SLAB_KASAN) | 44 | SLAB_FAILSLAB | SLAB_KASAN) |
45 | 45 | ||
46 | #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ | 46 | #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ |
47 | SLAB_NOTRACK | SLAB_ACCOUNT) | 47 | SLAB_ACCOUNT) |
48 | 48 | ||
49 | /* | 49 | /* |
50 | * Merge control. If this is set then no merging of slab caches will occur. | 50 | * Merge control. If this is set then no merging of slab caches will occur. |
@@ -291,7 +291,7 @@ int slab_unmergeable(struct kmem_cache *s) | |||
291 | } | 291 | } |
292 | 292 | ||
293 | struct kmem_cache *find_mergeable(size_t size, size_t align, | 293 | struct kmem_cache *find_mergeable(size_t size, size_t align, |
294 | unsigned long flags, const char *name, void (*ctor)(void *)) | 294 | slab_flags_t flags, const char *name, void (*ctor)(void *)) |
295 | { | 295 | { |
296 | struct kmem_cache *s; | 296 | struct kmem_cache *s; |
297 | 297 | ||
@@ -341,7 +341,7 @@ struct kmem_cache *find_mergeable(size_t size, size_t align, | |||
341 | * Figure out what the alignment of the objects will be given a set of | 341 | * Figure out what the alignment of the objects will be given a set of |
342 | * flags, a user specified alignment and the size of the objects. | 342 | * flags, a user specified alignment and the size of the objects. |
343 | */ | 343 | */ |
344 | unsigned long calculate_alignment(unsigned long flags, | 344 | unsigned long calculate_alignment(slab_flags_t flags, |
345 | unsigned long align, unsigned long size) | 345 | unsigned long align, unsigned long size) |
346 | { | 346 | { |
347 | /* | 347 | /* |
@@ -366,7 +366,7 @@ unsigned long calculate_alignment(unsigned long flags, | |||
366 | 366 | ||
367 | static struct kmem_cache *create_cache(const char *name, | 367 | static struct kmem_cache *create_cache(const char *name, |
368 | size_t object_size, size_t size, size_t align, | 368 | size_t object_size, size_t size, size_t align, |
369 | unsigned long flags, void (*ctor)(void *), | 369 | slab_flags_t flags, void (*ctor)(void *), |
370 | struct mem_cgroup *memcg, struct kmem_cache *root_cache) | 370 | struct mem_cgroup *memcg, struct kmem_cache *root_cache) |
371 | { | 371 | { |
372 | struct kmem_cache *s; | 372 | struct kmem_cache *s; |
@@ -431,7 +431,7 @@ out_free_cache: | |||
431 | */ | 431 | */ |
432 | struct kmem_cache * | 432 | struct kmem_cache * |
433 | kmem_cache_create(const char *name, size_t size, size_t align, | 433 | kmem_cache_create(const char *name, size_t size, size_t align, |
434 | unsigned long flags, void (*ctor)(void *)) | 434 | slab_flags_t flags, void (*ctor)(void *)) |
435 | { | 435 | { |
436 | struct kmem_cache *s = NULL; | 436 | struct kmem_cache *s = NULL; |
437 | const char *cache_name; | 437 | const char *cache_name; |
@@ -879,7 +879,7 @@ bool slab_is_available(void) | |||
879 | #ifndef CONFIG_SLOB | 879 | #ifndef CONFIG_SLOB |
880 | /* Create a cache during boot when no slab services are available yet */ | 880 | /* Create a cache during boot when no slab services are available yet */ |
881 | void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size, | 881 | void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size, |
882 | unsigned long flags) | 882 | slab_flags_t flags) |
883 | { | 883 | { |
884 | int err; | 884 | int err; |
885 | 885 | ||
@@ -899,7 +899,7 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz | |||
899 | } | 899 | } |
900 | 900 | ||
901 | struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, | 901 | struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, |
902 | unsigned long flags) | 902 | slab_flags_t flags) |
903 | { | 903 | { |
904 | struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); | 904 | struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); |
905 | 905 | ||
@@ -1057,7 +1057,7 @@ void __init setup_kmalloc_cache_index_table(void) | |||
1057 | } | 1057 | } |
1058 | } | 1058 | } |
1059 | 1059 | ||
1060 | static void __init new_kmalloc_cache(int idx, unsigned long flags) | 1060 | static void __init new_kmalloc_cache(int idx, slab_flags_t flags) |
1061 | { | 1061 | { |
1062 | kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name, | 1062 | kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name, |
1063 | kmalloc_info[idx].size, flags); | 1063 | kmalloc_info[idx].size, flags); |
@@ -1068,7 +1068,7 @@ static void __init new_kmalloc_cache(int idx, unsigned long flags) | |||
1068 | * may already have been created because they were needed to | 1068 | * may already have been created because they were needed to |
1069 | * enable allocations for slab creation. | 1069 | * enable allocations for slab creation. |
1070 | */ | 1070 | */ |
1071 | void __init create_kmalloc_caches(unsigned long flags) | 1071 | void __init create_kmalloc_caches(slab_flags_t flags) |
1072 | { | 1072 | { |
1073 | int i; | 1073 | int i; |
1074 | 1074 | ||
@@ -1184,8 +1184,7 @@ void cache_random_seq_destroy(struct kmem_cache *cachep) | |||
1184 | } | 1184 | } |
1185 | #endif /* CONFIG_SLAB_FREELIST_RANDOM */ | 1185 | #endif /* CONFIG_SLAB_FREELIST_RANDOM */ |
1186 | 1186 | ||
1187 | #ifdef CONFIG_SLABINFO | 1187 | #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) |
1188 | |||
1189 | #ifdef CONFIG_SLAB | 1188 | #ifdef CONFIG_SLAB |
1190 | #define SLABINFO_RIGHTS (S_IWUSR | S_IRUSR) | 1189 | #define SLABINFO_RIGHTS (S_IWUSR | S_IRUSR) |
1191 | #else | 1190 | #else |
@@ -1281,7 +1280,41 @@ static int slab_show(struct seq_file *m, void *p) | |||
1281 | return 0; | 1280 | return 0; |
1282 | } | 1281 | } |
1283 | 1282 | ||
1284 | #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) | 1283 | void dump_unreclaimable_slab(void) |
1284 | { | ||
1285 | struct kmem_cache *s, *s2; | ||
1286 | struct slabinfo sinfo; | ||
1287 | |||
1288 | /* | ||
1289 | * Here acquiring slab_mutex is risky since we don't prefer to get | ||
1290 | * sleep in oom path. But, without mutex hold, it may introduce a | ||
1291 | * risk of crash. | ||
1292 | * Use mutex_trylock to protect the list traverse, dump nothing | ||
1293 | * without acquiring the mutex. | ||
1294 | */ | ||
1295 | if (!mutex_trylock(&slab_mutex)) { | ||
1296 | pr_warn("excessive unreclaimable slab but cannot dump stats\n"); | ||
1297 | return; | ||
1298 | } | ||
1299 | |||
1300 | pr_info("Unreclaimable slab info:\n"); | ||
1301 | pr_info("Name Used Total\n"); | ||
1302 | |||
1303 | list_for_each_entry_safe(s, s2, &slab_caches, list) { | ||
1304 | if (!is_root_cache(s) || (s->flags & SLAB_RECLAIM_ACCOUNT)) | ||
1305 | continue; | ||
1306 | |||
1307 | get_slabinfo(s, &sinfo); | ||
1308 | |||
1309 | if (sinfo.num_objs > 0) | ||
1310 | pr_info("%-17s %10luKB %10luKB\n", cache_name(s), | ||
1311 | (sinfo.active_objs * s->size) / 1024, | ||
1312 | (sinfo.num_objs * s->size) / 1024); | ||
1313 | } | ||
1314 | mutex_unlock(&slab_mutex); | ||
1315 | } | ||
1316 | |||
1317 | #if defined(CONFIG_MEMCG) | ||
1285 | void *memcg_slab_start(struct seq_file *m, loff_t *pos) | 1318 | void *memcg_slab_start(struct seq_file *m, loff_t *pos) |
1286 | { | 1319 | { |
1287 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 1320 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
@@ -1355,7 +1388,7 @@ static int __init slab_proc_init(void) | |||
1355 | return 0; | 1388 | return 0; |
1356 | } | 1389 | } |
1357 | module_init(slab_proc_init); | 1390 | module_init(slab_proc_init); |
1358 | #endif /* CONFIG_SLABINFO */ | 1391 | #endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */ |
1359 | 1392 | ||
1360 | static __always_inline void *__do_krealloc(const void *p, size_t new_size, | 1393 | static __always_inline void *__do_krealloc(const void *p, size_t new_size, |
1361 | gfp_t flags) | 1394 | gfp_t flags) |
@@ -330,7 +330,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) | |||
330 | BUG_ON(!b); | 330 | BUG_ON(!b); |
331 | spin_unlock_irqrestore(&slob_lock, flags); | 331 | spin_unlock_irqrestore(&slob_lock, flags); |
332 | } | 332 | } |
333 | if (unlikely((gfp & __GFP_ZERO) && b)) | 333 | if (unlikely(gfp & __GFP_ZERO)) |
334 | memset(b, 0, size); | 334 | memset(b, 0, size); |
335 | return b; | 335 | return b; |
336 | } | 336 | } |
@@ -524,7 +524,7 @@ size_t ksize(const void *block) | |||
524 | } | 524 | } |
525 | EXPORT_SYMBOL(ksize); | 525 | EXPORT_SYMBOL(ksize); |
526 | 526 | ||
527 | int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) | 527 | int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags) |
528 | { | 528 | { |
529 | if (flags & SLAB_TYPESAFE_BY_RCU) { | 529 | if (flags & SLAB_TYPESAFE_BY_RCU) { |
530 | /* leave room for rcu footer at the end of object */ | 530 | /* leave room for rcu footer at the end of object */ |
@@ -22,7 +22,6 @@ | |||
22 | #include <linux/notifier.h> | 22 | #include <linux/notifier.h> |
23 | #include <linux/seq_file.h> | 23 | #include <linux/seq_file.h> |
24 | #include <linux/kasan.h> | 24 | #include <linux/kasan.h> |
25 | #include <linux/kmemcheck.h> | ||
26 | #include <linux/cpu.h> | 25 | #include <linux/cpu.h> |
27 | #include <linux/cpuset.h> | 26 | #include <linux/cpuset.h> |
28 | #include <linux/mempolicy.h> | 27 | #include <linux/mempolicy.h> |
@@ -193,8 +192,10 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) | |||
193 | #define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ | 192 | #define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ |
194 | 193 | ||
195 | /* Internal SLUB flags */ | 194 | /* Internal SLUB flags */ |
196 | #define __OBJECT_POISON 0x80000000UL /* Poison object */ | 195 | /* Poison object */ |
197 | #define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ | 196 | #define __OBJECT_POISON ((slab_flags_t __force)0x80000000U) |
197 | /* Use cmpxchg_double */ | ||
198 | #define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000U) | ||
198 | 199 | ||
199 | /* | 200 | /* |
200 | * Tracking user of a slab. | 201 | * Tracking user of a slab. |
@@ -485,9 +486,9 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p) | |||
485 | * Debug settings: | 486 | * Debug settings: |
486 | */ | 487 | */ |
487 | #if defined(CONFIG_SLUB_DEBUG_ON) | 488 | #if defined(CONFIG_SLUB_DEBUG_ON) |
488 | static int slub_debug = DEBUG_DEFAULT_FLAGS; | 489 | static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS; |
489 | #else | 490 | #else |
490 | static int slub_debug; | 491 | static slab_flags_t slub_debug; |
491 | #endif | 492 | #endif |
492 | 493 | ||
493 | static char *slub_debug_slabs; | 494 | static char *slub_debug_slabs; |
@@ -1289,8 +1290,8 @@ out: | |||
1289 | 1290 | ||
1290 | __setup("slub_debug", setup_slub_debug); | 1291 | __setup("slub_debug", setup_slub_debug); |
1291 | 1292 | ||
1292 | unsigned long kmem_cache_flags(unsigned long object_size, | 1293 | slab_flags_t kmem_cache_flags(unsigned long object_size, |
1293 | unsigned long flags, const char *name, | 1294 | slab_flags_t flags, const char *name, |
1294 | void (*ctor)(void *)) | 1295 | void (*ctor)(void *)) |
1295 | { | 1296 | { |
1296 | /* | 1297 | /* |
@@ -1322,8 +1323,8 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, | |||
1322 | struct page *page) {} | 1323 | struct page *page) {} |
1323 | static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, | 1324 | static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, |
1324 | struct page *page) {} | 1325 | struct page *page) {} |
1325 | unsigned long kmem_cache_flags(unsigned long object_size, | 1326 | slab_flags_t kmem_cache_flags(unsigned long object_size, |
1326 | unsigned long flags, const char *name, | 1327 | slab_flags_t flags, const char *name, |
1327 | void (*ctor)(void *)) | 1328 | void (*ctor)(void *)) |
1328 | { | 1329 | { |
1329 | return flags; | 1330 | return flags; |
@@ -1370,12 +1371,11 @@ static inline void *slab_free_hook(struct kmem_cache *s, void *x) | |||
1370 | * So in order to make the debug calls that expect irqs to be | 1371 | * So in order to make the debug calls that expect irqs to be |
1371 | * disabled we need to disable interrupts temporarily. | 1372 | * disabled we need to disable interrupts temporarily. |
1372 | */ | 1373 | */ |
1373 | #if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) | 1374 | #ifdef CONFIG_LOCKDEP |
1374 | { | 1375 | { |
1375 | unsigned long flags; | 1376 | unsigned long flags; |
1376 | 1377 | ||
1377 | local_irq_save(flags); | 1378 | local_irq_save(flags); |
1378 | kmemcheck_slab_free(s, x, s->object_size); | ||
1379 | debug_check_no_locks_freed(x, s->object_size); | 1379 | debug_check_no_locks_freed(x, s->object_size); |
1380 | local_irq_restore(flags); | 1380 | local_irq_restore(flags); |
1381 | } | 1381 | } |
@@ -1399,8 +1399,7 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s, | |||
1399 | * Compiler cannot detect this function can be removed if slab_free_hook() | 1399 | * Compiler cannot detect this function can be removed if slab_free_hook() |
1400 | * evaluates to nothing. Thus, catch all relevant config debug options here. | 1400 | * evaluates to nothing. Thus, catch all relevant config debug options here. |
1401 | */ | 1401 | */ |
1402 | #if defined(CONFIG_KMEMCHECK) || \ | 1402 | #if defined(CONFIG_LOCKDEP) || \ |
1403 | defined(CONFIG_LOCKDEP) || \ | ||
1404 | defined(CONFIG_DEBUG_KMEMLEAK) || \ | 1403 | defined(CONFIG_DEBUG_KMEMLEAK) || \ |
1405 | defined(CONFIG_DEBUG_OBJECTS_FREE) || \ | 1404 | defined(CONFIG_DEBUG_OBJECTS_FREE) || \ |
1406 | defined(CONFIG_KASAN) | 1405 | defined(CONFIG_KASAN) |
@@ -1436,8 +1435,6 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s, | |||
1436 | struct page *page; | 1435 | struct page *page; |
1437 | int order = oo_order(oo); | 1436 | int order = oo_order(oo); |
1438 | 1437 | ||
1439 | flags |= __GFP_NOTRACK; | ||
1440 | |||
1441 | if (node == NUMA_NO_NODE) | 1438 | if (node == NUMA_NO_NODE) |
1442 | page = alloc_pages(flags, order); | 1439 | page = alloc_pages(flags, order); |
1443 | else | 1440 | else |
@@ -1596,22 +1593,6 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1596 | stat(s, ORDER_FALLBACK); | 1593 | stat(s, ORDER_FALLBACK); |
1597 | } | 1594 | } |
1598 | 1595 | ||
1599 | if (kmemcheck_enabled && | ||
1600 | !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { | ||
1601 | int pages = 1 << oo_order(oo); | ||
1602 | |||
1603 | kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node); | ||
1604 | |||
1605 | /* | ||
1606 | * Objects from caches that have a constructor don't get | ||
1607 | * cleared when they're allocated, so we need to do it here. | ||
1608 | */ | ||
1609 | if (s->ctor) | ||
1610 | kmemcheck_mark_uninitialized_pages(page, pages); | ||
1611 | else | ||
1612 | kmemcheck_mark_unallocated_pages(page, pages); | ||
1613 | } | ||
1614 | |||
1615 | page->objects = oo_objects(oo); | 1596 | page->objects = oo_objects(oo); |
1616 | 1597 | ||
1617 | order = compound_order(page); | 1598 | order = compound_order(page); |
@@ -1687,8 +1668,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
1687 | check_object(s, page, p, SLUB_RED_INACTIVE); | 1668 | check_object(s, page, p, SLUB_RED_INACTIVE); |
1688 | } | 1669 | } |
1689 | 1670 | ||
1690 | kmemcheck_free_shadow(page, compound_order(page)); | ||
1691 | |||
1692 | mod_lruvec_page_state(page, | 1671 | mod_lruvec_page_state(page, |
1693 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? | 1672 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? |
1694 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, | 1673 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, |
@@ -3477,7 +3456,7 @@ static void set_cpu_partial(struct kmem_cache *s) | |||
3477 | */ | 3456 | */ |
3478 | static int calculate_sizes(struct kmem_cache *s, int forced_order) | 3457 | static int calculate_sizes(struct kmem_cache *s, int forced_order) |
3479 | { | 3458 | { |
3480 | unsigned long flags = s->flags; | 3459 | slab_flags_t flags = s->flags; |
3481 | size_t size = s->object_size; | 3460 | size_t size = s->object_size; |
3482 | int order; | 3461 | int order; |
3483 | 3462 | ||
@@ -3593,7 +3572,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
3593 | return !!oo_objects(s->oo); | 3572 | return !!oo_objects(s->oo); |
3594 | } | 3573 | } |
3595 | 3574 | ||
3596 | static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) | 3575 | static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) |
3597 | { | 3576 | { |
3598 | s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); | 3577 | s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); |
3599 | s->reserved = 0; | 3578 | s->reserved = 0; |
@@ -3655,7 +3634,7 @@ error: | |||
3655 | if (flags & SLAB_PANIC) | 3634 | if (flags & SLAB_PANIC) |
3656 | panic("Cannot create slab %s size=%lu realsize=%u order=%u offset=%u flags=%lx\n", | 3635 | panic("Cannot create slab %s size=%lu realsize=%u order=%u offset=%u flags=%lx\n", |
3657 | s->name, (unsigned long)s->size, s->size, | 3636 | s->name, (unsigned long)s->size, s->size, |
3658 | oo_order(s->oo), s->offset, flags); | 3637 | oo_order(s->oo), s->offset, (unsigned long)flags); |
3659 | return -EINVAL; | 3638 | return -EINVAL; |
3660 | } | 3639 | } |
3661 | 3640 | ||
@@ -3792,7 +3771,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) | |||
3792 | struct page *page; | 3771 | struct page *page; |
3793 | void *ptr = NULL; | 3772 | void *ptr = NULL; |
3794 | 3773 | ||
3795 | flags |= __GFP_COMP | __GFP_NOTRACK; | 3774 | flags |= __GFP_COMP; |
3796 | page = alloc_pages_node(node, flags, get_order(size)); | 3775 | page = alloc_pages_node(node, flags, get_order(size)); |
3797 | if (page) | 3776 | if (page) |
3798 | ptr = page_address(page); | 3777 | ptr = page_address(page); |
@@ -4245,7 +4224,7 @@ void __init kmem_cache_init_late(void) | |||
4245 | 4224 | ||
4246 | struct kmem_cache * | 4225 | struct kmem_cache * |
4247 | __kmem_cache_alias(const char *name, size_t size, size_t align, | 4226 | __kmem_cache_alias(const char *name, size_t size, size_t align, |
4248 | unsigned long flags, void (*ctor)(void *)) | 4227 | slab_flags_t flags, void (*ctor)(void *)) |
4249 | { | 4228 | { |
4250 | struct kmem_cache *s, *c; | 4229 | struct kmem_cache *s, *c; |
4251 | 4230 | ||
@@ -4275,7 +4254,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, | |||
4275 | return s; | 4254 | return s; |
4276 | } | 4255 | } |
4277 | 4256 | ||
4278 | int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) | 4257 | int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) |
4279 | { | 4258 | { |
4280 | int err; | 4259 | int err; |
4281 | 4260 | ||
@@ -5655,8 +5634,6 @@ static char *create_unique_id(struct kmem_cache *s) | |||
5655 | *p++ = 'a'; | 5634 | *p++ = 'a'; |
5656 | if (s->flags & SLAB_CONSISTENCY_CHECKS) | 5635 | if (s->flags & SLAB_CONSISTENCY_CHECKS) |
5657 | *p++ = 'F'; | 5636 | *p++ = 'F'; |
5658 | if (!(s->flags & SLAB_NOTRACK)) | ||
5659 | *p++ = 't'; | ||
5660 | if (s->flags & SLAB_ACCOUNT) | 5637 | if (s->flags & SLAB_ACCOUNT) |
5661 | *p++ = 'A'; | 5638 | *p++ = 'A'; |
5662 | if (p != name + 1) | 5639 | if (p != name + 1) |
@@ -5704,6 +5681,10 @@ static int sysfs_slab_add(struct kmem_cache *s) | |||
5704 | return 0; | 5681 | return 0; |
5705 | } | 5682 | } |
5706 | 5683 | ||
5684 | if (!unmergeable && disable_higher_order_debug && | ||
5685 | (slub_debug & DEBUG_METADATA_FLAGS)) | ||
5686 | unmergeable = 1; | ||
5687 | |||
5707 | if (unmergeable) { | 5688 | if (unmergeable) { |
5708 | /* | 5689 | /* |
5709 | * Slabcache can never be merged so we can use the name proper. | 5690 | * Slabcache can never be merged so we can use the name proper. |
@@ -5852,7 +5833,7 @@ __initcall(slab_sysfs_init); | |||
5852 | /* | 5833 | /* |
5853 | * The /proc/slabinfo ABI | 5834 | * The /proc/slabinfo ABI |
5854 | */ | 5835 | */ |
5855 | #ifdef CONFIG_SLABINFO | 5836 | #ifdef CONFIG_SLUB_DEBUG |
5856 | void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) | 5837 | void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) |
5857 | { | 5838 | { |
5858 | unsigned long nr_slabs = 0; | 5839 | unsigned long nr_slabs = 0; |
@@ -5884,4 +5865,4 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, | |||
5884 | { | 5865 | { |
5885 | return -EIO; | 5866 | return -EIO; |
5886 | } | 5867 | } |
5887 | #endif /* CONFIG_SLABINFO */ | 5868 | #endif /* CONFIG_SLUB_DEBUG */ |
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 478ce6d4a2c4..17acf01791fa 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c | |||
@@ -42,7 +42,7 @@ static void * __ref __earlyonly_bootmem_alloc(int node, | |||
42 | unsigned long align, | 42 | unsigned long align, |
43 | unsigned long goal) | 43 | unsigned long goal) |
44 | { | 44 | { |
45 | return memblock_virt_alloc_try_nid(size, align, goal, | 45 | return memblock_virt_alloc_try_nid_raw(size, align, goal, |
46 | BOOTMEM_ALLOC_ACCESSIBLE, node); | 46 | BOOTMEM_ALLOC_ACCESSIBLE, node); |
47 | } | 47 | } |
48 | 48 | ||
@@ -53,13 +53,20 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) | |||
53 | { | 53 | { |
54 | /* If the main allocator is up use that, fallback to bootmem. */ | 54 | /* If the main allocator is up use that, fallback to bootmem. */ |
55 | if (slab_is_available()) { | 55 | if (slab_is_available()) { |
56 | gfp_t gfp_mask = GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; | ||
57 | int order = get_order(size); | ||
58 | static bool warned; | ||
56 | struct page *page; | 59 | struct page *page; |
57 | 60 | ||
58 | page = alloc_pages_node(node, | 61 | page = alloc_pages_node(node, gfp_mask, order); |
59 | GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL, | ||
60 | get_order(size)); | ||
61 | if (page) | 62 | if (page) |
62 | return page_address(page); | 63 | return page_address(page); |
64 | |||
65 | if (!warned) { | ||
66 | warn_alloc(gfp_mask & ~__GFP_NOWARN, NULL, | ||
67 | "vmemmap alloc failure: order:%u", order); | ||
68 | warned = true; | ||
69 | } | ||
63 | return NULL; | 70 | return NULL; |
64 | } else | 71 | } else |
65 | return __earlyonly_bootmem_alloc(node, size, size, | 72 | return __earlyonly_bootmem_alloc(node, size, size, |
@@ -180,11 +187,22 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) | |||
180 | return pte; | 187 | return pte; |
181 | } | 188 | } |
182 | 189 | ||
190 | static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node) | ||
191 | { | ||
192 | void *p = vmemmap_alloc_block(size, node); | ||
193 | |||
194 | if (!p) | ||
195 | return NULL; | ||
196 | memset(p, 0, size); | ||
197 | |||
198 | return p; | ||
199 | } | ||
200 | |||
183 | pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) | 201 | pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) |
184 | { | 202 | { |
185 | pmd_t *pmd = pmd_offset(pud, addr); | 203 | pmd_t *pmd = pmd_offset(pud, addr); |
186 | if (pmd_none(*pmd)) { | 204 | if (pmd_none(*pmd)) { |
187 | void *p = vmemmap_alloc_block(PAGE_SIZE, node); | 205 | void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); |
188 | if (!p) | 206 | if (!p) |
189 | return NULL; | 207 | return NULL; |
190 | pmd_populate_kernel(&init_mm, pmd, p); | 208 | pmd_populate_kernel(&init_mm, pmd, p); |
@@ -196,7 +214,7 @@ pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node) | |||
196 | { | 214 | { |
197 | pud_t *pud = pud_offset(p4d, addr); | 215 | pud_t *pud = pud_offset(p4d, addr); |
198 | if (pud_none(*pud)) { | 216 | if (pud_none(*pud)) { |
199 | void *p = vmemmap_alloc_block(PAGE_SIZE, node); | 217 | void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); |
200 | if (!p) | 218 | if (!p) |
201 | return NULL; | 219 | return NULL; |
202 | pud_populate(&init_mm, pud, p); | 220 | pud_populate(&init_mm, pud, p); |
@@ -208,7 +226,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) | |||
208 | { | 226 | { |
209 | p4d_t *p4d = p4d_offset(pgd, addr); | 227 | p4d_t *p4d = p4d_offset(pgd, addr); |
210 | if (p4d_none(*p4d)) { | 228 | if (p4d_none(*p4d)) { |
211 | void *p = vmemmap_alloc_block(PAGE_SIZE, node); | 229 | void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); |
212 | if (!p) | 230 | if (!p) |
213 | return NULL; | 231 | return NULL; |
214 | p4d_populate(&init_mm, p4d, p); | 232 | p4d_populate(&init_mm, p4d, p); |
@@ -220,7 +238,7 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) | |||
220 | { | 238 | { |
221 | pgd_t *pgd = pgd_offset_k(addr); | 239 | pgd_t *pgd = pgd_offset_k(addr); |
222 | if (pgd_none(*pgd)) { | 240 | if (pgd_none(*pgd)) { |
223 | void *p = vmemmap_alloc_block(PAGE_SIZE, node); | 241 | void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); |
224 | if (!p) | 242 | if (!p) |
225 | return NULL; | 243 | return NULL; |
226 | pgd_populate(&init_mm, pgd, p); | 244 | pgd_populate(&init_mm, pgd, p); |
diff --git a/mm/sparse.c b/mm/sparse.c index 60805abf98af..7a5dacaa06e3 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -453,9 +453,9 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, | |||
453 | } | 453 | } |
454 | 454 | ||
455 | size = PAGE_ALIGN(size); | 455 | size = PAGE_ALIGN(size); |
456 | map = memblock_virt_alloc_try_nid(size * map_count, | 456 | map = memblock_virt_alloc_try_nid_raw(size * map_count, |
457 | PAGE_SIZE, __pa(MAX_DMA_ADDRESS), | 457 | PAGE_SIZE, __pa(MAX_DMA_ADDRESS), |
458 | BOOTMEM_ALLOC_ACCESSIBLE, nodeid); | 458 | BOOTMEM_ALLOC_ACCESSIBLE, nodeid); |
459 | if (map) { | 459 | if (map) { |
460 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | 460 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { |
461 | if (!present_section_nr(pnum)) | 461 | if (!present_section_nr(pnum)) |
@@ -76,7 +76,7 @@ static void __page_cache_release(struct page *page) | |||
76 | static void __put_single_page(struct page *page) | 76 | static void __put_single_page(struct page *page) |
77 | { | 77 | { |
78 | __page_cache_release(page); | 78 | __page_cache_release(page); |
79 | free_hot_cold_page(page, false); | 79 | free_unref_page(page); |
80 | } | 80 | } |
81 | 81 | ||
82 | static void __put_compound_page(struct page *page) | 82 | static void __put_compound_page(struct page *page) |
@@ -210,7 +210,7 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, | |||
210 | } | 210 | } |
211 | if (pgdat) | 211 | if (pgdat) |
212 | spin_unlock_irqrestore(&pgdat->lru_lock, flags); | 212 | spin_unlock_irqrestore(&pgdat->lru_lock, flags); |
213 | release_pages(pvec->pages, pvec->nr, pvec->cold); | 213 | release_pages(pvec->pages, pvec->nr); |
214 | pagevec_reinit(pvec); | 214 | pagevec_reinit(pvec); |
215 | } | 215 | } |
216 | 216 | ||
@@ -740,7 +740,7 @@ void lru_add_drain_all(void) | |||
740 | * Decrement the reference count on all the pages in @pages. If it | 740 | * Decrement the reference count on all the pages in @pages. If it |
741 | * fell to zero, remove the page from the LRU and free it. | 741 | * fell to zero, remove the page from the LRU and free it. |
742 | */ | 742 | */ |
743 | void release_pages(struct page **pages, int nr, bool cold) | 743 | void release_pages(struct page **pages, int nr) |
744 | { | 744 | { |
745 | int i; | 745 | int i; |
746 | LIST_HEAD(pages_to_free); | 746 | LIST_HEAD(pages_to_free); |
@@ -817,7 +817,7 @@ void release_pages(struct page **pages, int nr, bool cold) | |||
817 | spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); | 817 | spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); |
818 | 818 | ||
819 | mem_cgroup_uncharge_list(&pages_to_free); | 819 | mem_cgroup_uncharge_list(&pages_to_free); |
820 | free_hot_cold_page_list(&pages_to_free, cold); | 820 | free_unref_page_list(&pages_to_free); |
821 | } | 821 | } |
822 | EXPORT_SYMBOL(release_pages); | 822 | EXPORT_SYMBOL(release_pages); |
823 | 823 | ||
@@ -833,8 +833,11 @@ EXPORT_SYMBOL(release_pages); | |||
833 | */ | 833 | */ |
834 | void __pagevec_release(struct pagevec *pvec) | 834 | void __pagevec_release(struct pagevec *pvec) |
835 | { | 835 | { |
836 | lru_add_drain(); | 836 | if (!pvec->percpu_pvec_drained) { |
837 | release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); | 837 | lru_add_drain(); |
838 | pvec->percpu_pvec_drained = true; | ||
839 | } | ||
840 | release_pages(pvec->pages, pagevec_count(pvec)); | ||
838 | pagevec_reinit(pvec); | 841 | pagevec_reinit(pvec); |
839 | } | 842 | } |
840 | EXPORT_SYMBOL(__pagevec_release); | 843 | EXPORT_SYMBOL(__pagevec_release); |
@@ -986,15 +989,25 @@ unsigned pagevec_lookup_range(struct pagevec *pvec, | |||
986 | } | 989 | } |
987 | EXPORT_SYMBOL(pagevec_lookup_range); | 990 | EXPORT_SYMBOL(pagevec_lookup_range); |
988 | 991 | ||
989 | unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, | 992 | unsigned pagevec_lookup_range_tag(struct pagevec *pvec, |
990 | pgoff_t *index, int tag, unsigned nr_pages) | 993 | struct address_space *mapping, pgoff_t *index, pgoff_t end, |
994 | int tag) | ||
991 | { | 995 | { |
992 | pvec->nr = find_get_pages_tag(mapping, index, tag, | 996 | pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, |
993 | nr_pages, pvec->pages); | 997 | PAGEVEC_SIZE, pvec->pages); |
994 | return pagevec_count(pvec); | 998 | return pagevec_count(pvec); |
995 | } | 999 | } |
996 | EXPORT_SYMBOL(pagevec_lookup_tag); | 1000 | EXPORT_SYMBOL(pagevec_lookup_range_tag); |
997 | 1001 | ||
1002 | unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, | ||
1003 | struct address_space *mapping, pgoff_t *index, pgoff_t end, | ||
1004 | int tag, unsigned max_pages) | ||
1005 | { | ||
1006 | pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, | ||
1007 | min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages); | ||
1008 | return pagevec_count(pvec); | ||
1009 | } | ||
1010 | EXPORT_SYMBOL(pagevec_lookup_range_nr_tag); | ||
998 | /* | 1011 | /* |
999 | * Perform any setup for the swap system | 1012 | * Perform any setup for the swap system |
1000 | */ | 1013 | */ |
diff --git a/mm/swap_slots.c b/mm/swap_slots.c index d81cfc5a43d5..bebc19292018 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c | |||
@@ -149,6 +149,13 @@ static int alloc_swap_slot_cache(unsigned int cpu) | |||
149 | cache->nr = 0; | 149 | cache->nr = 0; |
150 | cache->cur = 0; | 150 | cache->cur = 0; |
151 | cache->n_ret = 0; | 151 | cache->n_ret = 0; |
152 | /* | ||
153 | * We initialized alloc_lock and free_lock earlier. We use | ||
154 | * !cache->slots or !cache->slots_ret to know if it is safe to acquire | ||
155 | * the corresponding lock and use the cache. Memory barrier below | ||
156 | * ensures the assumption. | ||
157 | */ | ||
158 | mb(); | ||
152 | cache->slots = slots; | 159 | cache->slots = slots; |
153 | slots = NULL; | 160 | slots = NULL; |
154 | cache->slots_ret = slots_ret; | 161 | cache->slots_ret = slots_ret; |
@@ -275,7 +282,7 @@ int free_swap_slot(swp_entry_t entry) | |||
275 | struct swap_slots_cache *cache; | 282 | struct swap_slots_cache *cache; |
276 | 283 | ||
277 | cache = raw_cpu_ptr(&swp_slots); | 284 | cache = raw_cpu_ptr(&swp_slots); |
278 | if (use_swap_slot_cache && cache->slots_ret) { | 285 | if (likely(use_swap_slot_cache && cache->slots_ret)) { |
279 | spin_lock_irq(&cache->free_lock); | 286 | spin_lock_irq(&cache->free_lock); |
280 | /* Swap slots cache may be deactivated before acquiring lock */ | 287 | /* Swap slots cache may be deactivated before acquiring lock */ |
281 | if (!use_swap_slot_cache || !cache->slots_ret) { | 288 | if (!use_swap_slot_cache || !cache->slots_ret) { |
@@ -326,7 +333,7 @@ swp_entry_t get_swap_page(struct page *page) | |||
326 | */ | 333 | */ |
327 | cache = raw_cpu_ptr(&swp_slots); | 334 | cache = raw_cpu_ptr(&swp_slots); |
328 | 335 | ||
329 | if (check_cache_active()) { | 336 | if (likely(check_cache_active() && cache->slots)) { |
330 | mutex_lock(&cache->alloc_lock); | 337 | mutex_lock(&cache->alloc_lock); |
331 | if (cache->slots) { | 338 | if (cache->slots) { |
332 | repeat: | 339 | repeat: |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 326439428daf..39ae7cfad90f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -36,9 +36,9 @@ static const struct address_space_operations swap_aops = { | |||
36 | #endif | 36 | #endif |
37 | }; | 37 | }; |
38 | 38 | ||
39 | struct address_space *swapper_spaces[MAX_SWAPFILES]; | 39 | struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; |
40 | static unsigned int nr_swapper_spaces[MAX_SWAPFILES]; | 40 | static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; |
41 | bool swap_vma_readahead = true; | 41 | bool swap_vma_readahead __read_mostly = true; |
42 | 42 | ||
43 | #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) | 43 | #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) |
44 | #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) | 44 | #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) |
@@ -319,7 +319,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr) | |||
319 | lru_add_drain(); | 319 | lru_add_drain(); |
320 | for (i = 0; i < nr; i++) | 320 | for (i = 0; i < nr; i++) |
321 | free_swap_cache(pagep[i]); | 321 | free_swap_cache(pagep[i]); |
322 | release_pages(pagep, nr, false); | 322 | release_pages(pagep, nr); |
323 | } | 323 | } |
324 | 324 | ||
325 | /* | 325 | /* |
@@ -559,6 +559,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, | |||
559 | unsigned long offset = entry_offset; | 559 | unsigned long offset = entry_offset; |
560 | unsigned long start_offset, end_offset; | 560 | unsigned long start_offset, end_offset; |
561 | unsigned long mask; | 561 | unsigned long mask; |
562 | struct swap_info_struct *si = swp_swap_info(entry); | ||
562 | struct blk_plug plug; | 563 | struct blk_plug plug; |
563 | bool do_poll = true, page_allocated; | 564 | bool do_poll = true, page_allocated; |
564 | 565 | ||
@@ -572,6 +573,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, | |||
572 | end_offset = offset | mask; | 573 | end_offset = offset | mask; |
573 | if (!start_offset) /* First page is swap header. */ | 574 | if (!start_offset) /* First page is swap header. */ |
574 | start_offset++; | 575 | start_offset++; |
576 | if (end_offset >= si->max) | ||
577 | end_offset = si->max - 1; | ||
575 | 578 | ||
576 | blk_start_plug(&plug); | 579 | blk_start_plug(&plug); |
577 | for (offset = start_offset; offset <= end_offset ; offset++) { | 580 | for (offset = start_offset; offset <= end_offset ; offset++) { |
diff --git a/mm/swapfile.c b/mm/swapfile.c index e47a21e64764..3074b02eaa09 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -1328,6 +1328,13 @@ int page_swapcount(struct page *page) | |||
1328 | return count; | 1328 | return count; |
1329 | } | 1329 | } |
1330 | 1330 | ||
1331 | int __swap_count(struct swap_info_struct *si, swp_entry_t entry) | ||
1332 | { | ||
1333 | pgoff_t offset = swp_offset(entry); | ||
1334 | |||
1335 | return swap_count(si->swap_map[offset]); | ||
1336 | } | ||
1337 | |||
1331 | static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) | 1338 | static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) |
1332 | { | 1339 | { |
1333 | int count = 0; | 1340 | int count = 0; |
@@ -3169,6 +3176,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
3169 | if (bdi_cap_stable_pages_required(inode_to_bdi(inode))) | 3176 | if (bdi_cap_stable_pages_required(inode_to_bdi(inode))) |
3170 | p->flags |= SWP_STABLE_WRITES; | 3177 | p->flags |= SWP_STABLE_WRITES; |
3171 | 3178 | ||
3179 | if (bdi_cap_synchronous_io(inode_to_bdi(inode))) | ||
3180 | p->flags |= SWP_SYNCHRONOUS_IO; | ||
3181 | |||
3172 | if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { | 3182 | if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { |
3173 | int cpu; | 3183 | int cpu; |
3174 | unsigned long ci, nr_cluster; | 3184 | unsigned long ci, nr_cluster; |
@@ -3452,10 +3462,15 @@ int swapcache_prepare(swp_entry_t entry) | |||
3452 | return __swap_duplicate(entry, SWAP_HAS_CACHE); | 3462 | return __swap_duplicate(entry, SWAP_HAS_CACHE); |
3453 | } | 3463 | } |
3454 | 3464 | ||
3465 | struct swap_info_struct *swp_swap_info(swp_entry_t entry) | ||
3466 | { | ||
3467 | return swap_info[swp_type(entry)]; | ||
3468 | } | ||
3469 | |||
3455 | struct swap_info_struct *page_swap_info(struct page *page) | 3470 | struct swap_info_struct *page_swap_info(struct page *page) |
3456 | { | 3471 | { |
3457 | swp_entry_t swap = { .val = page_private(page) }; | 3472 | swp_entry_t entry = { .val = page_private(page) }; |
3458 | return swap_info[swp_type(swap)]; | 3473 | return swp_swap_info(entry); |
3459 | } | 3474 | } |
3460 | 3475 | ||
3461 | /* | 3476 | /* |
@@ -3463,7 +3478,6 @@ struct swap_info_struct *page_swap_info(struct page *page) | |||
3463 | */ | 3478 | */ |
3464 | struct address_space *__page_file_mapping(struct page *page) | 3479 | struct address_space *__page_file_mapping(struct page *page) |
3465 | { | 3480 | { |
3466 | VM_BUG_ON_PAGE(!PageSwapCache(page), page); | ||
3467 | return page_swap_info(page)->swap_file->f_mapping; | 3481 | return page_swap_info(page)->swap_file->f_mapping; |
3468 | } | 3482 | } |
3469 | EXPORT_SYMBOL_GPL(__page_file_mapping); | 3483 | EXPORT_SYMBOL_GPL(__page_file_mapping); |
@@ -3471,7 +3485,6 @@ EXPORT_SYMBOL_GPL(__page_file_mapping); | |||
3471 | pgoff_t __page_file_index(struct page *page) | 3485 | pgoff_t __page_file_index(struct page *page) |
3472 | { | 3486 | { |
3473 | swp_entry_t swap = { .val = page_private(page) }; | 3487 | swp_entry_t swap = { .val = page_private(page) }; |
3474 | VM_BUG_ON_PAGE(!PageSwapCache(page), page); | ||
3475 | return swp_offset(swap); | 3488 | return swp_offset(swap); |
3476 | } | 3489 | } |
3477 | EXPORT_SYMBOL_GPL(__page_file_index); | 3490 | EXPORT_SYMBOL_GPL(__page_file_index); |
diff --git a/mm/truncate.c b/mm/truncate.c index 2330223841fb..e4b4cf0f4070 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -25,44 +25,85 @@ | |||
25 | #include <linux/rmap.h> | 25 | #include <linux/rmap.h> |
26 | #include "internal.h" | 26 | #include "internal.h" |
27 | 27 | ||
28 | static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, | 28 | /* |
29 | void *entry) | 29 | * Regular page slots are stabilized by the page lock even without the tree |
30 | * itself locked. These unlocked entries need verification under the tree | ||
31 | * lock. | ||
32 | */ | ||
33 | static inline void __clear_shadow_entry(struct address_space *mapping, | ||
34 | pgoff_t index, void *entry) | ||
30 | { | 35 | { |
31 | struct radix_tree_node *node; | 36 | struct radix_tree_node *node; |
32 | void **slot; | 37 | void **slot; |
33 | 38 | ||
34 | spin_lock_irq(&mapping->tree_lock); | ||
35 | /* | ||
36 | * Regular page slots are stabilized by the page lock even | ||
37 | * without the tree itself locked. These unlocked entries | ||
38 | * need verification under the tree lock. | ||
39 | */ | ||
40 | if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) | 39 | if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) |
41 | goto unlock; | 40 | return; |
42 | if (*slot != entry) | 41 | if (*slot != entry) |
43 | goto unlock; | 42 | return; |
44 | __radix_tree_replace(&mapping->page_tree, node, slot, NULL, | 43 | __radix_tree_replace(&mapping->page_tree, node, slot, NULL, |
45 | workingset_update_node, mapping); | 44 | workingset_update_node); |
46 | mapping->nrexceptional--; | 45 | mapping->nrexceptional--; |
47 | unlock: | 46 | } |
47 | |||
48 | static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, | ||
49 | void *entry) | ||
50 | { | ||
51 | spin_lock_irq(&mapping->tree_lock); | ||
52 | __clear_shadow_entry(mapping, index, entry); | ||
48 | spin_unlock_irq(&mapping->tree_lock); | 53 | spin_unlock_irq(&mapping->tree_lock); |
49 | } | 54 | } |
50 | 55 | ||
51 | /* | 56 | /* |
52 | * Unconditionally remove exceptional entry. Usually called from truncate path. | 57 | * Unconditionally remove exceptional entries. Usually called from truncate |
58 | * path. Note that the pagevec may be altered by this function by removing | ||
59 | * exceptional entries similar to what pagevec_remove_exceptionals does. | ||
53 | */ | 60 | */ |
54 | static void truncate_exceptional_entry(struct address_space *mapping, | 61 | static void truncate_exceptional_pvec_entries(struct address_space *mapping, |
55 | pgoff_t index, void *entry) | 62 | struct pagevec *pvec, pgoff_t *indices, |
63 | pgoff_t end) | ||
56 | { | 64 | { |
65 | int i, j; | ||
66 | bool dax, lock; | ||
67 | |||
57 | /* Handled by shmem itself */ | 68 | /* Handled by shmem itself */ |
58 | if (shmem_mapping(mapping)) | 69 | if (shmem_mapping(mapping)) |
59 | return; | 70 | return; |
60 | 71 | ||
61 | if (dax_mapping(mapping)) { | 72 | for (j = 0; j < pagevec_count(pvec); j++) |
62 | dax_delete_mapping_entry(mapping, index); | 73 | if (radix_tree_exceptional_entry(pvec->pages[j])) |
74 | break; | ||
75 | |||
76 | if (j == pagevec_count(pvec)) | ||
63 | return; | 77 | return; |
78 | |||
79 | dax = dax_mapping(mapping); | ||
80 | lock = !dax && indices[j] < end; | ||
81 | if (lock) | ||
82 | spin_lock_irq(&mapping->tree_lock); | ||
83 | |||
84 | for (i = j; i < pagevec_count(pvec); i++) { | ||
85 | struct page *page = pvec->pages[i]; | ||
86 | pgoff_t index = indices[i]; | ||
87 | |||
88 | if (!radix_tree_exceptional_entry(page)) { | ||
89 | pvec->pages[j++] = page; | ||
90 | continue; | ||
91 | } | ||
92 | |||
93 | if (index >= end) | ||
94 | continue; | ||
95 | |||
96 | if (unlikely(dax)) { | ||
97 | dax_delete_mapping_entry(mapping, index); | ||
98 | continue; | ||
99 | } | ||
100 | |||
101 | __clear_shadow_entry(mapping, index, page); | ||
64 | } | 102 | } |
65 | clear_shadow_entry(mapping, index, entry); | 103 | |
104 | if (lock) | ||
105 | spin_unlock_irq(&mapping->tree_lock); | ||
106 | pvec->nr = j; | ||
66 | } | 107 | } |
67 | 108 | ||
68 | /* | 109 | /* |
@@ -134,11 +175,17 @@ void do_invalidatepage(struct page *page, unsigned int offset, | |||
134 | * its lock, b) when a concurrent invalidate_mapping_pages got there first and | 175 | * its lock, b) when a concurrent invalidate_mapping_pages got there first and |
135 | * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. | 176 | * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. |
136 | */ | 177 | */ |
137 | static int | 178 | static void |
138 | truncate_complete_page(struct address_space *mapping, struct page *page) | 179 | truncate_cleanup_page(struct address_space *mapping, struct page *page) |
139 | { | 180 | { |
140 | if (page->mapping != mapping) | 181 | if (page_mapped(page)) { |
141 | return -EIO; | 182 | loff_t holelen; |
183 | |||
184 | holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE; | ||
185 | unmap_mapping_range(mapping, | ||
186 | (loff_t)page->index << PAGE_SHIFT, | ||
187 | holelen, 0); | ||
188 | } | ||
142 | 189 | ||
143 | if (page_has_private(page)) | 190 | if (page_has_private(page)) |
144 | do_invalidatepage(page, 0, PAGE_SIZE); | 191 | do_invalidatepage(page, 0, PAGE_SIZE); |
@@ -150,8 +197,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page) | |||
150 | */ | 197 | */ |
151 | cancel_dirty_page(page); | 198 | cancel_dirty_page(page); |
152 | ClearPageMappedToDisk(page); | 199 | ClearPageMappedToDisk(page); |
153 | delete_from_page_cache(page); | ||
154 | return 0; | ||
155 | } | 200 | } |
156 | 201 | ||
157 | /* | 202 | /* |
@@ -180,16 +225,14 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) | |||
180 | 225 | ||
181 | int truncate_inode_page(struct address_space *mapping, struct page *page) | 226 | int truncate_inode_page(struct address_space *mapping, struct page *page) |
182 | { | 227 | { |
183 | loff_t holelen; | ||
184 | VM_BUG_ON_PAGE(PageTail(page), page); | 228 | VM_BUG_ON_PAGE(PageTail(page), page); |
185 | 229 | ||
186 | holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE; | 230 | if (page->mapping != mapping) |
187 | if (page_mapped(page)) { | 231 | return -EIO; |
188 | unmap_mapping_range(mapping, | 232 | |
189 | (loff_t)page->index << PAGE_SHIFT, | 233 | truncate_cleanup_page(mapping, page); |
190 | holelen, 0); | 234 | delete_from_page_cache(page); |
191 | } | 235 | return 0; |
192 | return truncate_complete_page(mapping, page); | ||
193 | } | 236 | } |
194 | 237 | ||
195 | /* | 238 | /* |
@@ -287,11 +330,19 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
287 | else | 330 | else |
288 | end = (lend + 1) >> PAGE_SHIFT; | 331 | end = (lend + 1) >> PAGE_SHIFT; |
289 | 332 | ||
290 | pagevec_init(&pvec, 0); | 333 | pagevec_init(&pvec); |
291 | index = start; | 334 | index = start; |
292 | while (index < end && pagevec_lookup_entries(&pvec, mapping, index, | 335 | while (index < end && pagevec_lookup_entries(&pvec, mapping, index, |
293 | min(end - index, (pgoff_t)PAGEVEC_SIZE), | 336 | min(end - index, (pgoff_t)PAGEVEC_SIZE), |
294 | indices)) { | 337 | indices)) { |
338 | /* | ||
339 | * Pagevec array has exceptional entries and we may also fail | ||
340 | * to lock some pages. So we store pages that can be deleted | ||
341 | * in a new pagevec. | ||
342 | */ | ||
343 | struct pagevec locked_pvec; | ||
344 | |||
345 | pagevec_init(&locked_pvec); | ||
295 | for (i = 0; i < pagevec_count(&pvec); i++) { | 346 | for (i = 0; i < pagevec_count(&pvec); i++) { |
296 | struct page *page = pvec.pages[i]; | 347 | struct page *page = pvec.pages[i]; |
297 | 348 | ||
@@ -300,11 +351,8 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
300 | if (index >= end) | 351 | if (index >= end) |
301 | break; | 352 | break; |
302 | 353 | ||
303 | if (radix_tree_exceptional_entry(page)) { | 354 | if (radix_tree_exceptional_entry(page)) |
304 | truncate_exceptional_entry(mapping, index, | ||
305 | page); | ||
306 | continue; | 355 | continue; |
307 | } | ||
308 | 356 | ||
309 | if (!trylock_page(page)) | 357 | if (!trylock_page(page)) |
310 | continue; | 358 | continue; |
@@ -313,15 +361,22 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
313 | unlock_page(page); | 361 | unlock_page(page); |
314 | continue; | 362 | continue; |
315 | } | 363 | } |
316 | truncate_inode_page(mapping, page); | 364 | if (page->mapping != mapping) { |
317 | unlock_page(page); | 365 | unlock_page(page); |
366 | continue; | ||
367 | } | ||
368 | pagevec_add(&locked_pvec, page); | ||
318 | } | 369 | } |
319 | pagevec_remove_exceptionals(&pvec); | 370 | for (i = 0; i < pagevec_count(&locked_pvec); i++) |
371 | truncate_cleanup_page(mapping, locked_pvec.pages[i]); | ||
372 | delete_from_page_cache_batch(mapping, &locked_pvec); | ||
373 | for (i = 0; i < pagevec_count(&locked_pvec); i++) | ||
374 | unlock_page(locked_pvec.pages[i]); | ||
375 | truncate_exceptional_pvec_entries(mapping, &pvec, indices, end); | ||
320 | pagevec_release(&pvec); | 376 | pagevec_release(&pvec); |
321 | cond_resched(); | 377 | cond_resched(); |
322 | index++; | 378 | index++; |
323 | } | 379 | } |
324 | |||
325 | if (partial_start) { | 380 | if (partial_start) { |
326 | struct page *page = find_lock_page(mapping, start - 1); | 381 | struct page *page = find_lock_page(mapping, start - 1); |
327 | if (page) { | 382 | if (page) { |
@@ -379,6 +434,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
379 | pagevec_release(&pvec); | 434 | pagevec_release(&pvec); |
380 | break; | 435 | break; |
381 | } | 436 | } |
437 | |||
382 | for (i = 0; i < pagevec_count(&pvec); i++) { | 438 | for (i = 0; i < pagevec_count(&pvec); i++) { |
383 | struct page *page = pvec.pages[i]; | 439 | struct page *page = pvec.pages[i]; |
384 | 440 | ||
@@ -390,11 +446,8 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
390 | break; | 446 | break; |
391 | } | 447 | } |
392 | 448 | ||
393 | if (radix_tree_exceptional_entry(page)) { | 449 | if (radix_tree_exceptional_entry(page)) |
394 | truncate_exceptional_entry(mapping, index, | ||
395 | page); | ||
396 | continue; | 450 | continue; |
397 | } | ||
398 | 451 | ||
399 | lock_page(page); | 452 | lock_page(page); |
400 | WARN_ON(page_to_index(page) != index); | 453 | WARN_ON(page_to_index(page) != index); |
@@ -402,7 +455,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
402 | truncate_inode_page(mapping, page); | 455 | truncate_inode_page(mapping, page); |
403 | unlock_page(page); | 456 | unlock_page(page); |
404 | } | 457 | } |
405 | pagevec_remove_exceptionals(&pvec); | 458 | truncate_exceptional_pvec_entries(mapping, &pvec, indices, end); |
406 | pagevec_release(&pvec); | 459 | pagevec_release(&pvec); |
407 | index++; | 460 | index++; |
408 | } | 461 | } |
@@ -500,7 +553,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
500 | unsigned long count = 0; | 553 | unsigned long count = 0; |
501 | int i; | 554 | int i; |
502 | 555 | ||
503 | pagevec_init(&pvec, 0); | 556 | pagevec_init(&pvec); |
504 | while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, | 557 | while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, |
505 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, | 558 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, |
506 | indices)) { | 559 | indices)) { |
@@ -630,7 +683,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
630 | if (mapping->nrpages == 0 && mapping->nrexceptional == 0) | 683 | if (mapping->nrpages == 0 && mapping->nrexceptional == 0) |
631 | goto out; | 684 | goto out; |
632 | 685 | ||
633 | pagevec_init(&pvec, 0); | 686 | pagevec_init(&pvec); |
634 | index = start; | 687 | index = start; |
635 | while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, | 688 | while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, |
636 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, | 689 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 15b483ef6440..c02c850ea349 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -1349,7 +1349,7 @@ keep: | |||
1349 | 1349 | ||
1350 | mem_cgroup_uncharge_list(&free_pages); | 1350 | mem_cgroup_uncharge_list(&free_pages); |
1351 | try_to_unmap_flush(); | 1351 | try_to_unmap_flush(); |
1352 | free_hot_cold_page_list(&free_pages, true); | 1352 | free_unref_page_list(&free_pages); |
1353 | 1353 | ||
1354 | list_splice(&ret_pages, page_list); | 1354 | list_splice(&ret_pages, page_list); |
1355 | count_vm_events(PGACTIVATE, pgactivate); | 1355 | count_vm_events(PGACTIVATE, pgactivate); |
@@ -1824,7 +1824,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1824 | spin_unlock_irq(&pgdat->lru_lock); | 1824 | spin_unlock_irq(&pgdat->lru_lock); |
1825 | 1825 | ||
1826 | mem_cgroup_uncharge_list(&page_list); | 1826 | mem_cgroup_uncharge_list(&page_list); |
1827 | free_hot_cold_page_list(&page_list, true); | 1827 | free_unref_page_list(&page_list); |
1828 | 1828 | ||
1829 | /* | 1829 | /* |
1830 | * If reclaim is isolating dirty pages under writeback, it implies | 1830 | * If reclaim is isolating dirty pages under writeback, it implies |
@@ -2063,7 +2063,7 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
2063 | spin_unlock_irq(&pgdat->lru_lock); | 2063 | spin_unlock_irq(&pgdat->lru_lock); |
2064 | 2064 | ||
2065 | mem_cgroup_uncharge_list(&l_hold); | 2065 | mem_cgroup_uncharge_list(&l_hold); |
2066 | free_hot_cold_page_list(&l_hold, true); | 2066 | free_unref_page_list(&l_hold); |
2067 | trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, | 2067 | trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, |
2068 | nr_deactivate, nr_rotated, sc->priority, file); | 2068 | nr_deactivate, nr_rotated, sc->priority, file); |
2069 | } | 2069 | } |
@@ -2082,7 +2082,7 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
2082 | * If that fails and refaulting is observed, the inactive list grows. | 2082 | * If that fails and refaulting is observed, the inactive list grows. |
2083 | * | 2083 | * |
2084 | * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages | 2084 | * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages |
2085 | * on this LRU, maintained by the pageout code. A zone->inactive_ratio | 2085 | * on this LRU, maintained by the pageout code. An inactive_ratio |
2086 | * of 3 means 3:1 or 25% of the pages are kept on the inactive list. | 2086 | * of 3 means 3:1 or 25% of the pages are kept on the inactive list. |
2087 | * | 2087 | * |
2088 | * total target max | 2088 | * total target max |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 4bb13e72ac97..40b2db6db6b1 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -32,6 +32,77 @@ | |||
32 | 32 | ||
33 | #define NUMA_STATS_THRESHOLD (U16_MAX - 2) | 33 | #define NUMA_STATS_THRESHOLD (U16_MAX - 2) |
34 | 34 | ||
35 | #ifdef CONFIG_NUMA | ||
36 | int sysctl_vm_numa_stat = ENABLE_NUMA_STAT; | ||
37 | |||
38 | /* zero numa counters within a zone */ | ||
39 | static void zero_zone_numa_counters(struct zone *zone) | ||
40 | { | ||
41 | int item, cpu; | ||
42 | |||
43 | for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) { | ||
44 | atomic_long_set(&zone->vm_numa_stat[item], 0); | ||
45 | for_each_online_cpu(cpu) | ||
46 | per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item] | ||
47 | = 0; | ||
48 | } | ||
49 | } | ||
50 | |||
51 | /* zero numa counters of all the populated zones */ | ||
52 | static void zero_zones_numa_counters(void) | ||
53 | { | ||
54 | struct zone *zone; | ||
55 | |||
56 | for_each_populated_zone(zone) | ||
57 | zero_zone_numa_counters(zone); | ||
58 | } | ||
59 | |||
60 | /* zero global numa counters */ | ||
61 | static void zero_global_numa_counters(void) | ||
62 | { | ||
63 | int item; | ||
64 | |||
65 | for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) | ||
66 | atomic_long_set(&vm_numa_stat[item], 0); | ||
67 | } | ||
68 | |||
69 | static void invalid_numa_statistics(void) | ||
70 | { | ||
71 | zero_zones_numa_counters(); | ||
72 | zero_global_numa_counters(); | ||
73 | } | ||
74 | |||
75 | static DEFINE_MUTEX(vm_numa_stat_lock); | ||
76 | |||
77 | int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, | ||
78 | void __user *buffer, size_t *length, loff_t *ppos) | ||
79 | { | ||
80 | int ret, oldval; | ||
81 | |||
82 | mutex_lock(&vm_numa_stat_lock); | ||
83 | if (write) | ||
84 | oldval = sysctl_vm_numa_stat; | ||
85 | ret = proc_dointvec_minmax(table, write, buffer, length, ppos); | ||
86 | if (ret || !write) | ||
87 | goto out; | ||
88 | |||
89 | if (oldval == sysctl_vm_numa_stat) | ||
90 | goto out; | ||
91 | else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) { | ||
92 | static_branch_enable(&vm_numa_stat_key); | ||
93 | pr_info("enable numa statistics\n"); | ||
94 | } else { | ||
95 | static_branch_disable(&vm_numa_stat_key); | ||
96 | invalid_numa_statistics(); | ||
97 | pr_info("disable numa statistics, and clear numa counters\n"); | ||
98 | } | ||
99 | |||
100 | out: | ||
101 | mutex_unlock(&vm_numa_stat_lock); | ||
102 | return ret; | ||
103 | } | ||
104 | #endif | ||
105 | |||
35 | #ifdef CONFIG_VM_EVENT_COUNTERS | 106 | #ifdef CONFIG_VM_EVENT_COUNTERS |
36 | DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; | 107 | DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; |
37 | EXPORT_PER_CPU_SYMBOL(vm_event_states); | 108 | EXPORT_PER_CPU_SYMBOL(vm_event_states); |
@@ -1564,11 +1635,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
1564 | } | 1635 | } |
1565 | seq_printf(m, | 1636 | seq_printf(m, |
1566 | "\n node_unreclaimable: %u" | 1637 | "\n node_unreclaimable: %u" |
1567 | "\n start_pfn: %lu" | 1638 | "\n start_pfn: %lu", |
1568 | "\n node_inactive_ratio: %u", | ||
1569 | pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES, | 1639 | pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES, |
1570 | zone->zone_start_pfn, | 1640 | zone->zone_start_pfn); |
1571 | zone->zone_pgdat->inactive_ratio); | ||
1572 | seq_putc(m, '\n'); | 1641 | seq_putc(m, '\n'); |
1573 | } | 1642 | } |
1574 | 1643 | ||
diff --git a/mm/workingset.c b/mm/workingset.c index b997c9de28f6..b7d616a3bbbe 100644 --- a/mm/workingset.c +++ b/mm/workingset.c | |||
@@ -340,14 +340,8 @@ out: | |||
340 | 340 | ||
341 | static struct list_lru shadow_nodes; | 341 | static struct list_lru shadow_nodes; |
342 | 342 | ||
343 | void workingset_update_node(struct radix_tree_node *node, void *private) | 343 | void workingset_update_node(struct radix_tree_node *node) |
344 | { | 344 | { |
345 | struct address_space *mapping = private; | ||
346 | |||
347 | /* Only regular page cache has shadow entries */ | ||
348 | if (dax_mapping(mapping) || shmem_mapping(mapping)) | ||
349 | return; | ||
350 | |||
351 | /* | 345 | /* |
352 | * Track non-empty nodes that contain only shadow entries; | 346 | * Track non-empty nodes that contain only shadow entries; |
353 | * unlink those that contain pages or are being freed. | 347 | * unlink those that contain pages or are being freed. |
@@ -475,7 +469,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, | |||
475 | goto out_invalid; | 469 | goto out_invalid; |
476 | inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); | 470 | inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); |
477 | __radix_tree_delete_node(&mapping->page_tree, node, | 471 | __radix_tree_delete_node(&mapping->page_tree, node, |
478 | workingset_update_node, mapping); | 472 | workingset_lookup_update(mapping)); |
479 | 473 | ||
480 | out_invalid: | 474 | out_invalid: |
481 | spin_unlock(&mapping->tree_lock); | 475 | spin_unlock(&mapping->tree_lock); |
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 7c38e850a8fc..685049a9048d 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c | |||
@@ -1349,7 +1349,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, | |||
1349 | * pools/users, we can't allow mapping in interrupt context | 1349 | * pools/users, we can't allow mapping in interrupt context |
1350 | * because it can corrupt another users mappings. | 1350 | * because it can corrupt another users mappings. |
1351 | */ | 1351 | */ |
1352 | WARN_ON_ONCE(in_interrupt()); | 1352 | BUG_ON(in_interrupt()); |
1353 | 1353 | ||
1354 | /* From now on, migration cannot move the object */ | 1354 | /* From now on, migration cannot move the object */ |
1355 | pin_tag(handle); | 1355 | pin_tag(handle); |