aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig9
-rw-r--r--mm/backing-dev.c4
-rw-r--r--mm/cleancache.c4
-rw-r--r--mm/filemap.c1
-rw-r--r--mm/gup.c4
-rw-r--r--mm/huge_memory.c89
-rw-r--r--mm/hugetlb.c7
-rw-r--r--mm/internal.h31
-rw-r--r--mm/memblock.c2
-rw-r--r--mm/memory.c14
-rw-r--r--mm/mempolicy.c14
-rw-r--r--mm/mmap.c85
-rw-r--r--mm/mprotect.c6
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/page_alloc.c7
-rw-r--r--mm/pgtable-generic.c8
-rw-r--r--mm/util.c27
-rw-r--r--mm/vmpressure.c3
-rw-r--r--mm/vmscan.c2
-rw-r--r--mm/vmstat.c70
20 files changed, 222 insertions, 167 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 97a4e06b15c0..03cbfa072f42 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -624,7 +624,7 @@ config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
624 bool 624 bool
625 625
626config DEFERRED_STRUCT_PAGE_INIT 626config DEFERRED_STRUCT_PAGE_INIT
627 bool "Defer initialisation of struct pages to kswapd" 627 bool "Defer initialisation of struct pages to kthreads"
628 default n 628 default n
629 depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT 629 depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
630 depends on MEMORY_HOTPLUG 630 depends on MEMORY_HOTPLUG
@@ -633,9 +633,10 @@ config DEFERRED_STRUCT_PAGE_INIT
633 single thread. On very large machines this can take a considerable 633 single thread. On very large machines this can take a considerable
634 amount of time. If this option is set, large machines will bring up 634 amount of time. If this option is set, large machines will bring up
635 a subset of memmap at boot and then initialise the rest in parallel 635 a subset of memmap at boot and then initialise the rest in parallel
636 when kswapd starts. This has a potential performance impact on 636 by starting one-off "pgdatinitX" kernel thread for each node X. This
637 processes running early in the lifetime of the systemm until kswapd 637 has a potential performance impact on processes running early in the
638 finishes the initialisation. 638 lifetime of the system until these kthreads finish the
639 initialisation.
639 640
640config IDLE_PAGE_TRACKING 641config IDLE_PAGE_TRACKING
641 bool "Enable idle page tracking" 642 bool "Enable idle page tracking"
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index cc5d29d2da9b..c554d173a65f 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -328,7 +328,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
328 return 0; 328 return 0;
329 329
330out_destroy_stat: 330out_destroy_stat:
331 while (--i) 331 while (i--)
332 percpu_counter_destroy(&wb->stat[i]); 332 percpu_counter_destroy(&wb->stat[i]);
333 fprop_local_destroy_percpu(&wb->completions); 333 fprop_local_destroy_percpu(&wb->completions);
334out_put_cong: 334out_put_cong:
@@ -989,7 +989,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
989 * here rather than calling cond_resched(). 989 * here rather than calling cond_resched().
990 */ 990 */
991 if (current->flags & PF_WQ_WORKER) 991 if (current->flags & PF_WQ_WORKER)
992 schedule_timeout(1); 992 schedule_timeout_uninterruptible(1);
993 else 993 else
994 cond_resched(); 994 cond_resched();
995 995
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 8fc50811119b..ba5d8f3e6d68 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -22,7 +22,7 @@
22 * cleancache_ops is set by cleancache_register_ops to contain the pointers 22 * cleancache_ops is set by cleancache_register_ops to contain the pointers
23 * to the cleancache "backend" implementation functions. 23 * to the cleancache "backend" implementation functions.
24 */ 24 */
25static struct cleancache_ops *cleancache_ops __read_mostly; 25static const struct cleancache_ops *cleancache_ops __read_mostly;
26 26
27/* 27/*
28 * Counters available via /sys/kernel/debug/cleancache (if debugfs is 28 * Counters available via /sys/kernel/debug/cleancache (if debugfs is
@@ -49,7 +49,7 @@ static void cleancache_register_ops_sb(struct super_block *sb, void *unused)
49/* 49/*
50 * Register operations for cleancache. Returns 0 on success. 50 * Register operations for cleancache. Returns 0 on success.
51 */ 51 */
52int cleancache_register_ops(struct cleancache_ops *ops) 52int cleancache_register_ops(const struct cleancache_ops *ops)
53{ 53{
54 if (cmpxchg(&cleancache_ops, NULL, ops)) 54 if (cmpxchg(&cleancache_ops, NULL, ops))
55 return -EBUSY; 55 return -EBUSY;
diff --git a/mm/filemap.c b/mm/filemap.c
index bc943867d68c..23edccecadb0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1890,6 +1890,7 @@ EXPORT_SYMBOL(generic_file_read_iter);
1890 * page_cache_read - adds requested page to the page cache if not already there 1890 * page_cache_read - adds requested page to the page cache if not already there
1891 * @file: file to read 1891 * @file: file to read
1892 * @offset: page index 1892 * @offset: page index
1893 * @gfp_mask: memory allocation flags
1893 * 1894 *
1894 * This adds the requested page to the page cache if it isn't already there, 1895 * This adds the requested page to the page cache if it isn't already there,
1895 * and schedules an I/O to read in its contents from disk. 1896 * and schedules an I/O to read in its contents from disk.
diff --git a/mm/gup.c b/mm/gup.c
index b64a36175884..7bf19ffa2199 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -430,10 +430,8 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
430 * Anon pages in shared mappings are surprising: now 430 * Anon pages in shared mappings are surprising: now
431 * just reject it. 431 * just reject it.
432 */ 432 */
433 if (!is_cow_mapping(vm_flags)) { 433 if (!is_cow_mapping(vm_flags))
434 WARN_ON_ONCE(vm_flags & VM_MAYWRITE);
435 return -EFAULT; 434 return -EFAULT;
436 }
437 } 435 }
438 } else if (!(vm_flags & VM_READ)) { 436 } else if (!(vm_flags & VM_READ)) {
439 if (!(gup_flags & FOLL_FORCE)) 437 if (!(gup_flags & FOLL_FORCE))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index fd3a07b3e6f4..08fc0ba2207e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -138,9 +138,6 @@ static struct khugepaged_scan khugepaged_scan = {
138 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), 138 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
139}; 139};
140 140
141static DEFINE_SPINLOCK(split_queue_lock);
142static LIST_HEAD(split_queue);
143static unsigned long split_queue_len;
144static struct shrinker deferred_split_shrinker; 141static struct shrinker deferred_split_shrinker;
145 142
146static void set_recommended_min_free_kbytes(void) 143static void set_recommended_min_free_kbytes(void)
@@ -861,7 +858,8 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
861 return false; 858 return false;
862 entry = mk_pmd(zero_page, vma->vm_page_prot); 859 entry = mk_pmd(zero_page, vma->vm_page_prot);
863 entry = pmd_mkhuge(entry); 860 entry = pmd_mkhuge(entry);
864 pgtable_trans_huge_deposit(mm, pmd, pgtable); 861 if (pgtable)
862 pgtable_trans_huge_deposit(mm, pmd, pgtable);
865 set_pmd_at(mm, haddr, pmd, entry); 863 set_pmd_at(mm, haddr, pmd, entry);
866 atomic_long_inc(&mm->nr_ptes); 864 atomic_long_inc(&mm->nr_ptes);
867 return true; 865 return true;
@@ -1039,13 +1037,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1039 spinlock_t *dst_ptl, *src_ptl; 1037 spinlock_t *dst_ptl, *src_ptl;
1040 struct page *src_page; 1038 struct page *src_page;
1041 pmd_t pmd; 1039 pmd_t pmd;
1042 pgtable_t pgtable; 1040 pgtable_t pgtable = NULL;
1043 int ret; 1041 int ret;
1044 1042
1045 ret = -ENOMEM; 1043 if (!vma_is_dax(vma)) {
1046 pgtable = pte_alloc_one(dst_mm, addr); 1044 ret = -ENOMEM;
1047 if (unlikely(!pgtable)) 1045 pgtable = pte_alloc_one(dst_mm, addr);
1048 goto out; 1046 if (unlikely(!pgtable))
1047 goto out;
1048 }
1049 1049
1050 dst_ptl = pmd_lock(dst_mm, dst_pmd); 1050 dst_ptl = pmd_lock(dst_mm, dst_pmd);
1051 src_ptl = pmd_lockptr(src_mm, src_pmd); 1051 src_ptl = pmd_lockptr(src_mm, src_pmd);
@@ -1076,7 +1076,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1076 goto out_unlock; 1076 goto out_unlock;
1077 } 1077 }
1078 1078
1079 if (pmd_trans_huge(pmd)) { 1079 if (!vma_is_dax(vma)) {
1080 /* thp accounting separate from pmd_devmap accounting */ 1080 /* thp accounting separate from pmd_devmap accounting */
1081 src_page = pmd_page(pmd); 1081 src_page = pmd_page(pmd);
1082 VM_BUG_ON_PAGE(!PageHead(src_page), src_page); 1082 VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
@@ -3358,6 +3358,7 @@ int total_mapcount(struct page *page)
3358int split_huge_page_to_list(struct page *page, struct list_head *list) 3358int split_huge_page_to_list(struct page *page, struct list_head *list)
3359{ 3359{
3360 struct page *head = compound_head(page); 3360 struct page *head = compound_head(page);
3361 struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
3361 struct anon_vma *anon_vma; 3362 struct anon_vma *anon_vma;
3362 int count, mapcount, ret; 3363 int count, mapcount, ret;
3363 bool mlocked; 3364 bool mlocked;
@@ -3401,19 +3402,19 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
3401 lru_add_drain(); 3402 lru_add_drain();
3402 3403
3403 /* Prevent deferred_split_scan() touching ->_count */ 3404 /* Prevent deferred_split_scan() touching ->_count */
3404 spin_lock_irqsave(&split_queue_lock, flags); 3405 spin_lock_irqsave(&pgdata->split_queue_lock, flags);
3405 count = page_count(head); 3406 count = page_count(head);
3406 mapcount = total_mapcount(head); 3407 mapcount = total_mapcount(head);
3407 if (!mapcount && count == 1) { 3408 if (!mapcount && count == 1) {
3408 if (!list_empty(page_deferred_list(head))) { 3409 if (!list_empty(page_deferred_list(head))) {
3409 split_queue_len--; 3410 pgdata->split_queue_len--;
3410 list_del(page_deferred_list(head)); 3411 list_del(page_deferred_list(head));
3411 } 3412 }
3412 spin_unlock_irqrestore(&split_queue_lock, flags); 3413 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
3413 __split_huge_page(page, list); 3414 __split_huge_page(page, list);
3414 ret = 0; 3415 ret = 0;
3415 } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { 3416 } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
3416 spin_unlock_irqrestore(&split_queue_lock, flags); 3417 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
3417 pr_alert("total_mapcount: %u, page_count(): %u\n", 3418 pr_alert("total_mapcount: %u, page_count(): %u\n",
3418 mapcount, count); 3419 mapcount, count);
3419 if (PageTail(page)) 3420 if (PageTail(page))
@@ -3421,7 +3422,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
3421 dump_page(page, "total_mapcount(head) > 0"); 3422 dump_page(page, "total_mapcount(head) > 0");
3422 BUG(); 3423 BUG();
3423 } else { 3424 } else {
3424 spin_unlock_irqrestore(&split_queue_lock, flags); 3425 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
3425 unfreeze_page(anon_vma, head); 3426 unfreeze_page(anon_vma, head);
3426 ret = -EBUSY; 3427 ret = -EBUSY;
3427 } 3428 }
@@ -3436,64 +3437,65 @@ out:
3436 3437
3437void free_transhuge_page(struct page *page) 3438void free_transhuge_page(struct page *page)
3438{ 3439{
3440 struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
3439 unsigned long flags; 3441 unsigned long flags;
3440 3442
3441 spin_lock_irqsave(&split_queue_lock, flags); 3443 spin_lock_irqsave(&pgdata->split_queue_lock, flags);
3442 if (!list_empty(page_deferred_list(page))) { 3444 if (!list_empty(page_deferred_list(page))) {
3443 split_queue_len--; 3445 pgdata->split_queue_len--;
3444 list_del(page_deferred_list(page)); 3446 list_del(page_deferred_list(page));
3445 } 3447 }
3446 spin_unlock_irqrestore(&split_queue_lock, flags); 3448 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
3447 free_compound_page(page); 3449 free_compound_page(page);
3448} 3450}
3449 3451
3450void deferred_split_huge_page(struct page *page) 3452void deferred_split_huge_page(struct page *page)
3451{ 3453{
3454 struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
3452 unsigned long flags; 3455 unsigned long flags;
3453 3456
3454 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 3457 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3455 3458
3456 spin_lock_irqsave(&split_queue_lock, flags); 3459 spin_lock_irqsave(&pgdata->split_queue_lock, flags);
3457 if (list_empty(page_deferred_list(page))) { 3460 if (list_empty(page_deferred_list(page))) {
3458 list_add_tail(page_deferred_list(page), &split_queue); 3461 list_add_tail(page_deferred_list(page), &pgdata->split_queue);
3459 split_queue_len++; 3462 pgdata->split_queue_len++;
3460 } 3463 }
3461 spin_unlock_irqrestore(&split_queue_lock, flags); 3464 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
3462} 3465}
3463 3466
3464static unsigned long deferred_split_count(struct shrinker *shrink, 3467static unsigned long deferred_split_count(struct shrinker *shrink,
3465 struct shrink_control *sc) 3468 struct shrink_control *sc)
3466{ 3469{
3467 /* 3470 struct pglist_data *pgdata = NODE_DATA(sc->nid);
3468 * Split a page from split_queue will free up at least one page, 3471 return ACCESS_ONCE(pgdata->split_queue_len);
3469 * at most HPAGE_PMD_NR - 1. We don't track exact number.
3470 * Let's use HPAGE_PMD_NR / 2 as ballpark.
3471 */
3472 return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2;
3473} 3472}
3474 3473
3475static unsigned long deferred_split_scan(struct shrinker *shrink, 3474static unsigned long deferred_split_scan(struct shrinker *shrink,
3476 struct shrink_control *sc) 3475 struct shrink_control *sc)
3477{ 3476{
3477 struct pglist_data *pgdata = NODE_DATA(sc->nid);
3478 unsigned long flags; 3478 unsigned long flags;
3479 LIST_HEAD(list), *pos, *next; 3479 LIST_HEAD(list), *pos, *next;
3480 struct page *page; 3480 struct page *page;
3481 int split = 0; 3481 int split = 0;
3482 3482
3483 spin_lock_irqsave(&split_queue_lock, flags); 3483 spin_lock_irqsave(&pgdata->split_queue_lock, flags);
3484 list_splice_init(&split_queue, &list);
3485
3486 /* Take pin on all head pages to avoid freeing them under us */ 3484 /* Take pin on all head pages to avoid freeing them under us */
3487 list_for_each_safe(pos, next, &list) { 3485 list_for_each_safe(pos, next, &pgdata->split_queue) {
3488 page = list_entry((void *)pos, struct page, mapping); 3486 page = list_entry((void *)pos, struct page, mapping);
3489 page = compound_head(page); 3487 page = compound_head(page);
3490 /* race with put_compound_page() */ 3488 if (get_page_unless_zero(page)) {
3491 if (!get_page_unless_zero(page)) { 3489 list_move(page_deferred_list(page), &list);
3490 } else {
3491 /* We lost race with put_compound_page() */
3492 list_del_init(page_deferred_list(page)); 3492 list_del_init(page_deferred_list(page));
3493 split_queue_len--; 3493 pgdata->split_queue_len--;
3494 } 3494 }
3495 if (!--sc->nr_to_scan)
3496 break;
3495 } 3497 }
3496 spin_unlock_irqrestore(&split_queue_lock, flags); 3498 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
3497 3499
3498 list_for_each_safe(pos, next, &list) { 3500 list_for_each_safe(pos, next, &list) {
3499 page = list_entry((void *)pos, struct page, mapping); 3501 page = list_entry((void *)pos, struct page, mapping);
@@ -3505,17 +3507,24 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
3505 put_page(page); 3507 put_page(page);
3506 } 3508 }
3507 3509
3508 spin_lock_irqsave(&split_queue_lock, flags); 3510 spin_lock_irqsave(&pgdata->split_queue_lock, flags);
3509 list_splice_tail(&list, &split_queue); 3511 list_splice_tail(&list, &pgdata->split_queue);
3510 spin_unlock_irqrestore(&split_queue_lock, flags); 3512 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
3511 3513
3512 return split * HPAGE_PMD_NR / 2; 3514 /*
3515 * Stop shrinker if we didn't split any page, but the queue is empty.
3516 * This can happen if pages were freed under us.
3517 */
3518 if (!split && list_empty(&pgdata->split_queue))
3519 return SHRINK_STOP;
3520 return split;
3513} 3521}
3514 3522
3515static struct shrinker deferred_split_shrinker = { 3523static struct shrinker deferred_split_shrinker = {
3516 .count_objects = deferred_split_count, 3524 .count_objects = deferred_split_count,
3517 .scan_objects = deferred_split_scan, 3525 .scan_objects = deferred_split_scan,
3518 .seeks = DEFAULT_SEEKS, 3526 .seeks = DEFAULT_SEEKS,
3527 .flags = SHRINKER_NUMA_AWARE,
3519}; 3528};
3520 3529
3521#ifdef CONFIG_DEBUG_FS 3530#ifdef CONFIG_DEBUG_FS
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 12908dcf5831..06ae13e869d0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1001,7 +1001,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
1001 ((node = hstate_next_node_to_free(hs, mask)) || 1); \ 1001 ((node = hstate_next_node_to_free(hs, mask)) || 1); \
1002 nr_nodes--) 1002 nr_nodes--)
1003 1003
1004#if defined(CONFIG_CMA) && defined(CONFIG_X86_64) 1004#if defined(CONFIG_X86_64) && ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA))
1005static void destroy_compound_gigantic_page(struct page *page, 1005static void destroy_compound_gigantic_page(struct page *page,
1006 unsigned int order) 1006 unsigned int order)
1007{ 1007{
@@ -1214,8 +1214,8 @@ void free_huge_page(struct page *page)
1214 1214
1215 set_page_private(page, 0); 1215 set_page_private(page, 0);
1216 page->mapping = NULL; 1216 page->mapping = NULL;
1217 BUG_ON(page_count(page)); 1217 VM_BUG_ON_PAGE(page_count(page), page);
1218 BUG_ON(page_mapcount(page)); 1218 VM_BUG_ON_PAGE(page_mapcount(page), page);
1219 restore_reserve = PagePrivate(page); 1219 restore_reserve = PagePrivate(page);
1220 ClearPagePrivate(page); 1220 ClearPagePrivate(page);
1221 1221
@@ -1286,6 +1286,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
1286 set_page_count(p, 0); 1286 set_page_count(p, 0);
1287 set_compound_head(p, page); 1287 set_compound_head(p, page);
1288 } 1288 }
1289 atomic_set(compound_mapcount_ptr(page), -1);
1289} 1290}
1290 1291
1291/* 1292/*
diff --git a/mm/internal.h b/mm/internal.h
index ed8b5ffcf9b1..a38a21ebddb4 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -216,6 +216,37 @@ static inline bool is_cow_mapping(vm_flags_t flags)
216 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 216 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
217} 217}
218 218
219/*
220 * These three helpers classifies VMAs for virtual memory accounting.
221 */
222
223/*
224 * Executable code area - executable, not writable, not stack
225 */
226static inline bool is_exec_mapping(vm_flags_t flags)
227{
228 return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC;
229}
230
231/*
232 * Stack area - atomatically grows in one direction
233 *
234 * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
235 * do_mmap() forbids all other combinations.
236 */
237static inline bool is_stack_mapping(vm_flags_t flags)
238{
239 return (flags & VM_STACK) == VM_STACK;
240}
241
242/*
243 * Data area - private, writable, not stack
244 */
245static inline bool is_data_mapping(vm_flags_t flags)
246{
247 return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE;
248}
249
219/* mm/util.c */ 250/* mm/util.c */
220void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, 251void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
221 struct vm_area_struct *prev, struct rb_node *rb_parent); 252 struct vm_area_struct *prev, struct rb_node *rb_parent);
diff --git a/mm/memblock.c b/mm/memblock.c
index d2ed81e59a94..dd7989929f13 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1448,7 +1448,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
1448 * Remaining API functions 1448 * Remaining API functions
1449 */ 1449 */
1450 1450
1451phys_addr_t __init memblock_phys_mem_size(void) 1451phys_addr_t __init_memblock memblock_phys_mem_size(void)
1452{ 1452{
1453 return memblock.memory.total_size; 1453 return memblock.memory.total_size;
1454} 1454}
diff --git a/mm/memory.c b/mm/memory.c
index 5aa4f55eb786..38090ca37a08 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1612,10 +1612,15 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1612 * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP 1612 * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
1613 * without pte special, it would there be refcounted as a normal page. 1613 * without pte special, it would there be refcounted as a normal page.
1614 */ 1614 */
1615 if (!HAVE_PTE_SPECIAL && pfn_t_valid(pfn)) { 1615 if (!HAVE_PTE_SPECIAL && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
1616 struct page *page; 1616 struct page *page;
1617 1617
1618 page = pfn_t_to_page(pfn); 1618 /*
1619 * At this point we are committed to insert_page()
1620 * regardless of whether the caller specified flags that
1621 * result in pfn_t_has_page() == false.
1622 */
1623 page = pfn_to_page(pfn_t_to_pfn(pfn));
1619 return insert_page(vma, addr, page, vma->vm_page_prot); 1624 return insert_page(vma, addr, page, vma->vm_page_prot);
1620 } 1625 }
1621 return insert_pfn(vma, addr, pfn, vma->vm_page_prot); 1626 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
@@ -2253,11 +2258,6 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
2253 2258
2254 page_cache_get(old_page); 2259 page_cache_get(old_page);
2255 2260
2256 /*
2257 * Only catch write-faults on shared writable pages,
2258 * read-only shared pages can get COWed by
2259 * get_user_pages(.write=1, .force=1).
2260 */
2261 if (vma->vm_ops && vma->vm_ops->page_mkwrite) { 2261 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2262 int tmp; 2262 int tmp;
2263 2263
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 27d135408a22..4c4187c0e1de 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -548,8 +548,7 @@ retry:
548 goto retry; 548 goto retry;
549 } 549 }
550 550
551 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 551 migrate_page_add(page, qp->pagelist, flags);
552 migrate_page_add(page, qp->pagelist, flags);
553 } 552 }
554 pte_unmap_unlock(pte - 1, ptl); 553 pte_unmap_unlock(pte - 1, ptl);
555 cond_resched(); 554 cond_resched();
@@ -625,7 +624,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
625 unsigned long endvma = vma->vm_end; 624 unsigned long endvma = vma->vm_end;
626 unsigned long flags = qp->flags; 625 unsigned long flags = qp->flags;
627 626
628 if (vma->vm_flags & VM_PFNMAP) 627 if (!vma_migratable(vma))
629 return 1; 628 return 1;
630 629
631 if (endvma > end) 630 if (endvma > end)
@@ -644,16 +643,13 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
644 643
645 if (flags & MPOL_MF_LAZY) { 644 if (flags & MPOL_MF_LAZY) {
646 /* Similar to task_numa_work, skip inaccessible VMAs */ 645 /* Similar to task_numa_work, skip inaccessible VMAs */
647 if (vma_migratable(vma) && 646 if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
648 vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
649 change_prot_numa(vma, start, endvma); 647 change_prot_numa(vma, start, endvma);
650 return 1; 648 return 1;
651 } 649 }
652 650
653 if ((flags & MPOL_MF_STRICT) || 651 /* queue pages from current vma */
654 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && 652 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
655 vma_migratable(vma)))
656 /* queue pages from current vma */
657 return 0; 653 return 0;
658 return 1; 654 return 1;
659} 655}
diff --git a/mm/mmap.c b/mm/mmap.c
index 407ab434d5ee..e2e9f48b06c2 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -42,6 +42,7 @@
42#include <linux/memory.h> 42#include <linux/memory.h>
43#include <linux/printk.h> 43#include <linux/printk.h>
44#include <linux/userfaultfd_k.h> 44#include <linux/userfaultfd_k.h>
45#include <linux/moduleparam.h>
45 46
46#include <asm/uaccess.h> 47#include <asm/uaccess.h>
47#include <asm/cacheflush.h> 48#include <asm/cacheflush.h>
@@ -69,6 +70,8 @@ const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
69int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; 70int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
70#endif 71#endif
71 72
73static bool ignore_rlimit_data = true;
74core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
72 75
73static void unmap_region(struct mm_struct *mm, 76static void unmap_region(struct mm_struct *mm,
74 struct vm_area_struct *vma, struct vm_area_struct *prev, 77 struct vm_area_struct *vma, struct vm_area_struct *prev,
@@ -387,8 +390,9 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma)
387} 390}
388 391
389#ifdef CONFIG_DEBUG_VM_RB 392#ifdef CONFIG_DEBUG_VM_RB
390static int browse_rb(struct rb_root *root) 393static int browse_rb(struct mm_struct *mm)
391{ 394{
395 struct rb_root *root = &mm->mm_rb;
392 int i = 0, j, bug = 0; 396 int i = 0, j, bug = 0;
393 struct rb_node *nd, *pn = NULL; 397 struct rb_node *nd, *pn = NULL;
394 unsigned long prev = 0, pend = 0; 398 unsigned long prev = 0, pend = 0;
@@ -411,12 +415,14 @@ static int browse_rb(struct rb_root *root)
411 vma->vm_start, vma->vm_end); 415 vma->vm_start, vma->vm_end);
412 bug = 1; 416 bug = 1;
413 } 417 }
418 spin_lock(&mm->page_table_lock);
414 if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { 419 if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
415 pr_emerg("free gap %lx, correct %lx\n", 420 pr_emerg("free gap %lx, correct %lx\n",
416 vma->rb_subtree_gap, 421 vma->rb_subtree_gap,
417 vma_compute_subtree_gap(vma)); 422 vma_compute_subtree_gap(vma));
418 bug = 1; 423 bug = 1;
419 } 424 }
425 spin_unlock(&mm->page_table_lock);
420 i++; 426 i++;
421 pn = nd; 427 pn = nd;
422 prev = vma->vm_start; 428 prev = vma->vm_start;
@@ -453,12 +459,16 @@ static void validate_mm(struct mm_struct *mm)
453 struct vm_area_struct *vma = mm->mmap; 459 struct vm_area_struct *vma = mm->mmap;
454 460
455 while (vma) { 461 while (vma) {
462 struct anon_vma *anon_vma = vma->anon_vma;
456 struct anon_vma_chain *avc; 463 struct anon_vma_chain *avc;
457 464
458 vma_lock_anon_vma(vma); 465 if (anon_vma) {
459 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 466 anon_vma_lock_read(anon_vma);
460 anon_vma_interval_tree_verify(avc); 467 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
461 vma_unlock_anon_vma(vma); 468 anon_vma_interval_tree_verify(avc);
469 anon_vma_unlock_read(anon_vma);
470 }
471
462 highest_address = vma->vm_end; 472 highest_address = vma->vm_end;
463 vma = vma->vm_next; 473 vma = vma->vm_next;
464 i++; 474 i++;
@@ -472,7 +482,7 @@ static void validate_mm(struct mm_struct *mm)
472 mm->highest_vm_end, highest_address); 482 mm->highest_vm_end, highest_address);
473 bug = 1; 483 bug = 1;
474 } 484 }
475 i = browse_rb(&mm->mm_rb); 485 i = browse_rb(mm);
476 if (i != mm->map_count) { 486 if (i != mm->map_count) {
477 if (i != -1) 487 if (i != -1)
478 pr_emerg("map_count %d rb %d\n", mm->map_count, i); 488 pr_emerg("map_count %d rb %d\n", mm->map_count, i);
@@ -2139,32 +2149,27 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
2139int expand_upwards(struct vm_area_struct *vma, unsigned long address) 2149int expand_upwards(struct vm_area_struct *vma, unsigned long address)
2140{ 2150{
2141 struct mm_struct *mm = vma->vm_mm; 2151 struct mm_struct *mm = vma->vm_mm;
2142 int error; 2152 int error = 0;
2143 2153
2144 if (!(vma->vm_flags & VM_GROWSUP)) 2154 if (!(vma->vm_flags & VM_GROWSUP))
2145 return -EFAULT; 2155 return -EFAULT;
2146 2156
2147 /* 2157 /* Guard against wrapping around to address 0. */
2148 * We must make sure the anon_vma is allocated 2158 if (address < PAGE_ALIGN(address+4))
2149 * so that the anon_vma locking is not a noop. 2159 address = PAGE_ALIGN(address+4);
2150 */ 2160 else
2161 return -ENOMEM;
2162
2163 /* We must make sure the anon_vma is allocated. */
2151 if (unlikely(anon_vma_prepare(vma))) 2164 if (unlikely(anon_vma_prepare(vma)))
2152 return -ENOMEM; 2165 return -ENOMEM;
2153 vma_lock_anon_vma(vma);
2154 2166
2155 /* 2167 /*
2156 * vma->vm_start/vm_end cannot change under us because the caller 2168 * vma->vm_start/vm_end cannot change under us because the caller
2157 * is required to hold the mmap_sem in read mode. We need the 2169 * is required to hold the mmap_sem in read mode. We need the
2158 * anon_vma lock to serialize against concurrent expand_stacks. 2170 * anon_vma lock to serialize against concurrent expand_stacks.
2159 * Also guard against wrapping around to address 0.
2160 */ 2171 */
2161 if (address < PAGE_ALIGN(address+4)) 2172 anon_vma_lock_write(vma->anon_vma);
2162 address = PAGE_ALIGN(address+4);
2163 else {
2164 vma_unlock_anon_vma(vma);
2165 return -ENOMEM;
2166 }
2167 error = 0;
2168 2173
2169 /* Somebody else might have raced and expanded it already */ 2174 /* Somebody else might have raced and expanded it already */
2170 if (address > vma->vm_end) { 2175 if (address > vma->vm_end) {
@@ -2182,7 +2187,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
2182 * updates, but we only hold a shared mmap_sem 2187 * updates, but we only hold a shared mmap_sem
2183 * lock here, so we need to protect against 2188 * lock here, so we need to protect against
2184 * concurrent vma expansions. 2189 * concurrent vma expansions.
2185 * vma_lock_anon_vma() doesn't help here, as 2190 * anon_vma_lock_write() doesn't help here, as
2186 * we don't guarantee that all growable vmas 2191 * we don't guarantee that all growable vmas
2187 * in a mm share the same root anon vma. 2192 * in a mm share the same root anon vma.
2188 * So, we reuse mm->page_table_lock to guard 2193 * So, we reuse mm->page_table_lock to guard
@@ -2205,7 +2210,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
2205 } 2210 }
2206 } 2211 }
2207 } 2212 }
2208 vma_unlock_anon_vma(vma); 2213 anon_vma_unlock_write(vma->anon_vma);
2209 khugepaged_enter_vma_merge(vma, vma->vm_flags); 2214 khugepaged_enter_vma_merge(vma, vma->vm_flags);
2210 validate_mm(mm); 2215 validate_mm(mm);
2211 return error; 2216 return error;
@@ -2221,25 +2226,21 @@ int expand_downwards(struct vm_area_struct *vma,
2221 struct mm_struct *mm = vma->vm_mm; 2226 struct mm_struct *mm = vma->vm_mm;
2222 int error; 2227 int error;
2223 2228
2224 /*
2225 * We must make sure the anon_vma is allocated
2226 * so that the anon_vma locking is not a noop.
2227 */
2228 if (unlikely(anon_vma_prepare(vma)))
2229 return -ENOMEM;
2230
2231 address &= PAGE_MASK; 2229 address &= PAGE_MASK;
2232 error = security_mmap_addr(address); 2230 error = security_mmap_addr(address);
2233 if (error) 2231 if (error)
2234 return error; 2232 return error;
2235 2233
2236 vma_lock_anon_vma(vma); 2234 /* We must make sure the anon_vma is allocated. */
2235 if (unlikely(anon_vma_prepare(vma)))
2236 return -ENOMEM;
2237 2237
2238 /* 2238 /*
2239 * vma->vm_start/vm_end cannot change under us because the caller 2239 * vma->vm_start/vm_end cannot change under us because the caller
2240 * is required to hold the mmap_sem in read mode. We need the 2240 * is required to hold the mmap_sem in read mode. We need the
2241 * anon_vma lock to serialize against concurrent expand_stacks. 2241 * anon_vma lock to serialize against concurrent expand_stacks.
2242 */ 2242 */
2243 anon_vma_lock_write(vma->anon_vma);
2243 2244
2244 /* Somebody else might have raced and expanded it already */ 2245 /* Somebody else might have raced and expanded it already */
2245 if (address < vma->vm_start) { 2246 if (address < vma->vm_start) {
@@ -2257,7 +2258,7 @@ int expand_downwards(struct vm_area_struct *vma,
2257 * updates, but we only hold a shared mmap_sem 2258 * updates, but we only hold a shared mmap_sem
2258 * lock here, so we need to protect against 2259 * lock here, so we need to protect against
2259 * concurrent vma expansions. 2260 * concurrent vma expansions.
2260 * vma_lock_anon_vma() doesn't help here, as 2261 * anon_vma_lock_write() doesn't help here, as
2261 * we don't guarantee that all growable vmas 2262 * we don't guarantee that all growable vmas
2262 * in a mm share the same root anon vma. 2263 * in a mm share the same root anon vma.
2263 * So, we reuse mm->page_table_lock to guard 2264 * So, we reuse mm->page_table_lock to guard
@@ -2278,7 +2279,7 @@ int expand_downwards(struct vm_area_struct *vma,
2278 } 2279 }
2279 } 2280 }
2280 } 2281 }
2281 vma_unlock_anon_vma(vma); 2282 anon_vma_unlock_write(vma->anon_vma);
2282 khugepaged_enter_vma_merge(vma, vma->vm_flags); 2283 khugepaged_enter_vma_merge(vma, vma->vm_flags);
2283 validate_mm(mm); 2284 validate_mm(mm);
2284 return error; 2285 return error;
@@ -2982,9 +2983,17 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
2982 if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT) 2983 if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
2983 return false; 2984 return false;
2984 2985
2985 if ((flags & (VM_WRITE | VM_SHARED | (VM_STACK_FLAGS & 2986 if (is_data_mapping(flags) &&
2986 (VM_GROWSUP | VM_GROWSDOWN)))) == VM_WRITE) 2987 mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
2987 return mm->data_vm + npages <= rlimit(RLIMIT_DATA); 2988 if (ignore_rlimit_data)
2989 pr_warn_once("%s (%d): VmData %lu exceed data ulimit "
2990 "%lu. Will be forbidden soon.\n",
2991 current->comm, current->pid,
2992 (mm->data_vm + npages) << PAGE_SHIFT,
2993 rlimit(RLIMIT_DATA));
2994 else
2995 return false;
2996 }
2988 2997
2989 return true; 2998 return true;
2990} 2999}
@@ -2993,11 +3002,11 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
2993{ 3002{
2994 mm->total_vm += npages; 3003 mm->total_vm += npages;
2995 3004
2996 if ((flags & (VM_EXEC | VM_WRITE)) == VM_EXEC) 3005 if (is_exec_mapping(flags))
2997 mm->exec_vm += npages; 3006 mm->exec_vm += npages;
2998 else if (flags & (VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN))) 3007 else if (is_stack_mapping(flags))
2999 mm->stack_vm += npages; 3008 mm->stack_vm += npages;
3000 else if ((flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) 3009 else if (is_data_mapping(flags))
3001 mm->data_vm += npages; 3010 mm->data_vm += npages;
3002} 3011}
3003 3012
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 8eb7bb40dc40..f7cb3d4d9c2e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -160,9 +160,11 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
160 } 160 }
161 161
162 if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { 162 if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
163 if (next - addr != HPAGE_PMD_SIZE) 163 if (next - addr != HPAGE_PMD_SIZE) {
164 split_huge_pmd(vma, pmd, addr); 164 split_huge_pmd(vma, pmd, addr);
165 else { 165 if (pmd_none(*pmd))
166 continue;
167 } else {
166 int nr_ptes = change_huge_pmd(vma, pmd, addr, 168 int nr_ptes = change_huge_pmd(vma, pmd, addr,
167 newprot, prot_numa); 169 newprot, prot_numa);
168 170
diff --git a/mm/mremap.c b/mm/mremap.c
index d77946a997f7..8eeba02fc991 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -210,6 +210,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
210 } 210 }
211 } 211 }
212 split_huge_pmd(vma, old_pmd, old_addr); 212 split_huge_pmd(vma, old_pmd, old_addr);
213 if (pmd_none(*old_pmd))
214 continue;
213 VM_BUG_ON(pmd_trans_huge(*old_pmd)); 215 VM_BUG_ON(pmd_trans_huge(*old_pmd));
214 } 216 }
215 if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma, 217 if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 63358d9f9aa9..838ca8bb64f7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5210,6 +5210,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
5210 pgdat->numabalancing_migrate_nr_pages = 0; 5210 pgdat->numabalancing_migrate_nr_pages = 0;
5211 pgdat->numabalancing_migrate_next_window = jiffies; 5211 pgdat->numabalancing_migrate_next_window = jiffies;
5212#endif 5212#endif
5213#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5214 spin_lock_init(&pgdat->split_queue_lock);
5215 INIT_LIST_HEAD(&pgdat->split_queue);
5216 pgdat->split_queue_len = 0;
5217#endif
5213 init_waitqueue_head(&pgdat->kswapd_wait); 5218 init_waitqueue_head(&pgdat->kswapd_wait);
5214 init_waitqueue_head(&pgdat->pfmemalloc_wait); 5219 init_waitqueue_head(&pgdat->pfmemalloc_wait);
5215 pgdat_page_ext_init(pgdat); 5220 pgdat_page_ext_init(pgdat);
@@ -6615,7 +6620,7 @@ bool is_pageblock_removable_nolock(struct page *page)
6615 return !has_unmovable_pages(zone, page, 0, true); 6620 return !has_unmovable_pages(zone, page, 0, true);
6616} 6621}
6617 6622
6618#ifdef CONFIG_CMA 6623#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
6619 6624
6620static unsigned long pfn_max_align_down(unsigned long pfn) 6625static unsigned long pfn_max_align_down(unsigned long pfn)
6621{ 6626{
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 9d4767698a1c..06a005b979a7 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -90,9 +90,9 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
90 * ARCHes with special requirements for evicting THP backing TLB entries can 90 * ARCHes with special requirements for evicting THP backing TLB entries can
91 * implement this. Otherwise also, it can help optimize normal TLB flush in 91 * implement this. Otherwise also, it can help optimize normal TLB flush in
92 * THP regime. stock flush_tlb_range() typically has optimization to nuke the 92 * THP regime. stock flush_tlb_range() typically has optimization to nuke the
93 * entire TLB TLB if flush span is greater than a threshhold, which will 93 * entire TLB if flush span is greater than a threshold, which will
94 * likely be true for a single huge page. Thus a single thp flush will 94 * likely be true for a single huge page. Thus a single thp flush will
95 * invalidate the entire TLB which is not desitable. 95 * invalidate the entire TLB which is not desirable.
96 * e.g. see arch/arc: flush_pmd_tlb_range 96 * e.g. see arch/arc: flush_pmd_tlb_range
97 */ 97 */
98#define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) 98#define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
@@ -195,7 +195,9 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
195 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 195 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
196 VM_BUG_ON(pmd_trans_huge(*pmdp)); 196 VM_BUG_ON(pmd_trans_huge(*pmdp));
197 pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); 197 pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
198 flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 198
199 /* collapse entails shooting down ptes not pmd */
200 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
199 return pmd; 201 return pmd;
200} 202}
201#endif 203#endif
diff --git a/mm/util.c b/mm/util.c
index c108a6542d05..4fb14ca5a419 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -230,36 +230,11 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
230} 230}
231 231
232/* Check if the vma is being used as a stack by this task */ 232/* Check if the vma is being used as a stack by this task */
233static int vm_is_stack_for_task(struct task_struct *t, 233int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t)
234 struct vm_area_struct *vma)
235{ 234{
236 return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); 235 return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
237} 236}
238 237
239/*
240 * Check if the vma is being used as a stack.
241 * If is_group is non-zero, check in the entire thread group or else
242 * just check in the current task. Returns the task_struct of the task
243 * that the vma is stack for. Must be called under rcu_read_lock().
244 */
245struct task_struct *task_of_stack(struct task_struct *task,
246 struct vm_area_struct *vma, bool in_group)
247{
248 if (vm_is_stack_for_task(task, vma))
249 return task;
250
251 if (in_group) {
252 struct task_struct *t;
253
254 for_each_thread(task, t) {
255 if (vm_is_stack_for_task(t, vma))
256 return t;
257 }
258 }
259
260 return NULL;
261}
262
263#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) 238#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
264void arch_pick_mmap_layout(struct mm_struct *mm) 239void arch_pick_mmap_layout(struct mm_struct *mm)
265{ 240{
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 9a6c0704211c..149fdf6c5c56 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -248,9 +248,8 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
248 248
249 if (tree) { 249 if (tree) {
250 spin_lock(&vmpr->sr_lock); 250 spin_lock(&vmpr->sr_lock);
251 vmpr->tree_scanned += scanned; 251 scanned = vmpr->tree_scanned += scanned;
252 vmpr->tree_reclaimed += reclaimed; 252 vmpr->tree_reclaimed += reclaimed;
253 scanned = vmpr->scanned;
254 spin_unlock(&vmpr->sr_lock); 253 spin_unlock(&vmpr->sr_lock);
255 254
256 if (scanned < vmpressure_win) 255 if (scanned < vmpressure_win)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eb3dd37ccd7c..71b1c29948db 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1443,7 +1443,7 @@ int isolate_lru_page(struct page *page)
1443 int ret = -EBUSY; 1443 int ret = -EBUSY;
1444 1444
1445 VM_BUG_ON_PAGE(!page_count(page), page); 1445 VM_BUG_ON_PAGE(!page_count(page), page);
1446 VM_BUG_ON_PAGE(PageTail(page), page); 1446 WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
1447 1447
1448 if (PageLRU(page)) { 1448 if (PageLRU(page)) {
1449 struct zone *zone = page_zone(page); 1449 struct zone *zone = page_zone(page);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 40b2c74ddf16..084c6725b373 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1396,10 +1396,15 @@ static void vmstat_update(struct work_struct *w)
1396 * Counters were updated so we expect more updates 1396 * Counters were updated so we expect more updates
1397 * to occur in the future. Keep on running the 1397 * to occur in the future. Keep on running the
1398 * update worker thread. 1398 * update worker thread.
1399 * If we were marked on cpu_stat_off clear the flag
1400 * so that vmstat_shepherd doesn't schedule us again.
1399 */ 1401 */
1400 queue_delayed_work_on(smp_processor_id(), vmstat_wq, 1402 if (!cpumask_test_and_clear_cpu(smp_processor_id(),
1401 this_cpu_ptr(&vmstat_work), 1403 cpu_stat_off)) {
1402 round_jiffies_relative(sysctl_stat_interval)); 1404 queue_delayed_work_on(smp_processor_id(), vmstat_wq,
1405 this_cpu_ptr(&vmstat_work),
1406 round_jiffies_relative(sysctl_stat_interval));
1407 }
1403 } else { 1408 } else {
1404 /* 1409 /*
1405 * We did not update any counters so the app may be in 1410 * We did not update any counters so the app may be in
@@ -1417,18 +1422,6 @@ static void vmstat_update(struct work_struct *w)
1417 * until the diffs stay at zero. The function is used by NOHZ and can only be 1422 * until the diffs stay at zero. The function is used by NOHZ and can only be
1418 * invoked when tick processing is not active. 1423 * invoked when tick processing is not active.
1419 */ 1424 */
1420void quiet_vmstat(void)
1421{
1422 if (system_state != SYSTEM_RUNNING)
1423 return;
1424
1425 do {
1426 if (!cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
1427 cancel_delayed_work(this_cpu_ptr(&vmstat_work));
1428
1429 } while (refresh_cpu_vm_stats(false));
1430}
1431
1432/* 1425/*
1433 * Check if the diffs for a certain cpu indicate that 1426 * Check if the diffs for a certain cpu indicate that
1434 * an update is needed. 1427 * an update is needed.
@@ -1452,6 +1445,30 @@ static bool need_update(int cpu)
1452 return false; 1445 return false;
1453} 1446}
1454 1447
1448void quiet_vmstat(void)
1449{
1450 if (system_state != SYSTEM_RUNNING)
1451 return;
1452
1453 /*
1454 * If we are already in hands of the shepherd then there
1455 * is nothing for us to do here.
1456 */
1457 if (cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
1458 return;
1459
1460 if (!need_update(smp_processor_id()))
1461 return;
1462
1463 /*
1464 * Just refresh counters and do not care about the pending delayed
1465 * vmstat_update. It doesn't fire that often to matter and canceling
1466 * it would be too expensive from this path.
1467 * vmstat_shepherd will take care about that for us.
1468 */
1469 refresh_cpu_vm_stats(false);
1470}
1471
1455 1472
1456/* 1473/*
1457 * Shepherd worker thread that checks the 1474 * Shepherd worker thread that checks the
@@ -1469,18 +1486,25 @@ static void vmstat_shepherd(struct work_struct *w)
1469 1486
1470 get_online_cpus(); 1487 get_online_cpus();
1471 /* Check processors whose vmstat worker threads have been disabled */ 1488 /* Check processors whose vmstat worker threads have been disabled */
1472 for_each_cpu(cpu, cpu_stat_off) 1489 for_each_cpu(cpu, cpu_stat_off) {
1473 if (need_update(cpu) && 1490 struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
1474 cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
1475
1476 queue_delayed_work_on(cpu, vmstat_wq,
1477 &per_cpu(vmstat_work, cpu), 0);
1478 1491
1492 if (need_update(cpu)) {
1493 if (cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
1494 queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
1495 } else {
1496 /*
1497 * Cancel the work if quiet_vmstat has put this
1498 * cpu on cpu_stat_off because the work item might
1499 * be still scheduled
1500 */
1501 cancel_delayed_work(dw);
1502 }
1503 }
1479 put_online_cpus(); 1504 put_online_cpus();
1480 1505
1481 schedule_delayed_work(&shepherd, 1506 schedule_delayed_work(&shepherd,
1482 round_jiffies_relative(sysctl_stat_interval)); 1507 round_jiffies_relative(sysctl_stat_interval));
1483
1484} 1508}
1485 1509
1486static void __init start_shepherd_timer(void) 1510static void __init start_shepherd_timer(void)
@@ -1488,7 +1512,7 @@ static void __init start_shepherd_timer(void)
1488 int cpu; 1512 int cpu;
1489 1513
1490 for_each_possible_cpu(cpu) 1514 for_each_possible_cpu(cpu)
1491 INIT_DELAYED_WORK(per_cpu_ptr(&vmstat_work, cpu), 1515 INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
1492 vmstat_update); 1516 vmstat_update);
1493 1517
1494 if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL)) 1518 if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL))