aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig4
-rw-r--r--mm/compaction.c20
-rw-r--r--mm/filemap.c4
-rw-r--r--mm/huge_memory.c20
-rw-r--r--mm/ksm.c2
-rw-r--r--mm/memcontrol.c20
-rw-r--r--mm/memory-failure.c8
-rw-r--r--mm/memory.c15
-rw-r--r--mm/migrate.c11
-rw-r--r--mm/mprotect.c25
-rw-r--r--mm/page-writeback.c5
-rw-r--r--mm/page_alloc.c30
-rw-r--r--mm/slub.c38
-rw-r--r--mm/swap.c4
-rw-r--r--mm/swap_state.c63
-rw-r--r--mm/swapfile.c11
-rw-r--r--mm/vmpressure.c1
-rw-r--r--mm/vmstat.c4
18 files changed, 190 insertions, 95 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 2d9f1504d75e..2888024e0b0a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -575,5 +575,5 @@ config PGTABLE_MAPPING
575 then you should select this. This causes zsmalloc to use page table 575 then you should select this. This causes zsmalloc to use page table
576 mapping rather than copying for object mapping. 576 mapping rather than copying for object mapping.
577 577
578 You can check speed with zsmalloc benchmark[1]. 578 You can check speed with zsmalloc benchmark:
579 [1] https://github.com/spartacus06/zsmalloc 579 https://github.com/spartacus06/zsmapbench
diff --git a/mm/compaction.c b/mm/compaction.c
index b48c5259ea33..918577595ea8 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -251,7 +251,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
251{ 251{
252 int nr_scanned = 0, total_isolated = 0; 252 int nr_scanned = 0, total_isolated = 0;
253 struct page *cursor, *valid_page = NULL; 253 struct page *cursor, *valid_page = NULL;
254 unsigned long nr_strict_required = end_pfn - blockpfn;
255 unsigned long flags; 254 unsigned long flags;
256 bool locked = false; 255 bool locked = false;
257 256
@@ -264,11 +263,12 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
264 263
265 nr_scanned++; 264 nr_scanned++;
266 if (!pfn_valid_within(blockpfn)) 265 if (!pfn_valid_within(blockpfn))
267 continue; 266 goto isolate_fail;
267
268 if (!valid_page) 268 if (!valid_page)
269 valid_page = page; 269 valid_page = page;
270 if (!PageBuddy(page)) 270 if (!PageBuddy(page))
271 continue; 271 goto isolate_fail;
272 272
273 /* 273 /*
274 * The zone lock must be held to isolate freepages. 274 * The zone lock must be held to isolate freepages.
@@ -289,12 +289,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
289 289
290 /* Recheck this is a buddy page under lock */ 290 /* Recheck this is a buddy page under lock */
291 if (!PageBuddy(page)) 291 if (!PageBuddy(page))
292 continue; 292 goto isolate_fail;
293 293
294 /* Found a free page, break it into order-0 pages */ 294 /* Found a free page, break it into order-0 pages */
295 isolated = split_free_page(page); 295 isolated = split_free_page(page);
296 if (!isolated && strict)
297 break;
298 total_isolated += isolated; 296 total_isolated += isolated;
299 for (i = 0; i < isolated; i++) { 297 for (i = 0; i < isolated; i++) {
300 list_add(&page->lru, freelist); 298 list_add(&page->lru, freelist);
@@ -305,7 +303,15 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
305 if (isolated) { 303 if (isolated) {
306 blockpfn += isolated - 1; 304 blockpfn += isolated - 1;
307 cursor += isolated - 1; 305 cursor += isolated - 1;
306 continue;
308 } 307 }
308
309isolate_fail:
310 if (strict)
311 break;
312 else
313 continue;
314
309 } 315 }
310 316
311 trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); 317 trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
@@ -315,7 +321,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
315 * pages requested were isolated. If there were any failures, 0 is 321 * pages requested were isolated. If there were any failures, 0 is
316 * returned and CMA will fail. 322 * returned and CMA will fail.
317 */ 323 */
318 if (strict && nr_strict_required > total_isolated) 324 if (strict && blockpfn < end_pfn)
319 total_isolated = 0; 325 total_isolated = 0;
320 326
321 if (locked) 327 if (locked)
diff --git a/mm/filemap.c b/mm/filemap.c
index d56d3c145b9f..7a13f6ac5421 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2553,8 +2553,8 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2553 if (ret > 0) { 2553 if (ret > 0) {
2554 ssize_t err; 2554 ssize_t err;
2555 2555
2556 err = generic_write_sync(file, pos, ret); 2556 err = generic_write_sync(file, iocb->ki_pos - ret, ret);
2557 if (err < 0 && ret > 0) 2557 if (err < 0)
2558 ret = err; 2558 ret = err;
2559 } 2559 }
2560 return ret; 2560 return ret;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 82166bf974e1..1546655a2d78 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1166,8 +1166,10 @@ alloc:
1166 } else { 1166 } else {
1167 ret = do_huge_pmd_wp_page_fallback(mm, vma, address, 1167 ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
1168 pmd, orig_pmd, page, haddr); 1168 pmd, orig_pmd, page, haddr);
1169 if (ret & VM_FAULT_OOM) 1169 if (ret & VM_FAULT_OOM) {
1170 split_huge_page(page); 1170 split_huge_page(page);
1171 ret |= VM_FAULT_FALLBACK;
1172 }
1171 put_page(page); 1173 put_page(page);
1172 } 1174 }
1173 count_vm_event(THP_FAULT_FALLBACK); 1175 count_vm_event(THP_FAULT_FALLBACK);
@@ -1179,9 +1181,10 @@ alloc:
1179 if (page) { 1181 if (page) {
1180 split_huge_page(page); 1182 split_huge_page(page);
1181 put_page(page); 1183 put_page(page);
1182 } 1184 } else
1185 split_huge_page_pmd(vma, address, pmd);
1186 ret |= VM_FAULT_FALLBACK;
1183 count_vm_event(THP_FAULT_FALLBACK); 1187 count_vm_event(THP_FAULT_FALLBACK);
1184 ret |= VM_FAULT_OOM;
1185 goto out; 1188 goto out;
1186 } 1189 }
1187 1190
@@ -1545,6 +1548,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1545 entry = pmd_mknonnuma(entry); 1548 entry = pmd_mknonnuma(entry);
1546 entry = pmd_modify(entry, newprot); 1549 entry = pmd_modify(entry, newprot);
1547 ret = HPAGE_PMD_NR; 1550 ret = HPAGE_PMD_NR;
1551 set_pmd_at(mm, addr, pmd, entry);
1548 BUG_ON(pmd_write(entry)); 1552 BUG_ON(pmd_write(entry));
1549 } else { 1553 } else {
1550 struct page *page = pmd_page(*pmd); 1554 struct page *page = pmd_page(*pmd);
@@ -1557,16 +1561,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1557 */ 1561 */
1558 if (!is_huge_zero_page(page) && 1562 if (!is_huge_zero_page(page) &&
1559 !pmd_numa(*pmd)) { 1563 !pmd_numa(*pmd)) {
1560 entry = *pmd; 1564 pmdp_set_numa(mm, addr, pmd);
1561 entry = pmd_mknuma(entry);
1562 ret = HPAGE_PMD_NR; 1565 ret = HPAGE_PMD_NR;
1563 } 1566 }
1564 } 1567 }
1565
1566 /* Set PMD if cleared earlier */
1567 if (ret == HPAGE_PMD_NR)
1568 set_pmd_at(mm, addr, pmd, entry);
1569
1570 spin_unlock(ptl); 1568 spin_unlock(ptl);
1571 } 1569 }
1572 1570
@@ -1963,7 +1961,7 @@ out:
1963 return ret; 1961 return ret;
1964} 1962}
1965 1963
1966#define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE) 1964#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
1967 1965
1968int hugepage_madvise(struct vm_area_struct *vma, 1966int hugepage_madvise(struct vm_area_struct *vma,
1969 unsigned long *vm_flags, int advice) 1967 unsigned long *vm_flags, int advice)
diff --git a/mm/ksm.c b/mm/ksm.c
index aa4c7c7250c1..68710e80994a 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -444,7 +444,7 @@ static void break_cow(struct rmap_item *rmap_item)
444static struct page *page_trans_compound_anon(struct page *page) 444static struct page *page_trans_compound_anon(struct page *page)
445{ 445{
446 if (PageTransCompound(page)) { 446 if (PageTransCompound(page)) {
447 struct page *head = compound_trans_head(page); 447 struct page *head = compound_head(page);
448 /* 448 /*
449 * head may actually be splitted and freed from under 449 * head may actually be splitted and freed from under
450 * us but it's ok here. 450 * us but it's ok here.
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 53385cd4e6f0..5b6b0039f725 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1127,8 +1127,8 @@ skip_node:
1127 * skipping css reference should be safe. 1127 * skipping css reference should be safe.
1128 */ 1128 */
1129 if (next_css) { 1129 if (next_css) {
1130 if ((next_css->flags & CSS_ONLINE) && 1130 if ((next_css == &root->css) ||
1131 (next_css == &root->css || css_tryget(next_css))) 1131 ((next_css->flags & CSS_ONLINE) && css_tryget(next_css)))
1132 return mem_cgroup_from_css(next_css); 1132 return mem_cgroup_from_css(next_css);
1133 1133
1134 prev_css = next_css; 1134 prev_css = next_css;
@@ -1687,7 +1687,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1687 * protects memcg_name and makes sure that parallel ooms do not 1687 * protects memcg_name and makes sure that parallel ooms do not
1688 * interleave 1688 * interleave
1689 */ 1689 */
1690 static DEFINE_SPINLOCK(oom_info_lock); 1690 static DEFINE_MUTEX(oom_info_lock);
1691 struct cgroup *task_cgrp; 1691 struct cgroup *task_cgrp;
1692 struct cgroup *mem_cgrp; 1692 struct cgroup *mem_cgrp;
1693 static char memcg_name[PATH_MAX]; 1693 static char memcg_name[PATH_MAX];
@@ -1698,7 +1698,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1698 if (!p) 1698 if (!p)
1699 return; 1699 return;
1700 1700
1701 spin_lock(&oom_info_lock); 1701 mutex_lock(&oom_info_lock);
1702 rcu_read_lock(); 1702 rcu_read_lock();
1703 1703
1704 mem_cgrp = memcg->css.cgroup; 1704 mem_cgrp = memcg->css.cgroup;
@@ -1767,7 +1767,7 @@ done:
1767 1767
1768 pr_cont("\n"); 1768 pr_cont("\n");
1769 } 1769 }
1770 spin_unlock(&oom_info_lock); 1770 mutex_unlock(&oom_info_lock);
1771} 1771}
1772 1772
1773/* 1773/*
@@ -6595,6 +6595,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6595{ 6595{
6596 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6596 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6597 struct mem_cgroup_event *event, *tmp; 6597 struct mem_cgroup_event *event, *tmp;
6598 struct cgroup_subsys_state *iter;
6598 6599
6599 /* 6600 /*
6600 * Unregister events and notify userspace. 6601 * Unregister events and notify userspace.
@@ -6611,7 +6612,14 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6611 kmem_cgroup_css_offline(memcg); 6612 kmem_cgroup_css_offline(memcg);
6612 6613
6613 mem_cgroup_invalidate_reclaim_iterators(memcg); 6614 mem_cgroup_invalidate_reclaim_iterators(memcg);
6614 mem_cgroup_reparent_charges(memcg); 6615
6616 /*
6617 * This requires that offlining is serialized. Right now that is
6618 * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
6619 */
6620 css_for_each_descendant_post(iter, css)
6621 mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
6622
6615 mem_cgroup_destroy_all_caches(memcg); 6623 mem_cgroup_destroy_all_caches(memcg);
6616 vmpressure_cleanup(&memcg->vmpressure); 6624 vmpressure_cleanup(&memcg->vmpressure);
6617} 6625}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 4f08a2d61487..90002ea43638 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -945,8 +945,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
945 * to it. Similarly, page lock is shifted. 945 * to it. Similarly, page lock is shifted.
946 */ 946 */
947 if (hpage != p) { 947 if (hpage != p) {
948 put_page(hpage); 948 if (!(flags & MF_COUNT_INCREASED)) {
949 get_page(p); 949 put_page(hpage);
950 get_page(p);
951 }
950 lock_page(p); 952 lock_page(p);
951 unlock_page(hpage); 953 unlock_page(hpage);
952 *hpagep = p; 954 *hpagep = p;
@@ -1649,7 +1651,7 @@ int soft_offline_page(struct page *page, int flags)
1649{ 1651{
1650 int ret; 1652 int ret;
1651 unsigned long pfn = page_to_pfn(page); 1653 unsigned long pfn = page_to_pfn(page);
1652 struct page *hpage = compound_trans_head(page); 1654 struct page *hpage = compound_head(page);
1653 1655
1654 if (PageHWPoison(page)) { 1656 if (PageHWPoison(page)) {
1655 pr_info("soft offline: %#lx page already poisoned\n", pfn); 1657 pr_info("soft offline: %#lx page already poisoned\n", pfn);
diff --git a/mm/memory.c b/mm/memory.c
index be6a0c0d4ae0..22dfa617bddb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3348,6 +3348,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3348 if (ret & VM_FAULT_LOCKED) 3348 if (ret & VM_FAULT_LOCKED)
3349 unlock_page(vmf.page); 3349 unlock_page(vmf.page);
3350 ret = VM_FAULT_HWPOISON; 3350 ret = VM_FAULT_HWPOISON;
3351 page_cache_release(vmf.page);
3351 goto uncharge_out; 3352 goto uncharge_out;
3352 } 3353 }
3353 3354
@@ -3703,7 +3704,6 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3703 if (unlikely(is_vm_hugetlb_page(vma))) 3704 if (unlikely(is_vm_hugetlb_page(vma)))
3704 return hugetlb_fault(mm, vma, address, flags); 3705 return hugetlb_fault(mm, vma, address, flags);
3705 3706
3706retry:
3707 pgd = pgd_offset(mm, address); 3707 pgd = pgd_offset(mm, address);
3708 pud = pud_alloc(mm, pgd, address); 3708 pud = pud_alloc(mm, pgd, address);
3709 if (!pud) 3709 if (!pud)
@@ -3741,20 +3741,13 @@ retry:
3741 if (dirty && !pmd_write(orig_pmd)) { 3741 if (dirty && !pmd_write(orig_pmd)) {
3742 ret = do_huge_pmd_wp_page(mm, vma, address, pmd, 3742 ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
3743 orig_pmd); 3743 orig_pmd);
3744 /* 3744 if (!(ret & VM_FAULT_FALLBACK))
3745 * If COW results in an oom, the huge pmd will 3745 return ret;
3746 * have been split, so retry the fault on the
3747 * pte for a smaller charge.
3748 */
3749 if (unlikely(ret & VM_FAULT_OOM))
3750 goto retry;
3751 return ret;
3752 } else { 3746 } else {
3753 huge_pmd_set_accessed(mm, vma, address, pmd, 3747 huge_pmd_set_accessed(mm, vma, address, pmd,
3754 orig_pmd, dirty); 3748 orig_pmd, dirty);
3749 return 0;
3755 } 3750 }
3756
3757 return 0;
3758 } 3751 }
3759 } 3752 }
3760 3753
diff --git a/mm/migrate.c b/mm/migrate.c
index 482a33d89134..b494fdb9a636 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1158,7 +1158,7 @@ static struct page *new_page_node(struct page *p, unsigned long private,
1158 pm->node); 1158 pm->node);
1159 else 1159 else
1160 return alloc_pages_exact_node(pm->node, 1160 return alloc_pages_exact_node(pm->node,
1161 GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); 1161 GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
1162} 1162}
1163 1163
1164/* 1164/*
@@ -1544,9 +1544,9 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
1544 struct page *newpage; 1544 struct page *newpage;
1545 1545
1546 newpage = alloc_pages_exact_node(nid, 1546 newpage = alloc_pages_exact_node(nid,
1547 (GFP_HIGHUSER_MOVABLE | GFP_THISNODE | 1547 (GFP_HIGHUSER_MOVABLE |
1548 __GFP_NOMEMALLOC | __GFP_NORETRY | 1548 __GFP_THISNODE | __GFP_NOMEMALLOC |
1549 __GFP_NOWARN) & 1549 __GFP_NORETRY | __GFP_NOWARN) &
1550 ~GFP_IOFS, 0); 1550 ~GFP_IOFS, 0);
1551 1551
1552 return newpage; 1552 return newpage;
@@ -1747,7 +1747,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1747 goto out_dropref; 1747 goto out_dropref;
1748 1748
1749 new_page = alloc_pages_node(node, 1749 new_page = alloc_pages_node(node,
1750 (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); 1750 (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_WAIT,
1751 HPAGE_PMD_ORDER);
1751 if (!new_page) 1752 if (!new_page)
1752 goto out_fail; 1753 goto out_fail;
1753 1754
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 7332c1785744..769a67a15803 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -58,36 +58,27 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
58 if (pte_numa(ptent)) 58 if (pte_numa(ptent))
59 ptent = pte_mknonnuma(ptent); 59 ptent = pte_mknonnuma(ptent);
60 ptent = pte_modify(ptent, newprot); 60 ptent = pte_modify(ptent, newprot);
61 /*
62 * Avoid taking write faults for pages we
63 * know to be dirty.
64 */
65 if (dirty_accountable && pte_dirty(ptent))
66 ptent = pte_mkwrite(ptent);
67 ptep_modify_prot_commit(mm, addr, pte, ptent);
61 updated = true; 68 updated = true;
62 } else { 69 } else {
63 struct page *page; 70 struct page *page;
64 71
65 ptent = *pte;
66 page = vm_normal_page(vma, addr, oldpte); 72 page = vm_normal_page(vma, addr, oldpte);
67 if (page && !PageKsm(page)) { 73 if (page && !PageKsm(page)) {
68 if (!pte_numa(oldpte)) { 74 if (!pte_numa(oldpte)) {
69 ptent = pte_mknuma(ptent); 75 ptep_set_numa(mm, addr, pte);
70 set_pte_at(mm, addr, pte, ptent);
71 updated = true; 76 updated = true;
72 } 77 }
73 } 78 }
74 } 79 }
75
76 /*
77 * Avoid taking write faults for pages we know to be
78 * dirty.
79 */
80 if (dirty_accountable && pte_dirty(ptent)) {
81 ptent = pte_mkwrite(ptent);
82 updated = true;
83 }
84
85 if (updated) 80 if (updated)
86 pages++; 81 pages++;
87
88 /* Only !prot_numa always clears the pte */
89 if (!prot_numa)
90 ptep_modify_prot_commit(mm, addr, pte, ptent);
91 } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { 82 } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
92 swp_entry_t entry = pte_to_swp_entry(oldpte); 83 swp_entry_t entry = pte_to_swp_entry(oldpte);
93 84
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2d30e2cfe804..7106cb1aca8e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2173,11 +2173,12 @@ int __set_page_dirty_nobuffers(struct page *page)
2173 if (!TestSetPageDirty(page)) { 2173 if (!TestSetPageDirty(page)) {
2174 struct address_space *mapping = page_mapping(page); 2174 struct address_space *mapping = page_mapping(page);
2175 struct address_space *mapping2; 2175 struct address_space *mapping2;
2176 unsigned long flags;
2176 2177
2177 if (!mapping) 2178 if (!mapping)
2178 return 1; 2179 return 1;
2179 2180
2180 spin_lock_irq(&mapping->tree_lock); 2181 spin_lock_irqsave(&mapping->tree_lock, flags);
2181 mapping2 = page_mapping(page); 2182 mapping2 = page_mapping(page);
2182 if (mapping2) { /* Race with truncate? */ 2183 if (mapping2) { /* Race with truncate? */
2183 BUG_ON(mapping2 != mapping); 2184 BUG_ON(mapping2 != mapping);
@@ -2186,7 +2187,7 @@ int __set_page_dirty_nobuffers(struct page *page)
2186 radix_tree_tag_set(&mapping->page_tree, 2187 radix_tree_tag_set(&mapping->page_tree,
2187 page_index(page), PAGECACHE_TAG_DIRTY); 2188 page_index(page), PAGECACHE_TAG_DIRTY);
2188 } 2189 }
2189 spin_unlock_irq(&mapping->tree_lock); 2190 spin_unlock_irqrestore(&mapping->tree_lock, flags);
2190 if (mapping->host) { 2191 if (mapping->host) {
2191 /* !PageAnon && !swapper_space */ 2192 /* !PageAnon && !swapper_space */
2192 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 2193 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e3758a09a009..3bac76ae4b30 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -369,9 +369,11 @@ void prep_compound_page(struct page *page, unsigned long order)
369 __SetPageHead(page); 369 __SetPageHead(page);
370 for (i = 1; i < nr_pages; i++) { 370 for (i = 1; i < nr_pages; i++) {
371 struct page *p = page + i; 371 struct page *p = page + i;
372 __SetPageTail(p);
373 set_page_count(p, 0); 372 set_page_count(p, 0);
374 p->first_page = page; 373 p->first_page = page;
374 /* Make sure p->first_page is always valid for PageTail() */
375 smp_wmb();
376 __SetPageTail(p);
375 } 377 }
376} 378}
377 379
@@ -1236,6 +1238,15 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1236 } 1238 }
1237 local_irq_restore(flags); 1239 local_irq_restore(flags);
1238} 1240}
1241static bool gfp_thisnode_allocation(gfp_t gfp_mask)
1242{
1243 return (gfp_mask & GFP_THISNODE) == GFP_THISNODE;
1244}
1245#else
1246static bool gfp_thisnode_allocation(gfp_t gfp_mask)
1247{
1248 return false;
1249}
1239#endif 1250#endif
1240 1251
1241/* 1252/*
@@ -1572,7 +1583,13 @@ again:
1572 get_pageblock_migratetype(page)); 1583 get_pageblock_migratetype(page));
1573 } 1584 }
1574 1585
1575 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); 1586 /*
1587 * NOTE: GFP_THISNODE allocations do not partake in the kswapd
1588 * aging protocol, so they can't be fair.
1589 */
1590 if (!gfp_thisnode_allocation(gfp_flags))
1591 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
1592
1576 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1593 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1577 zone_statistics(preferred_zone, zone, gfp_flags); 1594 zone_statistics(preferred_zone, zone, gfp_flags);
1578 local_irq_restore(flags); 1595 local_irq_restore(flags);
@@ -1944,8 +1961,12 @@ zonelist_scan:
1944 * ultimately fall back to remote zones that do not 1961 * ultimately fall back to remote zones that do not
1945 * partake in the fairness round-robin cycle of this 1962 * partake in the fairness round-robin cycle of this
1946 * zonelist. 1963 * zonelist.
1964 *
1965 * NOTE: GFP_THISNODE allocations do not partake in
1966 * the kswapd aging protocol, so they can't be fair.
1947 */ 1967 */
1948 if (alloc_flags & ALLOC_WMARK_LOW) { 1968 if ((alloc_flags & ALLOC_WMARK_LOW) &&
1969 !gfp_thisnode_allocation(gfp_mask)) {
1949 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) 1970 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
1950 continue; 1971 continue;
1951 if (!zone_local(preferred_zone, zone)) 1972 if (!zone_local(preferred_zone, zone))
@@ -2501,8 +2522,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2501 * allowed per node queues are empty and that nodes are 2522 * allowed per node queues are empty and that nodes are
2502 * over allocated. 2523 * over allocated.
2503 */ 2524 */
2504 if (IS_ENABLED(CONFIG_NUMA) && 2525 if (gfp_thisnode_allocation(gfp_mask))
2505 (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
2506 goto nopage; 2526 goto nopage;
2507 2527
2508restart: 2528restart:
diff --git a/mm/slub.c b/mm/slub.c
index 7e3e0458bce4..25f14ad8f817 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1004,21 +1004,19 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
1004static void add_full(struct kmem_cache *s, 1004static void add_full(struct kmem_cache *s,
1005 struct kmem_cache_node *n, struct page *page) 1005 struct kmem_cache_node *n, struct page *page)
1006{ 1006{
1007 lockdep_assert_held(&n->list_lock);
1008
1009 if (!(s->flags & SLAB_STORE_USER)) 1007 if (!(s->flags & SLAB_STORE_USER))
1010 return; 1008 return;
1011 1009
1010 lockdep_assert_held(&n->list_lock);
1012 list_add(&page->lru, &n->full); 1011 list_add(&page->lru, &n->full);
1013} 1012}
1014 1013
1015static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) 1014static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
1016{ 1015{
1017 lockdep_assert_held(&n->list_lock);
1018
1019 if (!(s->flags & SLAB_STORE_USER)) 1016 if (!(s->flags & SLAB_STORE_USER))
1020 return; 1017 return;
1021 1018
1019 lockdep_assert_held(&n->list_lock);
1022 list_del(&page->lru); 1020 list_del(&page->lru);
1023} 1021}
1024 1022
@@ -1520,11 +1518,9 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
1520/* 1518/*
1521 * Management of partially allocated slabs. 1519 * Management of partially allocated slabs.
1522 */ 1520 */
1523static inline void add_partial(struct kmem_cache_node *n, 1521static inline void
1524 struct page *page, int tail) 1522__add_partial(struct kmem_cache_node *n, struct page *page, int tail)
1525{ 1523{
1526 lockdep_assert_held(&n->list_lock);
1527
1528 n->nr_partial++; 1524 n->nr_partial++;
1529 if (tail == DEACTIVATE_TO_TAIL) 1525 if (tail == DEACTIVATE_TO_TAIL)
1530 list_add_tail(&page->lru, &n->partial); 1526 list_add_tail(&page->lru, &n->partial);
@@ -1532,15 +1528,27 @@ static inline void add_partial(struct kmem_cache_node *n,
1532 list_add(&page->lru, &n->partial); 1528 list_add(&page->lru, &n->partial);
1533} 1529}
1534 1530
1535static inline void remove_partial(struct kmem_cache_node *n, 1531static inline void add_partial(struct kmem_cache_node *n,
1536 struct page *page) 1532 struct page *page, int tail)
1537{ 1533{
1538 lockdep_assert_held(&n->list_lock); 1534 lockdep_assert_held(&n->list_lock);
1535 __add_partial(n, page, tail);
1536}
1539 1537
1538static inline void
1539__remove_partial(struct kmem_cache_node *n, struct page *page)
1540{
1540 list_del(&page->lru); 1541 list_del(&page->lru);
1541 n->nr_partial--; 1542 n->nr_partial--;
1542} 1543}
1543 1544
1545static inline void remove_partial(struct kmem_cache_node *n,
1546 struct page *page)
1547{
1548 lockdep_assert_held(&n->list_lock);
1549 __remove_partial(n, page);
1550}
1551
1544/* 1552/*
1545 * Remove slab from the partial list, freeze it and 1553 * Remove slab from the partial list, freeze it and
1546 * return the pointer to the freelist. 1554 * return the pointer to the freelist.
@@ -2906,12 +2914,10 @@ static void early_kmem_cache_node_alloc(int node)
2906 inc_slabs_node(kmem_cache_node, node, page->objects); 2914 inc_slabs_node(kmem_cache_node, node, page->objects);
2907 2915
2908 /* 2916 /*
2909 * the lock is for lockdep's sake, not for any actual 2917 * No locks need to be taken here as it has just been
2910 * race protection 2918 * initialized and there is no concurrent access.
2911 */ 2919 */
2912 spin_lock(&n->list_lock); 2920 __add_partial(n, page, DEACTIVATE_TO_HEAD);
2913 add_partial(n, page, DEACTIVATE_TO_HEAD);
2914 spin_unlock(&n->list_lock);
2915} 2921}
2916 2922
2917static void free_kmem_cache_nodes(struct kmem_cache *s) 2923static void free_kmem_cache_nodes(struct kmem_cache *s)
@@ -3197,7 +3203,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
3197 3203
3198 list_for_each_entry_safe(page, h, &n->partial, lru) { 3204 list_for_each_entry_safe(page, h, &n->partial, lru) {
3199 if (!page->inuse) { 3205 if (!page->inuse) {
3200 remove_partial(n, page); 3206 __remove_partial(n, page);
3201 discard_slab(s, page); 3207 discard_slab(s, page);
3202 } else { 3208 } else {
3203 list_slab_objects(s, page, 3209 list_slab_objects(s, page,
diff --git a/mm/swap.c b/mm/swap.c
index b31ba67d440a..0092097b3f4c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -98,7 +98,7 @@ static void put_compound_page(struct page *page)
98 } 98 }
99 99
100 /* __split_huge_page_refcount can run under us */ 100 /* __split_huge_page_refcount can run under us */
101 page_head = compound_trans_head(page); 101 page_head = compound_head(page);
102 102
103 /* 103 /*
104 * THP can not break up slab pages so avoid taking 104 * THP can not break up slab pages so avoid taking
@@ -253,7 +253,7 @@ bool __get_page_tail(struct page *page)
253 */ 253 */
254 unsigned long flags; 254 unsigned long flags;
255 bool got; 255 bool got;
256 struct page *page_head = compound_trans_head(page); 256 struct page *page_head = compound_head(page);
257 257
258 /* Ref to put_compound_page() comment. */ 258 /* Ref to put_compound_page() comment. */
259 if (!__compound_tail_refcounted(page_head)) { 259 if (!__compound_tail_refcounted(page_head)) {
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 98e85e9c2b2d..e76ace30d436 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -63,6 +63,8 @@ unsigned long total_swapcache_pages(void)
63 return ret; 63 return ret;
64} 64}
65 65
66static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
67
66void show_swap_cache_info(void) 68void show_swap_cache_info(void)
67{ 69{
68 printk("%lu pages in swap cache\n", total_swapcache_pages()); 70 printk("%lu pages in swap cache\n", total_swapcache_pages());
@@ -286,8 +288,11 @@ struct page * lookup_swap_cache(swp_entry_t entry)
286 288
287 page = find_get_page(swap_address_space(entry), entry.val); 289 page = find_get_page(swap_address_space(entry), entry.val);
288 290
289 if (page) 291 if (page) {
290 INC_CACHE_INFO(find_success); 292 INC_CACHE_INFO(find_success);
293 if (TestClearPageReadahead(page))
294 atomic_inc(&swapin_readahead_hits);
295 }
291 296
292 INC_CACHE_INFO(find_total); 297 INC_CACHE_INFO(find_total);
293 return page; 298 return page;
@@ -389,6 +394,50 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
389 return found_page; 394 return found_page;
390} 395}
391 396
397static unsigned long swapin_nr_pages(unsigned long offset)
398{
399 static unsigned long prev_offset;
400 unsigned int pages, max_pages, last_ra;
401 static atomic_t last_readahead_pages;
402
403 max_pages = 1 << ACCESS_ONCE(page_cluster);
404 if (max_pages <= 1)
405 return 1;
406
407 /*
408 * This heuristic has been found to work well on both sequential and
409 * random loads, swapping to hard disk or to SSD: please don't ask
410 * what the "+ 2" means, it just happens to work well, that's all.
411 */
412 pages = atomic_xchg(&swapin_readahead_hits, 0) + 2;
413 if (pages == 2) {
414 /*
415 * We can have no readahead hits to judge by: but must not get
416 * stuck here forever, so check for an adjacent offset instead
417 * (and don't even bother to check whether swap type is same).
418 */
419 if (offset != prev_offset + 1 && offset != prev_offset - 1)
420 pages = 1;
421 prev_offset = offset;
422 } else {
423 unsigned int roundup = 4;
424 while (roundup < pages)
425 roundup <<= 1;
426 pages = roundup;
427 }
428
429 if (pages > max_pages)
430 pages = max_pages;
431
432 /* Don't shrink readahead too fast */
433 last_ra = atomic_read(&last_readahead_pages) / 2;
434 if (pages < last_ra)
435 pages = last_ra;
436 atomic_set(&last_readahead_pages, pages);
437
438 return pages;
439}
440
392/** 441/**
393 * swapin_readahead - swap in pages in hope we need them soon 442 * swapin_readahead - swap in pages in hope we need them soon
394 * @entry: swap entry of this memory 443 * @entry: swap entry of this memory
@@ -412,11 +461,16 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
412 struct vm_area_struct *vma, unsigned long addr) 461 struct vm_area_struct *vma, unsigned long addr)
413{ 462{
414 struct page *page; 463 struct page *page;
415 unsigned long offset = swp_offset(entry); 464 unsigned long entry_offset = swp_offset(entry);
465 unsigned long offset = entry_offset;
416 unsigned long start_offset, end_offset; 466 unsigned long start_offset, end_offset;
417 unsigned long mask = (1UL << page_cluster) - 1; 467 unsigned long mask;
418 struct blk_plug plug; 468 struct blk_plug plug;
419 469
470 mask = swapin_nr_pages(offset) - 1;
471 if (!mask)
472 goto skip;
473
420 /* Read a page_cluster sized and aligned cluster around offset. */ 474 /* Read a page_cluster sized and aligned cluster around offset. */
421 start_offset = offset & ~mask; 475 start_offset = offset & ~mask;
422 end_offset = offset | mask; 476 end_offset = offset | mask;
@@ -430,10 +484,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
430 gfp_mask, vma, addr); 484 gfp_mask, vma, addr);
431 if (!page) 485 if (!page)
432 continue; 486 continue;
487 if (offset != entry_offset)
488 SetPageReadahead(page);
433 page_cache_release(page); 489 page_cache_release(page);
434 } 490 }
435 blk_finish_plug(&plug); 491 blk_finish_plug(&plug);
436 492
437 lru_add_drain(); /* Push any new pages onto the LRU now */ 493 lru_add_drain(); /* Push any new pages onto the LRU now */
494skip:
438 return read_swap_cache_async(entry, gfp_mask, vma, addr); 495 return read_swap_cache_async(entry, gfp_mask, vma, addr);
439} 496}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c6c13b050a58..4a7f7e6992b6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1923,7 +1923,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1923 p->swap_map = NULL; 1923 p->swap_map = NULL;
1924 cluster_info = p->cluster_info; 1924 cluster_info = p->cluster_info;
1925 p->cluster_info = NULL; 1925 p->cluster_info = NULL;
1926 p->flags = 0;
1927 frontswap_map = frontswap_map_get(p); 1926 frontswap_map = frontswap_map_get(p);
1928 spin_unlock(&p->lock); 1927 spin_unlock(&p->lock);
1929 spin_unlock(&swap_lock); 1928 spin_unlock(&swap_lock);
@@ -1949,6 +1948,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1949 mutex_unlock(&inode->i_mutex); 1948 mutex_unlock(&inode->i_mutex);
1950 } 1949 }
1951 filp_close(swap_file, NULL); 1950 filp_close(swap_file, NULL);
1951
1952 /*
1953 * Clear the SWP_USED flag after all resources are freed so that swapon
1954 * can reuse this swap_info in alloc_swap_info() safely. It is ok to
1955 * not hold p->lock after we cleared its SWP_WRITEOK.
1956 */
1957 spin_lock(&swap_lock);
1958 p->flags = 0;
1959 spin_unlock(&swap_lock);
1960
1952 err = 0; 1961 err = 0;
1953 atomic_inc(&proc_poll_event); 1962 atomic_inc(&proc_poll_event);
1954 wake_up_interruptible(&proc_poll_wait); 1963 wake_up_interruptible(&proc_poll_wait);
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 196970a4541f..d4042e75f7c7 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -19,6 +19,7 @@
19#include <linux/mm.h> 19#include <linux/mm.h>
20#include <linux/vmstat.h> 20#include <linux/vmstat.h>
21#include <linux/eventfd.h> 21#include <linux/eventfd.h>
22#include <linux/slab.h>
22#include <linux/swap.h> 23#include <linux/swap.h>
23#include <linux/printk.h> 24#include <linux/printk.h>
24#include <linux/vmpressure.h> 25#include <linux/vmpressure.h>
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 72496140ac08..def5dd2fbe61 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -851,12 +851,14 @@ const char * const vmstat_text[] = {
851 "thp_zero_page_alloc", 851 "thp_zero_page_alloc",
852 "thp_zero_page_alloc_failed", 852 "thp_zero_page_alloc_failed",
853#endif 853#endif
854#ifdef CONFIG_DEBUG_TLBFLUSH
854#ifdef CONFIG_SMP 855#ifdef CONFIG_SMP
855 "nr_tlb_remote_flush", 856 "nr_tlb_remote_flush",
856 "nr_tlb_remote_flush_received", 857 "nr_tlb_remote_flush_received",
857#endif 858#endif /* CONFIG_SMP */
858 "nr_tlb_local_flush_all", 859 "nr_tlb_local_flush_all",
859 "nr_tlb_local_flush_one", 860 "nr_tlb_local_flush_one",
861#endif /* CONFIG_DEBUG_TLBFLUSH */
860 862
861#endif /* CONFIG_VM_EVENTS_COUNTERS */ 863#endif /* CONFIG_VM_EVENTS_COUNTERS */
862}; 864};