aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c4
-rw-r--r--mm/huge_memory.c20
-rw-r--r--mm/ksm.c2
-rw-r--r--mm/memcontrol.c20
-rw-r--r--mm/memory-failure.c8
-rw-r--r--mm/memory.c15
-rw-r--r--mm/mprotect.c25
-rw-r--r--mm/page-writeback.c5
-rw-r--r--mm/page_alloc.c30
-rw-r--r--mm/slub.c38
-rw-r--r--mm/swap.c4
-rw-r--r--mm/swap_state.c63
-rw-r--r--mm/swapfile.c11
-rw-r--r--mm/vmpressure.c1
-rw-r--r--mm/vmstat.c4
15 files changed, 169 insertions, 81 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index d56d3c145b9f..7a13f6ac5421 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2553,8 +2553,8 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2553 if (ret > 0) { 2553 if (ret > 0) {
2554 ssize_t err; 2554 ssize_t err;
2555 2555
2556 err = generic_write_sync(file, pos, ret); 2556 err = generic_write_sync(file, iocb->ki_pos - ret, ret);
2557 if (err < 0 && ret > 0) 2557 if (err < 0)
2558 ret = err; 2558 ret = err;
2559 } 2559 }
2560 return ret; 2560 return ret;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 82166bf974e1..1546655a2d78 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1166,8 +1166,10 @@ alloc:
1166 } else { 1166 } else {
1167 ret = do_huge_pmd_wp_page_fallback(mm, vma, address, 1167 ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
1168 pmd, orig_pmd, page, haddr); 1168 pmd, orig_pmd, page, haddr);
1169 if (ret & VM_FAULT_OOM) 1169 if (ret & VM_FAULT_OOM) {
1170 split_huge_page(page); 1170 split_huge_page(page);
1171 ret |= VM_FAULT_FALLBACK;
1172 }
1171 put_page(page); 1173 put_page(page);
1172 } 1174 }
1173 count_vm_event(THP_FAULT_FALLBACK); 1175 count_vm_event(THP_FAULT_FALLBACK);
@@ -1179,9 +1181,10 @@ alloc:
1179 if (page) { 1181 if (page) {
1180 split_huge_page(page); 1182 split_huge_page(page);
1181 put_page(page); 1183 put_page(page);
1182 } 1184 } else
1185 split_huge_page_pmd(vma, address, pmd);
1186 ret |= VM_FAULT_FALLBACK;
1183 count_vm_event(THP_FAULT_FALLBACK); 1187 count_vm_event(THP_FAULT_FALLBACK);
1184 ret |= VM_FAULT_OOM;
1185 goto out; 1188 goto out;
1186 } 1189 }
1187 1190
@@ -1545,6 +1548,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1545 entry = pmd_mknonnuma(entry); 1548 entry = pmd_mknonnuma(entry);
1546 entry = pmd_modify(entry, newprot); 1549 entry = pmd_modify(entry, newprot);
1547 ret = HPAGE_PMD_NR; 1550 ret = HPAGE_PMD_NR;
1551 set_pmd_at(mm, addr, pmd, entry);
1548 BUG_ON(pmd_write(entry)); 1552 BUG_ON(pmd_write(entry));
1549 } else { 1553 } else {
1550 struct page *page = pmd_page(*pmd); 1554 struct page *page = pmd_page(*pmd);
@@ -1557,16 +1561,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1557 */ 1561 */
1558 if (!is_huge_zero_page(page) && 1562 if (!is_huge_zero_page(page) &&
1559 !pmd_numa(*pmd)) { 1563 !pmd_numa(*pmd)) {
1560 entry = *pmd; 1564 pmdp_set_numa(mm, addr, pmd);
1561 entry = pmd_mknuma(entry);
1562 ret = HPAGE_PMD_NR; 1565 ret = HPAGE_PMD_NR;
1563 } 1566 }
1564 } 1567 }
1565
1566 /* Set PMD if cleared earlier */
1567 if (ret == HPAGE_PMD_NR)
1568 set_pmd_at(mm, addr, pmd, entry);
1569
1570 spin_unlock(ptl); 1568 spin_unlock(ptl);
1571 } 1569 }
1572 1570
@@ -1963,7 +1961,7 @@ out:
1963 return ret; 1961 return ret;
1964} 1962}
1965 1963
1966#define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE) 1964#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
1967 1965
1968int hugepage_madvise(struct vm_area_struct *vma, 1966int hugepage_madvise(struct vm_area_struct *vma,
1969 unsigned long *vm_flags, int advice) 1967 unsigned long *vm_flags, int advice)
diff --git a/mm/ksm.c b/mm/ksm.c
index aa4c7c7250c1..68710e80994a 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -444,7 +444,7 @@ static void break_cow(struct rmap_item *rmap_item)
444static struct page *page_trans_compound_anon(struct page *page) 444static struct page *page_trans_compound_anon(struct page *page)
445{ 445{
446 if (PageTransCompound(page)) { 446 if (PageTransCompound(page)) {
447 struct page *head = compound_trans_head(page); 447 struct page *head = compound_head(page);
448 /* 448 /*
449 * head may actually be splitted and freed from under 449 * head may actually be splitted and freed from under
450 * us but it's ok here. 450 * us but it's ok here.
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 53385cd4e6f0..5b6b0039f725 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1127,8 +1127,8 @@ skip_node:
1127 * skipping css reference should be safe. 1127 * skipping css reference should be safe.
1128 */ 1128 */
1129 if (next_css) { 1129 if (next_css) {
1130 if ((next_css->flags & CSS_ONLINE) && 1130 if ((next_css == &root->css) ||
1131 (next_css == &root->css || css_tryget(next_css))) 1131 ((next_css->flags & CSS_ONLINE) && css_tryget(next_css)))
1132 return mem_cgroup_from_css(next_css); 1132 return mem_cgroup_from_css(next_css);
1133 1133
1134 prev_css = next_css; 1134 prev_css = next_css;
@@ -1687,7 +1687,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1687 * protects memcg_name and makes sure that parallel ooms do not 1687 * protects memcg_name and makes sure that parallel ooms do not
1688 * interleave 1688 * interleave
1689 */ 1689 */
1690 static DEFINE_SPINLOCK(oom_info_lock); 1690 static DEFINE_MUTEX(oom_info_lock);
1691 struct cgroup *task_cgrp; 1691 struct cgroup *task_cgrp;
1692 struct cgroup *mem_cgrp; 1692 struct cgroup *mem_cgrp;
1693 static char memcg_name[PATH_MAX]; 1693 static char memcg_name[PATH_MAX];
@@ -1698,7 +1698,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1698 if (!p) 1698 if (!p)
1699 return; 1699 return;
1700 1700
1701 spin_lock(&oom_info_lock); 1701 mutex_lock(&oom_info_lock);
1702 rcu_read_lock(); 1702 rcu_read_lock();
1703 1703
1704 mem_cgrp = memcg->css.cgroup; 1704 mem_cgrp = memcg->css.cgroup;
@@ -1767,7 +1767,7 @@ done:
1767 1767
1768 pr_cont("\n"); 1768 pr_cont("\n");
1769 } 1769 }
1770 spin_unlock(&oom_info_lock); 1770 mutex_unlock(&oom_info_lock);
1771} 1771}
1772 1772
1773/* 1773/*
@@ -6595,6 +6595,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6595{ 6595{
6596 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6596 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6597 struct mem_cgroup_event *event, *tmp; 6597 struct mem_cgroup_event *event, *tmp;
6598 struct cgroup_subsys_state *iter;
6598 6599
6599 /* 6600 /*
6600 * Unregister events and notify userspace. 6601 * Unregister events and notify userspace.
@@ -6611,7 +6612,14 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6611 kmem_cgroup_css_offline(memcg); 6612 kmem_cgroup_css_offline(memcg);
6612 6613
6613 mem_cgroup_invalidate_reclaim_iterators(memcg); 6614 mem_cgroup_invalidate_reclaim_iterators(memcg);
6614 mem_cgroup_reparent_charges(memcg); 6615
6616 /*
6617 * This requires that offlining is serialized. Right now that is
6618 * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
6619 */
6620 css_for_each_descendant_post(iter, css)
6621 mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
6622
6615 mem_cgroup_destroy_all_caches(memcg); 6623 mem_cgroup_destroy_all_caches(memcg);
6616 vmpressure_cleanup(&memcg->vmpressure); 6624 vmpressure_cleanup(&memcg->vmpressure);
6617} 6625}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 4f08a2d61487..90002ea43638 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -945,8 +945,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
945 * to it. Similarly, page lock is shifted. 945 * to it. Similarly, page lock is shifted.
946 */ 946 */
947 if (hpage != p) { 947 if (hpage != p) {
948 put_page(hpage); 948 if (!(flags & MF_COUNT_INCREASED)) {
949 get_page(p); 949 put_page(hpage);
950 get_page(p);
951 }
950 lock_page(p); 952 lock_page(p);
951 unlock_page(hpage); 953 unlock_page(hpage);
952 *hpagep = p; 954 *hpagep = p;
@@ -1649,7 +1651,7 @@ int soft_offline_page(struct page *page, int flags)
1649{ 1651{
1650 int ret; 1652 int ret;
1651 unsigned long pfn = page_to_pfn(page); 1653 unsigned long pfn = page_to_pfn(page);
1652 struct page *hpage = compound_trans_head(page); 1654 struct page *hpage = compound_head(page);
1653 1655
1654 if (PageHWPoison(page)) { 1656 if (PageHWPoison(page)) {
1655 pr_info("soft offline: %#lx page already poisoned\n", pfn); 1657 pr_info("soft offline: %#lx page already poisoned\n", pfn);
diff --git a/mm/memory.c b/mm/memory.c
index be6a0c0d4ae0..22dfa617bddb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3348,6 +3348,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3348 if (ret & VM_FAULT_LOCKED) 3348 if (ret & VM_FAULT_LOCKED)
3349 unlock_page(vmf.page); 3349 unlock_page(vmf.page);
3350 ret = VM_FAULT_HWPOISON; 3350 ret = VM_FAULT_HWPOISON;
3351 page_cache_release(vmf.page);
3351 goto uncharge_out; 3352 goto uncharge_out;
3352 } 3353 }
3353 3354
@@ -3703,7 +3704,6 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3703 if (unlikely(is_vm_hugetlb_page(vma))) 3704 if (unlikely(is_vm_hugetlb_page(vma)))
3704 return hugetlb_fault(mm, vma, address, flags); 3705 return hugetlb_fault(mm, vma, address, flags);
3705 3706
3706retry:
3707 pgd = pgd_offset(mm, address); 3707 pgd = pgd_offset(mm, address);
3708 pud = pud_alloc(mm, pgd, address); 3708 pud = pud_alloc(mm, pgd, address);
3709 if (!pud) 3709 if (!pud)
@@ -3741,20 +3741,13 @@ retry:
3741 if (dirty && !pmd_write(orig_pmd)) { 3741 if (dirty && !pmd_write(orig_pmd)) {
3742 ret = do_huge_pmd_wp_page(mm, vma, address, pmd, 3742 ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
3743 orig_pmd); 3743 orig_pmd);
3744 /* 3744 if (!(ret & VM_FAULT_FALLBACK))
3745 * If COW results in an oom, the huge pmd will 3745 return ret;
3746 * have been split, so retry the fault on the
3747 * pte for a smaller charge.
3748 */
3749 if (unlikely(ret & VM_FAULT_OOM))
3750 goto retry;
3751 return ret;
3752 } else { 3746 } else {
3753 huge_pmd_set_accessed(mm, vma, address, pmd, 3747 huge_pmd_set_accessed(mm, vma, address, pmd,
3754 orig_pmd, dirty); 3748 orig_pmd, dirty);
3749 return 0;
3755 } 3750 }
3756
3757 return 0;
3758 } 3751 }
3759 } 3752 }
3760 3753
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 7332c1785744..769a67a15803 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -58,36 +58,27 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
58 if (pte_numa(ptent)) 58 if (pte_numa(ptent))
59 ptent = pte_mknonnuma(ptent); 59 ptent = pte_mknonnuma(ptent);
60 ptent = pte_modify(ptent, newprot); 60 ptent = pte_modify(ptent, newprot);
61 /*
62 * Avoid taking write faults for pages we
63 * know to be dirty.
64 */
65 if (dirty_accountable && pte_dirty(ptent))
66 ptent = pte_mkwrite(ptent);
67 ptep_modify_prot_commit(mm, addr, pte, ptent);
61 updated = true; 68 updated = true;
62 } else { 69 } else {
63 struct page *page; 70 struct page *page;
64 71
65 ptent = *pte;
66 page = vm_normal_page(vma, addr, oldpte); 72 page = vm_normal_page(vma, addr, oldpte);
67 if (page && !PageKsm(page)) { 73 if (page && !PageKsm(page)) {
68 if (!pte_numa(oldpte)) { 74 if (!pte_numa(oldpte)) {
69 ptent = pte_mknuma(ptent); 75 ptep_set_numa(mm, addr, pte);
70 set_pte_at(mm, addr, pte, ptent);
71 updated = true; 76 updated = true;
72 } 77 }
73 } 78 }
74 } 79 }
75
76 /*
77 * Avoid taking write faults for pages we know to be
78 * dirty.
79 */
80 if (dirty_accountable && pte_dirty(ptent)) {
81 ptent = pte_mkwrite(ptent);
82 updated = true;
83 }
84
85 if (updated) 80 if (updated)
86 pages++; 81 pages++;
87
88 /* Only !prot_numa always clears the pte */
89 if (!prot_numa)
90 ptep_modify_prot_commit(mm, addr, pte, ptent);
91 } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { 82 } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
92 swp_entry_t entry = pte_to_swp_entry(oldpte); 83 swp_entry_t entry = pte_to_swp_entry(oldpte);
93 84
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2d30e2cfe804..7106cb1aca8e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2173,11 +2173,12 @@ int __set_page_dirty_nobuffers(struct page *page)
2173 if (!TestSetPageDirty(page)) { 2173 if (!TestSetPageDirty(page)) {
2174 struct address_space *mapping = page_mapping(page); 2174 struct address_space *mapping = page_mapping(page);
2175 struct address_space *mapping2; 2175 struct address_space *mapping2;
2176 unsigned long flags;
2176 2177
2177 if (!mapping) 2178 if (!mapping)
2178 return 1; 2179 return 1;
2179 2180
2180 spin_lock_irq(&mapping->tree_lock); 2181 spin_lock_irqsave(&mapping->tree_lock, flags);
2181 mapping2 = page_mapping(page); 2182 mapping2 = page_mapping(page);
2182 if (mapping2) { /* Race with truncate? */ 2183 if (mapping2) { /* Race with truncate? */
2183 BUG_ON(mapping2 != mapping); 2184 BUG_ON(mapping2 != mapping);
@@ -2186,7 +2187,7 @@ int __set_page_dirty_nobuffers(struct page *page)
2186 radix_tree_tag_set(&mapping->page_tree, 2187 radix_tree_tag_set(&mapping->page_tree,
2187 page_index(page), PAGECACHE_TAG_DIRTY); 2188 page_index(page), PAGECACHE_TAG_DIRTY);
2188 } 2189 }
2189 spin_unlock_irq(&mapping->tree_lock); 2190 spin_unlock_irqrestore(&mapping->tree_lock, flags);
2190 if (mapping->host) { 2191 if (mapping->host) {
2191 /* !PageAnon && !swapper_space */ 2192 /* !PageAnon && !swapper_space */
2192 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 2193 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e3758a09a009..3bac76ae4b30 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -369,9 +369,11 @@ void prep_compound_page(struct page *page, unsigned long order)
369 __SetPageHead(page); 369 __SetPageHead(page);
370 for (i = 1; i < nr_pages; i++) { 370 for (i = 1; i < nr_pages; i++) {
371 struct page *p = page + i; 371 struct page *p = page + i;
372 __SetPageTail(p);
373 set_page_count(p, 0); 372 set_page_count(p, 0);
374 p->first_page = page; 373 p->first_page = page;
374 /* Make sure p->first_page is always valid for PageTail() */
375 smp_wmb();
376 __SetPageTail(p);
375 } 377 }
376} 378}
377 379
@@ -1236,6 +1238,15 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1236 } 1238 }
1237 local_irq_restore(flags); 1239 local_irq_restore(flags);
1238} 1240}
1241static bool gfp_thisnode_allocation(gfp_t gfp_mask)
1242{
1243 return (gfp_mask & GFP_THISNODE) == GFP_THISNODE;
1244}
1245#else
1246static bool gfp_thisnode_allocation(gfp_t gfp_mask)
1247{
1248 return false;
1249}
1239#endif 1250#endif
1240 1251
1241/* 1252/*
@@ -1572,7 +1583,13 @@ again:
1572 get_pageblock_migratetype(page)); 1583 get_pageblock_migratetype(page));
1573 } 1584 }
1574 1585
1575 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); 1586 /*
1587 * NOTE: GFP_THISNODE allocations do not partake in the kswapd
1588 * aging protocol, so they can't be fair.
1589 */
1590 if (!gfp_thisnode_allocation(gfp_flags))
1591 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
1592
1576 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1593 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1577 zone_statistics(preferred_zone, zone, gfp_flags); 1594 zone_statistics(preferred_zone, zone, gfp_flags);
1578 local_irq_restore(flags); 1595 local_irq_restore(flags);
@@ -1944,8 +1961,12 @@ zonelist_scan:
1944 * ultimately fall back to remote zones that do not 1961 * ultimately fall back to remote zones that do not
1945 * partake in the fairness round-robin cycle of this 1962 * partake in the fairness round-robin cycle of this
1946 * zonelist. 1963 * zonelist.
1964 *
1965 * NOTE: GFP_THISNODE allocations do not partake in
1966 * the kswapd aging protocol, so they can't be fair.
1947 */ 1967 */
1948 if (alloc_flags & ALLOC_WMARK_LOW) { 1968 if ((alloc_flags & ALLOC_WMARK_LOW) &&
1969 !gfp_thisnode_allocation(gfp_mask)) {
1949 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) 1970 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
1950 continue; 1971 continue;
1951 if (!zone_local(preferred_zone, zone)) 1972 if (!zone_local(preferred_zone, zone))
@@ -2501,8 +2522,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2501 * allowed per node queues are empty and that nodes are 2522 * allowed per node queues are empty and that nodes are
2502 * over allocated. 2523 * over allocated.
2503 */ 2524 */
2504 if (IS_ENABLED(CONFIG_NUMA) && 2525 if (gfp_thisnode_allocation(gfp_mask))
2505 (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
2506 goto nopage; 2526 goto nopage;
2507 2527
2508restart: 2528restart:
diff --git a/mm/slub.c b/mm/slub.c
index 7e3e0458bce4..25f14ad8f817 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1004,21 +1004,19 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
1004static void add_full(struct kmem_cache *s, 1004static void add_full(struct kmem_cache *s,
1005 struct kmem_cache_node *n, struct page *page) 1005 struct kmem_cache_node *n, struct page *page)
1006{ 1006{
1007 lockdep_assert_held(&n->list_lock);
1008
1009 if (!(s->flags & SLAB_STORE_USER)) 1007 if (!(s->flags & SLAB_STORE_USER))
1010 return; 1008 return;
1011 1009
1010 lockdep_assert_held(&n->list_lock);
1012 list_add(&page->lru, &n->full); 1011 list_add(&page->lru, &n->full);
1013} 1012}
1014 1013
1015static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) 1014static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
1016{ 1015{
1017 lockdep_assert_held(&n->list_lock);
1018
1019 if (!(s->flags & SLAB_STORE_USER)) 1016 if (!(s->flags & SLAB_STORE_USER))
1020 return; 1017 return;
1021 1018
1019 lockdep_assert_held(&n->list_lock);
1022 list_del(&page->lru); 1020 list_del(&page->lru);
1023} 1021}
1024 1022
@@ -1520,11 +1518,9 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
1520/* 1518/*
1521 * Management of partially allocated slabs. 1519 * Management of partially allocated slabs.
1522 */ 1520 */
1523static inline void add_partial(struct kmem_cache_node *n, 1521static inline void
1524 struct page *page, int tail) 1522__add_partial(struct kmem_cache_node *n, struct page *page, int tail)
1525{ 1523{
1526 lockdep_assert_held(&n->list_lock);
1527
1528 n->nr_partial++; 1524 n->nr_partial++;
1529 if (tail == DEACTIVATE_TO_TAIL) 1525 if (tail == DEACTIVATE_TO_TAIL)
1530 list_add_tail(&page->lru, &n->partial); 1526 list_add_tail(&page->lru, &n->partial);
@@ -1532,15 +1528,27 @@ static inline void add_partial(struct kmem_cache_node *n,
1532 list_add(&page->lru, &n->partial); 1528 list_add(&page->lru, &n->partial);
1533} 1529}
1534 1530
1535static inline void remove_partial(struct kmem_cache_node *n, 1531static inline void add_partial(struct kmem_cache_node *n,
1536 struct page *page) 1532 struct page *page, int tail)
1537{ 1533{
1538 lockdep_assert_held(&n->list_lock); 1534 lockdep_assert_held(&n->list_lock);
1535 __add_partial(n, page, tail);
1536}
1539 1537
1538static inline void
1539__remove_partial(struct kmem_cache_node *n, struct page *page)
1540{
1540 list_del(&page->lru); 1541 list_del(&page->lru);
1541 n->nr_partial--; 1542 n->nr_partial--;
1542} 1543}
1543 1544
1545static inline void remove_partial(struct kmem_cache_node *n,
1546 struct page *page)
1547{
1548 lockdep_assert_held(&n->list_lock);
1549 __remove_partial(n, page);
1550}
1551
1544/* 1552/*
1545 * Remove slab from the partial list, freeze it and 1553 * Remove slab from the partial list, freeze it and
1546 * return the pointer to the freelist. 1554 * return the pointer to the freelist.
@@ -2906,12 +2914,10 @@ static void early_kmem_cache_node_alloc(int node)
2906 inc_slabs_node(kmem_cache_node, node, page->objects); 2914 inc_slabs_node(kmem_cache_node, node, page->objects);
2907 2915
2908 /* 2916 /*
2909 * the lock is for lockdep's sake, not for any actual 2917 * No locks need to be taken here as it has just been
2910 * race protection 2918 * initialized and there is no concurrent access.
2911 */ 2919 */
2912 spin_lock(&n->list_lock); 2920 __add_partial(n, page, DEACTIVATE_TO_HEAD);
2913 add_partial(n, page, DEACTIVATE_TO_HEAD);
2914 spin_unlock(&n->list_lock);
2915} 2921}
2916 2922
2917static void free_kmem_cache_nodes(struct kmem_cache *s) 2923static void free_kmem_cache_nodes(struct kmem_cache *s)
@@ -3197,7 +3203,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
3197 3203
3198 list_for_each_entry_safe(page, h, &n->partial, lru) { 3204 list_for_each_entry_safe(page, h, &n->partial, lru) {
3199 if (!page->inuse) { 3205 if (!page->inuse) {
3200 remove_partial(n, page); 3206 __remove_partial(n, page);
3201 discard_slab(s, page); 3207 discard_slab(s, page);
3202 } else { 3208 } else {
3203 list_slab_objects(s, page, 3209 list_slab_objects(s, page,
diff --git a/mm/swap.c b/mm/swap.c
index b31ba67d440a..0092097b3f4c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -98,7 +98,7 @@ static void put_compound_page(struct page *page)
98 } 98 }
99 99
100 /* __split_huge_page_refcount can run under us */ 100 /* __split_huge_page_refcount can run under us */
101 page_head = compound_trans_head(page); 101 page_head = compound_head(page);
102 102
103 /* 103 /*
104 * THP can not break up slab pages so avoid taking 104 * THP can not break up slab pages so avoid taking
@@ -253,7 +253,7 @@ bool __get_page_tail(struct page *page)
253 */ 253 */
254 unsigned long flags; 254 unsigned long flags;
255 bool got; 255 bool got;
256 struct page *page_head = compound_trans_head(page); 256 struct page *page_head = compound_head(page);
257 257
258 /* Ref to put_compound_page() comment. */ 258 /* Ref to put_compound_page() comment. */
259 if (!__compound_tail_refcounted(page_head)) { 259 if (!__compound_tail_refcounted(page_head)) {
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 98e85e9c2b2d..e76ace30d436 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -63,6 +63,8 @@ unsigned long total_swapcache_pages(void)
63 return ret; 63 return ret;
64} 64}
65 65
66static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
67
66void show_swap_cache_info(void) 68void show_swap_cache_info(void)
67{ 69{
68 printk("%lu pages in swap cache\n", total_swapcache_pages()); 70 printk("%lu pages in swap cache\n", total_swapcache_pages());
@@ -286,8 +288,11 @@ struct page * lookup_swap_cache(swp_entry_t entry)
286 288
287 page = find_get_page(swap_address_space(entry), entry.val); 289 page = find_get_page(swap_address_space(entry), entry.val);
288 290
289 if (page) 291 if (page) {
290 INC_CACHE_INFO(find_success); 292 INC_CACHE_INFO(find_success);
293 if (TestClearPageReadahead(page))
294 atomic_inc(&swapin_readahead_hits);
295 }
291 296
292 INC_CACHE_INFO(find_total); 297 INC_CACHE_INFO(find_total);
293 return page; 298 return page;
@@ -389,6 +394,50 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
389 return found_page; 394 return found_page;
390} 395}
391 396
397static unsigned long swapin_nr_pages(unsigned long offset)
398{
399 static unsigned long prev_offset;
400 unsigned int pages, max_pages, last_ra;
401 static atomic_t last_readahead_pages;
402
403 max_pages = 1 << ACCESS_ONCE(page_cluster);
404 if (max_pages <= 1)
405 return 1;
406
407 /*
408 * This heuristic has been found to work well on both sequential and
409 * random loads, swapping to hard disk or to SSD: please don't ask
410 * what the "+ 2" means, it just happens to work well, that's all.
411 */
412 pages = atomic_xchg(&swapin_readahead_hits, 0) + 2;
413 if (pages == 2) {
414 /*
415 * We can have no readahead hits to judge by: but must not get
416 * stuck here forever, so check for an adjacent offset instead
417 * (and don't even bother to check whether swap type is same).
418 */
419 if (offset != prev_offset + 1 && offset != prev_offset - 1)
420 pages = 1;
421 prev_offset = offset;
422 } else {
423 unsigned int roundup = 4;
424 while (roundup < pages)
425 roundup <<= 1;
426 pages = roundup;
427 }
428
429 if (pages > max_pages)
430 pages = max_pages;
431
432 /* Don't shrink readahead too fast */
433 last_ra = atomic_read(&last_readahead_pages) / 2;
434 if (pages < last_ra)
435 pages = last_ra;
436 atomic_set(&last_readahead_pages, pages);
437
438 return pages;
439}
440
392/** 441/**
393 * swapin_readahead - swap in pages in hope we need them soon 442 * swapin_readahead - swap in pages in hope we need them soon
394 * @entry: swap entry of this memory 443 * @entry: swap entry of this memory
@@ -412,11 +461,16 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
412 struct vm_area_struct *vma, unsigned long addr) 461 struct vm_area_struct *vma, unsigned long addr)
413{ 462{
414 struct page *page; 463 struct page *page;
415 unsigned long offset = swp_offset(entry); 464 unsigned long entry_offset = swp_offset(entry);
465 unsigned long offset = entry_offset;
416 unsigned long start_offset, end_offset; 466 unsigned long start_offset, end_offset;
417 unsigned long mask = (1UL << page_cluster) - 1; 467 unsigned long mask;
418 struct blk_plug plug; 468 struct blk_plug plug;
419 469
470 mask = swapin_nr_pages(offset) - 1;
471 if (!mask)
472 goto skip;
473
420 /* Read a page_cluster sized and aligned cluster around offset. */ 474 /* Read a page_cluster sized and aligned cluster around offset. */
421 start_offset = offset & ~mask; 475 start_offset = offset & ~mask;
422 end_offset = offset | mask; 476 end_offset = offset | mask;
@@ -430,10 +484,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
430 gfp_mask, vma, addr); 484 gfp_mask, vma, addr);
431 if (!page) 485 if (!page)
432 continue; 486 continue;
487 if (offset != entry_offset)
488 SetPageReadahead(page);
433 page_cache_release(page); 489 page_cache_release(page);
434 } 490 }
435 blk_finish_plug(&plug); 491 blk_finish_plug(&plug);
436 492
437 lru_add_drain(); /* Push any new pages onto the LRU now */ 493 lru_add_drain(); /* Push any new pages onto the LRU now */
494skip:
438 return read_swap_cache_async(entry, gfp_mask, vma, addr); 495 return read_swap_cache_async(entry, gfp_mask, vma, addr);
439} 496}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c6c13b050a58..4a7f7e6992b6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1923,7 +1923,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1923 p->swap_map = NULL; 1923 p->swap_map = NULL;
1924 cluster_info = p->cluster_info; 1924 cluster_info = p->cluster_info;
1925 p->cluster_info = NULL; 1925 p->cluster_info = NULL;
1926 p->flags = 0;
1927 frontswap_map = frontswap_map_get(p); 1926 frontswap_map = frontswap_map_get(p);
1928 spin_unlock(&p->lock); 1927 spin_unlock(&p->lock);
1929 spin_unlock(&swap_lock); 1928 spin_unlock(&swap_lock);
@@ -1949,6 +1948,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1949 mutex_unlock(&inode->i_mutex); 1948 mutex_unlock(&inode->i_mutex);
1950 } 1949 }
1951 filp_close(swap_file, NULL); 1950 filp_close(swap_file, NULL);
1951
1952 /*
1953 * Clear the SWP_USED flag after all resources are freed so that swapon
1954 * can reuse this swap_info in alloc_swap_info() safely. It is ok to
1955 * not hold p->lock after we cleared its SWP_WRITEOK.
1956 */
1957 spin_lock(&swap_lock);
1958 p->flags = 0;
1959 spin_unlock(&swap_lock);
1960
1952 err = 0; 1961 err = 0;
1953 atomic_inc(&proc_poll_event); 1962 atomic_inc(&proc_poll_event);
1954 wake_up_interruptible(&proc_poll_wait); 1963 wake_up_interruptible(&proc_poll_wait);
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 196970a4541f..d4042e75f7c7 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -19,6 +19,7 @@
19#include <linux/mm.h> 19#include <linux/mm.h>
20#include <linux/vmstat.h> 20#include <linux/vmstat.h>
21#include <linux/eventfd.h> 21#include <linux/eventfd.h>
22#include <linux/slab.h>
22#include <linux/swap.h> 23#include <linux/swap.h>
23#include <linux/printk.h> 24#include <linux/printk.h>
24#include <linux/vmpressure.h> 25#include <linux/vmpressure.h>
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 72496140ac08..def5dd2fbe61 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -851,12 +851,14 @@ const char * const vmstat_text[] = {
851 "thp_zero_page_alloc", 851 "thp_zero_page_alloc",
852 "thp_zero_page_alloc_failed", 852 "thp_zero_page_alloc_failed",
853#endif 853#endif
854#ifdef CONFIG_DEBUG_TLBFLUSH
854#ifdef CONFIG_SMP 855#ifdef CONFIG_SMP
855 "nr_tlb_remote_flush", 856 "nr_tlb_remote_flush",
856 "nr_tlb_remote_flush_received", 857 "nr_tlb_remote_flush_received",
857#endif 858#endif /* CONFIG_SMP */
858 "nr_tlb_local_flush_all", 859 "nr_tlb_local_flush_all",
859 "nr_tlb_local_flush_one", 860 "nr_tlb_local_flush_one",
861#endif /* CONFIG_DEBUG_TLBFLUSH */
860 862
861#endif /* CONFIG_VM_EVENTS_COUNTERS */ 863#endif /* CONFIG_VM_EVENTS_COUNTERS */
862}; 864};