aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/compaction.c4
-rw-r--r--mm/fremap.c8
-rw-r--r--mm/huge_memory.c48
-rw-r--r--mm/memcontrol.c2
-rw-r--r--mm/memory-failure.c24
-rw-r--r--mm/memory.c2
-rw-r--r--mm/mempolicy.c16
-rw-r--r--mm/migrate.c82
-rw-r--r--mm/mlock.c44
-rw-r--r--mm/mprotect.c13
-rw-r--r--mm/page_alloc.c19
-rw-r--r--mm/pgtable-generic.c8
-rw-r--r--mm/rmap.c4
14 files changed, 204 insertions, 72 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index eb69f352401d..723bbe04a0b0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -543,7 +543,7 @@ config ZSWAP
543 543
544config MEM_SOFT_DIRTY 544config MEM_SOFT_DIRTY
545 bool "Track memory changes" 545 bool "Track memory changes"
546 depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY 546 depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS
547 select PROC_PAGE_MONITOR 547 select PROC_PAGE_MONITOR
548 help 548 help
549 This option enables memory changes tracking by introducing a 549 This option enables memory changes tracking by introducing a
diff --git a/mm/compaction.c b/mm/compaction.c
index 805165bcd3dd..f58bcd016f43 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -134,6 +134,10 @@ static void update_pageblock_skip(struct compact_control *cc,
134 bool migrate_scanner) 134 bool migrate_scanner)
135{ 135{
136 struct zone *zone = cc->zone; 136 struct zone *zone = cc->zone;
137
138 if (cc->ignore_skip_hint)
139 return;
140
137 if (!page) 141 if (!page)
138 return; 142 return;
139 143
diff --git a/mm/fremap.c b/mm/fremap.c
index 5bff08147768..bbc4d660221a 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -208,9 +208,10 @@ get_write_lock:
208 if (mapping_cap_account_dirty(mapping)) { 208 if (mapping_cap_account_dirty(mapping)) {
209 unsigned long addr; 209 unsigned long addr;
210 struct file *file = get_file(vma->vm_file); 210 struct file *file = get_file(vma->vm_file);
211 /* mmap_region may free vma; grab the info now */
212 vm_flags = vma->vm_flags;
211 213
212 addr = mmap_region(file, start, size, 214 addr = mmap_region(file, start, size, vm_flags, pgoff);
213 vma->vm_flags, pgoff);
214 fput(file); 215 fput(file);
215 if (IS_ERR_VALUE(addr)) { 216 if (IS_ERR_VALUE(addr)) {
216 err = addr; 217 err = addr;
@@ -218,7 +219,7 @@ get_write_lock:
218 BUG_ON(addr != start); 219 BUG_ON(addr != start);
219 err = 0; 220 err = 0;
220 } 221 }
221 goto out; 222 goto out_freed;
222 } 223 }
223 mutex_lock(&mapping->i_mmap_mutex); 224 mutex_lock(&mapping->i_mmap_mutex);
224 flush_dcache_mmap_lock(mapping); 225 flush_dcache_mmap_lock(mapping);
@@ -253,6 +254,7 @@ get_write_lock:
253out: 254out:
254 if (vma) 255 if (vma)
255 vm_flags = vma->vm_flags; 256 vm_flags = vma->vm_flags;
257out_freed:
256 if (likely(!has_write_lock)) 258 if (likely(!has_write_lock))
257 up_read(&mm->mmap_sem); 259 up_read(&mm->mmap_sem);
258 else 260 else
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 33a5dc492810..95d1acb0f3d2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -882,6 +882,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
882 ret = 0; 882 ret = 0;
883 goto out_unlock; 883 goto out_unlock;
884 } 884 }
885
885 if (unlikely(pmd_trans_splitting(pmd))) { 886 if (unlikely(pmd_trans_splitting(pmd))) {
886 /* split huge page running from under us */ 887 /* split huge page running from under us */
887 spin_unlock(src_ptl); 888 spin_unlock(src_ptl);
@@ -1153,7 +1154,7 @@ alloc:
1153 new_page = NULL; 1154 new_page = NULL;
1154 1155
1155 if (unlikely(!new_page)) { 1156 if (unlikely(!new_page)) {
1156 if (is_huge_zero_pmd(orig_pmd)) { 1157 if (!page) {
1157 ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, 1158 ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
1158 address, pmd, orig_pmd, haddr); 1159 address, pmd, orig_pmd, haddr);
1159 } else { 1160 } else {
@@ -1180,7 +1181,7 @@ alloc:
1180 1181
1181 count_vm_event(THP_FAULT_ALLOC); 1182 count_vm_event(THP_FAULT_ALLOC);
1182 1183
1183 if (is_huge_zero_pmd(orig_pmd)) 1184 if (!page)
1184 clear_huge_page(new_page, haddr, HPAGE_PMD_NR); 1185 clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
1185 else 1186 else
1186 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); 1187 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
@@ -1206,7 +1207,7 @@ alloc:
1206 page_add_new_anon_rmap(new_page, vma, haddr); 1207 page_add_new_anon_rmap(new_page, vma, haddr);
1207 set_pmd_at(mm, haddr, pmd, entry); 1208 set_pmd_at(mm, haddr, pmd, entry);
1208 update_mmu_cache_pmd(vma, address, pmd); 1209 update_mmu_cache_pmd(vma, address, pmd);
1209 if (is_huge_zero_pmd(orig_pmd)) { 1210 if (!page) {
1210 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 1211 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
1211 put_huge_zero_page(); 1212 put_huge_zero_page();
1212 } else { 1213 } else {
@@ -1243,6 +1244,10 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1243 if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) 1244 if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
1244 return ERR_PTR(-EFAULT); 1245 return ERR_PTR(-EFAULT);
1245 1246
1247 /* Full NUMA hinting faults to serialise migration in fault paths */
1248 if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
1249 goto out;
1250
1246 page = pmd_page(*pmd); 1251 page = pmd_page(*pmd);
1247 VM_BUG_ON(!PageHead(page)); 1252 VM_BUG_ON(!PageHead(page));
1248 if (flags & FOLL_TOUCH) { 1253 if (flags & FOLL_TOUCH) {
@@ -1295,6 +1300,17 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1295 if (unlikely(!pmd_same(pmd, *pmdp))) 1300 if (unlikely(!pmd_same(pmd, *pmdp)))
1296 goto out_unlock; 1301 goto out_unlock;
1297 1302
1303 /*
1304 * If there are potential migrations, wait for completion and retry
1305 * without disrupting NUMA hinting information. Do not relock and
1306 * check_same as the page may no longer be mapped.
1307 */
1308 if (unlikely(pmd_trans_migrating(*pmdp))) {
1309 spin_unlock(ptl);
1310 wait_migrate_huge_page(vma->anon_vma, pmdp);
1311 goto out;
1312 }
1313
1298 page = pmd_page(pmd); 1314 page = pmd_page(pmd);
1299 BUG_ON(is_huge_zero_page(page)); 1315 BUG_ON(is_huge_zero_page(page));
1300 page_nid = page_to_nid(page); 1316 page_nid = page_to_nid(page);
@@ -1323,23 +1339,22 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1323 /* If the page was locked, there are no parallel migrations */ 1339 /* If the page was locked, there are no parallel migrations */
1324 if (page_locked) 1340 if (page_locked)
1325 goto clear_pmdnuma; 1341 goto clear_pmdnuma;
1342 }
1326 1343
1327 /* 1344 /* Migration could have started since the pmd_trans_migrating check */
1328 * Otherwise wait for potential migrations and retry. We do 1345 if (!page_locked) {
1329 * relock and check_same as the page may no longer be mapped.
1330 * As the fault is being retried, do not account for it.
1331 */
1332 spin_unlock(ptl); 1346 spin_unlock(ptl);
1333 wait_on_page_locked(page); 1347 wait_on_page_locked(page);
1334 page_nid = -1; 1348 page_nid = -1;
1335 goto out; 1349 goto out;
1336 } 1350 }
1337 1351
1338 /* Page is misplaced, serialise migrations and parallel THP splits */ 1352 /*
1353 * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
1354 * to serialises splits
1355 */
1339 get_page(page); 1356 get_page(page);
1340 spin_unlock(ptl); 1357 spin_unlock(ptl);
1341 if (!page_locked)
1342 lock_page(page);
1343 anon_vma = page_lock_anon_vma_read(page); 1358 anon_vma = page_lock_anon_vma_read(page);
1344 1359
1345 /* Confirm the PMD did not change while page_table_lock was released */ 1360 /* Confirm the PMD did not change while page_table_lock was released */
@@ -1351,6 +1366,13 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1351 goto out_unlock; 1366 goto out_unlock;
1352 } 1367 }
1353 1368
1369 /* Bail if we fail to protect against THP splits for any reason */
1370 if (unlikely(!anon_vma)) {
1371 put_page(page);
1372 page_nid = -1;
1373 goto clear_pmdnuma;
1374 }
1375
1354 /* 1376 /*
1355 * Migrate the THP to the requested node, returns with page unlocked 1377 * Migrate the THP to the requested node, returns with page unlocked
1356 * and pmd_numa cleared. 1378 * and pmd_numa cleared.
@@ -1517,6 +1539,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1517 ret = 1; 1539 ret = 1;
1518 if (!prot_numa) { 1540 if (!prot_numa) {
1519 entry = pmdp_get_and_clear(mm, addr, pmd); 1541 entry = pmdp_get_and_clear(mm, addr, pmd);
1542 if (pmd_numa(entry))
1543 entry = pmd_mknonnuma(entry);
1520 entry = pmd_modify(entry, newprot); 1544 entry = pmd_modify(entry, newprot);
1521 ret = HPAGE_PMD_NR; 1545 ret = HPAGE_PMD_NR;
1522 BUG_ON(pmd_write(entry)); 1546 BUG_ON(pmd_write(entry));
@@ -1531,7 +1555,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1531 */ 1555 */
1532 if (!is_huge_zero_page(page) && 1556 if (!is_huge_zero_page(page) &&
1533 !pmd_numa(*pmd)) { 1557 !pmd_numa(*pmd)) {
1534 entry = pmdp_get_and_clear(mm, addr, pmd); 1558 entry = *pmd;
1535 entry = pmd_mknuma(entry); 1559 entry = pmd_mknuma(entry);
1536 ret = HPAGE_PMD_NR; 1560 ret = HPAGE_PMD_NR;
1537 } 1561 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bf5e89457149..7f1a356153c0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -338,7 +338,7 @@ struct mem_cgroup {
338static size_t memcg_size(void) 338static size_t memcg_size(void)
339{ 339{
340 return sizeof(struct mem_cgroup) + 340 return sizeof(struct mem_cgroup) +
341 nr_node_ids * sizeof(struct mem_cgroup_per_node); 341 nr_node_ids * sizeof(struct mem_cgroup_per_node *);
342} 342}
343 343
344/* internal only representation about the status of kmem accounting. */ 344/* internal only representation about the status of kmem accounting. */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index b7c171602ba1..fabe55046c1d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -938,6 +938,16 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
938 BUG_ON(!PageHWPoison(p)); 938 BUG_ON(!PageHWPoison(p));
939 return SWAP_FAIL; 939 return SWAP_FAIL;
940 } 940 }
941 /*
942 * We pinned the head page for hwpoison handling,
943 * now we split the thp and we are interested in
944 * the hwpoisoned raw page, so move the refcount
945 * to it.
946 */
947 if (hpage != p) {
948 put_page(hpage);
949 get_page(p);
950 }
941 /* THP is split, so ppage should be the real poisoned page. */ 951 /* THP is split, so ppage should be the real poisoned page. */
942 ppage = p; 952 ppage = p;
943 } 953 }
@@ -1505,10 +1515,16 @@ static int soft_offline_huge_page(struct page *page, int flags)
1505 if (ret > 0) 1515 if (ret > 0)
1506 ret = -EIO; 1516 ret = -EIO;
1507 } else { 1517 } else {
1508 set_page_hwpoison_huge_page(hpage); 1518 /* overcommit hugetlb page will be freed to buddy */
1509 dequeue_hwpoisoned_huge_page(hpage); 1519 if (PageHuge(page)) {
1510 atomic_long_add(1 << compound_order(hpage), 1520 set_page_hwpoison_huge_page(hpage);
1511 &num_poisoned_pages); 1521 dequeue_hwpoisoned_huge_page(hpage);
1522 atomic_long_add(1 << compound_order(hpage),
1523 &num_poisoned_pages);
1524 } else {
1525 SetPageHWPoison(page);
1526 atomic_long_inc(&num_poisoned_pages);
1527 }
1512 } 1528 }
1513 return ret; 1529 return ret;
1514} 1530}
diff --git a/mm/memory.c b/mm/memory.c
index 5d9025f3b3e1..6768ce9e57d2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4271,7 +4271,7 @@ void copy_user_huge_page(struct page *dst, struct page *src,
4271} 4271}
4272#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ 4272#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
4273 4273
4274#if USE_SPLIT_PTE_PTLOCKS && BLOATED_SPINLOCKS 4274#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
4275bool ptlock_alloc(struct page *page) 4275bool ptlock_alloc(struct page *page)
4276{ 4276{
4277 spinlock_t *ptl; 4277 spinlock_t *ptl;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index eca4a3129129..0cd2c4d4e270 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1197,14 +1197,16 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
1197 break; 1197 break;
1198 vma = vma->vm_next; 1198 vma = vma->vm_next;
1199 } 1199 }
1200
1201 if (PageHuge(page)) {
1202 if (vma)
1203 return alloc_huge_page_noerr(vma, address, 1);
1204 else
1205 return NULL;
1206 }
1200 /* 1207 /*
1201 * queue_pages_range() confirms that @page belongs to some vma, 1208 * if !vma, alloc_page_vma() will use task or system default policy
1202 * so vma shouldn't be NULL.
1203 */ 1209 */
1204 BUG_ON(!vma);
1205
1206 if (PageHuge(page))
1207 return alloc_huge_page_noerr(vma, address, 1);
1208 return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 1210 return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1209} 1211}
1210#else 1212#else
@@ -1318,7 +1320,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1318 if (nr_failed && (flags & MPOL_MF_STRICT)) 1320 if (nr_failed && (flags & MPOL_MF_STRICT))
1319 err = -EIO; 1321 err = -EIO;
1320 } else 1322 } else
1321 putback_lru_pages(&pagelist); 1323 putback_movable_pages(&pagelist);
1322 1324
1323 up_write(&mm->mmap_sem); 1325 up_write(&mm->mmap_sem);
1324 mpol_out: 1326 mpol_out:
diff --git a/mm/migrate.c b/mm/migrate.c
index bb940045fe85..9194375b2307 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -36,6 +36,7 @@
36#include <linux/hugetlb_cgroup.h> 36#include <linux/hugetlb_cgroup.h>
37#include <linux/gfp.h> 37#include <linux/gfp.h>
38#include <linux/balloon_compaction.h> 38#include <linux/balloon_compaction.h>
39#include <linux/mmu_notifier.h>
39 40
40#include <asm/tlbflush.h> 41#include <asm/tlbflush.h>
41 42
@@ -316,14 +317,15 @@ static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
316 */ 317 */
317int migrate_page_move_mapping(struct address_space *mapping, 318int migrate_page_move_mapping(struct address_space *mapping,
318 struct page *newpage, struct page *page, 319 struct page *newpage, struct page *page,
319 struct buffer_head *head, enum migrate_mode mode) 320 struct buffer_head *head, enum migrate_mode mode,
321 int extra_count)
320{ 322{
321 int expected_count = 0; 323 int expected_count = 1 + extra_count;
322 void **pslot; 324 void **pslot;
323 325
324 if (!mapping) { 326 if (!mapping) {
325 /* Anonymous page without mapping */ 327 /* Anonymous page without mapping */
326 if (page_count(page) != 1) 328 if (page_count(page) != expected_count)
327 return -EAGAIN; 329 return -EAGAIN;
328 return MIGRATEPAGE_SUCCESS; 330 return MIGRATEPAGE_SUCCESS;
329 } 331 }
@@ -333,7 +335,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
333 pslot = radix_tree_lookup_slot(&mapping->page_tree, 335 pslot = radix_tree_lookup_slot(&mapping->page_tree,
334 page_index(page)); 336 page_index(page));
335 337
336 expected_count = 2 + page_has_private(page); 338 expected_count += 1 + page_has_private(page);
337 if (page_count(page) != expected_count || 339 if (page_count(page) != expected_count ||
338 radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { 340 radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
339 spin_unlock_irq(&mapping->tree_lock); 341 spin_unlock_irq(&mapping->tree_lock);
@@ -583,7 +585,7 @@ int migrate_page(struct address_space *mapping,
583 585
584 BUG_ON(PageWriteback(page)); /* Writeback must be complete */ 586 BUG_ON(PageWriteback(page)); /* Writeback must be complete */
585 587
586 rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); 588 rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
587 589
588 if (rc != MIGRATEPAGE_SUCCESS) 590 if (rc != MIGRATEPAGE_SUCCESS)
589 return rc; 591 return rc;
@@ -610,7 +612,7 @@ int buffer_migrate_page(struct address_space *mapping,
610 612
611 head = page_buffers(page); 613 head = page_buffers(page);
612 614
613 rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); 615 rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0);
614 616
615 if (rc != MIGRATEPAGE_SUCCESS) 617 if (rc != MIGRATEPAGE_SUCCESS)
616 return rc; 618 return rc;
@@ -1654,6 +1656,18 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1654 return 1; 1656 return 1;
1655} 1657}
1656 1658
1659bool pmd_trans_migrating(pmd_t pmd)
1660{
1661 struct page *page = pmd_page(pmd);
1662 return PageLocked(page);
1663}
1664
1665void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd)
1666{
1667 struct page *page = pmd_page(*pmd);
1668 wait_on_page_locked(page);
1669}
1670
1657/* 1671/*
1658 * Attempt to migrate a misplaced page to the specified destination 1672 * Attempt to migrate a misplaced page to the specified destination
1659 * node. Caller is expected to have an elevated reference count on 1673 * node. Caller is expected to have an elevated reference count on
@@ -1716,12 +1730,14 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1716 struct page *page, int node) 1730 struct page *page, int node)
1717{ 1731{
1718 spinlock_t *ptl; 1732 spinlock_t *ptl;
1719 unsigned long haddr = address & HPAGE_PMD_MASK;
1720 pg_data_t *pgdat = NODE_DATA(node); 1733 pg_data_t *pgdat = NODE_DATA(node);
1721 int isolated = 0; 1734 int isolated = 0;
1722 struct page *new_page = NULL; 1735 struct page *new_page = NULL;
1723 struct mem_cgroup *memcg = NULL; 1736 struct mem_cgroup *memcg = NULL;
1724 int page_lru = page_is_file_cache(page); 1737 int page_lru = page_is_file_cache(page);
1738 unsigned long mmun_start = address & HPAGE_PMD_MASK;
1739 unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
1740 pmd_t orig_entry;
1725 1741
1726 /* 1742 /*
1727 * Rate-limit the amount of data that is being migrated to a node. 1743 * Rate-limit the amount of data that is being migrated to a node.
@@ -1744,6 +1760,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1744 goto out_fail; 1760 goto out_fail;
1745 } 1761 }
1746 1762
1763 if (mm_tlb_flush_pending(mm))
1764 flush_tlb_range(vma, mmun_start, mmun_end);
1765
1747 /* Prepare a page as a migration target */ 1766 /* Prepare a page as a migration target */
1748 __set_page_locked(new_page); 1767 __set_page_locked(new_page);
1749 SetPageSwapBacked(new_page); 1768 SetPageSwapBacked(new_page);
@@ -1755,9 +1774,12 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1755 WARN_ON(PageLRU(new_page)); 1774 WARN_ON(PageLRU(new_page));
1756 1775
1757 /* Recheck the target PMD */ 1776 /* Recheck the target PMD */
1777 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1758 ptl = pmd_lock(mm, pmd); 1778 ptl = pmd_lock(mm, pmd);
1759 if (unlikely(!pmd_same(*pmd, entry))) { 1779 if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) {
1780fail_putback:
1760 spin_unlock(ptl); 1781 spin_unlock(ptl);
1782 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1761 1783
1762 /* Reverse changes made by migrate_page_copy() */ 1784 /* Reverse changes made by migrate_page_copy() */
1763 if (TestClearPageActive(new_page)) 1785 if (TestClearPageActive(new_page))
@@ -1774,7 +1796,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1774 putback_lru_page(page); 1796 putback_lru_page(page);
1775 mod_zone_page_state(page_zone(page), 1797 mod_zone_page_state(page_zone(page),
1776 NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); 1798 NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
1777 goto out_fail; 1799
1800 goto out_unlock;
1778 } 1801 }
1779 1802
1780 /* 1803 /*
@@ -1786,16 +1809,35 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1786 */ 1809 */
1787 mem_cgroup_prepare_migration(page, new_page, &memcg); 1810 mem_cgroup_prepare_migration(page, new_page, &memcg);
1788 1811
1812 orig_entry = *pmd;
1789 entry = mk_pmd(new_page, vma->vm_page_prot); 1813 entry = mk_pmd(new_page, vma->vm_page_prot);
1790 entry = pmd_mknonnuma(entry);
1791 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1792 entry = pmd_mkhuge(entry); 1814 entry = pmd_mkhuge(entry);
1815 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1793 1816
1794 pmdp_clear_flush(vma, haddr, pmd); 1817 /*
1795 set_pmd_at(mm, haddr, pmd, entry); 1818 * Clear the old entry under pagetable lock and establish the new PTE.
1796 page_add_new_anon_rmap(new_page, vma, haddr); 1819 * Any parallel GUP will either observe the old page blocking on the
1820 * page lock, block on the page table lock or observe the new page.
1821 * The SetPageUptodate on the new page and page_add_new_anon_rmap
1822 * guarantee the copy is visible before the pagetable update.
1823 */
1824 flush_cache_range(vma, mmun_start, mmun_end);
1825 page_add_new_anon_rmap(new_page, vma, mmun_start);
1826 pmdp_clear_flush(vma, mmun_start, pmd);
1827 set_pmd_at(mm, mmun_start, pmd, entry);
1828 flush_tlb_range(vma, mmun_start, mmun_end);
1797 update_mmu_cache_pmd(vma, address, &entry); 1829 update_mmu_cache_pmd(vma, address, &entry);
1830
1831 if (page_count(page) != 2) {
1832 set_pmd_at(mm, mmun_start, pmd, orig_entry);
1833 flush_tlb_range(vma, mmun_start, mmun_end);
1834 update_mmu_cache_pmd(vma, address, &entry);
1835 page_remove_rmap(new_page);
1836 goto fail_putback;
1837 }
1838
1798 page_remove_rmap(page); 1839 page_remove_rmap(page);
1840
1799 /* 1841 /*
1800 * Finish the charge transaction under the page table lock to 1842 * Finish the charge transaction under the page table lock to
1801 * prevent split_huge_page() from dividing up the charge 1843 * prevent split_huge_page() from dividing up the charge
@@ -1803,6 +1845,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1803 */ 1845 */
1804 mem_cgroup_end_migration(memcg, page, new_page, true); 1846 mem_cgroup_end_migration(memcg, page, new_page, true);
1805 spin_unlock(ptl); 1847 spin_unlock(ptl);
1848 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1806 1849
1807 unlock_page(new_page); 1850 unlock_page(new_page);
1808 unlock_page(page); 1851 unlock_page(page);
@@ -1820,10 +1863,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1820out_fail: 1863out_fail:
1821 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); 1864 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1822out_dropref: 1865out_dropref:
1823 entry = pmd_mknonnuma(entry); 1866 ptl = pmd_lock(mm, pmd);
1824 set_pmd_at(mm, haddr, pmd, entry); 1867 if (pmd_same(*pmd, entry)) {
1825 update_mmu_cache_pmd(vma, address, &entry); 1868 entry = pmd_mknonnuma(entry);
1869 set_pmd_at(mm, mmun_start, pmd, entry);
1870 update_mmu_cache_pmd(vma, address, &entry);
1871 }
1872 spin_unlock(ptl);
1826 1873
1874out_unlock:
1827 unlock_page(page); 1875 unlock_page(page);
1828 put_page(page); 1876 put_page(page);
1829 return 0; 1877 return 0;
diff --git a/mm/mlock.c b/mm/mlock.c
index d480cd6fc475..192e6eebe4f2 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -133,7 +133,10 @@ static void __munlock_isolation_failed(struct page *page)
133 133
134/** 134/**
135 * munlock_vma_page - munlock a vma page 135 * munlock_vma_page - munlock a vma page
136 * @page - page to be unlocked 136 * @page - page to be unlocked, either a normal page or THP page head
137 *
138 * returns the size of the page as a page mask (0 for normal page,
139 * HPAGE_PMD_NR - 1 for THP head page)
137 * 140 *
138 * called from munlock()/munmap() path with page supposedly on the LRU. 141 * called from munlock()/munmap() path with page supposedly on the LRU.
139 * When we munlock a page, because the vma where we found the page is being 142 * When we munlock a page, because the vma where we found the page is being
@@ -148,21 +151,30 @@ static void __munlock_isolation_failed(struct page *page)
148 */ 151 */
149unsigned int munlock_vma_page(struct page *page) 152unsigned int munlock_vma_page(struct page *page)
150{ 153{
151 unsigned int page_mask = 0; 154 unsigned int nr_pages;
152 155
153 BUG_ON(!PageLocked(page)); 156 BUG_ON(!PageLocked(page));
154 157
155 if (TestClearPageMlocked(page)) { 158 if (TestClearPageMlocked(page)) {
156 unsigned int nr_pages = hpage_nr_pages(page); 159 nr_pages = hpage_nr_pages(page);
157 mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); 160 mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
158 page_mask = nr_pages - 1;
159 if (!isolate_lru_page(page)) 161 if (!isolate_lru_page(page))
160 __munlock_isolated_page(page); 162 __munlock_isolated_page(page);
161 else 163 else
162 __munlock_isolation_failed(page); 164 __munlock_isolation_failed(page);
165 } else {
166 nr_pages = hpage_nr_pages(page);
163 } 167 }
164 168
165 return page_mask; 169 /*
170 * Regardless of the original PageMlocked flag, we determine nr_pages
171 * after touching the flag. This leaves a possible race with a THP page
172 * split, such that a whole THP page was munlocked, but nr_pages == 1.
173 * Returning a smaller mask due to that is OK, the worst that can
174 * happen is subsequent useless scanning of the former tail pages.
175 * The NR_MLOCK accounting can however become broken.
176 */
177 return nr_pages - 1;
166} 178}
167 179
168/** 180/**
@@ -286,10 +298,12 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
286{ 298{
287 int i; 299 int i;
288 int nr = pagevec_count(pvec); 300 int nr = pagevec_count(pvec);
289 int delta_munlocked = -nr; 301 int delta_munlocked;
290 struct pagevec pvec_putback; 302 struct pagevec pvec_putback;
291 int pgrescued = 0; 303 int pgrescued = 0;
292 304
305 pagevec_init(&pvec_putback, 0);
306
293 /* Phase 1: page isolation */ 307 /* Phase 1: page isolation */
294 spin_lock_irq(&zone->lru_lock); 308 spin_lock_irq(&zone->lru_lock);
295 for (i = 0; i < nr; i++) { 309 for (i = 0; i < nr; i++) {
@@ -318,18 +332,21 @@ skip_munlock:
318 /* 332 /*
319 * We won't be munlocking this page in the next phase 333 * We won't be munlocking this page in the next phase
320 * but we still need to release the follow_page_mask() 334 * but we still need to release the follow_page_mask()
321 * pin. 335 * pin. We cannot do it under lru_lock however. If it's
336 * the last pin, __page_cache_release would deadlock.
322 */ 337 */
338 pagevec_add(&pvec_putback, pvec->pages[i]);
323 pvec->pages[i] = NULL; 339 pvec->pages[i] = NULL;
324 put_page(page);
325 delta_munlocked++;
326 } 340 }
327 } 341 }
342 delta_munlocked = -nr + pagevec_count(&pvec_putback);
328 __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); 343 __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
329 spin_unlock_irq(&zone->lru_lock); 344 spin_unlock_irq(&zone->lru_lock);
330 345
346 /* Now we can release pins of pages that we are not munlocking */
347 pagevec_release(&pvec_putback);
348
331 /* Phase 2: page munlock */ 349 /* Phase 2: page munlock */
332 pagevec_init(&pvec_putback, 0);
333 for (i = 0; i < nr; i++) { 350 for (i = 0; i < nr; i++) {
334 struct page *page = pvec->pages[i]; 351 struct page *page = pvec->pages[i];
335 352
@@ -440,7 +457,8 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
440 457
441 while (start < end) { 458 while (start < end) {
442 struct page *page = NULL; 459 struct page *page = NULL;
443 unsigned int page_mask, page_increm; 460 unsigned int page_mask;
461 unsigned long page_increm;
444 struct pagevec pvec; 462 struct pagevec pvec;
445 struct zone *zone; 463 struct zone *zone;
446 int zoneid; 464 int zoneid;
@@ -490,7 +508,9 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
490 goto next; 508 goto next;
491 } 509 }
492 } 510 }
493 page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); 511 /* It's a bug to munlock in the middle of a THP page */
512 VM_BUG_ON((start >> PAGE_SHIFT) & page_mask);
513 page_increm = 1 + page_mask;
494 start += page_increm * PAGE_SIZE; 514 start += page_increm * PAGE_SIZE;
495next: 515next:
496 cond_resched(); 516 cond_resched();
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 26667971c824..bb53a6591aea 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -52,17 +52,21 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
52 pte_t ptent; 52 pte_t ptent;
53 bool updated = false; 53 bool updated = false;
54 54
55 ptent = ptep_modify_prot_start(mm, addr, pte);
56 if (!prot_numa) { 55 if (!prot_numa) {
56 ptent = ptep_modify_prot_start(mm, addr, pte);
57 if (pte_numa(ptent))
58 ptent = pte_mknonnuma(ptent);
57 ptent = pte_modify(ptent, newprot); 59 ptent = pte_modify(ptent, newprot);
58 updated = true; 60 updated = true;
59 } else { 61 } else {
60 struct page *page; 62 struct page *page;
61 63
64 ptent = *pte;
62 page = vm_normal_page(vma, addr, oldpte); 65 page = vm_normal_page(vma, addr, oldpte);
63 if (page) { 66 if (page) {
64 if (!pte_numa(oldpte)) { 67 if (!pte_numa(oldpte)) {
65 ptent = pte_mknuma(ptent); 68 ptent = pte_mknuma(ptent);
69 set_pte_at(mm, addr, pte, ptent);
66 updated = true; 70 updated = true;
67 } 71 }
68 } 72 }
@@ -79,7 +83,10 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
79 83
80 if (updated) 84 if (updated)
81 pages++; 85 pages++;
82 ptep_modify_prot_commit(mm, addr, pte, ptent); 86
87 /* Only !prot_numa always clears the pte */
88 if (!prot_numa)
89 ptep_modify_prot_commit(mm, addr, pte, ptent);
83 } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { 90 } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
84 swp_entry_t entry = pte_to_swp_entry(oldpte); 91 swp_entry_t entry = pte_to_swp_entry(oldpte);
85 92
@@ -181,6 +188,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
181 BUG_ON(addr >= end); 188 BUG_ON(addr >= end);
182 pgd = pgd_offset(mm, addr); 189 pgd = pgd_offset(mm, addr);
183 flush_cache_range(vma, addr, end); 190 flush_cache_range(vma, addr, end);
191 set_tlb_flush_pending(mm);
184 do { 192 do {
185 next = pgd_addr_end(addr, end); 193 next = pgd_addr_end(addr, end);
186 if (pgd_none_or_clear_bad(pgd)) 194 if (pgd_none_or_clear_bad(pgd))
@@ -192,6 +200,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
192 /* Only flush the TLB if we actually modified any entries: */ 200 /* Only flush the TLB if we actually modified any entries: */
193 if (pages) 201 if (pages)
194 flush_tlb_range(vma, start, end); 202 flush_tlb_range(vma, start, end);
203 clear_tlb_flush_pending(mm);
195 204
196 return pages; 205 return pages;
197} 206}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 580a5f075ed0..5248fe070aa4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1816,7 +1816,7 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
1816 1816
1817static bool zone_local(struct zone *local_zone, struct zone *zone) 1817static bool zone_local(struct zone *local_zone, struct zone *zone)
1818{ 1818{
1819 return node_distance(local_zone->node, zone->node) == LOCAL_DISTANCE; 1819 return local_zone->node == zone->node;
1820} 1820}
1821 1821
1822static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 1822static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
@@ -1913,18 +1913,17 @@ zonelist_scan:
1913 * page was allocated in should have no effect on the 1913 * page was allocated in should have no effect on the
1914 * time the page has in memory before being reclaimed. 1914 * time the page has in memory before being reclaimed.
1915 * 1915 *
1916 * When zone_reclaim_mode is enabled, try to stay in 1916 * Try to stay in local zones in the fastpath. If
1917 * local zones in the fastpath. If that fails, the 1917 * that fails, the slowpath is entered, which will do
1918 * slowpath is entered, which will do another pass 1918 * another pass starting with the local zones, but
1919 * starting with the local zones, but ultimately fall 1919 * ultimately fall back to remote zones that do not
1920 * back to remote zones that do not partake in the 1920 * partake in the fairness round-robin cycle of this
1921 * fairness round-robin cycle of this zonelist. 1921 * zonelist.
1922 */ 1922 */
1923 if (alloc_flags & ALLOC_WMARK_LOW) { 1923 if (alloc_flags & ALLOC_WMARK_LOW) {
1924 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) 1924 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
1925 continue; 1925 continue;
1926 if (zone_reclaim_mode && 1926 if (!zone_local(preferred_zone, zone))
1927 !zone_local(preferred_zone, zone))
1928 continue; 1927 continue;
1929 } 1928 }
1930 /* 1929 /*
@@ -2390,7 +2389,7 @@ static void prepare_slowpath(gfp_t gfp_mask, unsigned int order,
2390 * thrash fairness information for zones that are not 2389 * thrash fairness information for zones that are not
2391 * actually part of this zonelist's round-robin cycle. 2390 * actually part of this zonelist's round-robin cycle.
2392 */ 2391 */
2393 if (zone_reclaim_mode && !zone_local(preferred_zone, zone)) 2392 if (!zone_local(preferred_zone, zone))
2394 continue; 2393 continue;
2395 mod_zone_page_state(zone, NR_ALLOC_BATCH, 2394 mod_zone_page_state(zone, NR_ALLOC_BATCH,
2396 high_wmark_pages(zone) - 2395 high_wmark_pages(zone) -
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index cbb38545d9d6..a8b919925934 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -110,9 +110,10 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
110pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, 110pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
111 pte_t *ptep) 111 pte_t *ptep)
112{ 112{
113 struct mm_struct *mm = (vma)->vm_mm;
113 pte_t pte; 114 pte_t pte;
114 pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); 115 pte = ptep_get_and_clear(mm, address, ptep);
115 if (pte_accessible(pte)) 116 if (pte_accessible(mm, pte))
116 flush_tlb_page(vma, address); 117 flush_tlb_page(vma, address);
117 return pte; 118 return pte;
118} 119}
@@ -191,6 +192,9 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
191void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, 192void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
192 pmd_t *pmdp) 193 pmd_t *pmdp)
193{ 194{
195 pmd_t entry = *pmdp;
196 if (pmd_numa(entry))
197 entry = pmd_mknonnuma(entry);
194 set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp)); 198 set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp));
195 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 199 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
196} 200}
diff --git a/mm/rmap.c b/mm/rmap.c
index 55c8b8dc9ffb..068522d8502a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -600,7 +600,11 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
600 spinlock_t *ptl; 600 spinlock_t *ptl;
601 601
602 if (unlikely(PageHuge(page))) { 602 if (unlikely(PageHuge(page))) {
603 /* when pud is not present, pte will be NULL */
603 pte = huge_pte_offset(mm, address); 604 pte = huge_pte_offset(mm, address);
605 if (!pte)
606 return NULL;
607
604 ptl = huge_pte_lockptr(page_hstate(page), mm, pte); 608 ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
605 goto check; 609 goto check;
606 } 610 }