aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--MAINTAINERS6
-rw-r--r--Makefile20
-rw-r--r--arch/sh/lib/Makefile2
-rw-r--r--arch/sparc/include/asm/pgtable_64.h4
-rw-r--r--arch/x86/include/asm/pgtable.h11
-rw-r--r--arch/x86/mm/gup.c13
-rw-r--r--include/asm-generic/pgtable.h2
-rw-r--r--include/linux/migrate.h9
-rw-r--r--include/linux/mm_types.h49
-rw-r--r--include/linux/reboot.h1
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/kexec.c1
-rw-r--r--kernel/reboot.c2
-rw-r--r--kernel/sched/fair.c7
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/compaction.c4
-rw-r--r--mm/huge_memory.c45
-rw-r--r--mm/memory-failure.c14
-rw-r--r--mm/mempolicy.c16
-rw-r--r--mm/migrate.c69
-rw-r--r--mm/mprotect.c13
-rw-r--r--mm/page_alloc.c3
-rw-r--r--mm/pgtable-generic.c8
-rw-r--r--mm/rmap.c4
24 files changed, 249 insertions, 57 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 49c3674294a1..52f761733bfe 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3833,6 +3833,12 @@ T: git git://linuxtv.org/media_tree.git
3833S: Maintained 3833S: Maintained
3834F: drivers/media/usb/gspca/ 3834F: drivers/media/usb/gspca/
3835 3835
3836GUID PARTITION TABLE (GPT)
3837M: Davidlohr Bueso <davidlohr@hp.com>
3838L: linux-efi@vger.kernel.org
3839S: Maintained
3840F: block/partitions/efi.*
3841
3836STK1160 USB VIDEO CAPTURE DRIVER 3842STK1160 USB VIDEO CAPTURE DRIVER
3837M: Ezequiel Garcia <elezegarcia@gmail.com> 3843M: Ezequiel Garcia <elezegarcia@gmail.com>
3838L: linux-media@vger.kernel.org 3844L: linux-media@vger.kernel.org
diff --git a/Makefile b/Makefile
index 858a147fd836..89cee8625651 100644
--- a/Makefile
+++ b/Makefile
@@ -732,19 +732,13 @@ export mod_strip_cmd
732# Select initial ramdisk compression format, default is gzip(1). 732# Select initial ramdisk compression format, default is gzip(1).
733# This shall be used by the dracut(8) tool while creating an initramfs image. 733# This shall be used by the dracut(8) tool while creating an initramfs image.
734# 734#
735INITRD_COMPRESS=gzip 735INITRD_COMPRESS-y := gzip
736ifeq ($(CONFIG_RD_BZIP2), y) 736INITRD_COMPRESS-$(CONFIG_RD_BZIP2) := bzip2
737 INITRD_COMPRESS=bzip2 737INITRD_COMPRESS-$(CONFIG_RD_LZMA) := lzma
738else ifeq ($(CONFIG_RD_LZMA), y) 738INITRD_COMPRESS-$(CONFIG_RD_XZ) := xz
739 INITRD_COMPRESS=lzma 739INITRD_COMPRESS-$(CONFIG_RD_LZO) := lzo
740else ifeq ($(CONFIG_RD_XZ), y) 740INITRD_COMPRESS-$(CONFIG_RD_LZ4) := lz4
741 INITRD_COMPRESS=xz 741export INITRD_COMPRESS := $(INITRD_COMPRESS-y)
742else ifeq ($(CONFIG_RD_LZO), y)
743 INITRD_COMPRESS=lzo
744else ifeq ($(CONFIG_RD_LZ4), y)
745 INITRD_COMPRESS=lz4
746endif
747export INITRD_COMPRESS
748 742
749ifdef CONFIG_MODULE_SIG_ALL 743ifdef CONFIG_MODULE_SIG_ALL
750MODSECKEY = ./signing_key.priv 744MODSECKEY = ./signing_key.priv
diff --git a/arch/sh/lib/Makefile b/arch/sh/lib/Makefile
index 7b95f29e3174..3baff31e58cf 100644
--- a/arch/sh/lib/Makefile
+++ b/arch/sh/lib/Makefile
@@ -6,7 +6,7 @@ lib-y = delay.o memmove.o memchr.o \
6 checksum.o strlen.o div64.o div64-generic.o 6 checksum.o strlen.o div64.o div64-generic.o
7 7
8# Extracted from libgcc 8# Extracted from libgcc
9lib-y += movmem.o ashldi3.o ashrdi3.o lshrdi3.o \ 9obj-y += movmem.o ashldi3.o ashrdi3.o lshrdi3.o \
10 ashlsi3.o ashrsi3.o ashiftrt.o lshrsi3.o \ 10 ashlsi3.o ashrsi3.o ashiftrt.o lshrsi3.o \
11 udiv_qrnnd.o 11 udiv_qrnnd.o
12 12
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 8358dc144959..0f9e94537eee 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -619,7 +619,7 @@ static inline unsigned long pte_present(pte_t pte)
619} 619}
620 620
621#define pte_accessible pte_accessible 621#define pte_accessible pte_accessible
622static inline unsigned long pte_accessible(pte_t a) 622static inline unsigned long pte_accessible(struct mm_struct *mm, pte_t a)
623{ 623{
624 return pte_val(a) & _PAGE_VALID; 624 return pte_val(a) & _PAGE_VALID;
625} 625}
@@ -847,7 +847,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
847 * SUN4V NOTE: _PAGE_VALID is the same value in both the SUN4U 847 * SUN4V NOTE: _PAGE_VALID is the same value in both the SUN4U
848 * and SUN4V pte layout, so this inline test is fine. 848 * and SUN4V pte layout, so this inline test is fine.
849 */ 849 */
850 if (likely(mm != &init_mm) && pte_accessible(orig)) 850 if (likely(mm != &init_mm) && pte_accessible(mm, orig))
851 tlb_batch_add(mm, addr, ptep, orig, fullmm); 851 tlb_batch_add(mm, addr, ptep, orig, fullmm);
852} 852}
853 853
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 3d1999458709..bbc8b12fa443 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -452,9 +452,16 @@ static inline int pte_present(pte_t a)
452} 452}
453 453
454#define pte_accessible pte_accessible 454#define pte_accessible pte_accessible
455static inline int pte_accessible(pte_t a) 455static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
456{ 456{
457 return pte_flags(a) & _PAGE_PRESENT; 457 if (pte_flags(a) & _PAGE_PRESENT)
458 return true;
459
460 if ((pte_flags(a) & (_PAGE_PROTNONE | _PAGE_NUMA)) &&
461 mm_tlb_flush_pending(mm))
462 return true;
463
464 return false;
458} 465}
459 466
460static inline int pte_hidden(pte_t pte) 467static inline int pte_hidden(pte_t pte)
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index dd74e46828c0..0596e8e0cc19 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -83,6 +83,12 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
83 pte_t pte = gup_get_pte(ptep); 83 pte_t pte = gup_get_pte(ptep);
84 struct page *page; 84 struct page *page;
85 85
86 /* Similar to the PMD case, NUMA hinting must take slow path */
87 if (pte_numa(pte)) {
88 pte_unmap(ptep);
89 return 0;
90 }
91
86 if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { 92 if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
87 pte_unmap(ptep); 93 pte_unmap(ptep);
88 return 0; 94 return 0;
@@ -167,6 +173,13 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
167 if (pmd_none(pmd) || pmd_trans_splitting(pmd)) 173 if (pmd_none(pmd) || pmd_trans_splitting(pmd))
168 return 0; 174 return 0;
169 if (unlikely(pmd_large(pmd))) { 175 if (unlikely(pmd_large(pmd))) {
176 /*
177 * NUMA hinting faults need to be handled in the GUP
178 * slowpath for accounting purposes and so that they
179 * can be serialised against THP migration.
180 */
181 if (pmd_numa(pmd))
182 return 0;
170 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) 183 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
171 return 0; 184 return 0;
172 } else { 185 } else {
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index f330d28e4d0e..b12079afbd5f 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -217,7 +217,7 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
217#endif 217#endif
218 218
219#ifndef pte_accessible 219#ifndef pte_accessible
220# define pte_accessible(pte) ((void)(pte),1) 220# define pte_accessible(mm, pte) ((void)(pte), 1)
221#endif 221#endif
222 222
223#ifndef flush_tlb_fix_spurious_fault 223#ifndef flush_tlb_fix_spurious_fault
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index f5096b58b20d..b7717d74da7f 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -90,10 +90,19 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
90#endif /* CONFIG_MIGRATION */ 90#endif /* CONFIG_MIGRATION */
91 91
92#ifdef CONFIG_NUMA_BALANCING 92#ifdef CONFIG_NUMA_BALANCING
93extern bool pmd_trans_migrating(pmd_t pmd);
94extern void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd);
93extern int migrate_misplaced_page(struct page *page, 95extern int migrate_misplaced_page(struct page *page,
94 struct vm_area_struct *vma, int node); 96 struct vm_area_struct *vma, int node);
95extern bool migrate_ratelimited(int node); 97extern bool migrate_ratelimited(int node);
96#else 98#else
99static inline bool pmd_trans_migrating(pmd_t pmd)
100{
101 return false;
102}
103static inline void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd)
104{
105}
97static inline int migrate_misplaced_page(struct page *page, 106static inline int migrate_misplaced_page(struct page *page,
98 struct vm_area_struct *vma, int node) 107 struct vm_area_struct *vma, int node)
99{ 108{
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index bd299418a934..ad0616f2fe2c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -443,6 +443,14 @@ struct mm_struct {
443 /* numa_scan_seq prevents two threads setting pte_numa */ 443 /* numa_scan_seq prevents two threads setting pte_numa */
444 int numa_scan_seq; 444 int numa_scan_seq;
445#endif 445#endif
446#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
447 /*
448 * An operation with batched TLB flushing is going on. Anything that
449 * can move process memory needs to flush the TLB when moving a
450 * PROT_NONE or PROT_NUMA mapped page.
451 */
452 bool tlb_flush_pending;
453#endif
446 struct uprobes_state uprobes_state; 454 struct uprobes_state uprobes_state;
447}; 455};
448 456
@@ -459,4 +467,45 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
459 return mm->cpu_vm_mask_var; 467 return mm->cpu_vm_mask_var;
460} 468}
461 469
470#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
471/*
472 * Memory barriers to keep this state in sync are graciously provided by
473 * the page table locks, outside of which no page table modifications happen.
474 * The barriers below prevent the compiler from re-ordering the instructions
475 * around the memory barriers that are already present in the code.
476 */
477static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
478{
479 barrier();
480 return mm->tlb_flush_pending;
481}
482static inline void set_tlb_flush_pending(struct mm_struct *mm)
483{
484 mm->tlb_flush_pending = true;
485
486 /*
487 * Guarantee that the tlb_flush_pending store does not leak into the
488 * critical section updating the page tables
489 */
490 smp_mb__before_spinlock();
491}
492/* Clearing is done after a TLB flush, which also provides a barrier. */
493static inline void clear_tlb_flush_pending(struct mm_struct *mm)
494{
495 barrier();
496 mm->tlb_flush_pending = false;
497}
498#else
499static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
500{
501 return false;
502}
503static inline void set_tlb_flush_pending(struct mm_struct *mm)
504{
505}
506static inline void clear_tlb_flush_pending(struct mm_struct *mm)
507{
508}
509#endif
510
462#endif /* _LINUX_MM_TYPES_H */ 511#endif /* _LINUX_MM_TYPES_H */
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 8e00f9f6f963..9e7db9e73cc1 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -43,6 +43,7 @@ extern int unregister_reboot_notifier(struct notifier_block *);
43 * Architecture-specific implementations of sys_reboot commands. 43 * Architecture-specific implementations of sys_reboot commands.
44 */ 44 */
45 45
46extern void migrate_to_reboot_cpu(void);
46extern void machine_restart(char *cmd); 47extern void machine_restart(char *cmd);
47extern void machine_halt(void); 48extern void machine_halt(void);
48extern void machine_power_off(void); 49extern void machine_power_off(void);
diff --git a/kernel/fork.c b/kernel/fork.c
index 728d5be9548c..5721f0e3f2da 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -537,6 +537,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
537 spin_lock_init(&mm->page_table_lock); 537 spin_lock_init(&mm->page_table_lock);
538 mm_init_aio(mm); 538 mm_init_aio(mm);
539 mm_init_owner(mm, p); 539 mm_init_owner(mm, p);
540 clear_tlb_flush_pending(mm);
540 541
541 if (likely(!mm_alloc_pgd(mm))) { 542 if (likely(!mm_alloc_pgd(mm))) {
542 mm->def_flags = 0; 543 mm->def_flags = 0;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index d0d8fca54065..9c970167e402 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1680,6 +1680,7 @@ int kernel_kexec(void)
1680 { 1680 {
1681 kexec_in_progress = true; 1681 kexec_in_progress = true;
1682 kernel_restart_prepare(NULL); 1682 kernel_restart_prepare(NULL);
1683 migrate_to_reboot_cpu();
1683 printk(KERN_EMERG "Starting new kernel\n"); 1684 printk(KERN_EMERG "Starting new kernel\n");
1684 machine_shutdown(); 1685 machine_shutdown();
1685 } 1686 }
diff --git a/kernel/reboot.c b/kernel/reboot.c
index f813b3474646..662c83fc16b7 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -104,7 +104,7 @@ int unregister_reboot_notifier(struct notifier_block *nb)
104} 104}
105EXPORT_SYMBOL(unregister_reboot_notifier); 105EXPORT_SYMBOL(unregister_reboot_notifier);
106 106
107static void migrate_to_reboot_cpu(void) 107void migrate_to_reboot_cpu(void)
108{ 108{
109 /* The boot cpu is always logical cpu 0 */ 109 /* The boot cpu is always logical cpu 0 */
110 int cpu = reboot_cpu; 110 int cpu = reboot_cpu;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9030da7bcb15..c7395d97e4cb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1738,6 +1738,13 @@ void task_numa_work(struct callback_head *work)
1738 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) 1738 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
1739 continue; 1739 continue;
1740 1740
1741 /*
1742 * Skip inaccessible VMAs to avoid any confusion between
1743 * PROT_NONE and NUMA hinting ptes
1744 */
1745 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
1746 continue;
1747
1741 do { 1748 do {
1742 start = max(start, vma->vm_start); 1749 start = max(start, vma->vm_start);
1743 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); 1750 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
diff --git a/mm/Kconfig b/mm/Kconfig
index eb69f352401d..723bbe04a0b0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -543,7 +543,7 @@ config ZSWAP
543 543
544config MEM_SOFT_DIRTY 544config MEM_SOFT_DIRTY
545 bool "Track memory changes" 545 bool "Track memory changes"
546 depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY 546 depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS
547 select PROC_PAGE_MONITOR 547 select PROC_PAGE_MONITOR
548 help 548 help
549 This option enables memory changes tracking by introducing a 549 This option enables memory changes tracking by introducing a
diff --git a/mm/compaction.c b/mm/compaction.c
index 805165bcd3dd..f58bcd016f43 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -134,6 +134,10 @@ static void update_pageblock_skip(struct compact_control *cc,
134 bool migrate_scanner) 134 bool migrate_scanner)
135{ 135{
136 struct zone *zone = cc->zone; 136 struct zone *zone = cc->zone;
137
138 if (cc->ignore_skip_hint)
139 return;
140
137 if (!page) 141 if (!page)
138 return; 142 return;
139 143
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 33a5dc492810..7de1bf85f683 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -882,6 +882,10 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
882 ret = 0; 882 ret = 0;
883 goto out_unlock; 883 goto out_unlock;
884 } 884 }
885
886 /* mmap_sem prevents this happening but warn if that changes */
887 WARN_ON(pmd_trans_migrating(pmd));
888
885 if (unlikely(pmd_trans_splitting(pmd))) { 889 if (unlikely(pmd_trans_splitting(pmd))) {
886 /* split huge page running from under us */ 890 /* split huge page running from under us */
887 spin_unlock(src_ptl); 891 spin_unlock(src_ptl);
@@ -1243,6 +1247,10 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1243 if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) 1247 if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
1244 return ERR_PTR(-EFAULT); 1248 return ERR_PTR(-EFAULT);
1245 1249
1250 /* Full NUMA hinting faults to serialise migration in fault paths */
1251 if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
1252 goto out;
1253
1246 page = pmd_page(*pmd); 1254 page = pmd_page(*pmd);
1247 VM_BUG_ON(!PageHead(page)); 1255 VM_BUG_ON(!PageHead(page));
1248 if (flags & FOLL_TOUCH) { 1256 if (flags & FOLL_TOUCH) {
@@ -1295,6 +1303,17 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1295 if (unlikely(!pmd_same(pmd, *pmdp))) 1303 if (unlikely(!pmd_same(pmd, *pmdp)))
1296 goto out_unlock; 1304 goto out_unlock;
1297 1305
1306 /*
1307 * If there are potential migrations, wait for completion and retry
1308 * without disrupting NUMA hinting information. Do not relock and
1309 * check_same as the page may no longer be mapped.
1310 */
1311 if (unlikely(pmd_trans_migrating(*pmdp))) {
1312 spin_unlock(ptl);
1313 wait_migrate_huge_page(vma->anon_vma, pmdp);
1314 goto out;
1315 }
1316
1298 page = pmd_page(pmd); 1317 page = pmd_page(pmd);
1299 BUG_ON(is_huge_zero_page(page)); 1318 BUG_ON(is_huge_zero_page(page));
1300 page_nid = page_to_nid(page); 1319 page_nid = page_to_nid(page);
@@ -1323,23 +1342,22 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1323 /* If the page was locked, there are no parallel migrations */ 1342 /* If the page was locked, there are no parallel migrations */
1324 if (page_locked) 1343 if (page_locked)
1325 goto clear_pmdnuma; 1344 goto clear_pmdnuma;
1345 }
1326 1346
1327 /* 1347 /* Migration could have started since the pmd_trans_migrating check */
1328 * Otherwise wait for potential migrations and retry. We do 1348 if (!page_locked) {
1329 * relock and check_same as the page may no longer be mapped.
1330 * As the fault is being retried, do not account for it.
1331 */
1332 spin_unlock(ptl); 1349 spin_unlock(ptl);
1333 wait_on_page_locked(page); 1350 wait_on_page_locked(page);
1334 page_nid = -1; 1351 page_nid = -1;
1335 goto out; 1352 goto out;
1336 } 1353 }
1337 1354
1338 /* Page is misplaced, serialise migrations and parallel THP splits */ 1355 /*
1356 * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
1357 * to serialises splits
1358 */
1339 get_page(page); 1359 get_page(page);
1340 spin_unlock(ptl); 1360 spin_unlock(ptl);
1341 if (!page_locked)
1342 lock_page(page);
1343 anon_vma = page_lock_anon_vma_read(page); 1361 anon_vma = page_lock_anon_vma_read(page);
1344 1362
1345 /* Confirm the PMD did not change while page_table_lock was released */ 1363 /* Confirm the PMD did not change while page_table_lock was released */
@@ -1351,6 +1369,13 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1351 goto out_unlock; 1369 goto out_unlock;
1352 } 1370 }
1353 1371
1372 /* Bail if we fail to protect against THP splits for any reason */
1373 if (unlikely(!anon_vma)) {
1374 put_page(page);
1375 page_nid = -1;
1376 goto clear_pmdnuma;
1377 }
1378
1354 /* 1379 /*
1355 * Migrate the THP to the requested node, returns with page unlocked 1380 * Migrate the THP to the requested node, returns with page unlocked
1356 * and pmd_numa cleared. 1381 * and pmd_numa cleared.
@@ -1517,6 +1542,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1517 ret = 1; 1542 ret = 1;
1518 if (!prot_numa) { 1543 if (!prot_numa) {
1519 entry = pmdp_get_and_clear(mm, addr, pmd); 1544 entry = pmdp_get_and_clear(mm, addr, pmd);
1545 if (pmd_numa(entry))
1546 entry = pmd_mknonnuma(entry);
1520 entry = pmd_modify(entry, newprot); 1547 entry = pmd_modify(entry, newprot);
1521 ret = HPAGE_PMD_NR; 1548 ret = HPAGE_PMD_NR;
1522 BUG_ON(pmd_write(entry)); 1549 BUG_ON(pmd_write(entry));
@@ -1531,7 +1558,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1531 */ 1558 */
1532 if (!is_huge_zero_page(page) && 1559 if (!is_huge_zero_page(page) &&
1533 !pmd_numa(*pmd)) { 1560 !pmd_numa(*pmd)) {
1534 entry = pmdp_get_and_clear(mm, addr, pmd); 1561 entry = *pmd;
1535 entry = pmd_mknuma(entry); 1562 entry = pmd_mknuma(entry);
1536 ret = HPAGE_PMD_NR; 1563 ret = HPAGE_PMD_NR;
1537 } 1564 }
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index b7c171602ba1..db08af92c6fc 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1505,10 +1505,16 @@ static int soft_offline_huge_page(struct page *page, int flags)
1505 if (ret > 0) 1505 if (ret > 0)
1506 ret = -EIO; 1506 ret = -EIO;
1507 } else { 1507 } else {
1508 set_page_hwpoison_huge_page(hpage); 1508 /* overcommit hugetlb page will be freed to buddy */
1509 dequeue_hwpoisoned_huge_page(hpage); 1509 if (PageHuge(page)) {
1510 atomic_long_add(1 << compound_order(hpage), 1510 set_page_hwpoison_huge_page(hpage);
1511 &num_poisoned_pages); 1511 dequeue_hwpoisoned_huge_page(hpage);
1512 atomic_long_add(1 << compound_order(hpage),
1513 &num_poisoned_pages);
1514 } else {
1515 SetPageHWPoison(page);
1516 atomic_long_inc(&num_poisoned_pages);
1517 }
1512 } 1518 }
1513 return ret; 1519 return ret;
1514} 1520}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index eca4a3129129..0cd2c4d4e270 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1197,14 +1197,16 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
1197 break; 1197 break;
1198 vma = vma->vm_next; 1198 vma = vma->vm_next;
1199 } 1199 }
1200
1201 if (PageHuge(page)) {
1202 if (vma)
1203 return alloc_huge_page_noerr(vma, address, 1);
1204 else
1205 return NULL;
1206 }
1200 /* 1207 /*
1201 * queue_pages_range() confirms that @page belongs to some vma, 1208 * if !vma, alloc_page_vma() will use task or system default policy
1202 * so vma shouldn't be NULL.
1203 */ 1209 */
1204 BUG_ON(!vma);
1205
1206 if (PageHuge(page))
1207 return alloc_huge_page_noerr(vma, address, 1);
1208 return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 1210 return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1209} 1211}
1210#else 1212#else
@@ -1318,7 +1320,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1318 if (nr_failed && (flags & MPOL_MF_STRICT)) 1320 if (nr_failed && (flags & MPOL_MF_STRICT))
1319 err = -EIO; 1321 err = -EIO;
1320 } else 1322 } else
1321 putback_lru_pages(&pagelist); 1323 putback_movable_pages(&pagelist);
1322 1324
1323 up_write(&mm->mmap_sem); 1325 up_write(&mm->mmap_sem);
1324 mpol_out: 1326 mpol_out:
diff --git a/mm/migrate.c b/mm/migrate.c
index bb940045fe85..e9b710201335 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -36,6 +36,7 @@
36#include <linux/hugetlb_cgroup.h> 36#include <linux/hugetlb_cgroup.h>
37#include <linux/gfp.h> 37#include <linux/gfp.h>
38#include <linux/balloon_compaction.h> 38#include <linux/balloon_compaction.h>
39#include <linux/mmu_notifier.h>
39 40
40#include <asm/tlbflush.h> 41#include <asm/tlbflush.h>
41 42
@@ -1654,6 +1655,18 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1654 return 1; 1655 return 1;
1655} 1656}
1656 1657
1658bool pmd_trans_migrating(pmd_t pmd)
1659{
1660 struct page *page = pmd_page(pmd);
1661 return PageLocked(page);
1662}
1663
1664void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd)
1665{
1666 struct page *page = pmd_page(*pmd);
1667 wait_on_page_locked(page);
1668}
1669
1657/* 1670/*
1658 * Attempt to migrate a misplaced page to the specified destination 1671 * Attempt to migrate a misplaced page to the specified destination
1659 * node. Caller is expected to have an elevated reference count on 1672 * node. Caller is expected to have an elevated reference count on
@@ -1716,12 +1729,14 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1716 struct page *page, int node) 1729 struct page *page, int node)
1717{ 1730{
1718 spinlock_t *ptl; 1731 spinlock_t *ptl;
1719 unsigned long haddr = address & HPAGE_PMD_MASK;
1720 pg_data_t *pgdat = NODE_DATA(node); 1732 pg_data_t *pgdat = NODE_DATA(node);
1721 int isolated = 0; 1733 int isolated = 0;
1722 struct page *new_page = NULL; 1734 struct page *new_page = NULL;
1723 struct mem_cgroup *memcg = NULL; 1735 struct mem_cgroup *memcg = NULL;
1724 int page_lru = page_is_file_cache(page); 1736 int page_lru = page_is_file_cache(page);
1737 unsigned long mmun_start = address & HPAGE_PMD_MASK;
1738 unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
1739 pmd_t orig_entry;
1725 1740
1726 /* 1741 /*
1727 * Rate-limit the amount of data that is being migrated to a node. 1742 * Rate-limit the amount of data that is being migrated to a node.
@@ -1744,6 +1759,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1744 goto out_fail; 1759 goto out_fail;
1745 } 1760 }
1746 1761
1762 if (mm_tlb_flush_pending(mm))
1763 flush_tlb_range(vma, mmun_start, mmun_end);
1764
1747 /* Prepare a page as a migration target */ 1765 /* Prepare a page as a migration target */
1748 __set_page_locked(new_page); 1766 __set_page_locked(new_page);
1749 SetPageSwapBacked(new_page); 1767 SetPageSwapBacked(new_page);
@@ -1755,9 +1773,12 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1755 WARN_ON(PageLRU(new_page)); 1773 WARN_ON(PageLRU(new_page));
1756 1774
1757 /* Recheck the target PMD */ 1775 /* Recheck the target PMD */
1776 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1758 ptl = pmd_lock(mm, pmd); 1777 ptl = pmd_lock(mm, pmd);
1759 if (unlikely(!pmd_same(*pmd, entry))) { 1778 if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) {
1779fail_putback:
1760 spin_unlock(ptl); 1780 spin_unlock(ptl);
1781 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1761 1782
1762 /* Reverse changes made by migrate_page_copy() */ 1783 /* Reverse changes made by migrate_page_copy() */
1763 if (TestClearPageActive(new_page)) 1784 if (TestClearPageActive(new_page))
@@ -1774,7 +1795,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1774 putback_lru_page(page); 1795 putback_lru_page(page);
1775 mod_zone_page_state(page_zone(page), 1796 mod_zone_page_state(page_zone(page),
1776 NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); 1797 NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
1777 goto out_fail; 1798
1799 goto out_unlock;
1778 } 1800 }
1779 1801
1780 /* 1802 /*
@@ -1786,16 +1808,35 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1786 */ 1808 */
1787 mem_cgroup_prepare_migration(page, new_page, &memcg); 1809 mem_cgroup_prepare_migration(page, new_page, &memcg);
1788 1810
1811 orig_entry = *pmd;
1789 entry = mk_pmd(new_page, vma->vm_page_prot); 1812 entry = mk_pmd(new_page, vma->vm_page_prot);
1790 entry = pmd_mknonnuma(entry);
1791 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1792 entry = pmd_mkhuge(entry); 1813 entry = pmd_mkhuge(entry);
1814 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1793 1815
1794 pmdp_clear_flush(vma, haddr, pmd); 1816 /*
1795 set_pmd_at(mm, haddr, pmd, entry); 1817 * Clear the old entry under pagetable lock and establish the new PTE.
1796 page_add_new_anon_rmap(new_page, vma, haddr); 1818 * Any parallel GUP will either observe the old page blocking on the
1819 * page lock, block on the page table lock or observe the new page.
1820 * The SetPageUptodate on the new page and page_add_new_anon_rmap
1821 * guarantee the copy is visible before the pagetable update.
1822 */
1823 flush_cache_range(vma, mmun_start, mmun_end);
1824 page_add_new_anon_rmap(new_page, vma, mmun_start);
1825 pmdp_clear_flush(vma, mmun_start, pmd);
1826 set_pmd_at(mm, mmun_start, pmd, entry);
1827 flush_tlb_range(vma, mmun_start, mmun_end);
1797 update_mmu_cache_pmd(vma, address, &entry); 1828 update_mmu_cache_pmd(vma, address, &entry);
1829
1830 if (page_count(page) != 2) {
1831 set_pmd_at(mm, mmun_start, pmd, orig_entry);
1832 flush_tlb_range(vma, mmun_start, mmun_end);
1833 update_mmu_cache_pmd(vma, address, &entry);
1834 page_remove_rmap(new_page);
1835 goto fail_putback;
1836 }
1837
1798 page_remove_rmap(page); 1838 page_remove_rmap(page);
1839
1799 /* 1840 /*
1800 * Finish the charge transaction under the page table lock to 1841 * Finish the charge transaction under the page table lock to
1801 * prevent split_huge_page() from dividing up the charge 1842 * prevent split_huge_page() from dividing up the charge
@@ -1803,6 +1844,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1803 */ 1844 */
1804 mem_cgroup_end_migration(memcg, page, new_page, true); 1845 mem_cgroup_end_migration(memcg, page, new_page, true);
1805 spin_unlock(ptl); 1846 spin_unlock(ptl);
1847 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1806 1848
1807 unlock_page(new_page); 1849 unlock_page(new_page);
1808 unlock_page(page); 1850 unlock_page(page);
@@ -1820,10 +1862,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1820out_fail: 1862out_fail:
1821 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); 1863 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1822out_dropref: 1864out_dropref:
1823 entry = pmd_mknonnuma(entry); 1865 ptl = pmd_lock(mm, pmd);
1824 set_pmd_at(mm, haddr, pmd, entry); 1866 if (pmd_same(*pmd, entry)) {
1825 update_mmu_cache_pmd(vma, address, &entry); 1867 entry = pmd_mknonnuma(entry);
1868 set_pmd_at(mm, mmun_start, pmd, entry);
1869 update_mmu_cache_pmd(vma, address, &entry);
1870 }
1871 spin_unlock(ptl);
1826 1872
1873out_unlock:
1827 unlock_page(page); 1874 unlock_page(page);
1828 put_page(page); 1875 put_page(page);
1829 return 0; 1876 return 0;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 26667971c824..bb53a6591aea 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -52,17 +52,21 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
52 pte_t ptent; 52 pte_t ptent;
53 bool updated = false; 53 bool updated = false;
54 54
55 ptent = ptep_modify_prot_start(mm, addr, pte);
56 if (!prot_numa) { 55 if (!prot_numa) {
56 ptent = ptep_modify_prot_start(mm, addr, pte);
57 if (pte_numa(ptent))
58 ptent = pte_mknonnuma(ptent);
57 ptent = pte_modify(ptent, newprot); 59 ptent = pte_modify(ptent, newprot);
58 updated = true; 60 updated = true;
59 } else { 61 } else {
60 struct page *page; 62 struct page *page;
61 63
64 ptent = *pte;
62 page = vm_normal_page(vma, addr, oldpte); 65 page = vm_normal_page(vma, addr, oldpte);
63 if (page) { 66 if (page) {
64 if (!pte_numa(oldpte)) { 67 if (!pte_numa(oldpte)) {
65 ptent = pte_mknuma(ptent); 68 ptent = pte_mknuma(ptent);
69 set_pte_at(mm, addr, pte, ptent);
66 updated = true; 70 updated = true;
67 } 71 }
68 } 72 }
@@ -79,7 +83,10 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
79 83
80 if (updated) 84 if (updated)
81 pages++; 85 pages++;
82 ptep_modify_prot_commit(mm, addr, pte, ptent); 86
87 /* Only !prot_numa always clears the pte */
88 if (!prot_numa)
89 ptep_modify_prot_commit(mm, addr, pte, ptent);
83 } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { 90 } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
84 swp_entry_t entry = pte_to_swp_entry(oldpte); 91 swp_entry_t entry = pte_to_swp_entry(oldpte);
85 92
@@ -181,6 +188,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
181 BUG_ON(addr >= end); 188 BUG_ON(addr >= end);
182 pgd = pgd_offset(mm, addr); 189 pgd = pgd_offset(mm, addr);
183 flush_cache_range(vma, addr, end); 190 flush_cache_range(vma, addr, end);
191 set_tlb_flush_pending(mm);
184 do { 192 do {
185 next = pgd_addr_end(addr, end); 193 next = pgd_addr_end(addr, end);
186 if (pgd_none_or_clear_bad(pgd)) 194 if (pgd_none_or_clear_bad(pgd))
@@ -192,6 +200,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
192 /* Only flush the TLB if we actually modified any entries: */ 200 /* Only flush the TLB if we actually modified any entries: */
193 if (pages) 201 if (pages)
194 flush_tlb_range(vma, start, end); 202 flush_tlb_range(vma, start, end);
203 clear_tlb_flush_pending(mm);
195 204
196 return pages; 205 return pages;
197} 206}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 580a5f075ed0..f861d0257e90 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1920,7 +1920,8 @@ zonelist_scan:
1920 * back to remote zones that do not partake in the 1920 * back to remote zones that do not partake in the
1921 * fairness round-robin cycle of this zonelist. 1921 * fairness round-robin cycle of this zonelist.
1922 */ 1922 */
1923 if (alloc_flags & ALLOC_WMARK_LOW) { 1923 if ((alloc_flags & ALLOC_WMARK_LOW) &&
1924 (gfp_mask & GFP_MOVABLE_MASK)) {
1924 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) 1925 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
1925 continue; 1926 continue;
1926 if (zone_reclaim_mode && 1927 if (zone_reclaim_mode &&
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index cbb38545d9d6..a8b919925934 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -110,9 +110,10 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
110pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, 110pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
111 pte_t *ptep) 111 pte_t *ptep)
112{ 112{
113 struct mm_struct *mm = (vma)->vm_mm;
113 pte_t pte; 114 pte_t pte;
114 pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); 115 pte = ptep_get_and_clear(mm, address, ptep);
115 if (pte_accessible(pte)) 116 if (pte_accessible(mm, pte))
116 flush_tlb_page(vma, address); 117 flush_tlb_page(vma, address);
117 return pte; 118 return pte;
118} 119}
@@ -191,6 +192,9 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
191void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, 192void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
192 pmd_t *pmdp) 193 pmd_t *pmdp)
193{ 194{
195 pmd_t entry = *pmdp;
196 if (pmd_numa(entry))
197 entry = pmd_mknonnuma(entry);
194 set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp)); 198 set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp));
195 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 199 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
196} 200}
diff --git a/mm/rmap.c b/mm/rmap.c
index 55c8b8dc9ffb..068522d8502a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -600,7 +600,11 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
600 spinlock_t *ptl; 600 spinlock_t *ptl;
601 601
602 if (unlikely(PageHuge(page))) { 602 if (unlikely(PageHuge(page))) {
603 /* when pud is not present, pte will be NULL */
603 pte = huge_pte_offset(mm, address); 604 pte = huge_pte_offset(mm, address);
605 if (!pte)
606 return NULL;
607
604 ptl = huge_pte_lockptr(page_hstate(page), mm, pte); 608 ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
605 goto check; 609 goto check;
606 } 610 }