aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-08-03 17:58:13 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-08-03 17:58:13 -0400
commit995d03ae266d3f3bec8844d01b6c3ded19b9cc1c (patch)
treea98a9fa566b55def6d742578cd078afb21a44ed6
parent8d3fe85f07a99cbf82f12c486414eda1d186bfa1 (diff)
parent19ec8e48582670c021e998b9deb88e39a842ff45 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge misc fixes from Andrew Morton: "15 fixes" [ This does not merge the "fortify: use WARN instead of BUG for now" patch, which needs a bit of extra work to build cleanly with all configurations. Arnd is on it. - Linus ] * emailed patches from Andrew Morton <akpm@linux-foundation.org>: ocfs2: don't clear SGID when inheriting ACLs mm: allow page_cache_get_speculative in interrupt context userfaultfd: non-cooperative: flush event_wqh at release time ipc: add missing container_of()s for randstruct cpuset: fix a deadlock due to incomplete patching of cpusets_enabled() userfaultfd_zeropage: return -ENOSPC in case mm has gone mm: take memory hotplug lock within numa_zonelist_order_handler() mm/page_io.c: fix oops during block io poll in swapin path zram: do not free pool->size_class kthread: fix documentation build warning kasan: avoid -Wmaybe-uninitialized warning userfaultfd: non-cooperative: notify about unmap of destination during mremap mm, mprotect: flush TLB if potentially racing with a parallel reclaim leaving stale TLB entries pid: kill pidhash_size in pidhash_init() mm/hugetlb.c: __get_user_pages ignores certain follow_hugetlb_page errors
-rw-r--r--fs/ocfs2/acl.c24
-rw-r--r--fs/userfaultfd.c5
-rw-r--r--include/linux/cpuset.h19
-rw-r--r--include/linux/kthread.h2
-rw-r--r--include/linux/mm_types.h4
-rw-r--r--include/linux/pagemap.h2
-rw-r--r--ipc/msg.c3
-rw-r--r--ipc/sem.c3
-rw-r--r--ipc/shm.c4
-rw-r--r--kernel/cgroup/cpuset.c1
-rw-r--r--kernel/pid.c3
-rw-r--r--mm/hugetlb.c9
-rw-r--r--mm/internal.h5
-rw-r--r--mm/kasan/report.c1
-rw-r--r--mm/madvise.c1
-rw-r--r--mm/memory.c1
-rw-r--r--mm/mprotect.c1
-rw-r--r--mm/mremap.c8
-rw-r--r--mm/page_alloc.c2
-rw-r--r--mm/page_io.c7
-rw-r--r--mm/rmap.c36
-rw-r--r--mm/zsmalloc.c1
22 files changed, 109 insertions, 33 deletions
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index dc22ba8c710f..e50a387959bf 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -240,18 +240,6 @@ int ocfs2_set_acl(handle_t *handle,
240 switch (type) { 240 switch (type) {
241 case ACL_TYPE_ACCESS: 241 case ACL_TYPE_ACCESS:
242 name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; 242 name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
243 if (acl) {
244 umode_t mode;
245
246 ret = posix_acl_update_mode(inode, &mode, &acl);
247 if (ret)
248 return ret;
249
250 ret = ocfs2_acl_set_mode(inode, di_bh,
251 handle, mode);
252 if (ret)
253 return ret;
254 }
255 break; 243 break;
256 case ACL_TYPE_DEFAULT: 244 case ACL_TYPE_DEFAULT:
257 name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT; 245 name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
@@ -289,7 +277,19 @@ int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
289 had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh); 277 had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh);
290 if (had_lock < 0) 278 if (had_lock < 0)
291 return had_lock; 279 return had_lock;
280 if (type == ACL_TYPE_ACCESS && acl) {
281 umode_t mode;
282
283 status = posix_acl_update_mode(inode, &mode, &acl);
284 if (status)
285 goto unlock;
286
287 status = ocfs2_acl_set_mode(inode, bh, NULL, mode);
288 if (status)
289 goto unlock;
290 }
292 status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL); 291 status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
292unlock:
293 ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock); 293 ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
294 brelse(bh); 294 brelse(bh);
295 return status; 295 return status;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index cadcd12a3d35..06ea26b8c996 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -854,6 +854,9 @@ wakeup:
854 __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, &range); 854 __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, &range);
855 spin_unlock(&ctx->fault_pending_wqh.lock); 855 spin_unlock(&ctx->fault_pending_wqh.lock);
856 856
857 /* Flush pending events that may still wait on event_wqh */
858 wake_up_all(&ctx->event_wqh);
859
857 wake_up_poll(&ctx->fd_wqh, POLLHUP); 860 wake_up_poll(&ctx->fd_wqh, POLLHUP);
858 userfaultfd_ctx_put(ctx); 861 userfaultfd_ctx_put(ctx);
859 return 0; 862 return 0;
@@ -1643,6 +1646,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
1643 ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, 1646 ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
1644 uffdio_zeropage.range.len); 1647 uffdio_zeropage.range.len);
1645 mmput(ctx->mm); 1648 mmput(ctx->mm);
1649 } else {
1650 return -ENOSPC;
1646 } 1651 }
1647 if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) 1652 if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
1648 return -EFAULT; 1653 return -EFAULT;
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 119a3f9604b0..898cfe2eeb42 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -18,6 +18,19 @@
18 18
19#ifdef CONFIG_CPUSETS 19#ifdef CONFIG_CPUSETS
20 20
21/*
22 * Static branch rewrites can happen in an arbitrary order for a given
23 * key. In code paths where we need to loop with read_mems_allowed_begin() and
24 * read_mems_allowed_retry() to get a consistent view of mems_allowed, we need
25 * to ensure that begin() always gets rewritten before retry() in the
26 * disabled -> enabled transition. If not, then if local irqs are disabled
27 * around the loop, we can deadlock since retry() would always be
28 * comparing the latest value of the mems_allowed seqcount against 0 as
29 * begin() still would see cpusets_enabled() as false. The enabled -> disabled
30 * transition should happen in reverse order for the same reasons (want to stop
31 * looking at real value of mems_allowed.sequence in retry() first).
32 */
33extern struct static_key_false cpusets_pre_enable_key;
21extern struct static_key_false cpusets_enabled_key; 34extern struct static_key_false cpusets_enabled_key;
22static inline bool cpusets_enabled(void) 35static inline bool cpusets_enabled(void)
23{ 36{
@@ -32,12 +45,14 @@ static inline int nr_cpusets(void)
32 45
33static inline void cpuset_inc(void) 46static inline void cpuset_inc(void)
34{ 47{
48 static_branch_inc(&cpusets_pre_enable_key);
35 static_branch_inc(&cpusets_enabled_key); 49 static_branch_inc(&cpusets_enabled_key);
36} 50}
37 51
38static inline void cpuset_dec(void) 52static inline void cpuset_dec(void)
39{ 53{
40 static_branch_dec(&cpusets_enabled_key); 54 static_branch_dec(&cpusets_enabled_key);
55 static_branch_dec(&cpusets_pre_enable_key);
41} 56}
42 57
43extern int cpuset_init(void); 58extern int cpuset_init(void);
@@ -115,7 +130,7 @@ extern void cpuset_print_current_mems_allowed(void);
115 */ 130 */
116static inline unsigned int read_mems_allowed_begin(void) 131static inline unsigned int read_mems_allowed_begin(void)
117{ 132{
118 if (!cpusets_enabled()) 133 if (!static_branch_unlikely(&cpusets_pre_enable_key))
119 return 0; 134 return 0;
120 135
121 return read_seqcount_begin(&current->mems_allowed_seq); 136 return read_seqcount_begin(&current->mems_allowed_seq);
@@ -129,7 +144,7 @@ static inline unsigned int read_mems_allowed_begin(void)
129 */ 144 */
130static inline bool read_mems_allowed_retry(unsigned int seq) 145static inline bool read_mems_allowed_retry(unsigned int seq)
131{ 146{
132 if (!cpusets_enabled()) 147 if (!static_branch_unlikely(&cpusets_enabled_key))
133 return false; 148 return false;
134 149
135 return read_seqcount_retry(&current->mems_allowed_seq, seq); 150 return read_seqcount_retry(&current->mems_allowed_seq, seq);
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 4fec8b775895..82e197eeac91 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -15,7 +15,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
15 * @threadfn: the function to run in the thread 15 * @threadfn: the function to run in the thread
16 * @data: data pointer for @threadfn() 16 * @data: data pointer for @threadfn()
17 * @namefmt: printf-style format string for the thread name 17 * @namefmt: printf-style format string for the thread name
18 * @...: arguments for @namefmt. 18 * @arg...: arguments for @namefmt.
19 * 19 *
20 * This macro will create a kthread on the current node, leaving it in 20 * This macro will create a kthread on the current node, leaving it in
21 * the stopped state. This is just a helper for kthread_create_on_node(); 21 * the stopped state. This is just a helper for kthread_create_on_node();
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index ff151814a02d..7f384bb62d8e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -495,6 +495,10 @@ struct mm_struct {
495 */ 495 */
496 bool tlb_flush_pending; 496 bool tlb_flush_pending;
497#endif 497#endif
498#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
499 /* See flush_tlb_batched_pending() */
500 bool tlb_flush_batched;
501#endif
498 struct uprobes_state uprobes_state; 502 struct uprobes_state uprobes_state;
499#ifdef CONFIG_HUGETLB_PAGE 503#ifdef CONFIG_HUGETLB_PAGE
500 atomic_long_t hugetlb_usage; 504 atomic_long_t hugetlb_usage;
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index baa9344dcd10..79b36f57c3ba 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -163,8 +163,6 @@ void release_pages(struct page **pages, int nr, bool cold);
163 */ 163 */
164static inline int page_cache_get_speculative(struct page *page) 164static inline int page_cache_get_speculative(struct page *page)
165{ 165{
166 VM_BUG_ON(in_interrupt());
167
168#ifdef CONFIG_TINY_RCU 166#ifdef CONFIG_TINY_RCU
169# ifdef CONFIG_PREEMPT_COUNT 167# ifdef CONFIG_PREEMPT_COUNT
170 VM_BUG_ON(!in_atomic() && !irqs_disabled()); 168 VM_BUG_ON(!in_atomic() && !irqs_disabled());
diff --git a/ipc/msg.c b/ipc/msg.c
index 5b25e0755656..2c38f10d1483 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -1034,7 +1034,8 @@ void msg_exit_ns(struct ipc_namespace *ns)
1034static int sysvipc_msg_proc_show(struct seq_file *s, void *it) 1034static int sysvipc_msg_proc_show(struct seq_file *s, void *it)
1035{ 1035{
1036 struct user_namespace *user_ns = seq_user_ns(s); 1036 struct user_namespace *user_ns = seq_user_ns(s);
1037 struct msg_queue *msq = it; 1037 struct kern_ipc_perm *ipcp = it;
1038 struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm);
1038 1039
1039 seq_printf(s, 1040 seq_printf(s,
1040 "%10d %10d %4o %10lu %10lu %5u %5u %5u %5u %5u %5u %10lu %10lu %10lu\n", 1041 "%10d %10d %4o %10lu %10lu %5u %5u %5u %5u %5u %5u %10lu %10lu %10lu\n",
diff --git a/ipc/sem.c b/ipc/sem.c
index 9e70cd7a17da..38371e93bfa5 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -2179,7 +2179,8 @@ void exit_sem(struct task_struct *tsk)
2179static int sysvipc_sem_proc_show(struct seq_file *s, void *it) 2179static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
2180{ 2180{
2181 struct user_namespace *user_ns = seq_user_ns(s); 2181 struct user_namespace *user_ns = seq_user_ns(s);
2182 struct sem_array *sma = it; 2182 struct kern_ipc_perm *ipcp = it;
2183 struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
2183 time_t sem_otime; 2184 time_t sem_otime;
2184 2185
2185 /* 2186 /*
diff --git a/ipc/shm.c b/ipc/shm.c
index 28a444861a8f..8828b4c3a190 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1380,9 +1380,11 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
1380static int sysvipc_shm_proc_show(struct seq_file *s, void *it) 1380static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
1381{ 1381{
1382 struct user_namespace *user_ns = seq_user_ns(s); 1382 struct user_namespace *user_ns = seq_user_ns(s);
1383 struct shmid_kernel *shp = it; 1383 struct kern_ipc_perm *ipcp = it;
1384 struct shmid_kernel *shp;
1384 unsigned long rss = 0, swp = 0; 1385 unsigned long rss = 0, swp = 0;
1385 1386
1387 shp = container_of(ipcp, struct shmid_kernel, shm_perm);
1386 shm_add_rss_swap(shp, &rss, &swp); 1388 shm_add_rss_swap(shp, &rss, &swp);
1387 1389
1388#if BITS_PER_LONG <= 32 1390#if BITS_PER_LONG <= 32
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index ca8376e5008c..8d5151688504 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -63,6 +63,7 @@
63#include <linux/cgroup.h> 63#include <linux/cgroup.h>
64#include <linux/wait.h> 64#include <linux/wait.h>
65 65
66DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
66DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); 67DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
67 68
68/* See "Frequency meter" comments, below. */ 69/* See "Frequency meter" comments, below. */
diff --git a/kernel/pid.c b/kernel/pid.c
index 731c4e528f4e..c69c30d827e5 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -575,13 +575,10 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
575 */ 575 */
576void __init pidhash_init(void) 576void __init pidhash_init(void)
577{ 577{
578 unsigned int pidhash_size;
579
580 pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, 578 pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
581 HASH_EARLY | HASH_SMALL | HASH_ZERO, 579 HASH_EARLY | HASH_SMALL | HASH_ZERO,
582 &pidhash_shift, NULL, 580 &pidhash_shift, NULL,
583 0, 4096); 581 0, 4096);
584 pidhash_size = 1U << pidhash_shift;
585} 582}
586 583
587void __init pidmap_init(void) 584void __init pidmap_init(void)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bc48ee783dd9..a1a0ac0ad6f6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4078,6 +4078,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
4078 unsigned long vaddr = *position; 4078 unsigned long vaddr = *position;
4079 unsigned long remainder = *nr_pages; 4079 unsigned long remainder = *nr_pages;
4080 struct hstate *h = hstate_vma(vma); 4080 struct hstate *h = hstate_vma(vma);
4081 int err = -EFAULT;
4081 4082
4082 while (vaddr < vma->vm_end && remainder) { 4083 while (vaddr < vma->vm_end && remainder) {
4083 pte_t *pte; 4084 pte_t *pte;
@@ -4154,11 +4155,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
4154 } 4155 }
4155 ret = hugetlb_fault(mm, vma, vaddr, fault_flags); 4156 ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
4156 if (ret & VM_FAULT_ERROR) { 4157 if (ret & VM_FAULT_ERROR) {
4157 int err = vm_fault_to_errno(ret, flags); 4158 err = vm_fault_to_errno(ret, flags);
4158
4159 if (err)
4160 return err;
4161
4162 remainder = 0; 4159 remainder = 0;
4163 break; 4160 break;
4164 } 4161 }
@@ -4213,7 +4210,7 @@ same_page:
4213 */ 4210 */
4214 *position = vaddr; 4211 *position = vaddr;
4215 4212
4216 return i ? i : -EFAULT; 4213 return i ? i : err;
4217} 4214}
4218 4215
4219#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE 4216#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
diff --git a/mm/internal.h b/mm/internal.h
index 24d88f084705..4ef49fc55e58 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -498,6 +498,7 @@ extern struct workqueue_struct *mm_percpu_wq;
498#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH 498#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
499void try_to_unmap_flush(void); 499void try_to_unmap_flush(void);
500void try_to_unmap_flush_dirty(void); 500void try_to_unmap_flush_dirty(void);
501void flush_tlb_batched_pending(struct mm_struct *mm);
501#else 502#else
502static inline void try_to_unmap_flush(void) 503static inline void try_to_unmap_flush(void)
503{ 504{
@@ -505,7 +506,9 @@ static inline void try_to_unmap_flush(void)
505static inline void try_to_unmap_flush_dirty(void) 506static inline void try_to_unmap_flush_dirty(void)
506{ 507{
507} 508}
508 509static inline void flush_tlb_batched_pending(struct mm_struct *mm)
510{
511}
509#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ 512#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
510 513
511extern const struct trace_print_flags pageflag_names[]; 514extern const struct trace_print_flags pageflag_names[];
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 04bb1d3eb9ec..6bcfb01ba038 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -401,6 +401,7 @@ void kasan_report(unsigned long addr, size_t size,
401 disable_trace_on_warning(); 401 disable_trace_on_warning();
402 402
403 info.access_addr = (void *)addr; 403 info.access_addr = (void *)addr;
404 info.first_bad_addr = (void *)addr;
404 info.access_size = size; 405 info.access_size = size;
405 info.is_write = is_write; 406 info.is_write = is_write;
406 info.ip = ip; 407 info.ip = ip;
diff --git a/mm/madvise.c b/mm/madvise.c
index 9976852f1e1c..47d8d8a25eae 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -320,6 +320,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
320 320
321 tlb_remove_check_page_size_change(tlb, PAGE_SIZE); 321 tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
322 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 322 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
323 flush_tlb_batched_pending(mm);
323 arch_enter_lazy_mmu_mode(); 324 arch_enter_lazy_mmu_mode();
324 for (; addr != end; pte++, addr += PAGE_SIZE) { 325 for (; addr != end; pte++, addr += PAGE_SIZE) {
325 ptent = *pte; 326 ptent = *pte;
diff --git a/mm/memory.c b/mm/memory.c
index 0e517be91a89..f65beaad319b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1197,6 +1197,7 @@ again:
1197 init_rss_vec(rss); 1197 init_rss_vec(rss);
1198 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 1198 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1199 pte = start_pte; 1199 pte = start_pte;
1200 flush_tlb_batched_pending(mm);
1200 arch_enter_lazy_mmu_mode(); 1201 arch_enter_lazy_mmu_mode();
1201 do { 1202 do {
1202 pte_t ptent = *pte; 1203 pte_t ptent = *pte;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 1a8c9ca83e48..4180ad8cc9c5 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -64,6 +64,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
64 atomic_read(&vma->vm_mm->mm_users) == 1) 64 atomic_read(&vma->vm_mm->mm_users) == 1)
65 target_node = numa_node_id(); 65 target_node = numa_node_id();
66 66
67 flush_tlb_batched_pending(vma->vm_mm);
67 arch_enter_lazy_mmu_mode(); 68 arch_enter_lazy_mmu_mode();
68 do { 69 do {
69 oldpte = *pte; 70 oldpte = *pte;
diff --git a/mm/mremap.c b/mm/mremap.c
index cd8a1b199ef9..3f23715d3c69 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -152,6 +152,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
152 new_ptl = pte_lockptr(mm, new_pmd); 152 new_ptl = pte_lockptr(mm, new_pmd);
153 if (new_ptl != old_ptl) 153 if (new_ptl != old_ptl)
154 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 154 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
155 flush_tlb_batched_pending(vma->vm_mm);
155 arch_enter_lazy_mmu_mode(); 156 arch_enter_lazy_mmu_mode();
156 157
157 for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, 158 for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
@@ -428,6 +429,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
428static unsigned long mremap_to(unsigned long addr, unsigned long old_len, 429static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
429 unsigned long new_addr, unsigned long new_len, bool *locked, 430 unsigned long new_addr, unsigned long new_len, bool *locked,
430 struct vm_userfaultfd_ctx *uf, 431 struct vm_userfaultfd_ctx *uf,
432 struct list_head *uf_unmap_early,
431 struct list_head *uf_unmap) 433 struct list_head *uf_unmap)
432{ 434{
433 struct mm_struct *mm = current->mm; 435 struct mm_struct *mm = current->mm;
@@ -446,7 +448,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
446 if (addr + old_len > new_addr && new_addr + new_len > addr) 448 if (addr + old_len > new_addr && new_addr + new_len > addr)
447 goto out; 449 goto out;
448 450
449 ret = do_munmap(mm, new_addr, new_len, NULL); 451 ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
450 if (ret) 452 if (ret)
451 goto out; 453 goto out;
452 454
@@ -514,6 +516,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
514 unsigned long charged = 0; 516 unsigned long charged = 0;
515 bool locked = false; 517 bool locked = false;
516 struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; 518 struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
519 LIST_HEAD(uf_unmap_early);
517 LIST_HEAD(uf_unmap); 520 LIST_HEAD(uf_unmap);
518 521
519 if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) 522 if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
@@ -541,7 +544,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
541 544
542 if (flags & MREMAP_FIXED) { 545 if (flags & MREMAP_FIXED) {
543 ret = mremap_to(addr, old_len, new_addr, new_len, 546 ret = mremap_to(addr, old_len, new_addr, new_len,
544 &locked, &uf, &uf_unmap); 547 &locked, &uf, &uf_unmap_early, &uf_unmap);
545 goto out; 548 goto out;
546 } 549 }
547 550
@@ -621,6 +624,7 @@ out:
621 up_write(&current->mm->mmap_sem); 624 up_write(&current->mm->mmap_sem);
622 if (locked && new_len > old_len) 625 if (locked && new_len > old_len)
623 mm_populate(new_addr + old_len, new_len - old_len); 626 mm_populate(new_addr + old_len, new_len - old_len);
627 userfaultfd_unmap_complete(mm, &uf_unmap_early);
624 mremap_userfaultfd_complete(&uf, addr, new_addr, old_len); 628 mremap_userfaultfd_complete(&uf, addr, new_addr, old_len);
625 userfaultfd_unmap_complete(mm, &uf_unmap); 629 userfaultfd_unmap_complete(mm, &uf_unmap);
626 return ret; 630 return ret;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6d30e914afb6..fc32aa81f359 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4891,9 +4891,11 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
4891 NUMA_ZONELIST_ORDER_LEN); 4891 NUMA_ZONELIST_ORDER_LEN);
4892 user_zonelist_order = oldval; 4892 user_zonelist_order = oldval;
4893 } else if (oldval != user_zonelist_order) { 4893 } else if (oldval != user_zonelist_order) {
4894 mem_hotplug_begin();
4894 mutex_lock(&zonelists_mutex); 4895 mutex_lock(&zonelists_mutex);
4895 build_all_zonelists(NULL, NULL); 4896 build_all_zonelists(NULL, NULL);
4896 mutex_unlock(&zonelists_mutex); 4897 mutex_unlock(&zonelists_mutex);
4898 mem_hotplug_done();
4897 } 4899 }
4898 } 4900 }
4899out: 4901out:
diff --git a/mm/page_io.c b/mm/page_io.c
index b6c4ac388209..5f61b54ee1f3 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -22,6 +22,7 @@
22#include <linux/frontswap.h> 22#include <linux/frontswap.h>
23#include <linux/blkdev.h> 23#include <linux/blkdev.h>
24#include <linux/uio.h> 24#include <linux/uio.h>
25#include <linux/sched/task.h>
25#include <asm/pgtable.h> 26#include <asm/pgtable.h>
26 27
27static struct bio *get_swap_bio(gfp_t gfp_flags, 28static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -136,6 +137,7 @@ out:
136 WRITE_ONCE(bio->bi_private, NULL); 137 WRITE_ONCE(bio->bi_private, NULL);
137 bio_put(bio); 138 bio_put(bio);
138 wake_up_process(waiter); 139 wake_up_process(waiter);
140 put_task_struct(waiter);
139} 141}
140 142
141int generic_swapfile_activate(struct swap_info_struct *sis, 143int generic_swapfile_activate(struct swap_info_struct *sis,
@@ -378,6 +380,11 @@ int swap_readpage(struct page *page, bool do_poll)
378 goto out; 380 goto out;
379 } 381 }
380 bdev = bio->bi_bdev; 382 bdev = bio->bi_bdev;
383 /*
384 * Keep this task valid during swap readpage because the oom killer may
385 * attempt to access it in the page fault retry time check.
386 */
387 get_task_struct(current);
381 bio->bi_private = current; 388 bio->bi_private = current;
382 bio_set_op_attrs(bio, REQ_OP_READ, 0); 389 bio_set_op_attrs(bio, REQ_OP_READ, 0);
383 count_vm_event(PSWPIN); 390 count_vm_event(PSWPIN);
diff --git a/mm/rmap.c b/mm/rmap.c
index ced14f1af6dc..c8993c63eb25 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -605,6 +605,13 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
605 tlb_ubc->flush_required = true; 605 tlb_ubc->flush_required = true;
606 606
607 /* 607 /*
608 * Ensure compiler does not re-order the setting of tlb_flush_batched
609 * before the PTE is cleared.
610 */
611 barrier();
612 mm->tlb_flush_batched = true;
613
614 /*
608 * If the PTE was dirty then it's best to assume it's writable. The 615 * If the PTE was dirty then it's best to assume it's writable. The
609 * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush() 616 * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
610 * before the page is queued for IO. 617 * before the page is queued for IO.
@@ -631,6 +638,35 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
631 638
632 return should_defer; 639 return should_defer;
633} 640}
641
642/*
643 * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
644 * releasing the PTL if TLB flushes are batched. It's possible for a parallel
645 * operation such as mprotect or munmap to race between reclaim unmapping
646 * the page and flushing the page. If this race occurs, it potentially allows
647 * access to data via a stale TLB entry. Tracking all mm's that have TLB
648 * batching in flight would be expensive during reclaim so instead track
649 * whether TLB batching occurred in the past and if so then do a flush here
650 * if required. This will cost one additional flush per reclaim cycle paid
651 * by the first operation at risk such as mprotect and mumap.
652 *
653 * This must be called under the PTL so that an access to tlb_flush_batched
654 * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
655 * via the PTL.
656 */
657void flush_tlb_batched_pending(struct mm_struct *mm)
658{
659 if (mm->tlb_flush_batched) {
660 flush_tlb_mm(mm);
661
662 /*
663 * Do not allow the compiler to re-order the clearing of
664 * tlb_flush_batched before the tlb is flushed.
665 */
666 barrier();
667 mm->tlb_flush_batched = false;
668 }
669}
634#else 670#else
635static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) 671static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
636{ 672{
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 013eea76685e..308acb9d814b 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -2453,7 +2453,6 @@ void zs_destroy_pool(struct zs_pool *pool)
2453 } 2453 }
2454 2454
2455 destroy_cache(pool); 2455 destroy_cache(pool);
2456 kfree(pool->size_class);
2457 kfree(pool->name); 2456 kfree(pool->name);
2458 kfree(pool); 2457 kfree(pool);
2459} 2458}