aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-01-24 19:54:39 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-01-24 19:54:39 -0500
commit883af14e67e8b8702b5560aa64c888c0cd0bd66c (patch)
tree74e3a6b53f5fad9f7848ab1b9f6921b7012940a4
parent0263d4ebd94b36280608e296cba39b924b6e832b (diff)
parentaab45453ff5c77200c6da4ac909f7a4392aed17e (diff)
Merge branch 'akpm' (patches from Andrew)
Merge fixes from Andrew Morton: "26 fixes" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (26 commits) MAINTAINERS: add Dan Streetman to zbud maintainers MAINTAINERS: add Dan Streetman to zswap maintainers mm: do not export ioremap_page_range symbol for external module mn10300: fix build error of missing fpu_save() romfs: use different way to generate fsid for BLOCK or MTD frv: add missing atomic64 operations mm, page_alloc: fix premature OOM when racing with cpuset mems update mm, page_alloc: move cpuset seqcount checking to slowpath mm, page_alloc: fix fast-path race with cpuset update or removal mm, page_alloc: fix check for NULL preferred_zone kernel/panic.c: add missing \n fbdev: color map copying bounds checking frv: add atomic64_add_unless() mm/mempolicy.c: do not put mempolicy before using its nodemask radix-tree: fix private list warnings Documentation/filesystems/proc.txt: add VmPin mm, memcg: do not retry precharge charges proc: add a schedule point in proc_pid_readdir() mm: alloc_contig: re-allow CMA to compact FS pages mm/slub.c: trace free objects at KERN_INFO ...
-rw-r--r--Documentation/filesystems/proc.txt5
-rw-r--r--MAINTAINERS2
-rw-r--r--arch/frv/include/asm/atomic.h35
-rw-r--r--arch/mn10300/include/asm/switch_to.h2
-rw-r--r--drivers/base/memory.c4
-rw-r--r--drivers/memstick/core/memstick.c2
-rw-r--r--drivers/video/fbdev/core/fbcmap.c26
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/dax.c2
-rw-r--r--fs/ext2/Kconfig1
-rw-r--r--fs/ext4/Kconfig1
-rw-r--r--fs/proc/base.c2
-rw-r--r--fs/romfs/super.c23
-rw-r--r--fs/userfaultfd.c37
-rw-r--r--include/linux/memory_hotplug.h4
-rw-r--r--include/linux/mmzone.h6
-rw-r--r--include/linux/nmi.h1
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/watchdog.c9
-rw-r--r--kernel/watchdog_hld.c3
-rw-r--r--lib/ioremap.c1
-rw-r--r--lib/radix-tree.c2
-rw-r--r--mm/huge_memory.c18
-rw-r--r--mm/memcontrol.c4
-rw-r--r--mm/memory_hotplug.c28
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/page_alloc.c69
-rw-r--r--mm/slub.c23
28 files changed, 237 insertions, 78 deletions
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 72624a16b792..c94b4675d021 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -212,10 +212,11 @@ asynchronous manner and the value may not be very precise. To see a precise
212snapshot of a moment, you can see /proc/<pid>/smaps file and scan page table. 212snapshot of a moment, you can see /proc/<pid>/smaps file and scan page table.
213It's slow but very precise. 213It's slow but very precise.
214 214
215Table 1-2: Contents of the status files (as of 4.1) 215Table 1-2: Contents of the status files (as of 4.8)
216.............................................................................. 216..............................................................................
217 Field Content 217 Field Content
218 Name filename of the executable 218 Name filename of the executable
219 Umask file mode creation mask
219 State state (R is running, S is sleeping, D is sleeping 220 State state (R is running, S is sleeping, D is sleeping
220 in an uninterruptible wait, Z is zombie, 221 in an uninterruptible wait, Z is zombie,
221 T is traced or stopped) 222 T is traced or stopped)
@@ -226,7 +227,6 @@ Table 1-2: Contents of the status files (as of 4.1)
226 TracerPid PID of process tracing this process (0 if not) 227 TracerPid PID of process tracing this process (0 if not)
227 Uid Real, effective, saved set, and file system UIDs 228 Uid Real, effective, saved set, and file system UIDs
228 Gid Real, effective, saved set, and file system GIDs 229 Gid Real, effective, saved set, and file system GIDs
229 Umask file mode creation mask
230 FDSize number of file descriptor slots currently allocated 230 FDSize number of file descriptor slots currently allocated
231 Groups supplementary group list 231 Groups supplementary group list
232 NStgid descendant namespace thread group ID hierarchy 232 NStgid descendant namespace thread group ID hierarchy
@@ -236,6 +236,7 @@ Table 1-2: Contents of the status files (as of 4.1)
236 VmPeak peak virtual memory size 236 VmPeak peak virtual memory size
237 VmSize total program size 237 VmSize total program size
238 VmLck locked memory size 238 VmLck locked memory size
239 VmPin pinned memory size
239 VmHWM peak resident set size ("high water mark") 240 VmHWM peak resident set size ("high water mark")
240 VmRSS size of memory portions. It contains the three 241 VmRSS size of memory portions. It contains the three
241 following parts (VmRSS = RssAnon + RssFile + RssShmem) 242 following parts (VmRSS = RssAnon + RssFile + RssShmem)
diff --git a/MAINTAINERS b/MAINTAINERS
index 795942555b4d..50e6f7c561d8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13625,6 +13625,7 @@ F: drivers/net/hamradio/z8530.h
13625 13625
13626ZBUD COMPRESSED PAGE ALLOCATOR 13626ZBUD COMPRESSED PAGE ALLOCATOR
13627M: Seth Jennings <sjenning@redhat.com> 13627M: Seth Jennings <sjenning@redhat.com>
13628M: Dan Streetman <ddstreet@ieee.org>
13628L: linux-mm@kvack.org 13629L: linux-mm@kvack.org
13629S: Maintained 13630S: Maintained
13630F: mm/zbud.c 13631F: mm/zbud.c
@@ -13680,6 +13681,7 @@ F: Documentation/vm/zsmalloc.txt
13680 13681
13681ZSWAP COMPRESSED SWAP CACHING 13682ZSWAP COMPRESSED SWAP CACHING
13682M: Seth Jennings <sjenning@redhat.com> 13683M: Seth Jennings <sjenning@redhat.com>
13684M: Dan Streetman <ddstreet@ieee.org>
13683L: linux-mm@kvack.org 13685L: linux-mm@kvack.org
13684S: Maintained 13686S: Maintained
13685F: mm/zswap.c 13687F: mm/zswap.c
diff --git a/arch/frv/include/asm/atomic.h b/arch/frv/include/asm/atomic.h
index 1c2a5e264fc7..e93c9494503a 100644
--- a/arch/frv/include/asm/atomic.h
+++ b/arch/frv/include/asm/atomic.h
@@ -139,7 +139,7 @@ static inline void atomic64_dec(atomic64_t *v)
139#define atomic64_sub_and_test(i,v) (atomic64_sub_return((i), (v)) == 0) 139#define atomic64_sub_and_test(i,v) (atomic64_sub_return((i), (v)) == 0)
140#define atomic64_dec_and_test(v) (atomic64_dec_return((v)) == 0) 140#define atomic64_dec_and_test(v) (atomic64_dec_return((v)) == 0)
141#define atomic64_inc_and_test(v) (atomic64_inc_return((v)) == 0) 141#define atomic64_inc_and_test(v) (atomic64_inc_return((v)) == 0)
142 142#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
143 143
144#define atomic_cmpxchg(v, old, new) (cmpxchg(&(v)->counter, old, new)) 144#define atomic_cmpxchg(v, old, new) (cmpxchg(&(v)->counter, old, new))
145#define atomic_xchg(v, new) (xchg(&(v)->counter, new)) 145#define atomic_xchg(v, new) (xchg(&(v)->counter, new))
@@ -161,6 +161,39 @@ static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u)
161 return c; 161 return c;
162} 162}
163 163
164static inline int atomic64_add_unless(atomic64_t *v, long long i, long long u)
165{
166 long long c, old;
167
168 c = atomic64_read(v);
169 for (;;) {
170 if (unlikely(c == u))
171 break;
172 old = atomic64_cmpxchg(v, c, c + i);
173 if (likely(old == c))
174 break;
175 c = old;
176 }
177 return c != u;
178}
179
180static inline long long atomic64_dec_if_positive(atomic64_t *v)
181{
182 long long c, old, dec;
183
184 c = atomic64_read(v);
185 for (;;) {
186 dec = c - 1;
187 if (unlikely(dec < 0))
188 break;
189 old = atomic64_cmpxchg((v), c, dec);
190 if (likely(old == c))
191 break;
192 c = old;
193 }
194 return dec;
195}
196
164#define ATOMIC_OP(op) \ 197#define ATOMIC_OP(op) \
165static inline int atomic_fetch_##op(int i, atomic_t *v) \ 198static inline int atomic_fetch_##op(int i, atomic_t *v) \
166{ \ 199{ \
diff --git a/arch/mn10300/include/asm/switch_to.h b/arch/mn10300/include/asm/switch_to.h
index 393d311735c8..67e333aa7629 100644
--- a/arch/mn10300/include/asm/switch_to.h
+++ b/arch/mn10300/include/asm/switch_to.h
@@ -16,7 +16,7 @@
16struct task_struct; 16struct task_struct;
17struct thread_struct; 17struct thread_struct;
18 18
19#if !defined(CONFIG_LAZY_SAVE_FPU) 19#if defined(CONFIG_FPU) && !defined(CONFIG_LAZY_SAVE_FPU)
20struct fpu_state_struct; 20struct fpu_state_struct;
21extern asmlinkage void fpu_save(struct fpu_state_struct *); 21extern asmlinkage void fpu_save(struct fpu_state_struct *);
22#define switch_fpu(prev, next) \ 22#define switch_fpu(prev, next) \
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 8ab8ea1253e6..dacb6a8418aa 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -408,14 +408,14 @@ static ssize_t show_valid_zones(struct device *dev,
408 sprintf(buf, "%s", zone->name); 408 sprintf(buf, "%s", zone->name);
409 409
410 /* MMOP_ONLINE_KERNEL */ 410 /* MMOP_ONLINE_KERNEL */
411 zone_shift = zone_can_shift(start_pfn, nr_pages, ZONE_NORMAL); 411 zone_can_shift(start_pfn, nr_pages, ZONE_NORMAL, &zone_shift);
412 if (zone_shift) { 412 if (zone_shift) {
413 strcat(buf, " "); 413 strcat(buf, " ");
414 strcat(buf, (zone + zone_shift)->name); 414 strcat(buf, (zone + zone_shift)->name);
415 } 415 }
416 416
417 /* MMOP_ONLINE_MOVABLE */ 417 /* MMOP_ONLINE_MOVABLE */
418 zone_shift = zone_can_shift(start_pfn, nr_pages, ZONE_MOVABLE); 418 zone_can_shift(start_pfn, nr_pages, ZONE_MOVABLE, &zone_shift);
419 if (zone_shift) { 419 if (zone_shift) {
420 strcat(buf, " "); 420 strcat(buf, " ");
421 strcat(buf, (zone + zone_shift)->name); 421 strcat(buf, (zone + zone_shift)->name);
diff --git a/drivers/memstick/core/memstick.c b/drivers/memstick/core/memstick.c
index a0547dbf9806..76382c858c35 100644
--- a/drivers/memstick/core/memstick.c
+++ b/drivers/memstick/core/memstick.c
@@ -330,7 +330,7 @@ static int h_memstick_read_dev_id(struct memstick_dev *card,
330 struct ms_id_register id_reg; 330 struct ms_id_register id_reg;
331 331
332 if (!(*mrq)) { 332 if (!(*mrq)) {
333 memstick_init_req(&card->current_mrq, MS_TPC_READ_REG, NULL, 333 memstick_init_req(&card->current_mrq, MS_TPC_READ_REG, &id_reg,
334 sizeof(struct ms_id_register)); 334 sizeof(struct ms_id_register));
335 *mrq = &card->current_mrq; 335 *mrq = &card->current_mrq;
336 return 0; 336 return 0;
diff --git a/drivers/video/fbdev/core/fbcmap.c b/drivers/video/fbdev/core/fbcmap.c
index f89245b8ba8e..68a113594808 100644
--- a/drivers/video/fbdev/core/fbcmap.c
+++ b/drivers/video/fbdev/core/fbcmap.c
@@ -163,17 +163,18 @@ void fb_dealloc_cmap(struct fb_cmap *cmap)
163 163
164int fb_copy_cmap(const struct fb_cmap *from, struct fb_cmap *to) 164int fb_copy_cmap(const struct fb_cmap *from, struct fb_cmap *to)
165{ 165{
166 int tooff = 0, fromoff = 0; 166 unsigned int tooff = 0, fromoff = 0;
167 int size; 167 size_t size;
168 168
169 if (to->start > from->start) 169 if (to->start > from->start)
170 fromoff = to->start - from->start; 170 fromoff = to->start - from->start;
171 else 171 else
172 tooff = from->start - to->start; 172 tooff = from->start - to->start;
173 size = to->len - tooff; 173 if (fromoff >= from->len || tooff >= to->len)
174 if (size > (int) (from->len - fromoff)) 174 return -EINVAL;
175 size = from->len - fromoff; 175
176 if (size <= 0) 176 size = min_t(size_t, to->len - tooff, from->len - fromoff);
177 if (size == 0)
177 return -EINVAL; 178 return -EINVAL;
178 size *= sizeof(u16); 179 size *= sizeof(u16);
179 180
@@ -187,17 +188,18 @@ int fb_copy_cmap(const struct fb_cmap *from, struct fb_cmap *to)
187 188
188int fb_cmap_to_user(const struct fb_cmap *from, struct fb_cmap_user *to) 189int fb_cmap_to_user(const struct fb_cmap *from, struct fb_cmap_user *to)
189{ 190{
190 int tooff = 0, fromoff = 0; 191 unsigned int tooff = 0, fromoff = 0;
191 int size; 192 size_t size;
192 193
193 if (to->start > from->start) 194 if (to->start > from->start)
194 fromoff = to->start - from->start; 195 fromoff = to->start - from->start;
195 else 196 else
196 tooff = from->start - to->start; 197 tooff = from->start - to->start;
197 size = to->len - tooff; 198 if (fromoff >= from->len || tooff >= to->len)
198 if (size > (int) (from->len - fromoff)) 199 return -EINVAL;
199 size = from->len - fromoff; 200
200 if (size <= 0) 201 size = min_t(size_t, to->len - tooff, from->len - fromoff);
202 if (size == 0)
201 return -EINVAL; 203 return -EINVAL;
202 size *= sizeof(u16); 204 size *= sizeof(u16);
203 205
diff --git a/fs/Kconfig b/fs/Kconfig
index c2a377cdda2b..83eab52fb3f6 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -38,6 +38,7 @@ config FS_DAX
38 bool "Direct Access (DAX) support" 38 bool "Direct Access (DAX) support"
39 depends on MMU 39 depends on MMU
40 depends on !(ARM || MIPS || SPARC) 40 depends on !(ARM || MIPS || SPARC)
41 select FS_IOMAP
41 help 42 help
42 Direct Access (DAX) can be used on memory-backed block devices. 43 Direct Access (DAX) can be used on memory-backed block devices.
43 If the block device supports DAX and the filesystem supports DAX, 44 If the block device supports DAX and the filesystem supports DAX,
diff --git a/fs/dax.c b/fs/dax.c
index ddcddfeaa03b..3af2da5e64ce 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -990,7 +990,6 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
990} 990}
991EXPORT_SYMBOL_GPL(__dax_zero_page_range); 991EXPORT_SYMBOL_GPL(__dax_zero_page_range);
992 992
993#ifdef CONFIG_FS_IOMAP
994static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) 993static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
995{ 994{
996 return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9); 995 return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
@@ -1428,4 +1427,3 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1428} 1427}
1429EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault); 1428EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
1430#endif /* CONFIG_FS_DAX_PMD */ 1429#endif /* CONFIG_FS_DAX_PMD */
1431#endif /* CONFIG_FS_IOMAP */
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index 36bea5adcaba..c634874e12d9 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -1,6 +1,5 @@
1config EXT2_FS 1config EXT2_FS
2 tristate "Second extended fs support" 2 tristate "Second extended fs support"
3 select FS_IOMAP if FS_DAX
4 help 3 help
5 Ext2 is a standard Linux file system for hard disks. 4 Ext2 is a standard Linux file system for hard disks.
6 5
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 7b90691e98c4..e38039fd96ff 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -37,7 +37,6 @@ config EXT4_FS
37 select CRC16 37 select CRC16
38 select CRYPTO 38 select CRYPTO
39 select CRYPTO_CRC32C 39 select CRYPTO_CRC32C
40 select FS_IOMAP if FS_DAX
41 help 40 help
42 This is the next generation of the ext3 filesystem. 41 This is the next generation of the ext3 filesystem.
43 42
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8e7e61b28f31..87c9a9aacda3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3179,6 +3179,8 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx)
3179 iter.tgid += 1, iter = next_tgid(ns, iter)) { 3179 iter.tgid += 1, iter = next_tgid(ns, iter)) {
3180 char name[PROC_NUMBUF]; 3180 char name[PROC_NUMBUF];
3181 int len; 3181 int len;
3182
3183 cond_resched();
3182 if (!has_pid_permissions(ns, iter.task, 2)) 3184 if (!has_pid_permissions(ns, iter.task, 2))
3183 continue; 3185 continue;
3184 3186
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index d0f8a38dfafa..0186fe6d39f3 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -74,6 +74,7 @@
74#include <linux/highmem.h> 74#include <linux/highmem.h>
75#include <linux/pagemap.h> 75#include <linux/pagemap.h>
76#include <linux/uaccess.h> 76#include <linux/uaccess.h>
77#include <linux/major.h>
77#include "internal.h" 78#include "internal.h"
78 79
79static struct kmem_cache *romfs_inode_cachep; 80static struct kmem_cache *romfs_inode_cachep;
@@ -416,7 +417,22 @@ static void romfs_destroy_inode(struct inode *inode)
416static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf) 417static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
417{ 418{
418 struct super_block *sb = dentry->d_sb; 419 struct super_block *sb = dentry->d_sb;
419 u64 id = huge_encode_dev(sb->s_bdev->bd_dev); 420 u64 id = 0;
421
422 /* When calling huge_encode_dev(),
423 * use sb->s_bdev->bd_dev when,
424 * - CONFIG_ROMFS_ON_BLOCK defined
425 * use sb->s_dev when,
426 * - CONFIG_ROMFS_ON_BLOCK undefined and
427 * - CONFIG_ROMFS_ON_MTD defined
428 * leave id as 0 when,
429 * - CONFIG_ROMFS_ON_BLOCK undefined and
430 * - CONFIG_ROMFS_ON_MTD undefined
431 */
432 if (sb->s_bdev)
433 id = huge_encode_dev(sb->s_bdev->bd_dev);
434 else if (sb->s_dev)
435 id = huge_encode_dev(sb->s_dev);
420 436
421 buf->f_type = ROMFS_MAGIC; 437 buf->f_type = ROMFS_MAGIC;
422 buf->f_namelen = ROMFS_MAXFN; 438 buf->f_namelen = ROMFS_MAXFN;
@@ -489,6 +505,11 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
489 sb->s_flags |= MS_RDONLY | MS_NOATIME; 505 sb->s_flags |= MS_RDONLY | MS_NOATIME;
490 sb->s_op = &romfs_super_ops; 506 sb->s_op = &romfs_super_ops;
491 507
508#ifdef CONFIG_ROMFS_ON_MTD
509 /* Use same dev ID from the underlying mtdblock device */
510 if (sb->s_mtd)
511 sb->s_dev = MKDEV(MTD_BLOCK_MAJOR, sb->s_mtd->index);
512#endif
492 /* read the image superblock and check it */ 513 /* read the image superblock and check it */
493 rsb = kmalloc(512, GFP_KERNEL); 514 rsb = kmalloc(512, GFP_KERNEL);
494 if (!rsb) 515 if (!rsb)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index d96e2f30084b..43953e03c356 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -63,6 +63,7 @@ struct userfaultfd_wait_queue {
63 struct uffd_msg msg; 63 struct uffd_msg msg;
64 wait_queue_t wq; 64 wait_queue_t wq;
65 struct userfaultfd_ctx *ctx; 65 struct userfaultfd_ctx *ctx;
66 bool waken;
66}; 67};
67 68
68struct userfaultfd_wake_range { 69struct userfaultfd_wake_range {
@@ -86,6 +87,12 @@ static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
86 if (len && (start > uwq->msg.arg.pagefault.address || 87 if (len && (start > uwq->msg.arg.pagefault.address ||
87 start + len <= uwq->msg.arg.pagefault.address)) 88 start + len <= uwq->msg.arg.pagefault.address))
88 goto out; 89 goto out;
90 WRITE_ONCE(uwq->waken, true);
91 /*
92 * The implicit smp_mb__before_spinlock in try_to_wake_up()
93 * renders uwq->waken visible to other CPUs before the task is
94 * waken.
95 */
89 ret = wake_up_state(wq->private, mode); 96 ret = wake_up_state(wq->private, mode);
90 if (ret) 97 if (ret)
91 /* 98 /*
@@ -264,6 +271,7 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
264 struct userfaultfd_wait_queue uwq; 271 struct userfaultfd_wait_queue uwq;
265 int ret; 272 int ret;
266 bool must_wait, return_to_userland; 273 bool must_wait, return_to_userland;
274 long blocking_state;
267 275
268 BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); 276 BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
269 277
@@ -334,10 +342,13 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
334 uwq.wq.private = current; 342 uwq.wq.private = current;
335 uwq.msg = userfault_msg(vmf->address, vmf->flags, reason); 343 uwq.msg = userfault_msg(vmf->address, vmf->flags, reason);
336 uwq.ctx = ctx; 344 uwq.ctx = ctx;
345 uwq.waken = false;
337 346
338 return_to_userland = 347 return_to_userland =
339 (vmf->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == 348 (vmf->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
340 (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); 349 (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
350 blocking_state = return_to_userland ? TASK_INTERRUPTIBLE :
351 TASK_KILLABLE;
341 352
342 spin_lock(&ctx->fault_pending_wqh.lock); 353 spin_lock(&ctx->fault_pending_wqh.lock);
343 /* 354 /*
@@ -350,8 +361,7 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
350 * following the spin_unlock to happen before the list_add in 361 * following the spin_unlock to happen before the list_add in
351 * __add_wait_queue. 362 * __add_wait_queue.
352 */ 363 */
353 set_current_state(return_to_userland ? TASK_INTERRUPTIBLE : 364 set_current_state(blocking_state);
354 TASK_KILLABLE);
355 spin_unlock(&ctx->fault_pending_wqh.lock); 365 spin_unlock(&ctx->fault_pending_wqh.lock);
356 366
357 must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, 367 must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
@@ -364,6 +374,29 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
364 wake_up_poll(&ctx->fd_wqh, POLLIN); 374 wake_up_poll(&ctx->fd_wqh, POLLIN);
365 schedule(); 375 schedule();
366 ret |= VM_FAULT_MAJOR; 376 ret |= VM_FAULT_MAJOR;
377
378 /*
379 * False wakeups can orginate even from rwsem before
380 * up_read() however userfaults will wait either for a
381 * targeted wakeup on the specific uwq waitqueue from
382 * wake_userfault() or for signals or for uffd
383 * release.
384 */
385 while (!READ_ONCE(uwq.waken)) {
386 /*
387 * This needs the full smp_store_mb()
388 * guarantee as the state write must be
389 * visible to other CPUs before reading
390 * uwq.waken from other CPUs.
391 */
392 set_current_state(blocking_state);
393 if (READ_ONCE(uwq.waken) ||
394 READ_ONCE(ctx->released) ||
395 (return_to_userland ? signal_pending(current) :
396 fatal_signal_pending(current)))
397 break;
398 schedule();
399 }
367 } 400 }
368 401
369 __set_current_state(TASK_RUNNING); 402 __set_current_state(TASK_RUNNING);
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 01033fadea47..c1784c0b4f35 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -284,7 +284,7 @@ extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
284 unsigned long map_offset); 284 unsigned long map_offset);
285extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, 285extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
286 unsigned long pnum); 286 unsigned long pnum);
287extern int zone_can_shift(unsigned long pfn, unsigned long nr_pages, 287extern bool zone_can_shift(unsigned long pfn, unsigned long nr_pages,
288 enum zone_type target); 288 enum zone_type target, int *zone_shift);
289 289
290#endif /* __LINUX_MEMORY_HOTPLUG_H */ 290#endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 36d9896fbc1e..f4aac87adcc3 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -972,12 +972,16 @@ static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z,
972 * @zonelist - The zonelist to search for a suitable zone 972 * @zonelist - The zonelist to search for a suitable zone
973 * @highest_zoneidx - The zone index of the highest zone to return 973 * @highest_zoneidx - The zone index of the highest zone to return
974 * @nodes - An optional nodemask to filter the zonelist with 974 * @nodes - An optional nodemask to filter the zonelist with
975 * @zone - The first suitable zone found is returned via this parameter 975 * @return - Zoneref pointer for the first suitable zone found (see below)
976 * 976 *
977 * This function returns the first zone at or below a given zone index that is 977 * This function returns the first zone at or below a given zone index that is
978 * within the allowed nodemask. The zoneref returned is a cursor that can be 978 * within the allowed nodemask. The zoneref returned is a cursor that can be
979 * used to iterate the zonelist with next_zones_zonelist by advancing it by 979 * used to iterate the zonelist with next_zones_zonelist by advancing it by
980 * one before calling. 980 * one before calling.
981 *
982 * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is
983 * never NULL). This may happen either genuinely, or due to concurrent nodemask
984 * update due to cpuset modification.
981 */ 985 */
982static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, 986static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
983 enum zone_type highest_zoneidx, 987 enum zone_type highest_zoneidx,
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index aacca824a6ae..0a3fadc32693 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -110,6 +110,7 @@ extern int watchdog_user_enabled;
110extern int watchdog_thresh; 110extern int watchdog_thresh;
111extern unsigned long watchdog_enabled; 111extern unsigned long watchdog_enabled;
112extern unsigned long *watchdog_cpumask_bits; 112extern unsigned long *watchdog_cpumask_bits;
113extern atomic_t watchdog_park_in_progress;
113#ifdef CONFIG_SMP 114#ifdef CONFIG_SMP
114extern int sysctl_softlockup_all_cpu_backtrace; 115extern int sysctl_softlockup_all_cpu_backtrace;
115extern int sysctl_hardlockup_all_cpu_backtrace; 116extern int sysctl_hardlockup_all_cpu_backtrace;
diff --git a/kernel/panic.c b/kernel/panic.c
index 901c4fb46002..08aa88dde7de 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -249,7 +249,7 @@ void panic(const char *fmt, ...)
249 * Delay timeout seconds before rebooting the machine. 249 * Delay timeout seconds before rebooting the machine.
250 * We can't use the "normal" timers since we just panicked. 250 * We can't use the "normal" timers since we just panicked.
251 */ 251 */
252 pr_emerg("Rebooting in %d seconds..", panic_timeout); 252 pr_emerg("Rebooting in %d seconds..\n", panic_timeout);
253 253
254 for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) { 254 for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
255 touch_nmi_watchdog(); 255 touch_nmi_watchdog();
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index d4b0fa01cae3..63177be0159e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -49,6 +49,8 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
49#define for_each_watchdog_cpu(cpu) \ 49#define for_each_watchdog_cpu(cpu) \
50 for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) 50 for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
51 51
52atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
53
52/* 54/*
53 * The 'watchdog_running' variable is set to 1 when the watchdog threads 55 * The 'watchdog_running' variable is set to 1 when the watchdog threads
54 * are registered/started and is set to 0 when the watchdog threads are 56 * are registered/started and is set to 0 when the watchdog threads are
@@ -260,6 +262,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
260 int duration; 262 int duration;
261 int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; 263 int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
262 264
265 if (atomic_read(&watchdog_park_in_progress) != 0)
266 return HRTIMER_NORESTART;
267
263 /* kick the hardlockup detector */ 268 /* kick the hardlockup detector */
264 watchdog_interrupt_count(); 269 watchdog_interrupt_count();
265 270
@@ -467,12 +472,16 @@ static int watchdog_park_threads(void)
467{ 472{
468 int cpu, ret = 0; 473 int cpu, ret = 0;
469 474
475 atomic_set(&watchdog_park_in_progress, 1);
476
470 for_each_watchdog_cpu(cpu) { 477 for_each_watchdog_cpu(cpu) {
471 ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); 478 ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
472 if (ret) 479 if (ret)
473 break; 480 break;
474 } 481 }
475 482
483 atomic_set(&watchdog_park_in_progress, 0);
484
476 return ret; 485 return ret;
477} 486}
478 487
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 84016c8aee6b..12b8dd640786 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -84,6 +84,9 @@ static void watchdog_overflow_callback(struct perf_event *event,
84 /* Ensure the watchdog never gets throttled */ 84 /* Ensure the watchdog never gets throttled */
85 event->hw.interrupts = 0; 85 event->hw.interrupts = 0;
86 86
87 if (atomic_read(&watchdog_park_in_progress) != 0)
88 return;
89
87 if (__this_cpu_read(watchdog_nmi_touch) == true) { 90 if (__this_cpu_read(watchdog_nmi_touch) == true) {
88 __this_cpu_write(watchdog_nmi_touch, false); 91 __this_cpu_write(watchdog_nmi_touch, false);
89 return; 92 return;
diff --git a/lib/ioremap.c b/lib/ioremap.c
index 86c8911b0e3a..a3e14ce92a56 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -144,4 +144,3 @@ int ioremap_page_range(unsigned long addr,
144 144
145 return err; 145 return err;
146} 146}
147EXPORT_SYMBOL_GPL(ioremap_page_range);
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 0b92d605fb69..84812a9fb16f 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -769,7 +769,7 @@ static void radix_tree_free_nodes(struct radix_tree_node *node)
769 struct radix_tree_node *old = child; 769 struct radix_tree_node *old = child;
770 offset = child->offset + 1; 770 offset = child->offset + 1;
771 child = child->parent; 771 child = child->parent;
772 WARN_ON_ONCE(!list_empty(&node->private_list)); 772 WARN_ON_ONCE(!list_empty(&old->private_list));
773 radix_tree_node_free(old); 773 radix_tree_node_free(old);
774 if (old == entry_to_node(node)) 774 if (old == entry_to_node(node))
775 return; 775 return;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9a6bd6c8d55a..5f3ad65c85de 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -783,6 +783,12 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
783 783
784 assert_spin_locked(pmd_lockptr(mm, pmd)); 784 assert_spin_locked(pmd_lockptr(mm, pmd));
785 785
786 /*
787 * When we COW a devmap PMD entry, we split it into PTEs, so we should
788 * not be in this function with `flags & FOLL_COW` set.
789 */
790 WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
791
786 if (flags & FOLL_WRITE && !pmd_write(*pmd)) 792 if (flags & FOLL_WRITE && !pmd_write(*pmd))
787 return NULL; 793 return NULL;
788 794
@@ -1128,6 +1134,16 @@ out_unlock:
1128 return ret; 1134 return ret;
1129} 1135}
1130 1136
1137/*
1138 * FOLL_FORCE can write to even unwritable pmd's, but only
1139 * after we've gone through a COW cycle and they are dirty.
1140 */
1141static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
1142{
1143 return pmd_write(pmd) ||
1144 ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
1145}
1146
1131struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, 1147struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1132 unsigned long addr, 1148 unsigned long addr,
1133 pmd_t *pmd, 1149 pmd_t *pmd,
@@ -1138,7 +1154,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1138 1154
1139 assert_spin_locked(pmd_lockptr(mm, pmd)); 1155 assert_spin_locked(pmd_lockptr(mm, pmd));
1140 1156
1141 if (flags & FOLL_WRITE && !pmd_write(*pmd)) 1157 if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
1142 goto out; 1158 goto out;
1143 1159
1144 /* Avoid dumping huge zero page */ 1160 /* Avoid dumping huge zero page */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a63a8f832664..b822e158b319 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4353,9 +4353,9 @@ static int mem_cgroup_do_precharge(unsigned long count)
4353 return ret; 4353 return ret;
4354 } 4354 }
4355 4355
4356 /* Try charges one by one with reclaim */ 4356 /* Try charges one by one with reclaim, but do not retry */
4357 while (count--) { 4357 while (count--) {
4358 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); 4358 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
4359 if (ret) 4359 if (ret)
4360 return ret; 4360 return ret;
4361 mc.precharge++; 4361 mc.precharge++;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e43142c15631..ca2723d47338 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1033,36 +1033,39 @@ static void node_states_set_node(int node, struct memory_notify *arg)
1033 node_set_state(node, N_MEMORY); 1033 node_set_state(node, N_MEMORY);
1034} 1034}
1035 1035
1036int zone_can_shift(unsigned long pfn, unsigned long nr_pages, 1036bool zone_can_shift(unsigned long pfn, unsigned long nr_pages,
1037 enum zone_type target) 1037 enum zone_type target, int *zone_shift)
1038{ 1038{
1039 struct zone *zone = page_zone(pfn_to_page(pfn)); 1039 struct zone *zone = page_zone(pfn_to_page(pfn));
1040 enum zone_type idx = zone_idx(zone); 1040 enum zone_type idx = zone_idx(zone);
1041 int i; 1041 int i;
1042 1042
1043 *zone_shift = 0;
1044
1043 if (idx < target) { 1045 if (idx < target) {
1044 /* pages must be at end of current zone */ 1046 /* pages must be at end of current zone */
1045 if (pfn + nr_pages != zone_end_pfn(zone)) 1047 if (pfn + nr_pages != zone_end_pfn(zone))
1046 return 0; 1048 return false;
1047 1049
1048 /* no zones in use between current zone and target */ 1050 /* no zones in use between current zone and target */
1049 for (i = idx + 1; i < target; i++) 1051 for (i = idx + 1; i < target; i++)
1050 if (zone_is_initialized(zone - idx + i)) 1052 if (zone_is_initialized(zone - idx + i))
1051 return 0; 1053 return false;
1052 } 1054 }
1053 1055
1054 if (target < idx) { 1056 if (target < idx) {
1055 /* pages must be at beginning of current zone */ 1057 /* pages must be at beginning of current zone */
1056 if (pfn != zone->zone_start_pfn) 1058 if (pfn != zone->zone_start_pfn)
1057 return 0; 1059 return false;
1058 1060
1059 /* no zones in use between current zone and target */ 1061 /* no zones in use between current zone and target */
1060 for (i = target + 1; i < idx; i++) 1062 for (i = target + 1; i < idx; i++)
1061 if (zone_is_initialized(zone - idx + i)) 1063 if (zone_is_initialized(zone - idx + i))
1062 return 0; 1064 return false;
1063 } 1065 }
1064 1066
1065 return target - idx; 1067 *zone_shift = target - idx;
1068 return true;
1066} 1069}
1067 1070
1068/* Must be protected by mem_hotplug_begin() */ 1071/* Must be protected by mem_hotplug_begin() */
@@ -1089,10 +1092,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
1089 !can_online_high_movable(zone)) 1092 !can_online_high_movable(zone))
1090 return -EINVAL; 1093 return -EINVAL;
1091 1094
1092 if (online_type == MMOP_ONLINE_KERNEL) 1095 if (online_type == MMOP_ONLINE_KERNEL) {
1093 zone_shift = zone_can_shift(pfn, nr_pages, ZONE_NORMAL); 1096 if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift))
1094 else if (online_type == MMOP_ONLINE_MOVABLE) 1097 return -EINVAL;
1095 zone_shift = zone_can_shift(pfn, nr_pages, ZONE_MOVABLE); 1098 } else if (online_type == MMOP_ONLINE_MOVABLE) {
1099 if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift))
1100 return -EINVAL;
1101 }
1096 1102
1097 zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages); 1103 zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages);
1098 if (!zone) 1104 if (!zone)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 2e346645eb80..1e7873e40c9a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2017,8 +2017,8 @@ retry_cpuset:
2017 2017
2018 nmask = policy_nodemask(gfp, pol); 2018 nmask = policy_nodemask(gfp, pol);
2019 zl = policy_zonelist(gfp, pol, node); 2019 zl = policy_zonelist(gfp, pol, node);
2020 mpol_cond_put(pol);
2021 page = __alloc_pages_nodemask(gfp, order, zl, nmask); 2020 page = __alloc_pages_nodemask(gfp, order, zl, nmask);
2021 mpol_cond_put(pol);
2022out: 2022out:
2023 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) 2023 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2024 goto retry_cpuset; 2024 goto retry_cpuset;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d604d2596b7b..f3e0c69a97b7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3523,12 +3523,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
3523 struct page *page = NULL; 3523 struct page *page = NULL;
3524 unsigned int alloc_flags; 3524 unsigned int alloc_flags;
3525 unsigned long did_some_progress; 3525 unsigned long did_some_progress;
3526 enum compact_priority compact_priority = DEF_COMPACT_PRIORITY; 3526 enum compact_priority compact_priority;
3527 enum compact_result compact_result; 3527 enum compact_result compact_result;
3528 int compaction_retries = 0; 3528 int compaction_retries;
3529 int no_progress_loops = 0; 3529 int no_progress_loops;
3530 unsigned long alloc_start = jiffies; 3530 unsigned long alloc_start = jiffies;
3531 unsigned int stall_timeout = 10 * HZ; 3531 unsigned int stall_timeout = 10 * HZ;
3532 unsigned int cpuset_mems_cookie;
3532 3533
3533 /* 3534 /*
3534 * In the slowpath, we sanity check order to avoid ever trying to 3535 * In the slowpath, we sanity check order to avoid ever trying to
@@ -3549,6 +3550,23 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
3549 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) 3550 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
3550 gfp_mask &= ~__GFP_ATOMIC; 3551 gfp_mask &= ~__GFP_ATOMIC;
3551 3552
3553retry_cpuset:
3554 compaction_retries = 0;
3555 no_progress_loops = 0;
3556 compact_priority = DEF_COMPACT_PRIORITY;
3557 cpuset_mems_cookie = read_mems_allowed_begin();
3558 /*
3559 * We need to recalculate the starting point for the zonelist iterator
3560 * because we might have used different nodemask in the fast path, or
3561 * there was a cpuset modification and we are retrying - otherwise we
3562 * could end up iterating over non-eligible zones endlessly.
3563 */
3564 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
3565 ac->high_zoneidx, ac->nodemask);
3566 if (!ac->preferred_zoneref->zone)
3567 goto nopage;
3568
3569
3552 /* 3570 /*
3553 * The fast path uses conservative alloc_flags to succeed only until 3571 * The fast path uses conservative alloc_flags to succeed only until
3554 * kswapd needs to be woken up, and to avoid the cost of setting up 3572 * kswapd needs to be woken up, and to avoid the cost of setting up
@@ -3708,6 +3726,13 @@ retry:
3708 &compaction_retries)) 3726 &compaction_retries))
3709 goto retry; 3727 goto retry;
3710 3728
3729 /*
3730 * It's possible we raced with cpuset update so the OOM would be
3731 * premature (see below the nopage: label for full explanation).
3732 */
3733 if (read_mems_allowed_retry(cpuset_mems_cookie))
3734 goto retry_cpuset;
3735
3711 /* Reclaim has failed us, start killing things */ 3736 /* Reclaim has failed us, start killing things */
3712 page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); 3737 page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
3713 if (page) 3738 if (page)
@@ -3720,6 +3745,16 @@ retry:
3720 } 3745 }
3721 3746
3722nopage: 3747nopage:
3748 /*
3749 * When updating a task's mems_allowed or mempolicy nodemask, it is
3750 * possible to race with parallel threads in such a way that our
3751 * allocation can fail while the mask is being updated. If we are about
3752 * to fail, check if the cpuset changed during allocation and if so,
3753 * retry.
3754 */
3755 if (read_mems_allowed_retry(cpuset_mems_cookie))
3756 goto retry_cpuset;
3757
3723 warn_alloc(gfp_mask, 3758 warn_alloc(gfp_mask,
3724 "page allocation failure: order:%u", order); 3759 "page allocation failure: order:%u", order);
3725got_pg: 3760got_pg:
@@ -3734,7 +3769,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
3734 struct zonelist *zonelist, nodemask_t *nodemask) 3769 struct zonelist *zonelist, nodemask_t *nodemask)
3735{ 3770{
3736 struct page *page; 3771 struct page *page;
3737 unsigned int cpuset_mems_cookie;
3738 unsigned int alloc_flags = ALLOC_WMARK_LOW; 3772 unsigned int alloc_flags = ALLOC_WMARK_LOW;
3739 gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ 3773 gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
3740 struct alloc_context ac = { 3774 struct alloc_context ac = {
@@ -3771,9 +3805,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
3771 if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE) 3805 if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
3772 alloc_flags |= ALLOC_CMA; 3806 alloc_flags |= ALLOC_CMA;
3773 3807
3774retry_cpuset:
3775 cpuset_mems_cookie = read_mems_allowed_begin();
3776
3777 /* Dirty zone balancing only done in the fast path */ 3808 /* Dirty zone balancing only done in the fast path */
3778 ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); 3809 ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
3779 3810
@@ -3784,8 +3815,13 @@ retry_cpuset:
3784 */ 3815 */
3785 ac.preferred_zoneref = first_zones_zonelist(ac.zonelist, 3816 ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
3786 ac.high_zoneidx, ac.nodemask); 3817 ac.high_zoneidx, ac.nodemask);
3787 if (!ac.preferred_zoneref) { 3818 if (!ac.preferred_zoneref->zone) {
3788 page = NULL; 3819 page = NULL;
3820 /*
3821 * This might be due to race with cpuset_current_mems_allowed
3822 * update, so make sure we retry with original nodemask in the
3823 * slow path.
3824 */
3789 goto no_zone; 3825 goto no_zone;
3790 } 3826 }
3791 3827
@@ -3794,6 +3830,7 @@ retry_cpuset:
3794 if (likely(page)) 3830 if (likely(page))
3795 goto out; 3831 goto out;
3796 3832
3833no_zone:
3797 /* 3834 /*
3798 * Runtime PM, block IO and its error handling path can deadlock 3835 * Runtime PM, block IO and its error handling path can deadlock
3799 * because I/O on the device might not complete. 3836 * because I/O on the device might not complete.
@@ -3805,21 +3842,10 @@ retry_cpuset:
3805 * Restore the original nodemask if it was potentially replaced with 3842 * Restore the original nodemask if it was potentially replaced with
3806 * &cpuset_current_mems_allowed to optimize the fast-path attempt. 3843 * &cpuset_current_mems_allowed to optimize the fast-path attempt.
3807 */ 3844 */
3808 if (cpusets_enabled()) 3845 if (unlikely(ac.nodemask != nodemask))
3809 ac.nodemask = nodemask; 3846 ac.nodemask = nodemask;
3810 page = __alloc_pages_slowpath(alloc_mask, order, &ac);
3811 3847
3812no_zone: 3848 page = __alloc_pages_slowpath(alloc_mask, order, &ac);
3813 /*
3814 * When updating a task's mems_allowed, it is possible to race with
3815 * parallel threads in such a way that an allocation can fail while
3816 * the mask is being updated. If a page allocation is about to fail,
3817 * check if the cpuset changed during allocation and if so, retry.
3818 */
3819 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) {
3820 alloc_mask = gfp_mask;
3821 goto retry_cpuset;
3822 }
3823 3849
3824out: 3850out:
3825 if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && 3851 if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
@@ -7248,6 +7274,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
7248 .zone = page_zone(pfn_to_page(start)), 7274 .zone = page_zone(pfn_to_page(start)),
7249 .mode = MIGRATE_SYNC, 7275 .mode = MIGRATE_SYNC,
7250 .ignore_skip_hint = true, 7276 .ignore_skip_hint = true,
7277 .gfp_mask = GFP_KERNEL,
7251 }; 7278 };
7252 INIT_LIST_HEAD(&cc.migratepages); 7279 INIT_LIST_HEAD(&cc.migratepages);
7253 7280
diff --git a/mm/slub.c b/mm/slub.c
index 067598a00849..7aa6f433f4de 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -496,10 +496,11 @@ static inline int check_valid_pointer(struct kmem_cache *s,
496 return 1; 496 return 1;
497} 497}
498 498
499static void print_section(char *text, u8 *addr, unsigned int length) 499static void print_section(char *level, char *text, u8 *addr,
500 unsigned int length)
500{ 501{
501 metadata_access_enable(); 502 metadata_access_enable();
502 print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, 503 print_hex_dump(level, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
503 length, 1); 504 length, 1);
504 metadata_access_disable(); 505 metadata_access_disable();
505} 506}
@@ -636,14 +637,15 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
636 p, p - addr, get_freepointer(s, p)); 637 p, p - addr, get_freepointer(s, p));
637 638
638 if (s->flags & SLAB_RED_ZONE) 639 if (s->flags & SLAB_RED_ZONE)
639 print_section("Redzone ", p - s->red_left_pad, s->red_left_pad); 640 print_section(KERN_ERR, "Redzone ", p - s->red_left_pad,
641 s->red_left_pad);
640 else if (p > addr + 16) 642 else if (p > addr + 16)
641 print_section("Bytes b4 ", p - 16, 16); 643 print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
642 644
643 print_section("Object ", p, min_t(unsigned long, s->object_size, 645 print_section(KERN_ERR, "Object ", p,
644 PAGE_SIZE)); 646 min_t(unsigned long, s->object_size, PAGE_SIZE));
645 if (s->flags & SLAB_RED_ZONE) 647 if (s->flags & SLAB_RED_ZONE)
646 print_section("Redzone ", p + s->object_size, 648 print_section(KERN_ERR, "Redzone ", p + s->object_size,
647 s->inuse - s->object_size); 649 s->inuse - s->object_size);
648 650
649 if (s->offset) 651 if (s->offset)
@@ -658,7 +660,8 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
658 660
659 if (off != size_from_object(s)) 661 if (off != size_from_object(s))
660 /* Beginning of the filler is the free pointer */ 662 /* Beginning of the filler is the free pointer */
661 print_section("Padding ", p + off, size_from_object(s) - off); 663 print_section(KERN_ERR, "Padding ", p + off,
664 size_from_object(s) - off);
662 665
663 dump_stack(); 666 dump_stack();
664} 667}
@@ -820,7 +823,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
820 end--; 823 end--;
821 824
822 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); 825 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
823 print_section("Padding ", end - remainder, remainder); 826 print_section(KERN_ERR, "Padding ", end - remainder, remainder);
824 827
825 restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); 828 restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
826 return 0; 829 return 0;
@@ -973,7 +976,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
973 page->freelist); 976 page->freelist);
974 977
975 if (!alloc) 978 if (!alloc)
976 print_section("Object ", (void *)object, 979 print_section(KERN_INFO, "Object ", (void *)object,
977 s->object_size); 980 s->object_size);
978 981
979 dump_stack(); 982 dump_stack();