aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-block-zram11
-rw-r--r--Documentation/blockdev/zram.txt74
-rw-r--r--arch/arm64/include/asm/cache.h6
-rw-r--r--drivers/block/zram/zram_drv.c90
-rw-r--r--drivers/block/zram/zram_drv.h5
-rw-r--r--fs/hugetlbfs/inode.c61
-rw-r--r--include/linux/mmzone.h6
-rw-r--r--kernel/fork.c1
-rw-r--r--mm/hugetlb.c81
-rw-r--r--mm/kasan/common.c65
-rw-r--r--mm/memory-failure.c16
-rw-r--r--mm/memory.c26
-rw-r--r--mm/migrate.c13
-rw-r--r--mm/page_alloc.c8
-rw-r--r--mm/rmap.c4
-rw-r--r--mm/slab.c6
-rw-r--r--mm/slub.c2
-rw-r--r--mm/usercopy.c9
-rw-r--r--mm/userfaultfd.c11
-rw-r--r--mm/util.c2
-rw-r--r--tools/vm/page_owner_sort.c4
21 files changed, 289 insertions, 212 deletions
diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram
index 9d2339a485c8..14b2bf2e5105 100644
--- a/Documentation/ABI/testing/sysfs-block-zram
+++ b/Documentation/ABI/testing/sysfs-block-zram
@@ -122,11 +122,18 @@ Description:
122 statistics (bd_count, bd_reads, bd_writes) in a format 122 statistics (bd_count, bd_reads, bd_writes) in a format
123 similar to block layer statistics file format. 123 similar to block layer statistics file format.
124 124
125What: /sys/block/zram<id>/writeback_limit_enable
126Date: November 2018
127Contact: Minchan Kim <minchan@kernel.org>
128Description:
129 The writeback_limit_enable file is read-write and specifies
130 eanbe of writeback_limit feature. "1" means eable the feature.
131 No limit "0" is the initial state.
132
125What: /sys/block/zram<id>/writeback_limit 133What: /sys/block/zram<id>/writeback_limit
126Date: November 2018 134Date: November 2018
127Contact: Minchan Kim <minchan@kernel.org> 135Contact: Minchan Kim <minchan@kernel.org>
128Description: 136Description:
129 The writeback_limit file is read-write and specifies the maximum 137 The writeback_limit file is read-write and specifies the maximum
130 amount of writeback ZRAM can do. The limit could be changed 138 amount of writeback ZRAM can do. The limit could be changed
131 in run time and "0" means disable the limit. 139 in run time.
132 No limit is the initial state.
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 436c5e98e1b6..4df0ce271085 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -156,22 +156,23 @@ Per-device statistics are exported as various nodes under /sys/block/zram<id>/
156A brief description of exported device attributes. For more details please 156A brief description of exported device attributes. For more details please
157read Documentation/ABI/testing/sysfs-block-zram. 157read Documentation/ABI/testing/sysfs-block-zram.
158 158
159Name access description 159Name access description
160---- ------ ----------- 160---- ------ -----------
161disksize RW show and set the device's disk size 161disksize RW show and set the device's disk size
162initstate RO shows the initialization state of the device 162initstate RO shows the initialization state of the device
163reset WO trigger device reset 163reset WO trigger device reset
164mem_used_max WO reset the `mem_used_max' counter (see later) 164mem_used_max WO reset the `mem_used_max' counter (see later)
165mem_limit WO specifies the maximum amount of memory ZRAM can use 165mem_limit WO specifies the maximum amount of memory ZRAM can use
166 to store the compressed data 166 to store the compressed data
167writeback_limit WO specifies the maximum amount of write IO zram can 167writeback_limit WO specifies the maximum amount of write IO zram can
168 write out to backing device as 4KB unit 168 write out to backing device as 4KB unit
169max_comp_streams RW the number of possible concurrent compress operations 169writeback_limit_enable RW show and set writeback_limit feature
170comp_algorithm RW show and change the compression algorithm 170max_comp_streams RW the number of possible concurrent compress operations
171compact WO trigger memory compaction 171comp_algorithm RW show and change the compression algorithm
172debug_stat RO this file is used for zram debugging purposes 172compact WO trigger memory compaction
173backing_dev RW set up backend storage for zram to write out 173debug_stat RO this file is used for zram debugging purposes
174idle WO mark allocated slot as idle 174backing_dev RW set up backend storage for zram to write out
175idle WO mark allocated slot as idle
175 176
176 177
177User space is advised to use the following files to read the device statistics. 178User space is advised to use the following files to read the device statistics.
@@ -280,32 +281,51 @@ With the command, zram writeback idle pages from memory to the storage.
280If there are lots of write IO with flash device, potentially, it has 281If there are lots of write IO with flash device, potentially, it has
281flash wearout problem so that admin needs to design write limitation 282flash wearout problem so that admin needs to design write limitation
282to guarantee storage health for entire product life. 283to guarantee storage health for entire product life.
283To overcome the concern, zram supports "writeback_limit". 284
284The "writeback_limit"'s default value is 0 so that it doesn't limit 285To overcome the concern, zram supports "writeback_limit" feature.
285any writeback. If admin want to measure writeback count in a certain 286The "writeback_limit_enable"'s default value is 0 so that it doesn't limit
286period, he could know it via /sys/block/zram0/bd_stat's 3rd column. 287any writeback. IOW, if admin want to apply writeback budget, he should
288enable writeback_limit_enable via
289
290 $ echo 1 > /sys/block/zramX/writeback_limit_enable
291
292Once writeback_limit_enable is set, zram doesn't allow any writeback
293until admin set the budget via /sys/block/zramX/writeback_limit.
294
295(If admin doesn't enable writeback_limit_enable, writeback_limit's value
296assigned via /sys/block/zramX/writeback_limit is meaninless.)
287 297
288If admin want to limit writeback as per-day 400M, he could do it 298If admin want to limit writeback as per-day 400M, he could do it
289like below. 299like below.
290 300
291 MB_SHIFT=20 301 $ MB_SHIFT=20
292 4K_SHIFT=12 302 $ 4K_SHIFT=12
293 echo $((400<<MB_SHIFT>>4K_SHIFT)) > \ 303 $ echo $((400<<MB_SHIFT>>4K_SHIFT)) > \
294 /sys/block/zram0/writeback_limit. 304 /sys/block/zram0/writeback_limit.
305 $ echo 1 > /sys/block/zram0/writeback_limit_enable
295 306
296If admin want to allow further write again, he could do it like below 307If admin want to allow further write again once the bugdet is exausted,
308he could do it like below
297 309
298 echo 0 > /sys/block/zram0/writeback_limit 310 $ echo $((400<<MB_SHIFT>>4K_SHIFT)) > \
311 /sys/block/zram0/writeback_limit
299 312
300If admin want to see remaining writeback budget since he set, 313If admin want to see remaining writeback budget since he set,
301 314
302 cat /sys/block/zram0/writeback_limit 315 $ cat /sys/block/zramX/writeback_limit
316
317If admin want to disable writeback limit, he could do
318
319 $ echo 0 > /sys/block/zramX/writeback_limit_enable
303 320
304The writeback_limit count will reset whenever you reset zram(e.g., 321The writeback_limit count will reset whenever you reset zram(e.g.,
305system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of 322system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of
306writeback happened until you reset the zram to allocate extra writeback 323writeback happened until you reset the zram to allocate extra writeback
307budget in next setting is user's job. 324budget in next setting is user's job.
308 325
326If admin want to measure writeback count in a certain period, he could
327know it via /sys/block/zram0/bd_stat's 3rd column.
328
309= memory tracking 329= memory tracking
310 330
311With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the 331With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the
diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
index 13dd42c3ad4e..eb43e09c1980 100644
--- a/arch/arm64/include/asm/cache.h
+++ b/arch/arm64/include/asm/cache.h
@@ -58,6 +58,12 @@
58 */ 58 */
59#define ARCH_DMA_MINALIGN (128) 59#define ARCH_DMA_MINALIGN (128)
60 60
61#ifdef CONFIG_KASAN_SW_TAGS
62#define ARCH_SLAB_MINALIGN (1ULL << KASAN_SHADOW_SCALE_SHIFT)
63#else
64#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
65#endif
66
61#ifndef __ASSEMBLY__ 67#ifndef __ASSEMBLY__
62 68
63#include <linux/bitops.h> 69#include <linux/bitops.h>
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 33c5cc879f24..04ca65912638 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -316,11 +316,9 @@ static ssize_t idle_store(struct device *dev,
316 * See the comment in writeback_store. 316 * See the comment in writeback_store.
317 */ 317 */
318 zram_slot_lock(zram, index); 318 zram_slot_lock(zram, index);
319 if (!zram_allocated(zram, index) || 319 if (zram_allocated(zram, index) &&
320 zram_test_flag(zram, index, ZRAM_UNDER_WB)) 320 !zram_test_flag(zram, index, ZRAM_UNDER_WB))
321 goto next; 321 zram_set_flag(zram, index, ZRAM_IDLE);
322 zram_set_flag(zram, index, ZRAM_IDLE);
323next:
324 zram_slot_unlock(zram, index); 322 zram_slot_unlock(zram, index);
325 } 323 }
326 324
@@ -330,6 +328,41 @@ next:
330} 328}
331 329
332#ifdef CONFIG_ZRAM_WRITEBACK 330#ifdef CONFIG_ZRAM_WRITEBACK
331static ssize_t writeback_limit_enable_store(struct device *dev,
332 struct device_attribute *attr, const char *buf, size_t len)
333{
334 struct zram *zram = dev_to_zram(dev);
335 u64 val;
336 ssize_t ret = -EINVAL;
337
338 if (kstrtoull(buf, 10, &val))
339 return ret;
340
341 down_read(&zram->init_lock);
342 spin_lock(&zram->wb_limit_lock);
343 zram->wb_limit_enable = val;
344 spin_unlock(&zram->wb_limit_lock);
345 up_read(&zram->init_lock);
346 ret = len;
347
348 return ret;
349}
350
351static ssize_t writeback_limit_enable_show(struct device *dev,
352 struct device_attribute *attr, char *buf)
353{
354 bool val;
355 struct zram *zram = dev_to_zram(dev);
356
357 down_read(&zram->init_lock);
358 spin_lock(&zram->wb_limit_lock);
359 val = zram->wb_limit_enable;
360 spin_unlock(&zram->wb_limit_lock);
361 up_read(&zram->init_lock);
362
363 return scnprintf(buf, PAGE_SIZE, "%d\n", val);
364}
365
333static ssize_t writeback_limit_store(struct device *dev, 366static ssize_t writeback_limit_store(struct device *dev,
334 struct device_attribute *attr, const char *buf, size_t len) 367 struct device_attribute *attr, const char *buf, size_t len)
335{ 368{
@@ -341,9 +374,9 @@ static ssize_t writeback_limit_store(struct device *dev,
341 return ret; 374 return ret;
342 375
343 down_read(&zram->init_lock); 376 down_read(&zram->init_lock);
344 atomic64_set(&zram->stats.bd_wb_limit, val); 377 spin_lock(&zram->wb_limit_lock);
345 if (val == 0) 378 zram->bd_wb_limit = val;
346 zram->stop_writeback = false; 379 spin_unlock(&zram->wb_limit_lock);
347 up_read(&zram->init_lock); 380 up_read(&zram->init_lock);
348 ret = len; 381 ret = len;
349 382
@@ -357,7 +390,9 @@ static ssize_t writeback_limit_show(struct device *dev,
357 struct zram *zram = dev_to_zram(dev); 390 struct zram *zram = dev_to_zram(dev);
358 391
359 down_read(&zram->init_lock); 392 down_read(&zram->init_lock);
360 val = atomic64_read(&zram->stats.bd_wb_limit); 393 spin_lock(&zram->wb_limit_lock);
394 val = zram->bd_wb_limit;
395 spin_unlock(&zram->wb_limit_lock);
361 up_read(&zram->init_lock); 396 up_read(&zram->init_lock);
362 397
363 return scnprintf(buf, PAGE_SIZE, "%llu\n", val); 398 return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
@@ -588,8 +623,8 @@ static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec,
588 return 1; 623 return 1;
589} 624}
590 625
591#define HUGE_WRITEBACK 0x1 626#define HUGE_WRITEBACK 1
592#define IDLE_WRITEBACK 0x2 627#define IDLE_WRITEBACK 2
593 628
594static ssize_t writeback_store(struct device *dev, 629static ssize_t writeback_store(struct device *dev,
595 struct device_attribute *attr, const char *buf, size_t len) 630 struct device_attribute *attr, const char *buf, size_t len)
@@ -602,7 +637,7 @@ static ssize_t writeback_store(struct device *dev,
602 struct page *page; 637 struct page *page;
603 ssize_t ret, sz; 638 ssize_t ret, sz;
604 char mode_buf[8]; 639 char mode_buf[8];
605 unsigned long mode = -1UL; 640 int mode = -1;
606 unsigned long blk_idx = 0; 641 unsigned long blk_idx = 0;
607 642
608 sz = strscpy(mode_buf, buf, sizeof(mode_buf)); 643 sz = strscpy(mode_buf, buf, sizeof(mode_buf));
@@ -618,7 +653,7 @@ static ssize_t writeback_store(struct device *dev,
618 else if (!strcmp(mode_buf, "huge")) 653 else if (!strcmp(mode_buf, "huge"))
619 mode = HUGE_WRITEBACK; 654 mode = HUGE_WRITEBACK;
620 655
621 if (mode == -1UL) 656 if (mode == -1)
622 return -EINVAL; 657 return -EINVAL;
623 658
624 down_read(&zram->init_lock); 659 down_read(&zram->init_lock);
@@ -645,10 +680,13 @@ static ssize_t writeback_store(struct device *dev,
645 bvec.bv_len = PAGE_SIZE; 680 bvec.bv_len = PAGE_SIZE;
646 bvec.bv_offset = 0; 681 bvec.bv_offset = 0;
647 682
648 if (zram->stop_writeback) { 683 spin_lock(&zram->wb_limit_lock);
684 if (zram->wb_limit_enable && !zram->bd_wb_limit) {
685 spin_unlock(&zram->wb_limit_lock);
649 ret = -EIO; 686 ret = -EIO;
650 break; 687 break;
651 } 688 }
689 spin_unlock(&zram->wb_limit_lock);
652 690
653 if (!blk_idx) { 691 if (!blk_idx) {
654 blk_idx = alloc_block_bdev(zram); 692 blk_idx = alloc_block_bdev(zram);
@@ -667,10 +705,11 @@ static ssize_t writeback_store(struct device *dev,
667 zram_test_flag(zram, index, ZRAM_UNDER_WB)) 705 zram_test_flag(zram, index, ZRAM_UNDER_WB))
668 goto next; 706 goto next;
669 707
670 if ((mode & IDLE_WRITEBACK && 708 if (mode == IDLE_WRITEBACK &&
671 !zram_test_flag(zram, index, ZRAM_IDLE)) && 709 !zram_test_flag(zram, index, ZRAM_IDLE))
672 (mode & HUGE_WRITEBACK && 710 goto next;
673 !zram_test_flag(zram, index, ZRAM_HUGE))) 711 if (mode == HUGE_WRITEBACK &&
712 !zram_test_flag(zram, index, ZRAM_HUGE))
674 goto next; 713 goto next;
675 /* 714 /*
676 * Clearing ZRAM_UNDER_WB is duty of caller. 715 * Clearing ZRAM_UNDER_WB is duty of caller.
@@ -732,11 +771,10 @@ static ssize_t writeback_store(struct device *dev,
732 zram_set_element(zram, index, blk_idx); 771 zram_set_element(zram, index, blk_idx);
733 blk_idx = 0; 772 blk_idx = 0;
734 atomic64_inc(&zram->stats.pages_stored); 773 atomic64_inc(&zram->stats.pages_stored);
735 if (atomic64_add_unless(&zram->stats.bd_wb_limit, 774 spin_lock(&zram->wb_limit_lock);
736 -1 << (PAGE_SHIFT - 12), 0)) { 775 if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
737 if (atomic64_read(&zram->stats.bd_wb_limit) == 0) 776 zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12);
738 zram->stop_writeback = true; 777 spin_unlock(&zram->wb_limit_lock);
739 }
740next: 778next:
741 zram_slot_unlock(zram, index); 779 zram_slot_unlock(zram, index);
742 } 780 }
@@ -1812,6 +1850,7 @@ static DEVICE_ATTR_RW(comp_algorithm);
1812static DEVICE_ATTR_RW(backing_dev); 1850static DEVICE_ATTR_RW(backing_dev);
1813static DEVICE_ATTR_WO(writeback); 1851static DEVICE_ATTR_WO(writeback);
1814static DEVICE_ATTR_RW(writeback_limit); 1852static DEVICE_ATTR_RW(writeback_limit);
1853static DEVICE_ATTR_RW(writeback_limit_enable);
1815#endif 1854#endif
1816 1855
1817static struct attribute *zram_disk_attrs[] = { 1856static struct attribute *zram_disk_attrs[] = {
@@ -1828,6 +1867,7 @@ static struct attribute *zram_disk_attrs[] = {
1828 &dev_attr_backing_dev.attr, 1867 &dev_attr_backing_dev.attr,
1829 &dev_attr_writeback.attr, 1868 &dev_attr_writeback.attr,
1830 &dev_attr_writeback_limit.attr, 1869 &dev_attr_writeback_limit.attr,
1870 &dev_attr_writeback_limit_enable.attr,
1831#endif 1871#endif
1832 &dev_attr_io_stat.attr, 1872 &dev_attr_io_stat.attr,
1833 &dev_attr_mm_stat.attr, 1873 &dev_attr_mm_stat.attr,
@@ -1867,7 +1907,9 @@ static int zram_add(void)
1867 device_id = ret; 1907 device_id = ret;
1868 1908
1869 init_rwsem(&zram->init_lock); 1909 init_rwsem(&zram->init_lock);
1870 1910#ifdef CONFIG_ZRAM_WRITEBACK
1911 spin_lock_init(&zram->wb_limit_lock);
1912#endif
1871 queue = blk_alloc_queue(GFP_KERNEL); 1913 queue = blk_alloc_queue(GFP_KERNEL);
1872 if (!queue) { 1914 if (!queue) {
1873 pr_err("Error allocating disk queue for device %d\n", 1915 pr_err("Error allocating disk queue for device %d\n",
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 4bd3afd15e83..f2fd46daa760 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -86,7 +86,6 @@ struct zram_stats {
86 atomic64_t bd_count; /* no. of pages in backing device */ 86 atomic64_t bd_count; /* no. of pages in backing device */
87 atomic64_t bd_reads; /* no. of reads from backing device */ 87 atomic64_t bd_reads; /* no. of reads from backing device */
88 atomic64_t bd_writes; /* no. of writes from backing device */ 88 atomic64_t bd_writes; /* no. of writes from backing device */
89 atomic64_t bd_wb_limit; /* writeback limit of backing device */
90#endif 89#endif
91}; 90};
92 91
@@ -114,8 +113,10 @@ struct zram {
114 */ 113 */
115 bool claim; /* Protected by bdev->bd_mutex */ 114 bool claim; /* Protected by bdev->bd_mutex */
116 struct file *backing_dev; 115 struct file *backing_dev;
117 bool stop_writeback;
118#ifdef CONFIG_ZRAM_WRITEBACK 116#ifdef CONFIG_ZRAM_WRITEBACK
117 spinlock_t wb_limit_lock;
118 bool wb_limit_enable;
119 u64 bd_wb_limit;
119 struct block_device *bdev; 120 struct block_device *bdev;
120 unsigned int old_block_size; 121 unsigned int old_block_size;
121 unsigned long *bitmap; 122 unsigned long *bitmap;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a2fcea5f8225..32920a10100e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -383,16 +383,17 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
383 * truncation is indicated by end of range being LLONG_MAX 383 * truncation is indicated by end of range being LLONG_MAX
384 * In this case, we first scan the range and release found pages. 384 * In this case, we first scan the range and release found pages.
385 * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv 385 * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
386 * maps and global counts. 386 * maps and global counts. Page faults can not race with truncation
387 * in this routine. hugetlb_no_page() prevents page faults in the
388 * truncated range. It checks i_size before allocation, and again after
389 * with the page table lock for the page held. The same lock must be
390 * acquired to unmap a page.
387 * hole punch is indicated if end is not LLONG_MAX 391 * hole punch is indicated if end is not LLONG_MAX
388 * In the hole punch case we scan the range and release found pages. 392 * In the hole punch case we scan the range and release found pages.
389 * Only when releasing a page is the associated region/reserv map 393 * Only when releasing a page is the associated region/reserv map
390 * deleted. The region/reserv map for ranges without associated 394 * deleted. The region/reserv map for ranges without associated
391 * pages are not modified. 395 * pages are not modified. Page faults can race with hole punch.
392 * 396 * This is indicated if we find a mapped page.
393 * Callers of this routine must hold the i_mmap_rwsem in write mode to prevent
394 * races with page faults.
395 *
396 * Note: If the passed end of range value is beyond the end of file, but 397 * Note: If the passed end of range value is beyond the end of file, but
397 * not LLONG_MAX this routine still performs a hole punch operation. 398 * not LLONG_MAX this routine still performs a hole punch operation.
398 */ 399 */
@@ -422,14 +423,32 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
422 423
423 for (i = 0; i < pagevec_count(&pvec); ++i) { 424 for (i = 0; i < pagevec_count(&pvec); ++i) {
424 struct page *page = pvec.pages[i]; 425 struct page *page = pvec.pages[i];
426 u32 hash;
425 427
426 index = page->index; 428 index = page->index;
429 hash = hugetlb_fault_mutex_hash(h, current->mm,
430 &pseudo_vma,
431 mapping, index, 0);
432 mutex_lock(&hugetlb_fault_mutex_table[hash]);
433
427 /* 434 /*
428 * A mapped page is impossible as callers should unmap 435 * If page is mapped, it was faulted in after being
429 * all references before calling. And, i_mmap_rwsem 436 * unmapped in caller. Unmap (again) now after taking
430 * prevents the creation of additional mappings. 437 * the fault mutex. The mutex will prevent faults
438 * until we finish removing the page.
439 *
440 * This race can only happen in the hole punch case.
441 * Getting here in a truncate operation is a bug.
431 */ 442 */
432 VM_BUG_ON(page_mapped(page)); 443 if (unlikely(page_mapped(page))) {
444 BUG_ON(truncate_op);
445
446 i_mmap_lock_write(mapping);
447 hugetlb_vmdelete_list(&mapping->i_mmap,
448 index * pages_per_huge_page(h),
449 (index + 1) * pages_per_huge_page(h));
450 i_mmap_unlock_write(mapping);
451 }
433 452
434 lock_page(page); 453 lock_page(page);
435 /* 454 /*
@@ -451,6 +470,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
451 } 470 }
452 471
453 unlock_page(page); 472 unlock_page(page);
473 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
454 } 474 }
455 huge_pagevec_release(&pvec); 475 huge_pagevec_release(&pvec);
456 cond_resched(); 476 cond_resched();
@@ -462,20 +482,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
462 482
463static void hugetlbfs_evict_inode(struct inode *inode) 483static void hugetlbfs_evict_inode(struct inode *inode)
464{ 484{
465 struct address_space *mapping = inode->i_mapping;
466 struct resv_map *resv_map; 485 struct resv_map *resv_map;
467 486
468 /*
469 * The vfs layer guarantees that there are no other users of this
470 * inode. Therefore, it would be safe to call remove_inode_hugepages
471 * without holding i_mmap_rwsem. We acquire and hold here to be
472 * consistent with other callers. Since there will be no contention
473 * on the semaphore, overhead is negligible.
474 */
475 i_mmap_lock_write(mapping);
476 remove_inode_hugepages(inode, 0, LLONG_MAX); 487 remove_inode_hugepages(inode, 0, LLONG_MAX);
477 i_mmap_unlock_write(mapping);
478
479 resv_map = (struct resv_map *)inode->i_mapping->private_data; 488 resv_map = (struct resv_map *)inode->i_mapping->private_data;
480 /* root inode doesn't have the resv_map, so we should check it */ 489 /* root inode doesn't have the resv_map, so we should check it */
481 if (resv_map) 490 if (resv_map)
@@ -496,8 +505,8 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
496 i_mmap_lock_write(mapping); 505 i_mmap_lock_write(mapping);
497 if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) 506 if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
498 hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0); 507 hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
499 remove_inode_hugepages(inode, offset, LLONG_MAX);
500 i_mmap_unlock_write(mapping); 508 i_mmap_unlock_write(mapping);
509 remove_inode_hugepages(inode, offset, LLONG_MAX);
501 return 0; 510 return 0;
502} 511}
503 512
@@ -531,8 +540,8 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
531 hugetlb_vmdelete_list(&mapping->i_mmap, 540 hugetlb_vmdelete_list(&mapping->i_mmap,
532 hole_start >> PAGE_SHIFT, 541 hole_start >> PAGE_SHIFT,
533 hole_end >> PAGE_SHIFT); 542 hole_end >> PAGE_SHIFT);
534 remove_inode_hugepages(inode, hole_start, hole_end);
535 i_mmap_unlock_write(mapping); 543 i_mmap_unlock_write(mapping);
544 remove_inode_hugepages(inode, hole_start, hole_end);
536 inode_unlock(inode); 545 inode_unlock(inode);
537 } 546 }
538 547
@@ -615,11 +624,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
615 /* addr is the offset within the file (zero based) */ 624 /* addr is the offset within the file (zero based) */
616 addr = index * hpage_size; 625 addr = index * hpage_size;
617 626
618 /* 627 /* mutex taken here, fault path and hole punch */
619 * fault mutex taken here, protects against fault path
620 * and hole punch. inode_lock previously taken protects
621 * against truncation.
622 */
623 hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping, 628 hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
624 index, addr); 629 index, addr);
625 mutex_lock(&hugetlb_fault_mutex_table[hash]); 630 mutex_lock(&hugetlb_fault_mutex_table[hash]);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index cc4a507d7ca4..842f9189537b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -520,6 +520,12 @@ enum pgdat_flags {
520 PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */ 520 PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */
521}; 521};
522 522
523enum zone_flags {
524 ZONE_BOOSTED_WATERMARK, /* zone recently boosted watermarks.
525 * Cleared when kswapd is woken.
526 */
527};
528
523static inline unsigned long zone_managed_pages(struct zone *zone) 529static inline unsigned long zone_managed_pages(struct zone *zone)
524{ 530{
525 return (unsigned long)atomic_long_read(&zone->managed_pages); 531 return (unsigned long)atomic_long_read(&zone->managed_pages);
diff --git a/kernel/fork.c b/kernel/fork.c
index 7f49be94eba9..b69248e6f0e0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -217,6 +217,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
217 memset(s->addr, 0, THREAD_SIZE); 217 memset(s->addr, 0, THREAD_SIZE);
218 218
219 tsk->stack_vm_area = s; 219 tsk->stack_vm_area = s;
220 tsk->stack = s->addr;
220 return s->addr; 221 return s->addr;
221 } 222 }
222 223
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 745088810965..df2e7dd5ff17 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3238,7 +3238,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
3238 struct page *ptepage; 3238 struct page *ptepage;
3239 unsigned long addr; 3239 unsigned long addr;
3240 int cow; 3240 int cow;
3241 struct address_space *mapping = vma->vm_file->f_mapping;
3242 struct hstate *h = hstate_vma(vma); 3241 struct hstate *h = hstate_vma(vma);
3243 unsigned long sz = huge_page_size(h); 3242 unsigned long sz = huge_page_size(h);
3244 struct mmu_notifier_range range; 3243 struct mmu_notifier_range range;
@@ -3250,23 +3249,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
3250 mmu_notifier_range_init(&range, src, vma->vm_start, 3249 mmu_notifier_range_init(&range, src, vma->vm_start,
3251 vma->vm_end); 3250 vma->vm_end);
3252 mmu_notifier_invalidate_range_start(&range); 3251 mmu_notifier_invalidate_range_start(&range);
3253 } else {
3254 /*
3255 * For shared mappings i_mmap_rwsem must be held to call
3256 * huge_pte_alloc, otherwise the returned ptep could go
3257 * away if part of a shared pmd and another thread calls
3258 * huge_pmd_unshare.
3259 */
3260 i_mmap_lock_read(mapping);
3261 } 3252 }
3262 3253
3263 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { 3254 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
3264 spinlock_t *src_ptl, *dst_ptl; 3255 spinlock_t *src_ptl, *dst_ptl;
3265
3266 src_pte = huge_pte_offset(src, addr, sz); 3256 src_pte = huge_pte_offset(src, addr, sz);
3267 if (!src_pte) 3257 if (!src_pte)
3268 continue; 3258 continue;
3269
3270 dst_pte = huge_pte_alloc(dst, addr, sz); 3259 dst_pte = huge_pte_alloc(dst, addr, sz);
3271 if (!dst_pte) { 3260 if (!dst_pte) {
3272 ret = -ENOMEM; 3261 ret = -ENOMEM;
@@ -3337,8 +3326,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
3337 3326
3338 if (cow) 3327 if (cow)
3339 mmu_notifier_invalidate_range_end(&range); 3328 mmu_notifier_invalidate_range_end(&range);
3340 else
3341 i_mmap_unlock_read(mapping);
3342 3329
3343 return ret; 3330 return ret;
3344} 3331}
@@ -3755,16 +3742,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
3755 } 3742 }
3756 3743
3757 /* 3744 /*
3758 * We can not race with truncation due to holding i_mmap_rwsem. 3745 * Use page lock to guard against racing truncation
3759 * Check once here for faults beyond end of file. 3746 * before we get page_table_lock.
3760 */ 3747 */
3761 size = i_size_read(mapping->host) >> huge_page_shift(h);
3762 if (idx >= size)
3763 goto out;
3764
3765retry: 3748retry:
3766 page = find_lock_page(mapping, idx); 3749 page = find_lock_page(mapping, idx);
3767 if (!page) { 3750 if (!page) {
3751 size = i_size_read(mapping->host) >> huge_page_shift(h);
3752 if (idx >= size)
3753 goto out;
3754
3768 /* 3755 /*
3769 * Check for page in userfault range 3756 * Check for page in userfault range
3770 */ 3757 */
@@ -3784,18 +3771,14 @@ retry:
3784 }; 3771 };
3785 3772
3786 /* 3773 /*
3787 * hugetlb_fault_mutex and i_mmap_rwsem must be 3774 * hugetlb_fault_mutex must be dropped before
3788 * dropped before handling userfault. Reacquire 3775 * handling userfault. Reacquire after handling
3789 * after handling fault to make calling code simpler. 3776 * fault to make calling code simpler.
3790 */ 3777 */
3791 hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, 3778 hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
3792 idx, haddr); 3779 idx, haddr);
3793 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 3780 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
3794 i_mmap_unlock_read(mapping);
3795
3796 ret = handle_userfault(&vmf, VM_UFFD_MISSING); 3781 ret = handle_userfault(&vmf, VM_UFFD_MISSING);
3797
3798 i_mmap_lock_read(mapping);
3799 mutex_lock(&hugetlb_fault_mutex_table[hash]); 3782 mutex_lock(&hugetlb_fault_mutex_table[hash]);
3800 goto out; 3783 goto out;
3801 } 3784 }
@@ -3854,6 +3837,9 @@ retry:
3854 } 3837 }
3855 3838
3856 ptl = huge_pte_lock(h, mm, ptep); 3839 ptl = huge_pte_lock(h, mm, ptep);
3840 size = i_size_read(mapping->host) >> huge_page_shift(h);
3841 if (idx >= size)
3842 goto backout;
3857 3843
3858 ret = 0; 3844 ret = 0;
3859 if (!huge_pte_none(huge_ptep_get(ptep))) 3845 if (!huge_pte_none(huge_ptep_get(ptep)))
@@ -3940,11 +3926,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3940 3926
3941 ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); 3927 ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
3942 if (ptep) { 3928 if (ptep) {
3943 /*
3944 * Since we hold no locks, ptep could be stale. That is
3945 * OK as we are only making decisions based on content and
3946 * not actually modifying content here.
3947 */
3948 entry = huge_ptep_get(ptep); 3929 entry = huge_ptep_get(ptep);
3949 if (unlikely(is_hugetlb_entry_migration(entry))) { 3930 if (unlikely(is_hugetlb_entry_migration(entry))) {
3950 migration_entry_wait_huge(vma, mm, ptep); 3931 migration_entry_wait_huge(vma, mm, ptep);
@@ -3952,33 +3933,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3952 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 3933 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
3953 return VM_FAULT_HWPOISON_LARGE | 3934 return VM_FAULT_HWPOISON_LARGE |
3954 VM_FAULT_SET_HINDEX(hstate_index(h)); 3935 VM_FAULT_SET_HINDEX(hstate_index(h));
3936 } else {
3937 ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
3938 if (!ptep)
3939 return VM_FAULT_OOM;
3955 } 3940 }
3956 3941
3957 /*
3958 * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
3959 * until finished with ptep. This serves two purposes:
3960 * 1) It prevents huge_pmd_unshare from being called elsewhere
3961 * and making the ptep no longer valid.
3962 * 2) It synchronizes us with file truncation.
3963 *
3964 * ptep could have already be assigned via huge_pte_offset. That
3965 * is OK, as huge_pte_alloc will return the same value unless
3966 * something changed.
3967 */
3968 mapping = vma->vm_file->f_mapping; 3942 mapping = vma->vm_file->f_mapping;
3969 i_mmap_lock_read(mapping); 3943 idx = vma_hugecache_offset(h, vma, haddr);
3970 ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
3971 if (!ptep) {
3972 i_mmap_unlock_read(mapping);
3973 return VM_FAULT_OOM;
3974 }
3975 3944
3976 /* 3945 /*
3977 * Serialize hugepage allocation and instantiation, so that we don't 3946 * Serialize hugepage allocation and instantiation, so that we don't
3978 * get spurious allocation failures if two CPUs race to instantiate 3947 * get spurious allocation failures if two CPUs race to instantiate
3979 * the same page in the page cache. 3948 * the same page in the page cache.
3980 */ 3949 */
3981 idx = vma_hugecache_offset(h, vma, haddr);
3982 hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); 3950 hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
3983 mutex_lock(&hugetlb_fault_mutex_table[hash]); 3951 mutex_lock(&hugetlb_fault_mutex_table[hash]);
3984 3952
@@ -4066,7 +4034,6 @@ out_ptl:
4066 } 4034 }
4067out_mutex: 4035out_mutex:
4068 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 4036 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
4069 i_mmap_unlock_read(mapping);
4070 /* 4037 /*
4071 * Generally it's safe to hold refcount during waiting page lock. But 4038 * Generally it's safe to hold refcount during waiting page lock. But
4072 * here we just wait to defer the next page fault to avoid busy loop and 4039 * here we just wait to defer the next page fault to avoid busy loop and
@@ -4671,12 +4638,10 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
4671 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() 4638 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
4672 * and returns the corresponding pte. While this is not necessary for the 4639 * and returns the corresponding pte. While this is not necessary for the
4673 * !shared pmd case because we can allocate the pmd later as well, it makes the 4640 * !shared pmd case because we can allocate the pmd later as well, it makes the
4674 * code much cleaner. 4641 * code much cleaner. pmd allocation is essential for the shared case because
4675 * 4642 * pud has to be populated inside the same i_mmap_rwsem section - otherwise
4676 * This routine must be called with i_mmap_rwsem held in at least read mode. 4643 * racing tasks could either miss the sharing (see huge_pte_offset) or select a
4677 * For hugetlbfs, this prevents removal of any page table entries associated 4644 * bad pmd for sharing.
4678 * with the address space. This is important as we are setting up sharing
4679 * based on existing page table entries (mappings).
4680 */ 4645 */
4681pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) 4646pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
4682{ 4647{
@@ -4693,6 +4658,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
4693 if (!vma_shareable(vma, addr)) 4658 if (!vma_shareable(vma, addr))
4694 return (pte_t *)pmd_alloc(mm, pud, addr); 4659 return (pte_t *)pmd_alloc(mm, pud, addr);
4695 4660
4661 i_mmap_lock_write(mapping);
4696 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { 4662 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
4697 if (svma == vma) 4663 if (svma == vma)
4698 continue; 4664 continue;
@@ -4722,6 +4688,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
4722 spin_unlock(ptl); 4688 spin_unlock(ptl);
4723out: 4689out:
4724 pte = (pte_t *)pmd_alloc(mm, pud, addr); 4690 pte = (pte_t *)pmd_alloc(mm, pud, addr);
4691 i_mmap_unlock_write(mapping);
4725 return pte; 4692 return pte;
4726} 4693}
4727 4694
@@ -4732,7 +4699,7 @@ out:
4732 * indicated by page_count > 1, unmap is achieved by clearing pud and 4699 * indicated by page_count > 1, unmap is achieved by clearing pud and
4733 * decrementing the ref count. If count == 1, the pte page is not shared. 4700 * decrementing the ref count. If count == 1, the pte page is not shared.
4734 * 4701 *
4735 * Called with page table lock held and i_mmap_rwsem held in write mode. 4702 * called with page table lock held.
4736 * 4703 *
4737 * returns: 1 successfully unmapped a shared pte page 4704 * returns: 1 successfully unmapped a shared pte page
4738 * 0 the underlying pte page is not shared, or it is the last user 4705 * 0 the underlying pte page is not shared, or it is the last user
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 03d5d1374ca7..73c9cbfdedf4 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -298,8 +298,6 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
298 return; 298 return;
299 } 299 }
300 300
301 cache->align = round_up(cache->align, KASAN_SHADOW_SCALE_SIZE);
302
303 *flags |= SLAB_KASAN; 301 *flags |= SLAB_KASAN;
304} 302}
305 303
@@ -349,28 +347,43 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
349} 347}
350 348
351/* 349/*
352 * Since it's desirable to only call object contructors once during slab 350 * This function assigns a tag to an object considering the following:
353 * allocation, we preassign tags to all such objects. Also preassign tags for 351 * 1. A cache might have a constructor, which might save a pointer to a slab
354 * SLAB_TYPESAFE_BY_RCU slabs to avoid use-after-free reports. 352 * object somewhere (e.g. in the object itself). We preassign a tag for
355 * For SLAB allocator we can't preassign tags randomly since the freelist is 353 * each object in caches with constructors during slab creation and reuse
356 * stored as an array of indexes instead of a linked list. Assign tags based 354 * the same tag each time a particular object is allocated.
357 * on objects indexes, so that objects that are next to each other get 355 * 2. A cache might be SLAB_TYPESAFE_BY_RCU, which means objects can be
358 * different tags. 356 * accessed after being freed. We preassign tags for objects in these
359 * After a tag is assigned, the object always gets allocated with the same tag. 357 * caches as well.
360 * The reason is that we can't change tags for objects with constructors on 358 * 3. For SLAB allocator we can't preassign tags randomly since the freelist
361 * reallocation (even for non-SLAB_TYPESAFE_BY_RCU), because the constructor 359 * is stored as an array of indexes instead of a linked list. Assign tags
362 * code can save the pointer to the object somewhere (e.g. in the object 360 * based on objects indexes, so that objects that are next to each other
363 * itself). Then if we retag it, the old saved pointer will become invalid. 361 * get different tags.
364 */ 362 */
365static u8 assign_tag(struct kmem_cache *cache, const void *object, bool new) 363static u8 assign_tag(struct kmem_cache *cache, const void *object,
364 bool init, bool krealloc)
366{ 365{
366 /* Reuse the same tag for krealloc'ed objects. */
367 if (krealloc)
368 return get_tag(object);
369
370 /*
371 * If the cache neither has a constructor nor has SLAB_TYPESAFE_BY_RCU
372 * set, assign a tag when the object is being allocated (init == false).
373 */
367 if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU)) 374 if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU))
368 return new ? KASAN_TAG_KERNEL : random_tag(); 375 return init ? KASAN_TAG_KERNEL : random_tag();
369 376
377 /* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */
370#ifdef CONFIG_SLAB 378#ifdef CONFIG_SLAB
379 /* For SLAB assign tags based on the object index in the freelist. */
371 return (u8)obj_to_index(cache, virt_to_page(object), (void *)object); 380 return (u8)obj_to_index(cache, virt_to_page(object), (void *)object);
372#else 381#else
373 return new ? random_tag() : get_tag(object); 382 /*
383 * For SLUB assign a random tag during slab creation, otherwise reuse
384 * the already assigned tag.
385 */
386 return init ? random_tag() : get_tag(object);
374#endif 387#endif
375} 388}
376 389
@@ -386,7 +399,8 @@ void * __must_check kasan_init_slab_obj(struct kmem_cache *cache,
386 __memset(alloc_info, 0, sizeof(*alloc_info)); 399 __memset(alloc_info, 0, sizeof(*alloc_info));
387 400
388 if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) 401 if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
389 object = set_tag(object, assign_tag(cache, object, true)); 402 object = set_tag(object,
403 assign_tag(cache, object, true, false));
390 404
391 return (void *)object; 405 return (void *)object;
392} 406}
@@ -452,8 +466,8 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
452 return __kasan_slab_free(cache, object, ip, true); 466 return __kasan_slab_free(cache, object, ip, true);
453} 467}
454 468
455void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object, 469static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
456 size_t size, gfp_t flags) 470 size_t size, gfp_t flags, bool krealloc)
457{ 471{
458 unsigned long redzone_start; 472 unsigned long redzone_start;
459 unsigned long redzone_end; 473 unsigned long redzone_end;
@@ -471,7 +485,7 @@ void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
471 KASAN_SHADOW_SCALE_SIZE); 485 KASAN_SHADOW_SCALE_SIZE);
472 486
473 if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) 487 if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
474 tag = assign_tag(cache, object, false); 488 tag = assign_tag(cache, object, false, krealloc);
475 489
476 /* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */ 490 /* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */
477 kasan_unpoison_shadow(set_tag(object, tag), size); 491 kasan_unpoison_shadow(set_tag(object, tag), size);
@@ -483,6 +497,12 @@ void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
483 497
484 return set_tag(object, tag); 498 return set_tag(object, tag);
485} 499}
500
501void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
502 size_t size, gfp_t flags)
503{
504 return __kasan_kmalloc(cache, object, size, flags, false);
505}
486EXPORT_SYMBOL(kasan_kmalloc); 506EXPORT_SYMBOL(kasan_kmalloc);
487 507
488void * __must_check kasan_kmalloc_large(const void *ptr, size_t size, 508void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
@@ -522,7 +542,8 @@ void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags)
522 if (unlikely(!PageSlab(page))) 542 if (unlikely(!PageSlab(page)))
523 return kasan_kmalloc_large(object, size, flags); 543 return kasan_kmalloc_large(object, size, flags);
524 else 544 else
525 return kasan_kmalloc(page->slab_cache, object, size, flags); 545 return __kasan_kmalloc(page->slab_cache, object, size,
546 flags, true);
526} 547}
527 548
528void kasan_poison_kfree(void *ptr, unsigned long ip) 549void kasan_poison_kfree(void *ptr, unsigned long ip)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6379fff1a5ff..7c72f2a95785 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -966,7 +966,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
966 enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; 966 enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
967 struct address_space *mapping; 967 struct address_space *mapping;
968 LIST_HEAD(tokill); 968 LIST_HEAD(tokill);
969 bool unmap_success = true; 969 bool unmap_success;
970 int kill = 1, forcekill; 970 int kill = 1, forcekill;
971 struct page *hpage = *hpagep; 971 struct page *hpage = *hpagep;
972 bool mlocked = PageMlocked(hpage); 972 bool mlocked = PageMlocked(hpage);
@@ -1028,19 +1028,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
1028 if (kill) 1028 if (kill)
1029 collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); 1029 collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
1030 1030
1031 if (!PageHuge(hpage)) { 1031 unmap_success = try_to_unmap(hpage, ttu);
1032 unmap_success = try_to_unmap(hpage, ttu);
1033 } else if (mapping) {
1034 /*
1035 * For hugetlb pages, try_to_unmap could potentially call
1036 * huge_pmd_unshare. Because of this, take semaphore in
1037 * write mode here and set TTU_RMAP_LOCKED to indicate we
1038 * have taken the lock at this higer level.
1039 */
1040 i_mmap_lock_write(mapping);
1041 unmap_success = try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
1042 i_mmap_unlock_write(mapping);
1043 }
1044 if (!unmap_success) 1032 if (!unmap_success)
1045 pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", 1033 pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
1046 pfn, page_mapcount(hpage)); 1034 pfn, page_mapcount(hpage));
diff --git a/mm/memory.c b/mm/memory.c
index a52663c0612d..e11ca9dd823f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2994,6 +2994,28 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
2994 struct vm_area_struct *vma = vmf->vma; 2994 struct vm_area_struct *vma = vmf->vma;
2995 vm_fault_t ret; 2995 vm_fault_t ret;
2996 2996
2997 /*
2998 * Preallocate pte before we take page_lock because this might lead to
2999 * deadlocks for memcg reclaim which waits for pages under writeback:
3000 * lock_page(A)
3001 * SetPageWriteback(A)
3002 * unlock_page(A)
3003 * lock_page(B)
3004 * lock_page(B)
3005 * pte_alloc_pne
3006 * shrink_page_list
3007 * wait_on_page_writeback(A)
3008 * SetPageWriteback(B)
3009 * unlock_page(B)
3010 * # flush A, B to clear the writeback
3011 */
3012 if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
3013 vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
3014 if (!vmf->prealloc_pte)
3015 return VM_FAULT_OOM;
3016 smp_wmb(); /* See comment in __pte_alloc() */
3017 }
3018
2997 ret = vma->vm_ops->fault(vmf); 3019 ret = vma->vm_ops->fault(vmf);
2998 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | 3020 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
2999 VM_FAULT_DONE_COW))) 3021 VM_FAULT_DONE_COW)))
@@ -4077,8 +4099,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4077 goto out; 4099 goto out;
4078 4100
4079 if (range) { 4101 if (range) {
4080 range->start = address & PAGE_MASK; 4102 mmu_notifier_range_init(range, mm, address & PAGE_MASK,
4081 range->end = range->start + PAGE_SIZE; 4103 (address & PAGE_MASK) + PAGE_SIZE);
4082 mmu_notifier_invalidate_range_start(range); 4104 mmu_notifier_invalidate_range_start(range);
4083 } 4105 }
4084 ptep = pte_offset_map_lock(mm, pmd, address, ptlp); 4106 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
diff --git a/mm/migrate.c b/mm/migrate.c
index ccf8966caf6f..a16b15090df3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1324,19 +1324,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
1324 goto put_anon; 1324 goto put_anon;
1325 1325
1326 if (page_mapped(hpage)) { 1326 if (page_mapped(hpage)) {
1327 struct address_space *mapping = page_mapping(hpage);
1328
1329 /*
1330 * try_to_unmap could potentially call huge_pmd_unshare.
1331 * Because of this, take semaphore in write mode here and
1332 * set TTU_RMAP_LOCKED to let lower levels know we have
1333 * taken the lock.
1334 */
1335 i_mmap_lock_write(mapping);
1336 try_to_unmap(hpage, 1327 try_to_unmap(hpage,
1337 TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS| 1328 TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
1338 TTU_RMAP_LOCKED);
1339 i_mmap_unlock_write(mapping);
1340 page_was_mapped = 1; 1329 page_was_mapped = 1;
1341 } 1330 }
1342 1331
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cde5dac6229a..d295c9bc01a8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2214,7 +2214,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
2214 */ 2214 */
2215 boost_watermark(zone); 2215 boost_watermark(zone);
2216 if (alloc_flags & ALLOC_KSWAPD) 2216 if (alloc_flags & ALLOC_KSWAPD)
2217 wakeup_kswapd(zone, 0, 0, zone_idx(zone)); 2217 set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
2218 2218
2219 /* We are not allowed to try stealing from the whole block */ 2219 /* We are not allowed to try stealing from the whole block */
2220 if (!whole_block) 2220 if (!whole_block)
@@ -3102,6 +3102,12 @@ struct page *rmqueue(struct zone *preferred_zone,
3102 local_irq_restore(flags); 3102 local_irq_restore(flags);
3103 3103
3104out: 3104out:
3105 /* Separate test+clear to avoid unnecessary atomics */
3106 if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
3107 clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
3108 wakeup_kswapd(zone, 0, 0, zone_idx(zone));
3109 }
3110
3105 VM_BUG_ON_PAGE(page && bad_range(zone, page), page); 3111 VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
3106 return page; 3112 return page;
3107 3113
diff --git a/mm/rmap.c b/mm/rmap.c
index 21a26cf51114..68a1a5b869a5 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -25,7 +25,6 @@
25 * page->flags PG_locked (lock_page) 25 * page->flags PG_locked (lock_page)
26 * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) 26 * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
27 * mapping->i_mmap_rwsem 27 * mapping->i_mmap_rwsem
28 * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
29 * anon_vma->rwsem 28 * anon_vma->rwsem
30 * mm->page_table_lock or pte_lock 29 * mm->page_table_lock or pte_lock
31 * zone_lru_lock (in mark_page_accessed, isolate_lru_page) 30 * zone_lru_lock (in mark_page_accessed, isolate_lru_page)
@@ -1379,9 +1378,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1379 /* 1378 /*
1380 * If sharing is possible, start and end will be adjusted 1379 * If sharing is possible, start and end will be adjusted
1381 * accordingly. 1380 * accordingly.
1382 *
1383 * If called for a huge page, caller must hold i_mmap_rwsem
1384 * in write mode as it is possible to call huge_pmd_unshare.
1385 */ 1381 */
1386 adjust_range_if_pmd_sharing_possible(vma, &range.start, 1382 adjust_range_if_pmd_sharing_possible(vma, &range.start,
1387 &range.end); 1383 &range.end);
diff --git a/mm/slab.c b/mm/slab.c
index 73fe23e649c9..78eb8c5bf4e4 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -666,8 +666,10 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries,
666 struct alien_cache *alc = NULL; 666 struct alien_cache *alc = NULL;
667 667
668 alc = kmalloc_node(memsize, gfp, node); 668 alc = kmalloc_node(memsize, gfp, node);
669 init_arraycache(&alc->ac, entries, batch); 669 if (alc) {
670 spin_lock_init(&alc->lock); 670 init_arraycache(&alc->ac, entries, batch);
671 spin_lock_init(&alc->lock);
672 }
671 return alc; 673 return alc;
672} 674}
673 675
diff --git a/mm/slub.c b/mm/slub.c
index 36c0befeebd8..1e3d0ec4e200 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3846,6 +3846,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
3846 unsigned int offset; 3846 unsigned int offset;
3847 size_t object_size; 3847 size_t object_size;
3848 3848
3849 ptr = kasan_reset_tag(ptr);
3850
3849 /* Find object and usable object size. */ 3851 /* Find object and usable object size. */
3850 s = page->slab_cache; 3852 s = page->slab_cache;
3851 3853
diff --git a/mm/usercopy.c b/mm/usercopy.c
index 852eb4e53f06..14faadcedd06 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -247,7 +247,8 @@ static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);
247/* 247/*
248 * Validates that the given object is: 248 * Validates that the given object is:
249 * - not bogus address 249 * - not bogus address
250 * - known-safe heap or stack object 250 * - fully contained by stack (or stack frame, when available)
251 * - fully within SLAB object (or object whitelist area, when available)
251 * - not in kernel text 252 * - not in kernel text
252 */ 253 */
253void __check_object_size(const void *ptr, unsigned long n, bool to_user) 254void __check_object_size(const void *ptr, unsigned long n, bool to_user)
@@ -262,9 +263,6 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
262 /* Check for invalid addresses. */ 263 /* Check for invalid addresses. */
263 check_bogus_address((const unsigned long)ptr, n, to_user); 264 check_bogus_address((const unsigned long)ptr, n, to_user);
264 265
265 /* Check for bad heap object. */
266 check_heap_object(ptr, n, to_user);
267
268 /* Check for bad stack object. */ 266 /* Check for bad stack object. */
269 switch (check_stack_object(ptr, n)) { 267 switch (check_stack_object(ptr, n)) {
270 case NOT_STACK: 268 case NOT_STACK:
@@ -282,6 +280,9 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
282 usercopy_abort("process stack", NULL, to_user, 0, n); 280 usercopy_abort("process stack", NULL, to_user, 0, n);
283 } 281 }
284 282
283 /* Check for bad heap object. */
284 check_heap_object(ptr, n, to_user);
285
285 /* Check for object in kernel to avoid text exposure. */ 286 /* Check for object in kernel to avoid text exposure. */
286 check_kernel_text_object((const unsigned long)ptr, n, to_user); 287 check_kernel_text_object((const unsigned long)ptr, n, to_user);
287} 288}
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 065c1ce191c4..d59b5a73dfb3 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -267,14 +267,10 @@ retry:
267 VM_BUG_ON(dst_addr & ~huge_page_mask(h)); 267 VM_BUG_ON(dst_addr & ~huge_page_mask(h));
268 268
269 /* 269 /*
270 * Serialize via i_mmap_rwsem and hugetlb_fault_mutex. 270 * Serialize via hugetlb_fault_mutex
271 * i_mmap_rwsem ensures the dst_pte remains valid even
272 * in the case of shared pmds. fault mutex prevents
273 * races with other faulting threads.
274 */ 271 */
275 mapping = dst_vma->vm_file->f_mapping;
276 i_mmap_lock_read(mapping);
277 idx = linear_page_index(dst_vma, dst_addr); 272 idx = linear_page_index(dst_vma, dst_addr);
273 mapping = dst_vma->vm_file->f_mapping;
278 hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping, 274 hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
279 idx, dst_addr); 275 idx, dst_addr);
280 mutex_lock(&hugetlb_fault_mutex_table[hash]); 276 mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -283,7 +279,6 @@ retry:
283 dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h)); 279 dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
284 if (!dst_pte) { 280 if (!dst_pte) {
285 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 281 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
286 i_mmap_unlock_read(mapping);
287 goto out_unlock; 282 goto out_unlock;
288 } 283 }
289 284
@@ -291,7 +286,6 @@ retry:
291 dst_pteval = huge_ptep_get(dst_pte); 286 dst_pteval = huge_ptep_get(dst_pte);
292 if (!huge_pte_none(dst_pteval)) { 287 if (!huge_pte_none(dst_pteval)) {
293 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 288 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
294 i_mmap_unlock_read(mapping);
295 goto out_unlock; 289 goto out_unlock;
296 } 290 }
297 291
@@ -299,7 +293,6 @@ retry:
299 dst_addr, src_addr, &page); 293 dst_addr, src_addr, &page);
300 294
301 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 295 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
302 i_mmap_unlock_read(mapping);
303 vm_alloc_shared = vm_shared; 296 vm_alloc_shared = vm_shared;
304 297
305 cond_resched(); 298 cond_resched();
diff --git a/mm/util.c b/mm/util.c
index 4df23d64aac7..1ea055138043 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -478,7 +478,7 @@ bool page_mapped(struct page *page)
478 return true; 478 return true;
479 if (PageHuge(page)) 479 if (PageHuge(page))
480 return false; 480 return false;
481 for (i = 0; i < hpage_nr_pages(page); i++) { 481 for (i = 0; i < (1 << compound_order(page)); i++) {
482 if (atomic_read(&page[i]._mapcount) >= 0) 482 if (atomic_read(&page[i]._mapcount) >= 0)
483 return true; 483 return true;
484 } 484 }
diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c
index 18fc112b65cd..d3a8755c039c 100644
--- a/tools/vm/page_owner_sort.c
+++ b/tools/vm/page_owner_sort.c
@@ -5,7 +5,9 @@
5 * Example use: 5 * Example use:
6 * cat /sys/kernel/debug/page_owner > page_owner_full.txt 6 * cat /sys/kernel/debug/page_owner > page_owner_full.txt
7 * grep -v ^PFN page_owner_full.txt > page_owner.txt 7 * grep -v ^PFN page_owner_full.txt > page_owner.txt
8 * ./sort page_owner.txt sorted_page_owner.txt 8 * ./page_owner_sort page_owner.txt sorted_page_owner.txt
9 *
10 * See Documentation/vm/page_owner.rst
9*/ 11*/
10 12
11#include <stdio.h> 13#include <stdio.h>