diff options
-rw-r--r-- | Documentation/ABI/testing/sysfs-block-zram | 11 | ||||
-rw-r--r-- | Documentation/blockdev/zram.txt | 74 | ||||
-rw-r--r-- | arch/arm64/include/asm/cache.h | 6 | ||||
-rw-r--r-- | drivers/block/zram/zram_drv.c | 90 | ||||
-rw-r--r-- | drivers/block/zram/zram_drv.h | 5 | ||||
-rw-r--r-- | fs/hugetlbfs/inode.c | 61 | ||||
-rw-r--r-- | include/linux/mmzone.h | 6 | ||||
-rw-r--r-- | kernel/fork.c | 1 | ||||
-rw-r--r-- | mm/hugetlb.c | 81 | ||||
-rw-r--r-- | mm/kasan/common.c | 65 | ||||
-rw-r--r-- | mm/memory-failure.c | 16 | ||||
-rw-r--r-- | mm/memory.c | 26 | ||||
-rw-r--r-- | mm/migrate.c | 13 | ||||
-rw-r--r-- | mm/page_alloc.c | 8 | ||||
-rw-r--r-- | mm/rmap.c | 4 | ||||
-rw-r--r-- | mm/slab.c | 6 | ||||
-rw-r--r-- | mm/slub.c | 2 | ||||
-rw-r--r-- | mm/usercopy.c | 9 | ||||
-rw-r--r-- | mm/userfaultfd.c | 11 | ||||
-rw-r--r-- | mm/util.c | 2 | ||||
-rw-r--r-- | tools/vm/page_owner_sort.c | 4 |
21 files changed, 289 insertions, 212 deletions
diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index 9d2339a485c8..14b2bf2e5105 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram | |||
@@ -122,11 +122,18 @@ Description: | |||
122 | statistics (bd_count, bd_reads, bd_writes) in a format | 122 | statistics (bd_count, bd_reads, bd_writes) in a format |
123 | similar to block layer statistics file format. | 123 | similar to block layer statistics file format. |
124 | 124 | ||
125 | What: /sys/block/zram<id>/writeback_limit_enable | ||
126 | Date: November 2018 | ||
127 | Contact: Minchan Kim <minchan@kernel.org> | ||
128 | Description: | ||
129 | The writeback_limit_enable file is read-write and specifies | ||
130 | eanbe of writeback_limit feature. "1" means eable the feature. | ||
131 | No limit "0" is the initial state. | ||
132 | |||
125 | What: /sys/block/zram<id>/writeback_limit | 133 | What: /sys/block/zram<id>/writeback_limit |
126 | Date: November 2018 | 134 | Date: November 2018 |
127 | Contact: Minchan Kim <minchan@kernel.org> | 135 | Contact: Minchan Kim <minchan@kernel.org> |
128 | Description: | 136 | Description: |
129 | The writeback_limit file is read-write and specifies the maximum | 137 | The writeback_limit file is read-write and specifies the maximum |
130 | amount of writeback ZRAM can do. The limit could be changed | 138 | amount of writeback ZRAM can do. The limit could be changed |
131 | in run time and "0" means disable the limit. | 139 | in run time. |
132 | No limit is the initial state. | ||
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 436c5e98e1b6..4df0ce271085 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt | |||
@@ -156,22 +156,23 @@ Per-device statistics are exported as various nodes under /sys/block/zram<id>/ | |||
156 | A brief description of exported device attributes. For more details please | 156 | A brief description of exported device attributes. For more details please |
157 | read Documentation/ABI/testing/sysfs-block-zram. | 157 | read Documentation/ABI/testing/sysfs-block-zram. |
158 | 158 | ||
159 | Name access description | 159 | Name access description |
160 | ---- ------ ----------- | 160 | ---- ------ ----------- |
161 | disksize RW show and set the device's disk size | 161 | disksize RW show and set the device's disk size |
162 | initstate RO shows the initialization state of the device | 162 | initstate RO shows the initialization state of the device |
163 | reset WO trigger device reset | 163 | reset WO trigger device reset |
164 | mem_used_max WO reset the `mem_used_max' counter (see later) | 164 | mem_used_max WO reset the `mem_used_max' counter (see later) |
165 | mem_limit WO specifies the maximum amount of memory ZRAM can use | 165 | mem_limit WO specifies the maximum amount of memory ZRAM can use |
166 | to store the compressed data | 166 | to store the compressed data |
167 | writeback_limit WO specifies the maximum amount of write IO zram can | 167 | writeback_limit WO specifies the maximum amount of write IO zram can |
168 | write out to backing device as 4KB unit | 168 | write out to backing device as 4KB unit |
169 | max_comp_streams RW the number of possible concurrent compress operations | 169 | writeback_limit_enable RW show and set writeback_limit feature |
170 | comp_algorithm RW show and change the compression algorithm | 170 | max_comp_streams RW the number of possible concurrent compress operations |
171 | compact WO trigger memory compaction | 171 | comp_algorithm RW show and change the compression algorithm |
172 | debug_stat RO this file is used for zram debugging purposes | 172 | compact WO trigger memory compaction |
173 | backing_dev RW set up backend storage for zram to write out | 173 | debug_stat RO this file is used for zram debugging purposes |
174 | idle WO mark allocated slot as idle | 174 | backing_dev RW set up backend storage for zram to write out |
175 | idle WO mark allocated slot as idle | ||
175 | 176 | ||
176 | 177 | ||
177 | User space is advised to use the following files to read the device statistics. | 178 | User space is advised to use the following files to read the device statistics. |
@@ -280,32 +281,51 @@ With the command, zram writeback idle pages from memory to the storage. | |||
280 | If there are lots of write IO with flash device, potentially, it has | 281 | If there are lots of write IO with flash device, potentially, it has |
281 | flash wearout problem so that admin needs to design write limitation | 282 | flash wearout problem so that admin needs to design write limitation |
282 | to guarantee storage health for entire product life. | 283 | to guarantee storage health for entire product life. |
283 | To overcome the concern, zram supports "writeback_limit". | 284 | |
284 | The "writeback_limit"'s default value is 0 so that it doesn't limit | 285 | To overcome the concern, zram supports "writeback_limit" feature. |
285 | any writeback. If admin want to measure writeback count in a certain | 286 | The "writeback_limit_enable"'s default value is 0 so that it doesn't limit |
286 | period, he could know it via /sys/block/zram0/bd_stat's 3rd column. | 287 | any writeback. IOW, if admin want to apply writeback budget, he should |
288 | enable writeback_limit_enable via | ||
289 | |||
290 | $ echo 1 > /sys/block/zramX/writeback_limit_enable | ||
291 | |||
292 | Once writeback_limit_enable is set, zram doesn't allow any writeback | ||
293 | until admin set the budget via /sys/block/zramX/writeback_limit. | ||
294 | |||
295 | (If admin doesn't enable writeback_limit_enable, writeback_limit's value | ||
296 | assigned via /sys/block/zramX/writeback_limit is meaninless.) | ||
287 | 297 | ||
288 | If admin want to limit writeback as per-day 400M, he could do it | 298 | If admin want to limit writeback as per-day 400M, he could do it |
289 | like below. | 299 | like below. |
290 | 300 | ||
291 | MB_SHIFT=20 | 301 | $ MB_SHIFT=20 |
292 | 4K_SHIFT=12 | 302 | $ 4K_SHIFT=12 |
293 | echo $((400<<MB_SHIFT>>4K_SHIFT)) > \ | 303 | $ echo $((400<<MB_SHIFT>>4K_SHIFT)) > \ |
294 | /sys/block/zram0/writeback_limit. | 304 | /sys/block/zram0/writeback_limit. |
305 | $ echo 1 > /sys/block/zram0/writeback_limit_enable | ||
295 | 306 | ||
296 | If admin want to allow further write again, he could do it like below | 307 | If admin want to allow further write again once the bugdet is exausted, |
308 | he could do it like below | ||
297 | 309 | ||
298 | echo 0 > /sys/block/zram0/writeback_limit | 310 | $ echo $((400<<MB_SHIFT>>4K_SHIFT)) > \ |
311 | /sys/block/zram0/writeback_limit | ||
299 | 312 | ||
300 | If admin want to see remaining writeback budget since he set, | 313 | If admin want to see remaining writeback budget since he set, |
301 | 314 | ||
302 | cat /sys/block/zram0/writeback_limit | 315 | $ cat /sys/block/zramX/writeback_limit |
316 | |||
317 | If admin want to disable writeback limit, he could do | ||
318 | |||
319 | $ echo 0 > /sys/block/zramX/writeback_limit_enable | ||
303 | 320 | ||
304 | The writeback_limit count will reset whenever you reset zram(e.g., | 321 | The writeback_limit count will reset whenever you reset zram(e.g., |
305 | system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of | 322 | system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of |
306 | writeback happened until you reset the zram to allocate extra writeback | 323 | writeback happened until you reset the zram to allocate extra writeback |
307 | budget in next setting is user's job. | 324 | budget in next setting is user's job. |
308 | 325 | ||
326 | If admin want to measure writeback count in a certain period, he could | ||
327 | know it via /sys/block/zram0/bd_stat's 3rd column. | ||
328 | |||
309 | = memory tracking | 329 | = memory tracking |
310 | 330 | ||
311 | With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the | 331 | With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the |
diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h index 13dd42c3ad4e..eb43e09c1980 100644 --- a/arch/arm64/include/asm/cache.h +++ b/arch/arm64/include/asm/cache.h | |||
@@ -58,6 +58,12 @@ | |||
58 | */ | 58 | */ |
59 | #define ARCH_DMA_MINALIGN (128) | 59 | #define ARCH_DMA_MINALIGN (128) |
60 | 60 | ||
61 | #ifdef CONFIG_KASAN_SW_TAGS | ||
62 | #define ARCH_SLAB_MINALIGN (1ULL << KASAN_SHADOW_SCALE_SHIFT) | ||
63 | #else | ||
64 | #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) | ||
65 | #endif | ||
66 | |||
61 | #ifndef __ASSEMBLY__ | 67 | #ifndef __ASSEMBLY__ |
62 | 68 | ||
63 | #include <linux/bitops.h> | 69 | #include <linux/bitops.h> |
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 33c5cc879f24..04ca65912638 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c | |||
@@ -316,11 +316,9 @@ static ssize_t idle_store(struct device *dev, | |||
316 | * See the comment in writeback_store. | 316 | * See the comment in writeback_store. |
317 | */ | 317 | */ |
318 | zram_slot_lock(zram, index); | 318 | zram_slot_lock(zram, index); |
319 | if (!zram_allocated(zram, index) || | 319 | if (zram_allocated(zram, index) && |
320 | zram_test_flag(zram, index, ZRAM_UNDER_WB)) | 320 | !zram_test_flag(zram, index, ZRAM_UNDER_WB)) |
321 | goto next; | 321 | zram_set_flag(zram, index, ZRAM_IDLE); |
322 | zram_set_flag(zram, index, ZRAM_IDLE); | ||
323 | next: | ||
324 | zram_slot_unlock(zram, index); | 322 | zram_slot_unlock(zram, index); |
325 | } | 323 | } |
326 | 324 | ||
@@ -330,6 +328,41 @@ next: | |||
330 | } | 328 | } |
331 | 329 | ||
332 | #ifdef CONFIG_ZRAM_WRITEBACK | 330 | #ifdef CONFIG_ZRAM_WRITEBACK |
331 | static ssize_t writeback_limit_enable_store(struct device *dev, | ||
332 | struct device_attribute *attr, const char *buf, size_t len) | ||
333 | { | ||
334 | struct zram *zram = dev_to_zram(dev); | ||
335 | u64 val; | ||
336 | ssize_t ret = -EINVAL; | ||
337 | |||
338 | if (kstrtoull(buf, 10, &val)) | ||
339 | return ret; | ||
340 | |||
341 | down_read(&zram->init_lock); | ||
342 | spin_lock(&zram->wb_limit_lock); | ||
343 | zram->wb_limit_enable = val; | ||
344 | spin_unlock(&zram->wb_limit_lock); | ||
345 | up_read(&zram->init_lock); | ||
346 | ret = len; | ||
347 | |||
348 | return ret; | ||
349 | } | ||
350 | |||
351 | static ssize_t writeback_limit_enable_show(struct device *dev, | ||
352 | struct device_attribute *attr, char *buf) | ||
353 | { | ||
354 | bool val; | ||
355 | struct zram *zram = dev_to_zram(dev); | ||
356 | |||
357 | down_read(&zram->init_lock); | ||
358 | spin_lock(&zram->wb_limit_lock); | ||
359 | val = zram->wb_limit_enable; | ||
360 | spin_unlock(&zram->wb_limit_lock); | ||
361 | up_read(&zram->init_lock); | ||
362 | |||
363 | return scnprintf(buf, PAGE_SIZE, "%d\n", val); | ||
364 | } | ||
365 | |||
333 | static ssize_t writeback_limit_store(struct device *dev, | 366 | static ssize_t writeback_limit_store(struct device *dev, |
334 | struct device_attribute *attr, const char *buf, size_t len) | 367 | struct device_attribute *attr, const char *buf, size_t len) |
335 | { | 368 | { |
@@ -341,9 +374,9 @@ static ssize_t writeback_limit_store(struct device *dev, | |||
341 | return ret; | 374 | return ret; |
342 | 375 | ||
343 | down_read(&zram->init_lock); | 376 | down_read(&zram->init_lock); |
344 | atomic64_set(&zram->stats.bd_wb_limit, val); | 377 | spin_lock(&zram->wb_limit_lock); |
345 | if (val == 0) | 378 | zram->bd_wb_limit = val; |
346 | zram->stop_writeback = false; | 379 | spin_unlock(&zram->wb_limit_lock); |
347 | up_read(&zram->init_lock); | 380 | up_read(&zram->init_lock); |
348 | ret = len; | 381 | ret = len; |
349 | 382 | ||
@@ -357,7 +390,9 @@ static ssize_t writeback_limit_show(struct device *dev, | |||
357 | struct zram *zram = dev_to_zram(dev); | 390 | struct zram *zram = dev_to_zram(dev); |
358 | 391 | ||
359 | down_read(&zram->init_lock); | 392 | down_read(&zram->init_lock); |
360 | val = atomic64_read(&zram->stats.bd_wb_limit); | 393 | spin_lock(&zram->wb_limit_lock); |
394 | val = zram->bd_wb_limit; | ||
395 | spin_unlock(&zram->wb_limit_lock); | ||
361 | up_read(&zram->init_lock); | 396 | up_read(&zram->init_lock); |
362 | 397 | ||
363 | return scnprintf(buf, PAGE_SIZE, "%llu\n", val); | 398 | return scnprintf(buf, PAGE_SIZE, "%llu\n", val); |
@@ -588,8 +623,8 @@ static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec, | |||
588 | return 1; | 623 | return 1; |
589 | } | 624 | } |
590 | 625 | ||
591 | #define HUGE_WRITEBACK 0x1 | 626 | #define HUGE_WRITEBACK 1 |
592 | #define IDLE_WRITEBACK 0x2 | 627 | #define IDLE_WRITEBACK 2 |
593 | 628 | ||
594 | static ssize_t writeback_store(struct device *dev, | 629 | static ssize_t writeback_store(struct device *dev, |
595 | struct device_attribute *attr, const char *buf, size_t len) | 630 | struct device_attribute *attr, const char *buf, size_t len) |
@@ -602,7 +637,7 @@ static ssize_t writeback_store(struct device *dev, | |||
602 | struct page *page; | 637 | struct page *page; |
603 | ssize_t ret, sz; | 638 | ssize_t ret, sz; |
604 | char mode_buf[8]; | 639 | char mode_buf[8]; |
605 | unsigned long mode = -1UL; | 640 | int mode = -1; |
606 | unsigned long blk_idx = 0; | 641 | unsigned long blk_idx = 0; |
607 | 642 | ||
608 | sz = strscpy(mode_buf, buf, sizeof(mode_buf)); | 643 | sz = strscpy(mode_buf, buf, sizeof(mode_buf)); |
@@ -618,7 +653,7 @@ static ssize_t writeback_store(struct device *dev, | |||
618 | else if (!strcmp(mode_buf, "huge")) | 653 | else if (!strcmp(mode_buf, "huge")) |
619 | mode = HUGE_WRITEBACK; | 654 | mode = HUGE_WRITEBACK; |
620 | 655 | ||
621 | if (mode == -1UL) | 656 | if (mode == -1) |
622 | return -EINVAL; | 657 | return -EINVAL; |
623 | 658 | ||
624 | down_read(&zram->init_lock); | 659 | down_read(&zram->init_lock); |
@@ -645,10 +680,13 @@ static ssize_t writeback_store(struct device *dev, | |||
645 | bvec.bv_len = PAGE_SIZE; | 680 | bvec.bv_len = PAGE_SIZE; |
646 | bvec.bv_offset = 0; | 681 | bvec.bv_offset = 0; |
647 | 682 | ||
648 | if (zram->stop_writeback) { | 683 | spin_lock(&zram->wb_limit_lock); |
684 | if (zram->wb_limit_enable && !zram->bd_wb_limit) { | ||
685 | spin_unlock(&zram->wb_limit_lock); | ||
649 | ret = -EIO; | 686 | ret = -EIO; |
650 | break; | 687 | break; |
651 | } | 688 | } |
689 | spin_unlock(&zram->wb_limit_lock); | ||
652 | 690 | ||
653 | if (!blk_idx) { | 691 | if (!blk_idx) { |
654 | blk_idx = alloc_block_bdev(zram); | 692 | blk_idx = alloc_block_bdev(zram); |
@@ -667,10 +705,11 @@ static ssize_t writeback_store(struct device *dev, | |||
667 | zram_test_flag(zram, index, ZRAM_UNDER_WB)) | 705 | zram_test_flag(zram, index, ZRAM_UNDER_WB)) |
668 | goto next; | 706 | goto next; |
669 | 707 | ||
670 | if ((mode & IDLE_WRITEBACK && | 708 | if (mode == IDLE_WRITEBACK && |
671 | !zram_test_flag(zram, index, ZRAM_IDLE)) && | 709 | !zram_test_flag(zram, index, ZRAM_IDLE)) |
672 | (mode & HUGE_WRITEBACK && | 710 | goto next; |
673 | !zram_test_flag(zram, index, ZRAM_HUGE))) | 711 | if (mode == HUGE_WRITEBACK && |
712 | !zram_test_flag(zram, index, ZRAM_HUGE)) | ||
674 | goto next; | 713 | goto next; |
675 | /* | 714 | /* |
676 | * Clearing ZRAM_UNDER_WB is duty of caller. | 715 | * Clearing ZRAM_UNDER_WB is duty of caller. |
@@ -732,11 +771,10 @@ static ssize_t writeback_store(struct device *dev, | |||
732 | zram_set_element(zram, index, blk_idx); | 771 | zram_set_element(zram, index, blk_idx); |
733 | blk_idx = 0; | 772 | blk_idx = 0; |
734 | atomic64_inc(&zram->stats.pages_stored); | 773 | atomic64_inc(&zram->stats.pages_stored); |
735 | if (atomic64_add_unless(&zram->stats.bd_wb_limit, | 774 | spin_lock(&zram->wb_limit_lock); |
736 | -1 << (PAGE_SHIFT - 12), 0)) { | 775 | if (zram->wb_limit_enable && zram->bd_wb_limit > 0) |
737 | if (atomic64_read(&zram->stats.bd_wb_limit) == 0) | 776 | zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12); |
738 | zram->stop_writeback = true; | 777 | spin_unlock(&zram->wb_limit_lock); |
739 | } | ||
740 | next: | 778 | next: |
741 | zram_slot_unlock(zram, index); | 779 | zram_slot_unlock(zram, index); |
742 | } | 780 | } |
@@ -1812,6 +1850,7 @@ static DEVICE_ATTR_RW(comp_algorithm); | |||
1812 | static DEVICE_ATTR_RW(backing_dev); | 1850 | static DEVICE_ATTR_RW(backing_dev); |
1813 | static DEVICE_ATTR_WO(writeback); | 1851 | static DEVICE_ATTR_WO(writeback); |
1814 | static DEVICE_ATTR_RW(writeback_limit); | 1852 | static DEVICE_ATTR_RW(writeback_limit); |
1853 | static DEVICE_ATTR_RW(writeback_limit_enable); | ||
1815 | #endif | 1854 | #endif |
1816 | 1855 | ||
1817 | static struct attribute *zram_disk_attrs[] = { | 1856 | static struct attribute *zram_disk_attrs[] = { |
@@ -1828,6 +1867,7 @@ static struct attribute *zram_disk_attrs[] = { | |||
1828 | &dev_attr_backing_dev.attr, | 1867 | &dev_attr_backing_dev.attr, |
1829 | &dev_attr_writeback.attr, | 1868 | &dev_attr_writeback.attr, |
1830 | &dev_attr_writeback_limit.attr, | 1869 | &dev_attr_writeback_limit.attr, |
1870 | &dev_attr_writeback_limit_enable.attr, | ||
1831 | #endif | 1871 | #endif |
1832 | &dev_attr_io_stat.attr, | 1872 | &dev_attr_io_stat.attr, |
1833 | &dev_attr_mm_stat.attr, | 1873 | &dev_attr_mm_stat.attr, |
@@ -1867,7 +1907,9 @@ static int zram_add(void) | |||
1867 | device_id = ret; | 1907 | device_id = ret; |
1868 | 1908 | ||
1869 | init_rwsem(&zram->init_lock); | 1909 | init_rwsem(&zram->init_lock); |
1870 | 1910 | #ifdef CONFIG_ZRAM_WRITEBACK | |
1911 | spin_lock_init(&zram->wb_limit_lock); | ||
1912 | #endif | ||
1871 | queue = blk_alloc_queue(GFP_KERNEL); | 1913 | queue = blk_alloc_queue(GFP_KERNEL); |
1872 | if (!queue) { | 1914 | if (!queue) { |
1873 | pr_err("Error allocating disk queue for device %d\n", | 1915 | pr_err("Error allocating disk queue for device %d\n", |
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 4bd3afd15e83..f2fd46daa760 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h | |||
@@ -86,7 +86,6 @@ struct zram_stats { | |||
86 | atomic64_t bd_count; /* no. of pages in backing device */ | 86 | atomic64_t bd_count; /* no. of pages in backing device */ |
87 | atomic64_t bd_reads; /* no. of reads from backing device */ | 87 | atomic64_t bd_reads; /* no. of reads from backing device */ |
88 | atomic64_t bd_writes; /* no. of writes from backing device */ | 88 | atomic64_t bd_writes; /* no. of writes from backing device */ |
89 | atomic64_t bd_wb_limit; /* writeback limit of backing device */ | ||
90 | #endif | 89 | #endif |
91 | }; | 90 | }; |
92 | 91 | ||
@@ -114,8 +113,10 @@ struct zram { | |||
114 | */ | 113 | */ |
115 | bool claim; /* Protected by bdev->bd_mutex */ | 114 | bool claim; /* Protected by bdev->bd_mutex */ |
116 | struct file *backing_dev; | 115 | struct file *backing_dev; |
117 | bool stop_writeback; | ||
118 | #ifdef CONFIG_ZRAM_WRITEBACK | 116 | #ifdef CONFIG_ZRAM_WRITEBACK |
117 | spinlock_t wb_limit_lock; | ||
118 | bool wb_limit_enable; | ||
119 | u64 bd_wb_limit; | ||
119 | struct block_device *bdev; | 120 | struct block_device *bdev; |
120 | unsigned int old_block_size; | 121 | unsigned int old_block_size; |
121 | unsigned long *bitmap; | 122 | unsigned long *bitmap; |
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a2fcea5f8225..32920a10100e 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c | |||
@@ -383,16 +383,17 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end) | |||
383 | * truncation is indicated by end of range being LLONG_MAX | 383 | * truncation is indicated by end of range being LLONG_MAX |
384 | * In this case, we first scan the range and release found pages. | 384 | * In this case, we first scan the range and release found pages. |
385 | * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv | 385 | * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv |
386 | * maps and global counts. | 386 | * maps and global counts. Page faults can not race with truncation |
387 | * in this routine. hugetlb_no_page() prevents page faults in the | ||
388 | * truncated range. It checks i_size before allocation, and again after | ||
389 | * with the page table lock for the page held. The same lock must be | ||
390 | * acquired to unmap a page. | ||
387 | * hole punch is indicated if end is not LLONG_MAX | 391 | * hole punch is indicated if end is not LLONG_MAX |
388 | * In the hole punch case we scan the range and release found pages. | 392 | * In the hole punch case we scan the range and release found pages. |
389 | * Only when releasing a page is the associated region/reserv map | 393 | * Only when releasing a page is the associated region/reserv map |
390 | * deleted. The region/reserv map for ranges without associated | 394 | * deleted. The region/reserv map for ranges without associated |
391 | * pages are not modified. | 395 | * pages are not modified. Page faults can race with hole punch. |
392 | * | 396 | * This is indicated if we find a mapped page. |
393 | * Callers of this routine must hold the i_mmap_rwsem in write mode to prevent | ||
394 | * races with page faults. | ||
395 | * | ||
396 | * Note: If the passed end of range value is beyond the end of file, but | 397 | * Note: If the passed end of range value is beyond the end of file, but |
397 | * not LLONG_MAX this routine still performs a hole punch operation. | 398 | * not LLONG_MAX this routine still performs a hole punch operation. |
398 | */ | 399 | */ |
@@ -422,14 +423,32 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, | |||
422 | 423 | ||
423 | for (i = 0; i < pagevec_count(&pvec); ++i) { | 424 | for (i = 0; i < pagevec_count(&pvec); ++i) { |
424 | struct page *page = pvec.pages[i]; | 425 | struct page *page = pvec.pages[i]; |
426 | u32 hash; | ||
425 | 427 | ||
426 | index = page->index; | 428 | index = page->index; |
429 | hash = hugetlb_fault_mutex_hash(h, current->mm, | ||
430 | &pseudo_vma, | ||
431 | mapping, index, 0); | ||
432 | mutex_lock(&hugetlb_fault_mutex_table[hash]); | ||
433 | |||
427 | /* | 434 | /* |
428 | * A mapped page is impossible as callers should unmap | 435 | * If page is mapped, it was faulted in after being |
429 | * all references before calling. And, i_mmap_rwsem | 436 | * unmapped in caller. Unmap (again) now after taking |
430 | * prevents the creation of additional mappings. | 437 | * the fault mutex. The mutex will prevent faults |
438 | * until we finish removing the page. | ||
439 | * | ||
440 | * This race can only happen in the hole punch case. | ||
441 | * Getting here in a truncate operation is a bug. | ||
431 | */ | 442 | */ |
432 | VM_BUG_ON(page_mapped(page)); | 443 | if (unlikely(page_mapped(page))) { |
444 | BUG_ON(truncate_op); | ||
445 | |||
446 | i_mmap_lock_write(mapping); | ||
447 | hugetlb_vmdelete_list(&mapping->i_mmap, | ||
448 | index * pages_per_huge_page(h), | ||
449 | (index + 1) * pages_per_huge_page(h)); | ||
450 | i_mmap_unlock_write(mapping); | ||
451 | } | ||
433 | 452 | ||
434 | lock_page(page); | 453 | lock_page(page); |
435 | /* | 454 | /* |
@@ -451,6 +470,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, | |||
451 | } | 470 | } |
452 | 471 | ||
453 | unlock_page(page); | 472 | unlock_page(page); |
473 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); | ||
454 | } | 474 | } |
455 | huge_pagevec_release(&pvec); | 475 | huge_pagevec_release(&pvec); |
456 | cond_resched(); | 476 | cond_resched(); |
@@ -462,20 +482,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, | |||
462 | 482 | ||
463 | static void hugetlbfs_evict_inode(struct inode *inode) | 483 | static void hugetlbfs_evict_inode(struct inode *inode) |
464 | { | 484 | { |
465 | struct address_space *mapping = inode->i_mapping; | ||
466 | struct resv_map *resv_map; | 485 | struct resv_map *resv_map; |
467 | 486 | ||
468 | /* | ||
469 | * The vfs layer guarantees that there are no other users of this | ||
470 | * inode. Therefore, it would be safe to call remove_inode_hugepages | ||
471 | * without holding i_mmap_rwsem. We acquire and hold here to be | ||
472 | * consistent with other callers. Since there will be no contention | ||
473 | * on the semaphore, overhead is negligible. | ||
474 | */ | ||
475 | i_mmap_lock_write(mapping); | ||
476 | remove_inode_hugepages(inode, 0, LLONG_MAX); | 487 | remove_inode_hugepages(inode, 0, LLONG_MAX); |
477 | i_mmap_unlock_write(mapping); | ||
478 | |||
479 | resv_map = (struct resv_map *)inode->i_mapping->private_data; | 488 | resv_map = (struct resv_map *)inode->i_mapping->private_data; |
480 | /* root inode doesn't have the resv_map, so we should check it */ | 489 | /* root inode doesn't have the resv_map, so we should check it */ |
481 | if (resv_map) | 490 | if (resv_map) |
@@ -496,8 +505,8 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) | |||
496 | i_mmap_lock_write(mapping); | 505 | i_mmap_lock_write(mapping); |
497 | if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) | 506 | if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) |
498 | hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0); | 507 | hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0); |
499 | remove_inode_hugepages(inode, offset, LLONG_MAX); | ||
500 | i_mmap_unlock_write(mapping); | 508 | i_mmap_unlock_write(mapping); |
509 | remove_inode_hugepages(inode, offset, LLONG_MAX); | ||
501 | return 0; | 510 | return 0; |
502 | } | 511 | } |
503 | 512 | ||
@@ -531,8 +540,8 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) | |||
531 | hugetlb_vmdelete_list(&mapping->i_mmap, | 540 | hugetlb_vmdelete_list(&mapping->i_mmap, |
532 | hole_start >> PAGE_SHIFT, | 541 | hole_start >> PAGE_SHIFT, |
533 | hole_end >> PAGE_SHIFT); | 542 | hole_end >> PAGE_SHIFT); |
534 | remove_inode_hugepages(inode, hole_start, hole_end); | ||
535 | i_mmap_unlock_write(mapping); | 543 | i_mmap_unlock_write(mapping); |
544 | remove_inode_hugepages(inode, hole_start, hole_end); | ||
536 | inode_unlock(inode); | 545 | inode_unlock(inode); |
537 | } | 546 | } |
538 | 547 | ||
@@ -615,11 +624,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, | |||
615 | /* addr is the offset within the file (zero based) */ | 624 | /* addr is the offset within the file (zero based) */ |
616 | addr = index * hpage_size; | 625 | addr = index * hpage_size; |
617 | 626 | ||
618 | /* | 627 | /* mutex taken here, fault path and hole punch */ |
619 | * fault mutex taken here, protects against fault path | ||
620 | * and hole punch. inode_lock previously taken protects | ||
621 | * against truncation. | ||
622 | */ | ||
623 | hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping, | 628 | hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping, |
624 | index, addr); | 629 | index, addr); |
625 | mutex_lock(&hugetlb_fault_mutex_table[hash]); | 630 | mutex_lock(&hugetlb_fault_mutex_table[hash]); |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index cc4a507d7ca4..842f9189537b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -520,6 +520,12 @@ enum pgdat_flags { | |||
520 | PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */ | 520 | PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */ |
521 | }; | 521 | }; |
522 | 522 | ||
523 | enum zone_flags { | ||
524 | ZONE_BOOSTED_WATERMARK, /* zone recently boosted watermarks. | ||
525 | * Cleared when kswapd is woken. | ||
526 | */ | ||
527 | }; | ||
528 | |||
523 | static inline unsigned long zone_managed_pages(struct zone *zone) | 529 | static inline unsigned long zone_managed_pages(struct zone *zone) |
524 | { | 530 | { |
525 | return (unsigned long)atomic_long_read(&zone->managed_pages); | 531 | return (unsigned long)atomic_long_read(&zone->managed_pages); |
diff --git a/kernel/fork.c b/kernel/fork.c index 7f49be94eba9..b69248e6f0e0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -217,6 +217,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) | |||
217 | memset(s->addr, 0, THREAD_SIZE); | 217 | memset(s->addr, 0, THREAD_SIZE); |
218 | 218 | ||
219 | tsk->stack_vm_area = s; | 219 | tsk->stack_vm_area = s; |
220 | tsk->stack = s->addr; | ||
220 | return s->addr; | 221 | return s->addr; |
221 | } | 222 | } |
222 | 223 | ||
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 745088810965..df2e7dd5ff17 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -3238,7 +3238,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
3238 | struct page *ptepage; | 3238 | struct page *ptepage; |
3239 | unsigned long addr; | 3239 | unsigned long addr; |
3240 | int cow; | 3240 | int cow; |
3241 | struct address_space *mapping = vma->vm_file->f_mapping; | ||
3242 | struct hstate *h = hstate_vma(vma); | 3241 | struct hstate *h = hstate_vma(vma); |
3243 | unsigned long sz = huge_page_size(h); | 3242 | unsigned long sz = huge_page_size(h); |
3244 | struct mmu_notifier_range range; | 3243 | struct mmu_notifier_range range; |
@@ -3250,23 +3249,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
3250 | mmu_notifier_range_init(&range, src, vma->vm_start, | 3249 | mmu_notifier_range_init(&range, src, vma->vm_start, |
3251 | vma->vm_end); | 3250 | vma->vm_end); |
3252 | mmu_notifier_invalidate_range_start(&range); | 3251 | mmu_notifier_invalidate_range_start(&range); |
3253 | } else { | ||
3254 | /* | ||
3255 | * For shared mappings i_mmap_rwsem must be held to call | ||
3256 | * huge_pte_alloc, otherwise the returned ptep could go | ||
3257 | * away if part of a shared pmd and another thread calls | ||
3258 | * huge_pmd_unshare. | ||
3259 | */ | ||
3260 | i_mmap_lock_read(mapping); | ||
3261 | } | 3252 | } |
3262 | 3253 | ||
3263 | for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { | 3254 | for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { |
3264 | spinlock_t *src_ptl, *dst_ptl; | 3255 | spinlock_t *src_ptl, *dst_ptl; |
3265 | |||
3266 | src_pte = huge_pte_offset(src, addr, sz); | 3256 | src_pte = huge_pte_offset(src, addr, sz); |
3267 | if (!src_pte) | 3257 | if (!src_pte) |
3268 | continue; | 3258 | continue; |
3269 | |||
3270 | dst_pte = huge_pte_alloc(dst, addr, sz); | 3259 | dst_pte = huge_pte_alloc(dst, addr, sz); |
3271 | if (!dst_pte) { | 3260 | if (!dst_pte) { |
3272 | ret = -ENOMEM; | 3261 | ret = -ENOMEM; |
@@ -3337,8 +3326,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
3337 | 3326 | ||
3338 | if (cow) | 3327 | if (cow) |
3339 | mmu_notifier_invalidate_range_end(&range); | 3328 | mmu_notifier_invalidate_range_end(&range); |
3340 | else | ||
3341 | i_mmap_unlock_read(mapping); | ||
3342 | 3329 | ||
3343 | return ret; | 3330 | return ret; |
3344 | } | 3331 | } |
@@ -3755,16 +3742,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, | |||
3755 | } | 3742 | } |
3756 | 3743 | ||
3757 | /* | 3744 | /* |
3758 | * We can not race with truncation due to holding i_mmap_rwsem. | 3745 | * Use page lock to guard against racing truncation |
3759 | * Check once here for faults beyond end of file. | 3746 | * before we get page_table_lock. |
3760 | */ | 3747 | */ |
3761 | size = i_size_read(mapping->host) >> huge_page_shift(h); | ||
3762 | if (idx >= size) | ||
3763 | goto out; | ||
3764 | |||
3765 | retry: | 3748 | retry: |
3766 | page = find_lock_page(mapping, idx); | 3749 | page = find_lock_page(mapping, idx); |
3767 | if (!page) { | 3750 | if (!page) { |
3751 | size = i_size_read(mapping->host) >> huge_page_shift(h); | ||
3752 | if (idx >= size) | ||
3753 | goto out; | ||
3754 | |||
3768 | /* | 3755 | /* |
3769 | * Check for page in userfault range | 3756 | * Check for page in userfault range |
3770 | */ | 3757 | */ |
@@ -3784,18 +3771,14 @@ retry: | |||
3784 | }; | 3771 | }; |
3785 | 3772 | ||
3786 | /* | 3773 | /* |
3787 | * hugetlb_fault_mutex and i_mmap_rwsem must be | 3774 | * hugetlb_fault_mutex must be dropped before |
3788 | * dropped before handling userfault. Reacquire | 3775 | * handling userfault. Reacquire after handling |
3789 | * after handling fault to make calling code simpler. | 3776 | * fault to make calling code simpler. |
3790 | */ | 3777 | */ |
3791 | hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, | 3778 | hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, |
3792 | idx, haddr); | 3779 | idx, haddr); |
3793 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); | 3780 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); |
3794 | i_mmap_unlock_read(mapping); | ||
3795 | |||
3796 | ret = handle_userfault(&vmf, VM_UFFD_MISSING); | 3781 | ret = handle_userfault(&vmf, VM_UFFD_MISSING); |
3797 | |||
3798 | i_mmap_lock_read(mapping); | ||
3799 | mutex_lock(&hugetlb_fault_mutex_table[hash]); | 3782 | mutex_lock(&hugetlb_fault_mutex_table[hash]); |
3800 | goto out; | 3783 | goto out; |
3801 | } | 3784 | } |
@@ -3854,6 +3837,9 @@ retry: | |||
3854 | } | 3837 | } |
3855 | 3838 | ||
3856 | ptl = huge_pte_lock(h, mm, ptep); | 3839 | ptl = huge_pte_lock(h, mm, ptep); |
3840 | size = i_size_read(mapping->host) >> huge_page_shift(h); | ||
3841 | if (idx >= size) | ||
3842 | goto backout; | ||
3857 | 3843 | ||
3858 | ret = 0; | 3844 | ret = 0; |
3859 | if (!huge_pte_none(huge_ptep_get(ptep))) | 3845 | if (!huge_pte_none(huge_ptep_get(ptep))) |
@@ -3940,11 +3926,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3940 | 3926 | ||
3941 | ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); | 3927 | ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); |
3942 | if (ptep) { | 3928 | if (ptep) { |
3943 | /* | ||
3944 | * Since we hold no locks, ptep could be stale. That is | ||
3945 | * OK as we are only making decisions based on content and | ||
3946 | * not actually modifying content here. | ||
3947 | */ | ||
3948 | entry = huge_ptep_get(ptep); | 3929 | entry = huge_ptep_get(ptep); |
3949 | if (unlikely(is_hugetlb_entry_migration(entry))) { | 3930 | if (unlikely(is_hugetlb_entry_migration(entry))) { |
3950 | migration_entry_wait_huge(vma, mm, ptep); | 3931 | migration_entry_wait_huge(vma, mm, ptep); |
@@ -3952,33 +3933,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3952 | } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) | 3933 | } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) |
3953 | return VM_FAULT_HWPOISON_LARGE | | 3934 | return VM_FAULT_HWPOISON_LARGE | |
3954 | VM_FAULT_SET_HINDEX(hstate_index(h)); | 3935 | VM_FAULT_SET_HINDEX(hstate_index(h)); |
3936 | } else { | ||
3937 | ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); | ||
3938 | if (!ptep) | ||
3939 | return VM_FAULT_OOM; | ||
3955 | } | 3940 | } |
3956 | 3941 | ||
3957 | /* | ||
3958 | * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold | ||
3959 | * until finished with ptep. This serves two purposes: | ||
3960 | * 1) It prevents huge_pmd_unshare from being called elsewhere | ||
3961 | * and making the ptep no longer valid. | ||
3962 | * 2) It synchronizes us with file truncation. | ||
3963 | * | ||
3964 | * ptep could have already be assigned via huge_pte_offset. That | ||
3965 | * is OK, as huge_pte_alloc will return the same value unless | ||
3966 | * something changed. | ||
3967 | */ | ||
3968 | mapping = vma->vm_file->f_mapping; | 3942 | mapping = vma->vm_file->f_mapping; |
3969 | i_mmap_lock_read(mapping); | 3943 | idx = vma_hugecache_offset(h, vma, haddr); |
3970 | ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); | ||
3971 | if (!ptep) { | ||
3972 | i_mmap_unlock_read(mapping); | ||
3973 | return VM_FAULT_OOM; | ||
3974 | } | ||
3975 | 3944 | ||
3976 | /* | 3945 | /* |
3977 | * Serialize hugepage allocation and instantiation, so that we don't | 3946 | * Serialize hugepage allocation and instantiation, so that we don't |
3978 | * get spurious allocation failures if two CPUs race to instantiate | 3947 | * get spurious allocation failures if two CPUs race to instantiate |
3979 | * the same page in the page cache. | 3948 | * the same page in the page cache. |
3980 | */ | 3949 | */ |
3981 | idx = vma_hugecache_offset(h, vma, haddr); | ||
3982 | hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); | 3950 | hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); |
3983 | mutex_lock(&hugetlb_fault_mutex_table[hash]); | 3951 | mutex_lock(&hugetlb_fault_mutex_table[hash]); |
3984 | 3952 | ||
@@ -4066,7 +4034,6 @@ out_ptl: | |||
4066 | } | 4034 | } |
4067 | out_mutex: | 4035 | out_mutex: |
4068 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); | 4036 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); |
4069 | i_mmap_unlock_read(mapping); | ||
4070 | /* | 4037 | /* |
4071 | * Generally it's safe to hold refcount during waiting page lock. But | 4038 | * Generally it's safe to hold refcount during waiting page lock. But |
4072 | * here we just wait to defer the next page fault to avoid busy loop and | 4039 | * here we just wait to defer the next page fault to avoid busy loop and |
@@ -4671,12 +4638,10 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, | |||
4671 | * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() | 4638 | * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() |
4672 | * and returns the corresponding pte. While this is not necessary for the | 4639 | * and returns the corresponding pte. While this is not necessary for the |
4673 | * !shared pmd case because we can allocate the pmd later as well, it makes the | 4640 | * !shared pmd case because we can allocate the pmd later as well, it makes the |
4674 | * code much cleaner. | 4641 | * code much cleaner. pmd allocation is essential for the shared case because |
4675 | * | 4642 | * pud has to be populated inside the same i_mmap_rwsem section - otherwise |
4676 | * This routine must be called with i_mmap_rwsem held in at least read mode. | 4643 | * racing tasks could either miss the sharing (see huge_pte_offset) or select a |
4677 | * For hugetlbfs, this prevents removal of any page table entries associated | 4644 | * bad pmd for sharing. |
4678 | * with the address space. This is important as we are setting up sharing | ||
4679 | * based on existing page table entries (mappings). | ||
4680 | */ | 4645 | */ |
4681 | pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | 4646 | pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) |
4682 | { | 4647 | { |
@@ -4693,6 +4658,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |||
4693 | if (!vma_shareable(vma, addr)) | 4658 | if (!vma_shareable(vma, addr)) |
4694 | return (pte_t *)pmd_alloc(mm, pud, addr); | 4659 | return (pte_t *)pmd_alloc(mm, pud, addr); |
4695 | 4660 | ||
4661 | i_mmap_lock_write(mapping); | ||
4696 | vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { | 4662 | vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { |
4697 | if (svma == vma) | 4663 | if (svma == vma) |
4698 | continue; | 4664 | continue; |
@@ -4722,6 +4688,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |||
4722 | spin_unlock(ptl); | 4688 | spin_unlock(ptl); |
4723 | out: | 4689 | out: |
4724 | pte = (pte_t *)pmd_alloc(mm, pud, addr); | 4690 | pte = (pte_t *)pmd_alloc(mm, pud, addr); |
4691 | i_mmap_unlock_write(mapping); | ||
4725 | return pte; | 4692 | return pte; |
4726 | } | 4693 | } |
4727 | 4694 | ||
@@ -4732,7 +4699,7 @@ out: | |||
4732 | * indicated by page_count > 1, unmap is achieved by clearing pud and | 4699 | * indicated by page_count > 1, unmap is achieved by clearing pud and |
4733 | * decrementing the ref count. If count == 1, the pte page is not shared. | 4700 | * decrementing the ref count. If count == 1, the pte page is not shared. |
4734 | * | 4701 | * |
4735 | * Called with page table lock held and i_mmap_rwsem held in write mode. | 4702 | * called with page table lock held. |
4736 | * | 4703 | * |
4737 | * returns: 1 successfully unmapped a shared pte page | 4704 | * returns: 1 successfully unmapped a shared pte page |
4738 | * 0 the underlying pte page is not shared, or it is the last user | 4705 | * 0 the underlying pte page is not shared, or it is the last user |
diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 03d5d1374ca7..73c9cbfdedf4 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c | |||
@@ -298,8 +298,6 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, | |||
298 | return; | 298 | return; |
299 | } | 299 | } |
300 | 300 | ||
301 | cache->align = round_up(cache->align, KASAN_SHADOW_SCALE_SIZE); | ||
302 | |||
303 | *flags |= SLAB_KASAN; | 301 | *flags |= SLAB_KASAN; |
304 | } | 302 | } |
305 | 303 | ||
@@ -349,28 +347,43 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object) | |||
349 | } | 347 | } |
350 | 348 | ||
351 | /* | 349 | /* |
352 | * Since it's desirable to only call object contructors once during slab | 350 | * This function assigns a tag to an object considering the following: |
353 | * allocation, we preassign tags to all such objects. Also preassign tags for | 351 | * 1. A cache might have a constructor, which might save a pointer to a slab |
354 | * SLAB_TYPESAFE_BY_RCU slabs to avoid use-after-free reports. | 352 | * object somewhere (e.g. in the object itself). We preassign a tag for |
355 | * For SLAB allocator we can't preassign tags randomly since the freelist is | 353 | * each object in caches with constructors during slab creation and reuse |
356 | * stored as an array of indexes instead of a linked list. Assign tags based | 354 | * the same tag each time a particular object is allocated. |
357 | * on objects indexes, so that objects that are next to each other get | 355 | * 2. A cache might be SLAB_TYPESAFE_BY_RCU, which means objects can be |
358 | * different tags. | 356 | * accessed after being freed. We preassign tags for objects in these |
359 | * After a tag is assigned, the object always gets allocated with the same tag. | 357 | * caches as well. |
360 | * The reason is that we can't change tags for objects with constructors on | 358 | * 3. For SLAB allocator we can't preassign tags randomly since the freelist |
361 | * reallocation (even for non-SLAB_TYPESAFE_BY_RCU), because the constructor | 359 | * is stored as an array of indexes instead of a linked list. Assign tags |
362 | * code can save the pointer to the object somewhere (e.g. in the object | 360 | * based on objects indexes, so that objects that are next to each other |
363 | * itself). Then if we retag it, the old saved pointer will become invalid. | 361 | * get different tags. |
364 | */ | 362 | */ |
365 | static u8 assign_tag(struct kmem_cache *cache, const void *object, bool new) | 363 | static u8 assign_tag(struct kmem_cache *cache, const void *object, |
364 | bool init, bool krealloc) | ||
366 | { | 365 | { |
366 | /* Reuse the same tag for krealloc'ed objects. */ | ||
367 | if (krealloc) | ||
368 | return get_tag(object); | ||
369 | |||
370 | /* | ||
371 | * If the cache neither has a constructor nor has SLAB_TYPESAFE_BY_RCU | ||
372 | * set, assign a tag when the object is being allocated (init == false). | ||
373 | */ | ||
367 | if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU)) | 374 | if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU)) |
368 | return new ? KASAN_TAG_KERNEL : random_tag(); | 375 | return init ? KASAN_TAG_KERNEL : random_tag(); |
369 | 376 | ||
377 | /* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */ | ||
370 | #ifdef CONFIG_SLAB | 378 | #ifdef CONFIG_SLAB |
379 | /* For SLAB assign tags based on the object index in the freelist. */ | ||
371 | return (u8)obj_to_index(cache, virt_to_page(object), (void *)object); | 380 | return (u8)obj_to_index(cache, virt_to_page(object), (void *)object); |
372 | #else | 381 | #else |
373 | return new ? random_tag() : get_tag(object); | 382 | /* |
383 | * For SLUB assign a random tag during slab creation, otherwise reuse | ||
384 | * the already assigned tag. | ||
385 | */ | ||
386 | return init ? random_tag() : get_tag(object); | ||
374 | #endif | 387 | #endif |
375 | } | 388 | } |
376 | 389 | ||
@@ -386,7 +399,8 @@ void * __must_check kasan_init_slab_obj(struct kmem_cache *cache, | |||
386 | __memset(alloc_info, 0, sizeof(*alloc_info)); | 399 | __memset(alloc_info, 0, sizeof(*alloc_info)); |
387 | 400 | ||
388 | if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) | 401 | if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) |
389 | object = set_tag(object, assign_tag(cache, object, true)); | 402 | object = set_tag(object, |
403 | assign_tag(cache, object, true, false)); | ||
390 | 404 | ||
391 | return (void *)object; | 405 | return (void *)object; |
392 | } | 406 | } |
@@ -452,8 +466,8 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip) | |||
452 | return __kasan_slab_free(cache, object, ip, true); | 466 | return __kasan_slab_free(cache, object, ip, true); |
453 | } | 467 | } |
454 | 468 | ||
455 | void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object, | 469 | static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object, |
456 | size_t size, gfp_t flags) | 470 | size_t size, gfp_t flags, bool krealloc) |
457 | { | 471 | { |
458 | unsigned long redzone_start; | 472 | unsigned long redzone_start; |
459 | unsigned long redzone_end; | 473 | unsigned long redzone_end; |
@@ -471,7 +485,7 @@ void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object, | |||
471 | KASAN_SHADOW_SCALE_SIZE); | 485 | KASAN_SHADOW_SCALE_SIZE); |
472 | 486 | ||
473 | if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) | 487 | if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) |
474 | tag = assign_tag(cache, object, false); | 488 | tag = assign_tag(cache, object, false, krealloc); |
475 | 489 | ||
476 | /* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */ | 490 | /* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */ |
477 | kasan_unpoison_shadow(set_tag(object, tag), size); | 491 | kasan_unpoison_shadow(set_tag(object, tag), size); |
@@ -483,6 +497,12 @@ void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object, | |||
483 | 497 | ||
484 | return set_tag(object, tag); | 498 | return set_tag(object, tag); |
485 | } | 499 | } |
500 | |||
501 | void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object, | ||
502 | size_t size, gfp_t flags) | ||
503 | { | ||
504 | return __kasan_kmalloc(cache, object, size, flags, false); | ||
505 | } | ||
486 | EXPORT_SYMBOL(kasan_kmalloc); | 506 | EXPORT_SYMBOL(kasan_kmalloc); |
487 | 507 | ||
488 | void * __must_check kasan_kmalloc_large(const void *ptr, size_t size, | 508 | void * __must_check kasan_kmalloc_large(const void *ptr, size_t size, |
@@ -522,7 +542,8 @@ void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags) | |||
522 | if (unlikely(!PageSlab(page))) | 542 | if (unlikely(!PageSlab(page))) |
523 | return kasan_kmalloc_large(object, size, flags); | 543 | return kasan_kmalloc_large(object, size, flags); |
524 | else | 544 | else |
525 | return kasan_kmalloc(page->slab_cache, object, size, flags); | 545 | return __kasan_kmalloc(page->slab_cache, object, size, |
546 | flags, true); | ||
526 | } | 547 | } |
527 | 548 | ||
528 | void kasan_poison_kfree(void *ptr, unsigned long ip) | 549 | void kasan_poison_kfree(void *ptr, unsigned long ip) |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6379fff1a5ff..7c72f2a95785 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -966,7 +966,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
966 | enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; | 966 | enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; |
967 | struct address_space *mapping; | 967 | struct address_space *mapping; |
968 | LIST_HEAD(tokill); | 968 | LIST_HEAD(tokill); |
969 | bool unmap_success = true; | 969 | bool unmap_success; |
970 | int kill = 1, forcekill; | 970 | int kill = 1, forcekill; |
971 | struct page *hpage = *hpagep; | 971 | struct page *hpage = *hpagep; |
972 | bool mlocked = PageMlocked(hpage); | 972 | bool mlocked = PageMlocked(hpage); |
@@ -1028,19 +1028,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
1028 | if (kill) | 1028 | if (kill) |
1029 | collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); | 1029 | collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); |
1030 | 1030 | ||
1031 | if (!PageHuge(hpage)) { | 1031 | unmap_success = try_to_unmap(hpage, ttu); |
1032 | unmap_success = try_to_unmap(hpage, ttu); | ||
1033 | } else if (mapping) { | ||
1034 | /* | ||
1035 | * For hugetlb pages, try_to_unmap could potentially call | ||
1036 | * huge_pmd_unshare. Because of this, take semaphore in | ||
1037 | * write mode here and set TTU_RMAP_LOCKED to indicate we | ||
1038 | * have taken the lock at this higer level. | ||
1039 | */ | ||
1040 | i_mmap_lock_write(mapping); | ||
1041 | unmap_success = try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED); | ||
1042 | i_mmap_unlock_write(mapping); | ||
1043 | } | ||
1044 | if (!unmap_success) | 1032 | if (!unmap_success) |
1045 | pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", | 1033 | pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", |
1046 | pfn, page_mapcount(hpage)); | 1034 | pfn, page_mapcount(hpage)); |
diff --git a/mm/memory.c b/mm/memory.c index a52663c0612d..e11ca9dd823f 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -2994,6 +2994,28 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) | |||
2994 | struct vm_area_struct *vma = vmf->vma; | 2994 | struct vm_area_struct *vma = vmf->vma; |
2995 | vm_fault_t ret; | 2995 | vm_fault_t ret; |
2996 | 2996 | ||
2997 | /* | ||
2998 | * Preallocate pte before we take page_lock because this might lead to | ||
2999 | * deadlocks for memcg reclaim which waits for pages under writeback: | ||
3000 | * lock_page(A) | ||
3001 | * SetPageWriteback(A) | ||
3002 | * unlock_page(A) | ||
3003 | * lock_page(B) | ||
3004 | * lock_page(B) | ||
3005 | * pte_alloc_pne | ||
3006 | * shrink_page_list | ||
3007 | * wait_on_page_writeback(A) | ||
3008 | * SetPageWriteback(B) | ||
3009 | * unlock_page(B) | ||
3010 | * # flush A, B to clear the writeback | ||
3011 | */ | ||
3012 | if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) { | ||
3013 | vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); | ||
3014 | if (!vmf->prealloc_pte) | ||
3015 | return VM_FAULT_OOM; | ||
3016 | smp_wmb(); /* See comment in __pte_alloc() */ | ||
3017 | } | ||
3018 | |||
2997 | ret = vma->vm_ops->fault(vmf); | 3019 | ret = vma->vm_ops->fault(vmf); |
2998 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | | 3020 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | |
2999 | VM_FAULT_DONE_COW))) | 3021 | VM_FAULT_DONE_COW))) |
@@ -4077,8 +4099,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, | |||
4077 | goto out; | 4099 | goto out; |
4078 | 4100 | ||
4079 | if (range) { | 4101 | if (range) { |
4080 | range->start = address & PAGE_MASK; | 4102 | mmu_notifier_range_init(range, mm, address & PAGE_MASK, |
4081 | range->end = range->start + PAGE_SIZE; | 4103 | (address & PAGE_MASK) + PAGE_SIZE); |
4082 | mmu_notifier_invalidate_range_start(range); | 4104 | mmu_notifier_invalidate_range_start(range); |
4083 | } | 4105 | } |
4084 | ptep = pte_offset_map_lock(mm, pmd, address, ptlp); | 4106 | ptep = pte_offset_map_lock(mm, pmd, address, ptlp); |
diff --git a/mm/migrate.c b/mm/migrate.c index ccf8966caf6f..a16b15090df3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -1324,19 +1324,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
1324 | goto put_anon; | 1324 | goto put_anon; |
1325 | 1325 | ||
1326 | if (page_mapped(hpage)) { | 1326 | if (page_mapped(hpage)) { |
1327 | struct address_space *mapping = page_mapping(hpage); | ||
1328 | |||
1329 | /* | ||
1330 | * try_to_unmap could potentially call huge_pmd_unshare. | ||
1331 | * Because of this, take semaphore in write mode here and | ||
1332 | * set TTU_RMAP_LOCKED to let lower levels know we have | ||
1333 | * taken the lock. | ||
1334 | */ | ||
1335 | i_mmap_lock_write(mapping); | ||
1336 | try_to_unmap(hpage, | 1327 | try_to_unmap(hpage, |
1337 | TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS| | 1328 | TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); |
1338 | TTU_RMAP_LOCKED); | ||
1339 | i_mmap_unlock_write(mapping); | ||
1340 | page_was_mapped = 1; | 1329 | page_was_mapped = 1; |
1341 | } | 1330 | } |
1342 | 1331 | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cde5dac6229a..d295c9bc01a8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -2214,7 +2214,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, | |||
2214 | */ | 2214 | */ |
2215 | boost_watermark(zone); | 2215 | boost_watermark(zone); |
2216 | if (alloc_flags & ALLOC_KSWAPD) | 2216 | if (alloc_flags & ALLOC_KSWAPD) |
2217 | wakeup_kswapd(zone, 0, 0, zone_idx(zone)); | 2217 | set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); |
2218 | 2218 | ||
2219 | /* We are not allowed to try stealing from the whole block */ | 2219 | /* We are not allowed to try stealing from the whole block */ |
2220 | if (!whole_block) | 2220 | if (!whole_block) |
@@ -3102,6 +3102,12 @@ struct page *rmqueue(struct zone *preferred_zone, | |||
3102 | local_irq_restore(flags); | 3102 | local_irq_restore(flags); |
3103 | 3103 | ||
3104 | out: | 3104 | out: |
3105 | /* Separate test+clear to avoid unnecessary atomics */ | ||
3106 | if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) { | ||
3107 | clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); | ||
3108 | wakeup_kswapd(zone, 0, 0, zone_idx(zone)); | ||
3109 | } | ||
3110 | |||
3105 | VM_BUG_ON_PAGE(page && bad_range(zone, page), page); | 3111 | VM_BUG_ON_PAGE(page && bad_range(zone, page), page); |
3106 | return page; | 3112 | return page; |
3107 | 3113 | ||
@@ -25,7 +25,6 @@ | |||
25 | * page->flags PG_locked (lock_page) | 25 | * page->flags PG_locked (lock_page) |
26 | * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) | 26 | * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) |
27 | * mapping->i_mmap_rwsem | 27 | * mapping->i_mmap_rwsem |
28 | * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) | ||
29 | * anon_vma->rwsem | 28 | * anon_vma->rwsem |
30 | * mm->page_table_lock or pte_lock | 29 | * mm->page_table_lock or pte_lock |
31 | * zone_lru_lock (in mark_page_accessed, isolate_lru_page) | 30 | * zone_lru_lock (in mark_page_accessed, isolate_lru_page) |
@@ -1379,9 +1378,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1379 | /* | 1378 | /* |
1380 | * If sharing is possible, start and end will be adjusted | 1379 | * If sharing is possible, start and end will be adjusted |
1381 | * accordingly. | 1380 | * accordingly. |
1382 | * | ||
1383 | * If called for a huge page, caller must hold i_mmap_rwsem | ||
1384 | * in write mode as it is possible to call huge_pmd_unshare. | ||
1385 | */ | 1381 | */ |
1386 | adjust_range_if_pmd_sharing_possible(vma, &range.start, | 1382 | adjust_range_if_pmd_sharing_possible(vma, &range.start, |
1387 | &range.end); | 1383 | &range.end); |
@@ -666,8 +666,10 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries, | |||
666 | struct alien_cache *alc = NULL; | 666 | struct alien_cache *alc = NULL; |
667 | 667 | ||
668 | alc = kmalloc_node(memsize, gfp, node); | 668 | alc = kmalloc_node(memsize, gfp, node); |
669 | init_arraycache(&alc->ac, entries, batch); | 669 | if (alc) { |
670 | spin_lock_init(&alc->lock); | 670 | init_arraycache(&alc->ac, entries, batch); |
671 | spin_lock_init(&alc->lock); | ||
672 | } | ||
671 | return alc; | 673 | return alc; |
672 | } | 674 | } |
673 | 675 | ||
@@ -3846,6 +3846,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, | |||
3846 | unsigned int offset; | 3846 | unsigned int offset; |
3847 | size_t object_size; | 3847 | size_t object_size; |
3848 | 3848 | ||
3849 | ptr = kasan_reset_tag(ptr); | ||
3850 | |||
3849 | /* Find object and usable object size. */ | 3851 | /* Find object and usable object size. */ |
3850 | s = page->slab_cache; | 3852 | s = page->slab_cache; |
3851 | 3853 | ||
diff --git a/mm/usercopy.c b/mm/usercopy.c index 852eb4e53f06..14faadcedd06 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c | |||
@@ -247,7 +247,8 @@ static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks); | |||
247 | /* | 247 | /* |
248 | * Validates that the given object is: | 248 | * Validates that the given object is: |
249 | * - not bogus address | 249 | * - not bogus address |
250 | * - known-safe heap or stack object | 250 | * - fully contained by stack (or stack frame, when available) |
251 | * - fully within SLAB object (or object whitelist area, when available) | ||
251 | * - not in kernel text | 252 | * - not in kernel text |
252 | */ | 253 | */ |
253 | void __check_object_size(const void *ptr, unsigned long n, bool to_user) | 254 | void __check_object_size(const void *ptr, unsigned long n, bool to_user) |
@@ -262,9 +263,6 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user) | |||
262 | /* Check for invalid addresses. */ | 263 | /* Check for invalid addresses. */ |
263 | check_bogus_address((const unsigned long)ptr, n, to_user); | 264 | check_bogus_address((const unsigned long)ptr, n, to_user); |
264 | 265 | ||
265 | /* Check for bad heap object. */ | ||
266 | check_heap_object(ptr, n, to_user); | ||
267 | |||
268 | /* Check for bad stack object. */ | 266 | /* Check for bad stack object. */ |
269 | switch (check_stack_object(ptr, n)) { | 267 | switch (check_stack_object(ptr, n)) { |
270 | case NOT_STACK: | 268 | case NOT_STACK: |
@@ -282,6 +280,9 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user) | |||
282 | usercopy_abort("process stack", NULL, to_user, 0, n); | 280 | usercopy_abort("process stack", NULL, to_user, 0, n); |
283 | } | 281 | } |
284 | 282 | ||
283 | /* Check for bad heap object. */ | ||
284 | check_heap_object(ptr, n, to_user); | ||
285 | |||
285 | /* Check for object in kernel to avoid text exposure. */ | 286 | /* Check for object in kernel to avoid text exposure. */ |
286 | check_kernel_text_object((const unsigned long)ptr, n, to_user); | 287 | check_kernel_text_object((const unsigned long)ptr, n, to_user); |
287 | } | 288 | } |
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 065c1ce191c4..d59b5a73dfb3 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c | |||
@@ -267,14 +267,10 @@ retry: | |||
267 | VM_BUG_ON(dst_addr & ~huge_page_mask(h)); | 267 | VM_BUG_ON(dst_addr & ~huge_page_mask(h)); |
268 | 268 | ||
269 | /* | 269 | /* |
270 | * Serialize via i_mmap_rwsem and hugetlb_fault_mutex. | 270 | * Serialize via hugetlb_fault_mutex |
271 | * i_mmap_rwsem ensures the dst_pte remains valid even | ||
272 | * in the case of shared pmds. fault mutex prevents | ||
273 | * races with other faulting threads. | ||
274 | */ | 271 | */ |
275 | mapping = dst_vma->vm_file->f_mapping; | ||
276 | i_mmap_lock_read(mapping); | ||
277 | idx = linear_page_index(dst_vma, dst_addr); | 272 | idx = linear_page_index(dst_vma, dst_addr); |
273 | mapping = dst_vma->vm_file->f_mapping; | ||
278 | hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping, | 274 | hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping, |
279 | idx, dst_addr); | 275 | idx, dst_addr); |
280 | mutex_lock(&hugetlb_fault_mutex_table[hash]); | 276 | mutex_lock(&hugetlb_fault_mutex_table[hash]); |
@@ -283,7 +279,6 @@ retry: | |||
283 | dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h)); | 279 | dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h)); |
284 | if (!dst_pte) { | 280 | if (!dst_pte) { |
285 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); | 281 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); |
286 | i_mmap_unlock_read(mapping); | ||
287 | goto out_unlock; | 282 | goto out_unlock; |
288 | } | 283 | } |
289 | 284 | ||
@@ -291,7 +286,6 @@ retry: | |||
291 | dst_pteval = huge_ptep_get(dst_pte); | 286 | dst_pteval = huge_ptep_get(dst_pte); |
292 | if (!huge_pte_none(dst_pteval)) { | 287 | if (!huge_pte_none(dst_pteval)) { |
293 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); | 288 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); |
294 | i_mmap_unlock_read(mapping); | ||
295 | goto out_unlock; | 289 | goto out_unlock; |
296 | } | 290 | } |
297 | 291 | ||
@@ -299,7 +293,6 @@ retry: | |||
299 | dst_addr, src_addr, &page); | 293 | dst_addr, src_addr, &page); |
300 | 294 | ||
301 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); | 295 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); |
302 | i_mmap_unlock_read(mapping); | ||
303 | vm_alloc_shared = vm_shared; | 296 | vm_alloc_shared = vm_shared; |
304 | 297 | ||
305 | cond_resched(); | 298 | cond_resched(); |
@@ -478,7 +478,7 @@ bool page_mapped(struct page *page) | |||
478 | return true; | 478 | return true; |
479 | if (PageHuge(page)) | 479 | if (PageHuge(page)) |
480 | return false; | 480 | return false; |
481 | for (i = 0; i < hpage_nr_pages(page); i++) { | 481 | for (i = 0; i < (1 << compound_order(page)); i++) { |
482 | if (atomic_read(&page[i]._mapcount) >= 0) | 482 | if (atomic_read(&page[i]._mapcount) >= 0) |
483 | return true; | 483 | return true; |
484 | } | 484 | } |
diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 18fc112b65cd..d3a8755c039c 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c | |||
@@ -5,7 +5,9 @@ | |||
5 | * Example use: | 5 | * Example use: |
6 | * cat /sys/kernel/debug/page_owner > page_owner_full.txt | 6 | * cat /sys/kernel/debug/page_owner > page_owner_full.txt |
7 | * grep -v ^PFN page_owner_full.txt > page_owner.txt | 7 | * grep -v ^PFN page_owner_full.txt > page_owner.txt |
8 | * ./sort page_owner.txt sorted_page_owner.txt | 8 | * ./page_owner_sort page_owner.txt sorted_page_owner.txt |
9 | * | ||
10 | * See Documentation/vm/page_owner.rst | ||
9 | */ | 11 | */ |
10 | 12 | ||
11 | #include <stdio.h> | 13 | #include <stdio.h> |