summaryrefslogtreecommitdiffstats
path: root/drivers/block/zram
diff options
context:
space:
mode:
authorMinchan Kim <minchan@kernel.org>2018-12-28 03:36:47 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2018-12-28 15:11:49 -0500
commita939888ec38bf1f33e4a903056677e92a4844244 (patch)
treec1895d12365563d62ad0f9906a8ab5f6ed2f0cf7 /drivers/block/zram
parente82592c4fd7eafe8dec12a70436e93e3afb28556 (diff)
zram: support idle/huge page writeback
Add a new feature "zram idle/huge page writeback". In the zram-swap use case, zram usually has many idle/huge swap pages. It's pointless to keep them in memory (ie, zram). To solve this problem, this feature introduces idle/huge page writeback to the backing device so the goal is to save more memory space on embedded systems. Normal sequence to use idle/huge page writeback feature is as follows, while (1) { # mark allocated zram slot to idle echo all > /sys/block/zram0/idle # leave system working for several hours # Unless there is no access for some blocks on zram, # they are still IDLE marked pages. echo "idle" > /sys/block/zram0/writeback or/and echo "huge" > /sys/block/zram0/writeback # write the IDLE or/and huge marked slot into backing device # and free the memory. } Per the discussion at https://lore.kernel.org/lkml/20181122065926.GG3441@jagdpanzerIV/T/#u, This patch removes direct incommpressibe page writeback feature (d2afd25114f4 ("zram: write incompressible pages to backing device")). Below concerns from Sergey: == &< == "IDLE writeback" is superior to "incompressible writeback". "incompressible writeback" is completely unpredictable and uncontrollable; it depens on data patterns and compression algorithms. While "IDLE writeback" is predictable. I even suspect, that, *ideally*, we can remove "incompressible writeback". "IDLE pages" is a super set which also includes "incompressible" pages. So, technically, we still can do "incompressible writeback" from "IDLE writeback" path; but a much more reasonable one, based on a page idling period. I understand that you want to keep "direct incompressible writeback" around. ZRAM is especially popular on devices which do suffer from flash wearout, so I can see "incompressible writeback" path becoming a dead code, long term. == &< == Below concerns from Minchan: == &< == My concern is if we enable CONFIG_ZRAM_WRITEBACK in this implementation, both hugepage/idlepage writeck will turn on. However someuser want to enable only idlepage writeback so we need to introduce turn on/off knob for hugepage or new CONFIG_ZRAM_IDLEPAGE_WRITEBACK for those usecase. I don't want to make it complicated *if possible*. Long term, I imagine we need to make VM aware of new swap hierarchy a little bit different with as-is. For example, first high priority swap can return -EIO or -ENOCOMP, swap try to fallback to next lower priority swap device. With that, hugepage writeback will work tranparently. So we could regard it as regression because incompressible pages doesn't go to backing storage automatically. Instead, user should do it via "echo huge" > /sys/block/zram/writeback" manually. == &< == Link: http://lkml.kernel.org/r/20181127055429.251614-6-minchan@kernel.org Signed-off-by: Minchan Kim <minchan@kernel.org> Reviewed-by: Joey Pabalinas <joeypabalinas@gmail.com> Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'drivers/block/zram')
-rw-r--r--drivers/block/zram/Kconfig5
-rw-r--r--drivers/block/zram/zram_drv.c247
-rw-r--r--drivers/block/zram/zram_drv.h1
3 files changed, 178 insertions, 75 deletions
diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
index fcd055457364..1ffc64770643 100644
--- a/drivers/block/zram/Kconfig
+++ b/drivers/block/zram/Kconfig
@@ -15,7 +15,7 @@ config ZRAM
15 See Documentation/blockdev/zram.txt for more information. 15 See Documentation/blockdev/zram.txt for more information.
16 16
17config ZRAM_WRITEBACK 17config ZRAM_WRITEBACK
18 bool "Write back incompressible page to backing device" 18 bool "Write back incompressible or idle page to backing device"
19 depends on ZRAM 19 depends on ZRAM
20 help 20 help
21 With incompressible page, there is no memory saving to keep it 21 With incompressible page, there is no memory saving to keep it
@@ -23,6 +23,9 @@ config ZRAM_WRITEBACK
23 For this feature, admin should set up backing device via 23 For this feature, admin should set up backing device via
24 /sys/block/zramX/backing_dev. 24 /sys/block/zramX/backing_dev.
25 25
26 With /sys/block/zramX/{idle,writeback}, application could ask
27 idle page's writeback to the backing device to save in memory.
28
26 See Documentation/blockdev/zram.txt for more information. 29 See Documentation/blockdev/zram.txt for more information.
27 30
28config ZRAM_MEMORY_TRACKING 31config ZRAM_MEMORY_TRACKING
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 180613b478a6..6b5a886c8f32 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -52,6 +52,9 @@ static unsigned int num_devices = 1;
52static size_t huge_class_size; 52static size_t huge_class_size;
53 53
54static void zram_free_page(struct zram *zram, size_t index); 54static void zram_free_page(struct zram *zram, size_t index);
55static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
56 u32 index, int offset, struct bio *bio);
57
55 58
56static int zram_slot_trylock(struct zram *zram, u32 index) 59static int zram_slot_trylock(struct zram *zram, u32 index)
57{ 60{
@@ -73,13 +76,6 @@ static inline bool init_done(struct zram *zram)
73 return zram->disksize; 76 return zram->disksize;
74} 77}
75 78
76static inline bool zram_allocated(struct zram *zram, u32 index)
77{
78
79 return (zram->table[index].flags >> (ZRAM_FLAG_SHIFT + 1)) ||
80 zram->table[index].handle;
81}
82
83static inline struct zram *dev_to_zram(struct device *dev) 79static inline struct zram *dev_to_zram(struct device *dev)
84{ 80{
85 return (struct zram *)dev_to_disk(dev)->private_data; 81 return (struct zram *)dev_to_disk(dev)->private_data;
@@ -138,6 +134,13 @@ static void zram_set_obj_size(struct zram *zram,
138 zram->table[index].flags = (flags << ZRAM_FLAG_SHIFT) | size; 134 zram->table[index].flags = (flags << ZRAM_FLAG_SHIFT) | size;
139} 135}
140 136
137static inline bool zram_allocated(struct zram *zram, u32 index)
138{
139 return zram_get_obj_size(zram, index) ||
140 zram_test_flag(zram, index, ZRAM_SAME) ||
141 zram_test_flag(zram, index, ZRAM_WB);
142}
143
141#if PAGE_SIZE != 4096 144#if PAGE_SIZE != 4096
142static inline bool is_partial_io(struct bio_vec *bvec) 145static inline bool is_partial_io(struct bio_vec *bvec)
143{ 146{
@@ -308,10 +311,14 @@ static ssize_t idle_store(struct device *dev,
308 } 311 }
309 312
310 for (index = 0; index < nr_pages; index++) { 313 for (index = 0; index < nr_pages; index++) {
314 /*
315 * Do not mark ZRAM_UNDER_WB slot as ZRAM_IDLE to close race.
316 * See the comment in writeback_store.
317 */
311 zram_slot_lock(zram, index); 318 zram_slot_lock(zram, index);
312 if (!zram_allocated(zram, index)) 319 if (!zram_allocated(zram, index) ||
320 zram_test_flag(zram, index, ZRAM_UNDER_WB))
313 goto next; 321 goto next;
314
315 zram_set_flag(zram, index, ZRAM_IDLE); 322 zram_set_flag(zram, index, ZRAM_IDLE);
316next: 323next:
317 zram_slot_unlock(zram, index); 324 zram_slot_unlock(zram, index);
@@ -546,6 +553,158 @@ static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec,
546 return 1; 553 return 1;
547} 554}
548 555
556#define HUGE_WRITEBACK 0x1
557#define IDLE_WRITEBACK 0x2
558
559static ssize_t writeback_store(struct device *dev,
560 struct device_attribute *attr, const char *buf, size_t len)
561{
562 struct zram *zram = dev_to_zram(dev);
563 unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
564 unsigned long index;
565 struct bio bio;
566 struct bio_vec bio_vec;
567 struct page *page;
568 ssize_t ret, sz;
569 char mode_buf[8];
570 unsigned long mode = -1UL;
571 unsigned long blk_idx = 0;
572
573 sz = strscpy(mode_buf, buf, sizeof(mode_buf));
574 if (sz <= 0)
575 return -EINVAL;
576
577 /* ignore trailing newline */
578 if (mode_buf[sz - 1] == '\n')
579 mode_buf[sz - 1] = 0x00;
580
581 if (!strcmp(mode_buf, "idle"))
582 mode = IDLE_WRITEBACK;
583 else if (!strcmp(mode_buf, "huge"))
584 mode = HUGE_WRITEBACK;
585
586 if (mode == -1UL)
587 return -EINVAL;
588
589 down_read(&zram->init_lock);
590 if (!init_done(zram)) {
591 ret = -EINVAL;
592 goto release_init_lock;
593 }
594
595 if (!zram->backing_dev) {
596 ret = -ENODEV;
597 goto release_init_lock;
598 }
599
600 page = alloc_page(GFP_KERNEL);
601 if (!page) {
602 ret = -ENOMEM;
603 goto release_init_lock;
604 }
605
606 for (index = 0; index < nr_pages; index++) {
607 struct bio_vec bvec;
608
609 bvec.bv_page = page;
610 bvec.bv_len = PAGE_SIZE;
611 bvec.bv_offset = 0;
612
613 if (!blk_idx) {
614 blk_idx = alloc_block_bdev(zram);
615 if (!blk_idx) {
616 ret = -ENOSPC;
617 break;
618 }
619 }
620
621 zram_slot_lock(zram, index);
622 if (!zram_allocated(zram, index))
623 goto next;
624
625 if (zram_test_flag(zram, index, ZRAM_WB) ||
626 zram_test_flag(zram, index, ZRAM_SAME) ||
627 zram_test_flag(zram, index, ZRAM_UNDER_WB))
628 goto next;
629
630 if ((mode & IDLE_WRITEBACK &&
631 !zram_test_flag(zram, index, ZRAM_IDLE)) &&
632 (mode & HUGE_WRITEBACK &&
633 !zram_test_flag(zram, index, ZRAM_HUGE)))
634 goto next;
635 /*
636 * Clearing ZRAM_UNDER_WB is duty of caller.
637 * IOW, zram_free_page never clear it.
638 */
639 zram_set_flag(zram, index, ZRAM_UNDER_WB);
640 /* Need for hugepage writeback racing */
641 zram_set_flag(zram, index, ZRAM_IDLE);
642 zram_slot_unlock(zram, index);
643 if (zram_bvec_read(zram, &bvec, index, 0, NULL)) {
644 zram_slot_lock(zram, index);
645 zram_clear_flag(zram, index, ZRAM_UNDER_WB);
646 zram_clear_flag(zram, index, ZRAM_IDLE);
647 zram_slot_unlock(zram, index);
648 continue;
649 }
650
651 bio_init(&bio, &bio_vec, 1);
652 bio_set_dev(&bio, zram->bdev);
653 bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9);
654 bio.bi_opf = REQ_OP_WRITE | REQ_SYNC;
655
656 bio_add_page(&bio, bvec.bv_page, bvec.bv_len,
657 bvec.bv_offset);
658 /*
659 * XXX: A single page IO would be inefficient for write
660 * but it would be not bad as starter.
661 */
662 ret = submit_bio_wait(&bio);
663 if (ret) {
664 zram_slot_lock(zram, index);
665 zram_clear_flag(zram, index, ZRAM_UNDER_WB);
666 zram_clear_flag(zram, index, ZRAM_IDLE);
667 zram_slot_unlock(zram, index);
668 continue;
669 }
670
671 /*
672 * We released zram_slot_lock so need to check if the slot was
673 * changed. If there is freeing for the slot, we can catch it
674 * easily by zram_allocated.
675 * A subtle case is the slot is freed/reallocated/marked as
676 * ZRAM_IDLE again. To close the race, idle_store doesn't
677 * mark ZRAM_IDLE once it found the slot was ZRAM_UNDER_WB.
678 * Thus, we could close the race by checking ZRAM_IDLE bit.
679 */
680 zram_slot_lock(zram, index);
681 if (!zram_allocated(zram, index) ||
682 !zram_test_flag(zram, index, ZRAM_IDLE)) {
683 zram_clear_flag(zram, index, ZRAM_UNDER_WB);
684 zram_clear_flag(zram, index, ZRAM_IDLE);
685 goto next;
686 }
687
688 zram_free_page(zram, index);
689 zram_clear_flag(zram, index, ZRAM_UNDER_WB);
690 zram_set_flag(zram, index, ZRAM_WB);
691 zram_set_element(zram, index, blk_idx);
692 blk_idx = 0;
693 atomic64_inc(&zram->stats.pages_stored);
694next:
695 zram_slot_unlock(zram, index);
696 }
697
698 if (blk_idx)
699 free_block_bdev(zram, blk_idx);
700 ret = len;
701 __free_page(page);
702release_init_lock:
703 up_read(&zram->init_lock);
704
705 return ret;
706}
707
549struct zram_work { 708struct zram_work {
550 struct work_struct work; 709 struct work_struct work;
551 struct zram *zram; 710 struct zram *zram;
@@ -603,57 +762,8 @@ static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
603 else 762 else
604 return read_from_bdev_async(zram, bvec, entry, parent); 763 return read_from_bdev_async(zram, bvec, entry, parent);
605} 764}
606
607static int write_to_bdev(struct zram *zram, struct bio_vec *bvec,
608 u32 index, struct bio *parent,
609 unsigned long *pentry)
610{
611 struct bio *bio;
612 unsigned long entry;
613
614 bio = bio_alloc(GFP_ATOMIC, 1);
615 if (!bio)
616 return -ENOMEM;
617
618 entry = alloc_block_bdev(zram);
619 if (!entry) {
620 bio_put(bio);
621 return -ENOSPC;
622 }
623
624 bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
625 bio_set_dev(bio, zram->bdev);
626 if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len,
627 bvec->bv_offset)) {
628 bio_put(bio);
629 free_block_bdev(zram, entry);
630 return -EIO;
631 }
632
633 if (!parent) {
634 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
635 bio->bi_end_io = zram_page_end_io;
636 } else {
637 bio->bi_opf = parent->bi_opf;
638 bio_chain(bio, parent);
639 }
640
641 submit_bio(bio);
642 *pentry = entry;
643
644 return 0;
645}
646
647#else 765#else
648static inline void reset_bdev(struct zram *zram) {}; 766static inline void reset_bdev(struct zram *zram) {};
649static int write_to_bdev(struct zram *zram, struct bio_vec *bvec,
650 u32 index, struct bio *parent,
651 unsigned long *pentry)
652
653{
654 return -EIO;
655}
656
657static int read_from_bdev(struct zram *zram, struct bio_vec *bvec, 767static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
658 unsigned long entry, struct bio *parent, bool sync) 768 unsigned long entry, struct bio *parent, bool sync)
659{ 769{
@@ -1006,7 +1116,8 @@ out:
1006 atomic64_dec(&zram->stats.pages_stored); 1116 atomic64_dec(&zram->stats.pages_stored);
1007 zram_set_handle(zram, index, 0); 1117 zram_set_handle(zram, index, 0);
1008 zram_set_obj_size(zram, index, 0); 1118 zram_set_obj_size(zram, index, 0);
1009 WARN_ON_ONCE(zram->table[index].flags & ~(1UL << ZRAM_LOCK)); 1119 WARN_ON_ONCE(zram->table[index].flags &
1120 ~(1UL << ZRAM_LOCK | 1UL << ZRAM_UNDER_WB));
1010} 1121}
1011 1122
1012static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, 1123static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
@@ -1115,7 +1226,6 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
1115 struct page *page = bvec->bv_page; 1226 struct page *page = bvec->bv_page;
1116 unsigned long element = 0; 1227 unsigned long element = 0;
1117 enum zram_pageflags flags = 0; 1228 enum zram_pageflags flags = 0;
1118 bool allow_wb = true;
1119 1229
1120 mem = kmap_atomic(page); 1230 mem = kmap_atomic(page);
1121 if (page_same_filled(mem, &element)) { 1231 if (page_same_filled(mem, &element)) {
@@ -1140,21 +1250,8 @@ compress_again:
1140 return ret; 1250 return ret;
1141 } 1251 }
1142 1252
1143 if (unlikely(comp_len >= huge_class_size)) { 1253 if (comp_len >= huge_class_size)
1144 comp_len = PAGE_SIZE; 1254 comp_len = PAGE_SIZE;
1145 if (zram->backing_dev && allow_wb) {
1146 zcomp_stream_put(zram->comp);
1147 ret = write_to_bdev(zram, bvec, index, bio, &element);
1148 if (!ret) {
1149 flags = ZRAM_WB;
1150 ret = 1;
1151 goto out;
1152 }
1153 allow_wb = false;
1154 goto compress_again;
1155 }
1156 }
1157
1158 /* 1255 /*
1159 * handle allocation has 2 paths: 1256 * handle allocation has 2 paths:
1160 * a) fast path is executed with preemption disabled (for 1257 * a) fast path is executed with preemption disabled (for
@@ -1643,6 +1740,7 @@ static DEVICE_ATTR_RW(max_comp_streams);
1643static DEVICE_ATTR_RW(comp_algorithm); 1740static DEVICE_ATTR_RW(comp_algorithm);
1644#ifdef CONFIG_ZRAM_WRITEBACK 1741#ifdef CONFIG_ZRAM_WRITEBACK
1645static DEVICE_ATTR_RW(backing_dev); 1742static DEVICE_ATTR_RW(backing_dev);
1743static DEVICE_ATTR_WO(writeback);
1646#endif 1744#endif
1647 1745
1648static struct attribute *zram_disk_attrs[] = { 1746static struct attribute *zram_disk_attrs[] = {
@@ -1657,6 +1755,7 @@ static struct attribute *zram_disk_attrs[] = {
1657 &dev_attr_comp_algorithm.attr, 1755 &dev_attr_comp_algorithm.attr,
1658#ifdef CONFIG_ZRAM_WRITEBACK 1756#ifdef CONFIG_ZRAM_WRITEBACK
1659 &dev_attr_backing_dev.attr, 1757 &dev_attr_backing_dev.attr,
1758 &dev_attr_writeback.attr,
1660#endif 1759#endif
1661 &dev_attr_io_stat.attr, 1760 &dev_attr_io_stat.attr,
1662 &dev_attr_mm_stat.attr, 1761 &dev_attr_mm_stat.attr,
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index a84611b97867..1ad74f030b6d 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -47,6 +47,7 @@ enum zram_pageflags {
47 ZRAM_LOCK = ZRAM_FLAG_SHIFT, 47 ZRAM_LOCK = ZRAM_FLAG_SHIFT,
48 ZRAM_SAME, /* Page consists the same element */ 48 ZRAM_SAME, /* Page consists the same element */
49 ZRAM_WB, /* page is stored on backing_device */ 49 ZRAM_WB, /* page is stored on backing_device */
50 ZRAM_UNDER_WB, /* page is under writeback */
50 ZRAM_HUGE, /* Incompressible page */ 51 ZRAM_HUGE, /* Incompressible page */
51 ZRAM_IDLE, /* not accessed page since last idle marking */ 52 ZRAM_IDLE, /* not accessed page since last idle marking */
52 53