diff options
| -rw-r--r-- | Documentation/00-INDEX | 4 | ||||
| -rw-r--r-- | Documentation/admin-guide/md.rst | 5 | ||||
| -rw-r--r-- | Documentation/md/md-cluster.txt (renamed from Documentation/md-cluster.txt) | 0 | ||||
| -rw-r--r-- | Documentation/md/raid5-cache.txt | 109 | ||||
| -rw-r--r-- | block/bio.c | 61 | ||||
| -rw-r--r-- | drivers/md/faulty.c | 2 | ||||
| -rw-r--r-- | drivers/md/linear.c | 41 | ||||
| -rw-r--r-- | drivers/md/linear.h | 1 | ||||
| -rw-r--r-- | drivers/md/md.c | 22 | ||||
| -rw-r--r-- | drivers/md/md.h | 9 | ||||
| -rw-r--r-- | drivers/md/multipath.c | 1 | ||||
| -rw-r--r-- | drivers/md/raid0.c | 1 | ||||
| -rw-r--r-- | drivers/md/raid1.c | 596 | ||||
| -rw-r--r-- | drivers/md/raid1.h | 58 | ||||
| -rw-r--r-- | drivers/md/raid10.c | 11 | ||||
| -rw-r--r-- | drivers/md/raid5-cache.c | 225 | ||||
| -rw-r--r-- | drivers/md/raid5.c | 129 | ||||
| -rw-r--r-- | drivers/md/raid5.h | 7 | ||||
| -rw-r--r-- | include/linux/bio.h | 11 | ||||
| -rw-r--r-- | lib/radix-tree.c | 1 |
20 files changed, 942 insertions, 352 deletions
diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX index c8a8eb1a2b11..793acf999e9e 100644 --- a/Documentation/00-INDEX +++ b/Documentation/00-INDEX | |||
| @@ -270,8 +270,8 @@ m68k/ | |||
| 270 | - directory with info about Linux on Motorola 68k architecture. | 270 | - directory with info about Linux on Motorola 68k architecture. |
| 271 | mailbox.txt | 271 | mailbox.txt |
| 272 | - How to write drivers for the common mailbox framework (IPC). | 272 | - How to write drivers for the common mailbox framework (IPC). |
| 273 | md-cluster.txt | 273 | md/ |
| 274 | - info on shared-device RAID MD cluster. | 274 | - directory with info about Linux Software RAID |
| 275 | media/ | 275 | media/ |
| 276 | - info on media drivers: uAPI, kAPI and driver documentation. | 276 | - info on media drivers: uAPI, kAPI and driver documentation. |
| 277 | memory-barriers.txt | 277 | memory-barriers.txt |
diff --git a/Documentation/admin-guide/md.rst b/Documentation/admin-guide/md.rst index e449fb5f277c..1e61bf50595c 100644 --- a/Documentation/admin-guide/md.rst +++ b/Documentation/admin-guide/md.rst | |||
| @@ -725,3 +725,8 @@ These currently include: | |||
| 725 | to 1. Setting this to 0 disables bypass accounting and | 725 | to 1. Setting this to 0 disables bypass accounting and |
| 726 | requires preread stripes to wait until all full-width stripe- | 726 | requires preread stripes to wait until all full-width stripe- |
| 727 | writes are complete. Valid values are 0 to stripe_cache_size. | 727 | writes are complete. Valid values are 0 to stripe_cache_size. |
| 728 | |||
| 729 | journal_mode (currently raid5 only) | ||
| 730 | The cache mode for raid5. raid5 could include an extra disk for | ||
| 731 | caching. The mode can be "write-throuth" and "write-back". The | ||
| 732 | default is "write-through". | ||
diff --git a/Documentation/md-cluster.txt b/Documentation/md/md-cluster.txt index 38883276d31c..38883276d31c 100644 --- a/Documentation/md-cluster.txt +++ b/Documentation/md/md-cluster.txt | |||
diff --git a/Documentation/md/raid5-cache.txt b/Documentation/md/raid5-cache.txt new file mode 100644 index 000000000000..2b210f295786 --- /dev/null +++ b/Documentation/md/raid5-cache.txt | |||
| @@ -0,0 +1,109 @@ | |||
| 1 | RAID5 cache | ||
| 2 | |||
| 3 | Raid 4/5/6 could include an extra disk for data cache besides normal RAID | ||
| 4 | disks. The role of RAID disks isn't changed with the cache disk. The cache disk | ||
| 5 | caches data to the RAID disks. The cache can be in write-through (supported | ||
| 6 | since 4.4) or write-back mode (supported since 4.10). mdadm (supported since | ||
| 7 | 3.4) has a new option '--write-journal' to create array with cache. Please | ||
| 8 | refer to mdadm manual for details. By default (RAID array starts), the cache is | ||
| 9 | in write-through mode. A user can switch it to write-back mode by: | ||
| 10 | |||
| 11 | echo "write-back" > /sys/block/md0/md/journal_mode | ||
| 12 | |||
| 13 | And switch it back to write-through mode by: | ||
| 14 | |||
| 15 | echo "write-through" > /sys/block/md0/md/journal_mode | ||
| 16 | |||
| 17 | In both modes, all writes to the array will hit cache disk first. This means | ||
| 18 | the cache disk must be fast and sustainable. | ||
| 19 | |||
| 20 | ------------------------------------- | ||
| 21 | write-through mode: | ||
| 22 | |||
| 23 | This mode mainly fixes the 'write hole' issue. For RAID 4/5/6 array, an unclean | ||
| 24 | shutdown can cause data in some stripes to not be in consistent state, eg, data | ||
| 25 | and parity don't match. The reason is that a stripe write involves several RAID | ||
| 26 | disks and it's possible the writes don't hit all RAID disks yet before the | ||
| 27 | unclean shutdown. We call an array degraded if it has inconsistent data. MD | ||
| 28 | tries to resync the array to bring it back to normal state. But before the | ||
| 29 | resync completes, any system crash will expose the chance of real data | ||
| 30 | corruption in the RAID array. This problem is called 'write hole'. | ||
| 31 | |||
| 32 | The write-through cache will cache all data on cache disk first. After the data | ||
| 33 | is safe on the cache disk, the data will be flushed onto RAID disks. The | ||
| 34 | two-step write will guarantee MD can recover correct data after unclean | ||
| 35 | shutdown even the array is degraded. Thus the cache can close the 'write hole'. | ||
| 36 | |||
| 37 | In write-through mode, MD reports IO completion to upper layer (usually | ||
| 38 | filesystems) after the data is safe on RAID disks, so cache disk failure | ||
| 39 | doesn't cause data loss. Of course cache disk failure means the array is | ||
| 40 | exposed to 'write hole' again. | ||
| 41 | |||
| 42 | In write-through mode, the cache disk isn't required to be big. Several | ||
| 43 | hundreds megabytes are enough. | ||
| 44 | |||
| 45 | -------------------------------------- | ||
| 46 | write-back mode: | ||
| 47 | |||
| 48 | write-back mode fixes the 'write hole' issue too, since all write data is | ||
| 49 | cached on cache disk. But the main goal of 'write-back' cache is to speed up | ||
| 50 | write. If a write crosses all RAID disks of a stripe, we call it full-stripe | ||
| 51 | write. For non-full-stripe writes, MD must read old data before the new parity | ||
| 52 | can be calculated. These synchronous reads hurt write throughput. Some writes | ||
| 53 | which are sequential but not dispatched in the same time will suffer from this | ||
| 54 | overhead too. Write-back cache will aggregate the data and flush the data to | ||
| 55 | RAID disks only after the data becomes a full stripe write. This will | ||
| 56 | completely avoid the overhead, so it's very helpful for some workloads. A | ||
| 57 | typical workload which does sequential write followed by fsync is an example. | ||
| 58 | |||
| 59 | In write-back mode, MD reports IO completion to upper layer (usually | ||
| 60 | filesystems) right after the data hits cache disk. The data is flushed to raid | ||
| 61 | disks later after specific conditions met. So cache disk failure will cause | ||
| 62 | data loss. | ||
| 63 | |||
| 64 | In write-back mode, MD also caches data in memory. The memory cache includes | ||
| 65 | the same data stored on cache disk, so a power loss doesn't cause data loss. | ||
| 66 | The memory cache size has performance impact for the array. It's recommended | ||
| 67 | the size is big. A user can configure the size by: | ||
| 68 | |||
| 69 | echo "2048" > /sys/block/md0/md/stripe_cache_size | ||
| 70 | |||
| 71 | Too small cache disk will make the write aggregation less efficient in this | ||
| 72 | mode depending on the workloads. It's recommended to use a cache disk with at | ||
| 73 | least several gigabytes size in write-back mode. | ||
| 74 | |||
| 75 | -------------------------------------- | ||
| 76 | The implementation: | ||
| 77 | |||
| 78 | The write-through and write-back cache use the same disk format. The cache disk | ||
| 79 | is organized as a simple write log. The log consists of 'meta data' and 'data' | ||
| 80 | pairs. The meta data describes the data. It also includes checksum and sequence | ||
| 81 | ID for recovery identification. Data can be IO data and parity data. Data is | ||
| 82 | checksumed too. The checksum is stored in the meta data ahead of the data. The | ||
| 83 | checksum is an optimization because MD can write meta and data freely without | ||
| 84 | worry about the order. MD superblock has a field pointed to the valid meta data | ||
| 85 | of log head. | ||
| 86 | |||
| 87 | The log implementation is pretty straightforward. The difficult part is the | ||
| 88 | order in which MD writes data to cache disk and RAID disks. Specifically, in | ||
| 89 | write-through mode, MD calculates parity for IO data, writes both IO data and | ||
| 90 | parity to the log, writes the data and parity to RAID disks after the data and | ||
| 91 | parity is settled down in log and finally the IO is finished. Read just reads | ||
| 92 | from raid disks as usual. | ||
| 93 | |||
| 94 | In write-back mode, MD writes IO data to the log and reports IO completion. The | ||
| 95 | data is also fully cached in memory at that time, which means read must query | ||
| 96 | memory cache. If some conditions are met, MD will flush the data to RAID disks. | ||
| 97 | MD will calculate parity for the data and write parity into the log. After this | ||
| 98 | is finished, MD will write both data and parity into RAID disks, then MD can | ||
| 99 | release the memory cache. The flush conditions could be stripe becomes a full | ||
| 100 | stripe write, free cache disk space is low or free in-kernel memory cache space | ||
| 101 | is low. | ||
| 102 | |||
| 103 | After an unclean shutdown, MD does recovery. MD reads all meta data and data | ||
| 104 | from the log. The sequence ID and checksum will help us detect corrupted meta | ||
| 105 | data and data. If MD finds a stripe with data and valid parities (1 parity for | ||
| 106 | raid4/5 and 2 for raid6), MD will write the data and parities to RAID disks. If | ||
| 107 | parities are incompleted, they are discarded. If part of data is corrupted, | ||
| 108 | they are discarded too. MD then loads valid data and writes them to RAID disks | ||
| 109 | in normal way. | ||
diff --git a/block/bio.c b/block/bio.c index 4b564d0c3e29..5eec5e08417f 100644 --- a/block/bio.c +++ b/block/bio.c | |||
| @@ -625,21 +625,20 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs) | |||
| 625 | } | 625 | } |
| 626 | EXPORT_SYMBOL(bio_clone_fast); | 626 | EXPORT_SYMBOL(bio_clone_fast); |
| 627 | 627 | ||
| 628 | /** | 628 | static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, |
| 629 | * bio_clone_bioset - clone a bio | 629 | struct bio_set *bs, int offset, |
| 630 | * @bio_src: bio to clone | 630 | int size) |
| 631 | * @gfp_mask: allocation priority | ||
| 632 | * @bs: bio_set to allocate from | ||
| 633 | * | ||
| 634 | * Clone bio. Caller will own the returned bio, but not the actual data it | ||
| 635 | * points to. Reference count of returned bio will be one. | ||
| 636 | */ | ||
| 637 | struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, | ||
| 638 | struct bio_set *bs) | ||
| 639 | { | 631 | { |
| 640 | struct bvec_iter iter; | 632 | struct bvec_iter iter; |
| 641 | struct bio_vec bv; | 633 | struct bio_vec bv; |
| 642 | struct bio *bio; | 634 | struct bio *bio; |
| 635 | struct bvec_iter iter_src = bio_src->bi_iter; | ||
| 636 | |||
| 637 | /* for supporting partial clone */ | ||
| 638 | if (offset || size != bio_src->bi_iter.bi_size) { | ||
| 639 | bio_advance_iter(bio_src, &iter_src, offset); | ||
| 640 | iter_src.bi_size = size; | ||
| 641 | } | ||
| 643 | 642 | ||
| 644 | /* | 643 | /* |
| 645 | * Pre immutable biovecs, __bio_clone() used to just do a memcpy from | 644 | * Pre immutable biovecs, __bio_clone() used to just do a memcpy from |
| @@ -663,7 +662,8 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, | |||
| 663 | * __bio_clone_fast() anyways. | 662 | * __bio_clone_fast() anyways. |
| 664 | */ | 663 | */ |
| 665 | 664 | ||
| 666 | bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); | 665 | bio = bio_alloc_bioset(gfp_mask, __bio_segments(bio_src, |
| 666 | &iter_src), bs); | ||
| 667 | if (!bio) | 667 | if (!bio) |
| 668 | return NULL; | 668 | return NULL; |
| 669 | bio->bi_bdev = bio_src->bi_bdev; | 669 | bio->bi_bdev = bio_src->bi_bdev; |
| @@ -680,7 +680,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, | |||
| 680 | bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; | 680 | bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; |
| 681 | break; | 681 | break; |
| 682 | default: | 682 | default: |
| 683 | bio_for_each_segment(bv, bio_src, iter) | 683 | __bio_for_each_segment(bv, bio_src, iter, iter_src) |
| 684 | bio->bi_io_vec[bio->bi_vcnt++] = bv; | 684 | bio->bi_io_vec[bio->bi_vcnt++] = bv; |
| 685 | break; | 685 | break; |
| 686 | } | 686 | } |
| @@ -699,9 +699,44 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, | |||
| 699 | 699 | ||
| 700 | return bio; | 700 | return bio; |
| 701 | } | 701 | } |
| 702 | |||
| 703 | /** | ||
| 704 | * bio_clone_bioset - clone a bio | ||
| 705 | * @bio_src: bio to clone | ||
| 706 | * @gfp_mask: allocation priority | ||
| 707 | * @bs: bio_set to allocate from | ||
| 708 | * | ||
| 709 | * Clone bio. Caller will own the returned bio, but not the actual data it | ||
| 710 | * points to. Reference count of returned bio will be one. | ||
| 711 | */ | ||
| 712 | struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, | ||
| 713 | struct bio_set *bs) | ||
| 714 | { | ||
| 715 | return __bio_clone_bioset(bio_src, gfp_mask, bs, 0, | ||
| 716 | bio_src->bi_iter.bi_size); | ||
| 717 | } | ||
| 702 | EXPORT_SYMBOL(bio_clone_bioset); | 718 | EXPORT_SYMBOL(bio_clone_bioset); |
| 703 | 719 | ||
| 704 | /** | 720 | /** |
| 721 | * bio_clone_bioset_partial - clone a partial bio | ||
| 722 | * @bio_src: bio to clone | ||
| 723 | * @gfp_mask: allocation priority | ||
| 724 | * @bs: bio_set to allocate from | ||
| 725 | * @offset: cloned starting from the offset | ||
| 726 | * @size: size for the cloned bio | ||
| 727 | * | ||
| 728 | * Clone bio. Caller will own the returned bio, but not the actual data it | ||
| 729 | * points to. Reference count of returned bio will be one. | ||
| 730 | */ | ||
| 731 | struct bio *bio_clone_bioset_partial(struct bio *bio_src, gfp_t gfp_mask, | ||
| 732 | struct bio_set *bs, int offset, | ||
| 733 | int size) | ||
| 734 | { | ||
| 735 | return __bio_clone_bioset(bio_src, gfp_mask, bs, offset, size); | ||
| 736 | } | ||
| 737 | EXPORT_SYMBOL(bio_clone_bioset_partial); | ||
| 738 | |||
| 739 | /** | ||
| 705 | * bio_add_pc_page - attempt to add page to bio | 740 | * bio_add_pc_page - attempt to add page to bio |
| 706 | * @q: the target queue | 741 | * @q: the target queue |
| 707 | * @bio: destination bio | 742 | * @bio: destination bio |
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 685aa2d77e25..b0536cfd8e17 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c | |||
| @@ -214,7 +214,7 @@ static void faulty_make_request(struct mddev *mddev, struct bio *bio) | |||
| 214 | } | 214 | } |
| 215 | } | 215 | } |
| 216 | if (failit) { | 216 | if (failit) { |
| 217 | struct bio *b = bio_clone_mddev(bio, GFP_NOIO, mddev); | 217 | struct bio *b = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); |
| 218 | 218 | ||
| 219 | b->bi_bdev = conf->rdev->bdev; | 219 | b->bi_bdev = conf->rdev->bdev; |
| 220 | b->bi_private = bio; | 220 | b->bi_private = bio; |
diff --git a/drivers/md/linear.c b/drivers/md/linear.c index f1c7bbac31a5..3e38e0207a3e 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c | |||
| @@ -53,18 +53,26 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) | |||
| 53 | return conf->disks + lo; | 53 | return conf->disks + lo; |
| 54 | } | 54 | } |
| 55 | 55 | ||
| 56 | /* | ||
| 57 | * In linear_congested() conf->raid_disks is used as a copy of | ||
| 58 | * mddev->raid_disks to iterate conf->disks[], because conf->raid_disks | ||
| 59 | * and conf->disks[] are created in linear_conf(), they are always | ||
| 60 | * consitent with each other, but mddev->raid_disks does not. | ||
| 61 | */ | ||
| 56 | static int linear_congested(struct mddev *mddev, int bits) | 62 | static int linear_congested(struct mddev *mddev, int bits) |
| 57 | { | 63 | { |
| 58 | struct linear_conf *conf; | 64 | struct linear_conf *conf; |
| 59 | int i, ret = 0; | 65 | int i, ret = 0; |
| 60 | 66 | ||
| 61 | conf = mddev->private; | 67 | rcu_read_lock(); |
| 68 | conf = rcu_dereference(mddev->private); | ||
| 62 | 69 | ||
| 63 | for (i = 0; i < mddev->raid_disks && !ret ; i++) { | 70 | for (i = 0; i < conf->raid_disks && !ret ; i++) { |
| 64 | struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); | 71 | struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); |
| 65 | ret |= bdi_congested(q->backing_dev_info, bits); | 72 | ret |= bdi_congested(q->backing_dev_info, bits); |
| 66 | } | 73 | } |
| 67 | 74 | ||
| 75 | rcu_read_unlock(); | ||
| 68 | return ret; | 76 | return ret; |
| 69 | } | 77 | } |
| 70 | 78 | ||
| @@ -144,6 +152,19 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) | |||
| 144 | conf->disks[i-1].end_sector + | 152 | conf->disks[i-1].end_sector + |
| 145 | conf->disks[i].rdev->sectors; | 153 | conf->disks[i].rdev->sectors; |
| 146 | 154 | ||
| 155 | /* | ||
| 156 | * conf->raid_disks is copy of mddev->raid_disks. The reason to | ||
| 157 | * keep a copy of mddev->raid_disks in struct linear_conf is, | ||
| 158 | * mddev->raid_disks may not be consistent with pointers number of | ||
| 159 | * conf->disks[] when it is updated in linear_add() and used to | ||
| 160 | * iterate old conf->disks[] earray in linear_congested(). | ||
| 161 | * Here conf->raid_disks is always consitent with number of | ||
| 162 | * pointers in conf->disks[] array, and mddev->private is updated | ||
| 163 | * with rcu_assign_pointer() in linear_addr(), such race can be | ||
| 164 | * avoided. | ||
| 165 | */ | ||
| 166 | conf->raid_disks = raid_disks; | ||
| 167 | |||
| 147 | return conf; | 168 | return conf; |
| 148 | 169 | ||
| 149 | out: | 170 | out: |
| @@ -196,15 +217,24 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) | |||
| 196 | if (!newconf) | 217 | if (!newconf) |
| 197 | return -ENOMEM; | 218 | return -ENOMEM; |
| 198 | 219 | ||
| 220 | /* newconf->raid_disks already keeps a copy of * the increased | ||
| 221 | * value of mddev->raid_disks, WARN_ONCE() is just used to make | ||
| 222 | * sure of this. It is possible that oldconf is still referenced | ||
| 223 | * in linear_congested(), therefore kfree_rcu() is used to free | ||
| 224 | * oldconf until no one uses it anymore. | ||
| 225 | */ | ||
| 199 | mddev_suspend(mddev); | 226 | mddev_suspend(mddev); |
| 200 | oldconf = mddev->private; | 227 | oldconf = rcu_dereference_protected(mddev->private, |
| 228 | lockdep_is_held(&mddev->reconfig_mutex)); | ||
| 201 | mddev->raid_disks++; | 229 | mddev->raid_disks++; |
| 202 | mddev->private = newconf; | 230 | WARN_ONCE(mddev->raid_disks != newconf->raid_disks, |
| 231 | "copied raid_disks doesn't match mddev->raid_disks"); | ||
| 232 | rcu_assign_pointer(mddev->private, newconf); | ||
| 203 | md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); | 233 | md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); |
| 204 | set_capacity(mddev->gendisk, mddev->array_sectors); | 234 | set_capacity(mddev->gendisk, mddev->array_sectors); |
| 205 | mddev_resume(mddev); | 235 | mddev_resume(mddev); |
| 206 | revalidate_disk(mddev->gendisk); | 236 | revalidate_disk(mddev->gendisk); |
| 207 | kfree(oldconf); | 237 | kfree_rcu(oldconf, rcu); |
| 208 | return 0; | 238 | return 0; |
| 209 | } | 239 | } |
| 210 | 240 | ||
| @@ -262,6 +292,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) | |||
| 262 | trace_block_bio_remap(bdev_get_queue(split->bi_bdev), | 292 | trace_block_bio_remap(bdev_get_queue(split->bi_bdev), |
| 263 | split, disk_devt(mddev->gendisk), | 293 | split, disk_devt(mddev->gendisk), |
| 264 | bio_sector); | 294 | bio_sector); |
| 295 | mddev_check_writesame(mddev, split); | ||
| 265 | generic_make_request(split); | 296 | generic_make_request(split); |
| 266 | } | 297 | } |
| 267 | } while (split != bio); | 298 | } while (split != bio); |
diff --git a/drivers/md/linear.h b/drivers/md/linear.h index b685ddd7d7f7..8d392e6098b3 100644 --- a/drivers/md/linear.h +++ b/drivers/md/linear.h | |||
| @@ -10,6 +10,7 @@ struct linear_conf | |||
| 10 | { | 10 | { |
| 11 | struct rcu_head rcu; | 11 | struct rcu_head rcu; |
| 12 | sector_t array_sectors; | 12 | sector_t array_sectors; |
| 13 | int raid_disks; /* a copy of mddev->raid_disks */ | ||
| 13 | struct dev_info disks[0]; | 14 | struct dev_info disks[0]; |
| 14 | }; | 15 | }; |
| 15 | #endif | 16 | #endif |
diff --git a/drivers/md/md.c b/drivers/md/md.c index ba485dcf1064..985374f20e2e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
| @@ -190,16 +190,6 @@ struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, | |||
| 190 | } | 190 | } |
| 191 | EXPORT_SYMBOL_GPL(bio_alloc_mddev); | 191 | EXPORT_SYMBOL_GPL(bio_alloc_mddev); |
| 192 | 192 | ||
| 193 | struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, | ||
| 194 | struct mddev *mddev) | ||
| 195 | { | ||
| 196 | if (!mddev || !mddev->bio_set) | ||
| 197 | return bio_clone(bio, gfp_mask); | ||
| 198 | |||
| 199 | return bio_clone_bioset(bio, gfp_mask, mddev->bio_set); | ||
| 200 | } | ||
| 201 | EXPORT_SYMBOL_GPL(bio_clone_mddev); | ||
| 202 | |||
| 203 | /* | 193 | /* |
| 204 | * We have a system wide 'event count' that is incremented | 194 | * We have a system wide 'event count' that is incremented |
| 205 | * on any 'interesting' event, and readers of /proc/mdstat | 195 | * on any 'interesting' event, and readers of /proc/mdstat |
| @@ -5228,8 +5218,11 @@ int md_run(struct mddev *mddev) | |||
| 5228 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 5218 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
| 5229 | } | 5219 | } |
| 5230 | 5220 | ||
| 5231 | if (mddev->bio_set == NULL) | 5221 | if (mddev->bio_set == NULL) { |
| 5232 | mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0); | 5222 | mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0); |
| 5223 | if (!mddev->bio_set) | ||
| 5224 | return -ENOMEM; | ||
| 5225 | } | ||
| 5233 | 5226 | ||
| 5234 | spin_lock(&pers_lock); | 5227 | spin_lock(&pers_lock); |
| 5235 | pers = find_pers(mddev->level, mddev->clevel); | 5228 | pers = find_pers(mddev->level, mddev->clevel); |
| @@ -8980,7 +8973,14 @@ static __exit void md_exit(void) | |||
| 8980 | 8973 | ||
| 8981 | for_each_mddev(mddev, tmp) { | 8974 | for_each_mddev(mddev, tmp) { |
| 8982 | export_array(mddev); | 8975 | export_array(mddev); |
| 8976 | mddev->ctime = 0; | ||
| 8983 | mddev->hold_active = 0; | 8977 | mddev->hold_active = 0; |
| 8978 | /* | ||
| 8979 | * for_each_mddev() will call mddev_put() at the end of each | ||
| 8980 | * iteration. As the mddev is now fully clear, this will | ||
| 8981 | * schedule the mddev for destruction by a workqueue, and the | ||
| 8982 | * destroy_workqueue() below will wait for that to complete. | ||
| 8983 | */ | ||
| 8984 | } | 8984 | } |
| 8985 | destroy_workqueue(md_misc_wq); | 8985 | destroy_workqueue(md_misc_wq); |
| 8986 | destroy_workqueue(md_wq); | 8986 | destroy_workqueue(md_wq); |
diff --git a/drivers/md/md.h b/drivers/md/md.h index 2a514036a83d..b8859cbf84b6 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h | |||
| @@ -673,8 +673,6 @@ extern void md_rdev_clear(struct md_rdev *rdev); | |||
| 673 | 673 | ||
| 674 | extern void mddev_suspend(struct mddev *mddev); | 674 | extern void mddev_suspend(struct mddev *mddev); |
| 675 | extern void mddev_resume(struct mddev *mddev); | 675 | extern void mddev_resume(struct mddev *mddev); |
| 676 | extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, | ||
| 677 | struct mddev *mddev); | ||
| 678 | extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, | 676 | extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, |
| 679 | struct mddev *mddev); | 677 | struct mddev *mddev); |
| 680 | 678 | ||
| @@ -710,4 +708,11 @@ static inline void mddev_clear_unsupported_flags(struct mddev *mddev, | |||
| 710 | { | 708 | { |
| 711 | mddev->flags &= ~unsupported_flags; | 709 | mddev->flags &= ~unsupported_flags; |
| 712 | } | 710 | } |
| 711 | |||
| 712 | static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio) | ||
| 713 | { | ||
| 714 | if (bio_op(bio) == REQ_OP_WRITE_SAME && | ||
| 715 | !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors) | ||
| 716 | mddev->queue->limits.max_write_same_sectors = 0; | ||
| 717 | } | ||
| 713 | #endif /* _MD_MD_H */ | 718 | #endif /* _MD_MD_H */ |
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index d457afa672d5..79a12b59250b 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c | |||
| @@ -138,6 +138,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio) | |||
| 138 | mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT; | 138 | mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT; |
| 139 | mp_bh->bio.bi_end_io = multipath_end_request; | 139 | mp_bh->bio.bi_end_io = multipath_end_request; |
| 140 | mp_bh->bio.bi_private = mp_bh; | 140 | mp_bh->bio.bi_private = mp_bh; |
| 141 | mddev_check_writesame(mddev, &mp_bh->bio); | ||
| 141 | generic_make_request(&mp_bh->bio); | 142 | generic_make_request(&mp_bh->bio); |
| 142 | return; | 143 | return; |
| 143 | } | 144 | } |
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index d6585239bff2..93347ca7c7a6 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c | |||
| @@ -503,6 +503,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) | |||
| 503 | trace_block_bio_remap(bdev_get_queue(split->bi_bdev), | 503 | trace_block_bio_remap(bdev_get_queue(split->bi_bdev), |
| 504 | split, disk_devt(mddev->gendisk), | 504 | split, disk_devt(mddev->gendisk), |
| 505 | bio_sector); | 505 | bio_sector); |
| 506 | mddev_check_writesame(mddev, split); | ||
| 506 | generic_make_request(split); | 507 | generic_make_request(split); |
| 507 | } | 508 | } |
| 508 | } while (split != bio); | 509 | } while (split != bio); |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 830ff2b20346..7453d94eeed7 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
| @@ -71,9 +71,8 @@ | |||
| 71 | */ | 71 | */ |
| 72 | static int max_queued_requests = 1024; | 72 | static int max_queued_requests = 1024; |
| 73 | 73 | ||
| 74 | static void allow_barrier(struct r1conf *conf, sector_t start_next_window, | 74 | static void allow_barrier(struct r1conf *conf, sector_t sector_nr); |
| 75 | sector_t bi_sector); | 75 | static void lower_barrier(struct r1conf *conf, sector_t sector_nr); |
| 76 | static void lower_barrier(struct r1conf *conf); | ||
| 77 | 76 | ||
| 78 | #define raid1_log(md, fmt, args...) \ | 77 | #define raid1_log(md, fmt, args...) \ |
| 79 | do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0) | 78 | do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0) |
| @@ -100,7 +99,6 @@ static void r1bio_pool_free(void *r1_bio, void *data) | |||
| 100 | #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) | 99 | #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) |
| 101 | #define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW) | 100 | #define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW) |
| 102 | #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) | 101 | #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) |
| 103 | #define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS) | ||
| 104 | 102 | ||
| 105 | static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) | 103 | static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) |
| 106 | { | 104 | { |
| @@ -205,6 +203,7 @@ static void free_r1bio(struct r1bio *r1_bio) | |||
| 205 | static void put_buf(struct r1bio *r1_bio) | 203 | static void put_buf(struct r1bio *r1_bio) |
| 206 | { | 204 | { |
| 207 | struct r1conf *conf = r1_bio->mddev->private; | 205 | struct r1conf *conf = r1_bio->mddev->private; |
| 206 | sector_t sect = r1_bio->sector; | ||
| 208 | int i; | 207 | int i; |
| 209 | 208 | ||
| 210 | for (i = 0; i < conf->raid_disks * 2; i++) { | 209 | for (i = 0; i < conf->raid_disks * 2; i++) { |
| @@ -215,7 +214,7 @@ static void put_buf(struct r1bio *r1_bio) | |||
| 215 | 214 | ||
| 216 | mempool_free(r1_bio, conf->r1buf_pool); | 215 | mempool_free(r1_bio, conf->r1buf_pool); |
| 217 | 216 | ||
| 218 | lower_barrier(conf); | 217 | lower_barrier(conf, sect); |
| 219 | } | 218 | } |
| 220 | 219 | ||
| 221 | static void reschedule_retry(struct r1bio *r1_bio) | 220 | static void reschedule_retry(struct r1bio *r1_bio) |
| @@ -223,10 +222,12 @@ static void reschedule_retry(struct r1bio *r1_bio) | |||
| 223 | unsigned long flags; | 222 | unsigned long flags; |
| 224 | struct mddev *mddev = r1_bio->mddev; | 223 | struct mddev *mddev = r1_bio->mddev; |
| 225 | struct r1conf *conf = mddev->private; | 224 | struct r1conf *conf = mddev->private; |
| 225 | int idx; | ||
| 226 | 226 | ||
| 227 | idx = sector_to_idx(r1_bio->sector); | ||
| 227 | spin_lock_irqsave(&conf->device_lock, flags); | 228 | spin_lock_irqsave(&conf->device_lock, flags); |
| 228 | list_add(&r1_bio->retry_list, &conf->retry_list); | 229 | list_add(&r1_bio->retry_list, &conf->retry_list); |
| 229 | conf->nr_queued ++; | 230 | atomic_inc(&conf->nr_queued[idx]); |
| 230 | spin_unlock_irqrestore(&conf->device_lock, flags); | 231 | spin_unlock_irqrestore(&conf->device_lock, flags); |
| 231 | 232 | ||
| 232 | wake_up(&conf->wait_barrier); | 233 | wake_up(&conf->wait_barrier); |
| @@ -243,7 +244,6 @@ static void call_bio_endio(struct r1bio *r1_bio) | |||
| 243 | struct bio *bio = r1_bio->master_bio; | 244 | struct bio *bio = r1_bio->master_bio; |
| 244 | int done; | 245 | int done; |
| 245 | struct r1conf *conf = r1_bio->mddev->private; | 246 | struct r1conf *conf = r1_bio->mddev->private; |
| 246 | sector_t start_next_window = r1_bio->start_next_window; | ||
| 247 | sector_t bi_sector = bio->bi_iter.bi_sector; | 247 | sector_t bi_sector = bio->bi_iter.bi_sector; |
| 248 | 248 | ||
| 249 | if (bio->bi_phys_segments) { | 249 | if (bio->bi_phys_segments) { |
| @@ -269,7 +269,7 @@ static void call_bio_endio(struct r1bio *r1_bio) | |||
| 269 | * Wake up any possible resync thread that waits for the device | 269 | * Wake up any possible resync thread that waits for the device |
| 270 | * to go idle. | 270 | * to go idle. |
| 271 | */ | 271 | */ |
| 272 | allow_barrier(conf, start_next_window, bi_sector); | 272 | allow_barrier(conf, bi_sector); |
| 273 | } | 273 | } |
| 274 | } | 274 | } |
| 275 | 275 | ||
| @@ -517,6 +517,25 @@ static void raid1_end_write_request(struct bio *bio) | |||
| 517 | bio_put(to_put); | 517 | bio_put(to_put); |
| 518 | } | 518 | } |
| 519 | 519 | ||
| 520 | static sector_t align_to_barrier_unit_end(sector_t start_sector, | ||
| 521 | sector_t sectors) | ||
| 522 | { | ||
| 523 | sector_t len; | ||
| 524 | |||
| 525 | WARN_ON(sectors == 0); | ||
| 526 | /* | ||
| 527 | * len is the number of sectors from start_sector to end of the | ||
| 528 | * barrier unit which start_sector belongs to. | ||
| 529 | */ | ||
| 530 | len = round_up(start_sector + 1, BARRIER_UNIT_SECTOR_SIZE) - | ||
| 531 | start_sector; | ||
| 532 | |||
| 533 | if (len > sectors) | ||
| 534 | len = sectors; | ||
| 535 | |||
| 536 | return len; | ||
| 537 | } | ||
| 538 | |||
| 520 | /* | 539 | /* |
| 521 | * This routine returns the disk from which the requested read should | 540 | * This routine returns the disk from which the requested read should |
| 522 | * be done. There is a per-array 'next expected sequential IO' sector | 541 | * be done. There is a per-array 'next expected sequential IO' sector |
| @@ -813,168 +832,228 @@ static void flush_pending_writes(struct r1conf *conf) | |||
| 813 | */ | 832 | */ |
| 814 | static void raise_barrier(struct r1conf *conf, sector_t sector_nr) | 833 | static void raise_barrier(struct r1conf *conf, sector_t sector_nr) |
| 815 | { | 834 | { |
| 835 | int idx = sector_to_idx(sector_nr); | ||
| 836 | |||
| 816 | spin_lock_irq(&conf->resync_lock); | 837 | spin_lock_irq(&conf->resync_lock); |
| 817 | 838 | ||
| 818 | /* Wait until no block IO is waiting */ | 839 | /* Wait until no block IO is waiting */ |
| 819 | wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, | 840 | wait_event_lock_irq(conf->wait_barrier, |
| 841 | !atomic_read(&conf->nr_waiting[idx]), | ||
| 820 | conf->resync_lock); | 842 | conf->resync_lock); |
| 821 | 843 | ||
| 822 | /* block any new IO from starting */ | 844 | /* block any new IO from starting */ |
| 823 | conf->barrier++; | 845 | atomic_inc(&conf->barrier[idx]); |
| 824 | conf->next_resync = sector_nr; | 846 | /* |
| 847 | * In raise_barrier() we firstly increase conf->barrier[idx] then | ||
| 848 | * check conf->nr_pending[idx]. In _wait_barrier() we firstly | ||
| 849 | * increase conf->nr_pending[idx] then check conf->barrier[idx]. | ||
| 850 | * A memory barrier here to make sure conf->nr_pending[idx] won't | ||
| 851 | * be fetched before conf->barrier[idx] is increased. Otherwise | ||
| 852 | * there will be a race between raise_barrier() and _wait_barrier(). | ||
| 853 | */ | ||
| 854 | smp_mb__after_atomic(); | ||
| 825 | 855 | ||
| 826 | /* For these conditions we must wait: | 856 | /* For these conditions we must wait: |
| 827 | * A: while the array is in frozen state | 857 | * A: while the array is in frozen state |
| 828 | * B: while barrier >= RESYNC_DEPTH, meaning resync reach | 858 | * B: while conf->nr_pending[idx] is not 0, meaning regular I/O |
| 829 | * the max count which allowed. | 859 | * existing in corresponding I/O barrier bucket. |
| 830 | * C: next_resync + RESYNC_SECTORS > start_next_window, meaning | 860 | * C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning reaches |
| 831 | * next resync will reach to the window which normal bios are | 861 | * max resync count which allowed on current I/O barrier bucket. |
| 832 | * handling. | ||
| 833 | * D: while there are any active requests in the current window. | ||
| 834 | */ | 862 | */ |
| 835 | wait_event_lock_irq(conf->wait_barrier, | 863 | wait_event_lock_irq(conf->wait_barrier, |
| 836 | !conf->array_frozen && | 864 | !conf->array_frozen && |
| 837 | conf->barrier < RESYNC_DEPTH && | 865 | !atomic_read(&conf->nr_pending[idx]) && |
| 838 | conf->current_window_requests == 0 && | 866 | atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH, |
| 839 | (conf->start_next_window >= | ||
| 840 | conf->next_resync + RESYNC_SECTORS), | ||
| 841 | conf->resync_lock); | 867 | conf->resync_lock); |
| 842 | 868 | ||
| 843 | conf->nr_pending++; | 869 | atomic_inc(&conf->nr_pending[idx]); |
| 844 | spin_unlock_irq(&conf->resync_lock); | 870 | spin_unlock_irq(&conf->resync_lock); |
| 845 | } | 871 | } |
| 846 | 872 | ||
| 847 | static void lower_barrier(struct r1conf *conf) | 873 | static void lower_barrier(struct r1conf *conf, sector_t sector_nr) |
| 848 | { | 874 | { |
| 849 | unsigned long flags; | 875 | int idx = sector_to_idx(sector_nr); |
| 850 | BUG_ON(conf->barrier <= 0); | 876 | |
| 851 | spin_lock_irqsave(&conf->resync_lock, flags); | 877 | BUG_ON(atomic_read(&conf->barrier[idx]) <= 0); |
| 852 | conf->barrier--; | 878 | |
| 853 | conf->nr_pending--; | 879 | atomic_dec(&conf->barrier[idx]); |
| 854 | spin_unlock_irqrestore(&conf->resync_lock, flags); | 880 | atomic_dec(&conf->nr_pending[idx]); |
| 855 | wake_up(&conf->wait_barrier); | 881 | wake_up(&conf->wait_barrier); |
| 856 | } | 882 | } |
| 857 | 883 | ||
| 858 | static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio) | 884 | static void _wait_barrier(struct r1conf *conf, int idx) |
| 859 | { | 885 | { |
| 860 | bool wait = false; | 886 | /* |
| 887 | * We need to increase conf->nr_pending[idx] very early here, | ||
| 888 | * then raise_barrier() can be blocked when it waits for | ||
| 889 | * conf->nr_pending[idx] to be 0. Then we can avoid holding | ||
| 890 | * conf->resync_lock when there is no barrier raised in same | ||
| 891 | * barrier unit bucket. Also if the array is frozen, I/O | ||
| 892 | * should be blocked until array is unfrozen. | ||
| 893 | */ | ||
| 894 | atomic_inc(&conf->nr_pending[idx]); | ||
| 895 | /* | ||
| 896 | * In _wait_barrier() we firstly increase conf->nr_pending[idx], then | ||
| 897 | * check conf->barrier[idx]. In raise_barrier() we firstly increase | ||
| 898 | * conf->barrier[idx], then check conf->nr_pending[idx]. A memory | ||
| 899 | * barrier is necessary here to make sure conf->barrier[idx] won't be | ||
| 900 | * fetched before conf->nr_pending[idx] is increased. Otherwise there | ||
| 901 | * will be a race between _wait_barrier() and raise_barrier(). | ||
| 902 | */ | ||
| 903 | smp_mb__after_atomic(); | ||
| 861 | 904 | ||
| 862 | if (conf->array_frozen || !bio) | 905 | /* |
| 863 | wait = true; | 906 | * Don't worry about checking two atomic_t variables at same time |
| 864 | else if (conf->barrier && bio_data_dir(bio) == WRITE) { | 907 | * here. If during we check conf->barrier[idx], the array is |
| 865 | if ((conf->mddev->curr_resync_completed | 908 | * frozen (conf->array_frozen is 1), and chonf->barrier[idx] is |
| 866 | >= bio_end_sector(bio)) || | 909 | * 0, it is safe to return and make the I/O continue. Because the |
| 867 | (conf->start_next_window + NEXT_NORMALIO_DISTANCE | 910 | * array is frozen, all I/O returned here will eventually complete |
| 868 | <= bio->bi_iter.bi_sector)) | 911 | * or be queued, no race will happen. See code comment in |
| 869 | wait = false; | 912 | * frozen_array(). |
| 870 | else | 913 | */ |
| 871 | wait = true; | 914 | if (!READ_ONCE(conf->array_frozen) && |
| 872 | } | 915 | !atomic_read(&conf->barrier[idx])) |
| 916 | return; | ||
| 873 | 917 | ||
| 874 | return wait; | 918 | /* |
| 919 | * After holding conf->resync_lock, conf->nr_pending[idx] | ||
| 920 | * should be decreased before waiting for barrier to drop. | ||
| 921 | * Otherwise, we may encounter a race condition because | ||
| 922 | * raise_barrer() might be waiting for conf->nr_pending[idx] | ||
| 923 | * to be 0 at same time. | ||
| 924 | */ | ||
| 925 | spin_lock_irq(&conf->resync_lock); | ||
| 926 | atomic_inc(&conf->nr_waiting[idx]); | ||
| 927 | atomic_dec(&conf->nr_pending[idx]); | ||
| 928 | /* | ||
| 929 | * In case freeze_array() is waiting for | ||
| 930 | * get_unqueued_pending() == extra | ||
| 931 | */ | ||
| 932 | wake_up(&conf->wait_barrier); | ||
| 933 | /* Wait for the barrier in same barrier unit bucket to drop. */ | ||
| 934 | wait_event_lock_irq(conf->wait_barrier, | ||
| 935 | !conf->array_frozen && | ||
| 936 | !atomic_read(&conf->barrier[idx]), | ||
| 937 | conf->resync_lock); | ||
| 938 | atomic_inc(&conf->nr_pending[idx]); | ||
| 939 | atomic_dec(&conf->nr_waiting[idx]); | ||
| 940 | spin_unlock_irq(&conf->resync_lock); | ||
| 875 | } | 941 | } |
| 876 | 942 | ||
| 877 | static sector_t wait_barrier(struct r1conf *conf, struct bio *bio) | 943 | static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr) |
| 878 | { | 944 | { |
| 879 | sector_t sector = 0; | 945 | int idx = sector_to_idx(sector_nr); |
| 880 | 946 | ||
| 881 | spin_lock_irq(&conf->resync_lock); | 947 | /* |
| 882 | if (need_to_wait_for_sync(conf, bio)) { | 948 | * Very similar to _wait_barrier(). The difference is, for read |
| 883 | conf->nr_waiting++; | 949 | * I/O we don't need wait for sync I/O, but if the whole array |
| 884 | /* Wait for the barrier to drop. | 950 | * is frozen, the read I/O still has to wait until the array is |
| 885 | * However if there are already pending | 951 | * unfrozen. Since there is no ordering requirement with |
| 886 | * requests (preventing the barrier from | 952 | * conf->barrier[idx] here, memory barrier is unnecessary as well. |
| 887 | * rising completely), and the | 953 | */ |
| 888 | * per-process bio queue isn't empty, | 954 | atomic_inc(&conf->nr_pending[idx]); |
| 889 | * then don't wait, as we need to empty | ||
| 890 | * that queue to allow conf->start_next_window | ||
| 891 | * to increase. | ||
| 892 | */ | ||
| 893 | raid1_log(conf->mddev, "wait barrier"); | ||
| 894 | wait_event_lock_irq(conf->wait_barrier, | ||
| 895 | !conf->array_frozen && | ||
| 896 | (!conf->barrier || | ||
| 897 | ((conf->start_next_window < | ||
| 898 | conf->next_resync + RESYNC_SECTORS) && | ||
| 899 | current->bio_list && | ||
| 900 | !bio_list_empty(current->bio_list))), | ||
| 901 | conf->resync_lock); | ||
| 902 | conf->nr_waiting--; | ||
| 903 | } | ||
| 904 | |||
| 905 | if (bio && bio_data_dir(bio) == WRITE) { | ||
| 906 | if (bio->bi_iter.bi_sector >= conf->next_resync) { | ||
| 907 | if (conf->start_next_window == MaxSector) | ||
| 908 | conf->start_next_window = | ||
| 909 | conf->next_resync + | ||
| 910 | NEXT_NORMALIO_DISTANCE; | ||
| 911 | |||
| 912 | if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE) | ||
| 913 | <= bio->bi_iter.bi_sector) | ||
| 914 | conf->next_window_requests++; | ||
| 915 | else | ||
| 916 | conf->current_window_requests++; | ||
| 917 | sector = conf->start_next_window; | ||
| 918 | } | ||
| 919 | } | ||
| 920 | 955 | ||
| 921 | conf->nr_pending++; | 956 | if (!READ_ONCE(conf->array_frozen)) |
| 957 | return; | ||
| 958 | |||
| 959 | spin_lock_irq(&conf->resync_lock); | ||
| 960 | atomic_inc(&conf->nr_waiting[idx]); | ||
| 961 | atomic_dec(&conf->nr_pending[idx]); | ||
| 962 | /* | ||
| 963 | * In case freeze_array() is waiting for | ||
| 964 | * get_unqueued_pending() == extra | ||
| 965 | */ | ||
| 966 | wake_up(&conf->wait_barrier); | ||
| 967 | /* Wait for array to be unfrozen */ | ||
| 968 | wait_event_lock_irq(conf->wait_barrier, | ||
| 969 | !conf->array_frozen, | ||
| 970 | conf->resync_lock); | ||
| 971 | atomic_inc(&conf->nr_pending[idx]); | ||
| 972 | atomic_dec(&conf->nr_waiting[idx]); | ||
| 922 | spin_unlock_irq(&conf->resync_lock); | 973 | spin_unlock_irq(&conf->resync_lock); |
| 923 | return sector; | ||
| 924 | } | 974 | } |
| 925 | 975 | ||
| 926 | static void allow_barrier(struct r1conf *conf, sector_t start_next_window, | 976 | static void wait_barrier(struct r1conf *conf, sector_t sector_nr) |
| 927 | sector_t bi_sector) | ||
| 928 | { | 977 | { |
| 929 | unsigned long flags; | 978 | int idx = sector_to_idx(sector_nr); |
| 930 | 979 | ||
| 931 | spin_lock_irqsave(&conf->resync_lock, flags); | 980 | _wait_barrier(conf, idx); |
| 932 | conf->nr_pending--; | 981 | } |
| 933 | if (start_next_window) { | 982 | |
| 934 | if (start_next_window == conf->start_next_window) { | 983 | static void wait_all_barriers(struct r1conf *conf) |
| 935 | if (conf->start_next_window + NEXT_NORMALIO_DISTANCE | 984 | { |
| 936 | <= bi_sector) | 985 | int idx; |
| 937 | conf->next_window_requests--; | 986 | |
| 938 | else | 987 | for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) |
| 939 | conf->current_window_requests--; | 988 | _wait_barrier(conf, idx); |
| 940 | } else | 989 | } |
| 941 | conf->current_window_requests--; | 990 | |
| 942 | 991 | static void _allow_barrier(struct r1conf *conf, int idx) | |
| 943 | if (!conf->current_window_requests) { | 992 | { |
| 944 | if (conf->next_window_requests) { | 993 | atomic_dec(&conf->nr_pending[idx]); |
| 945 | conf->current_window_requests = | ||
| 946 | conf->next_window_requests; | ||
| 947 | conf->next_window_requests = 0; | ||
| 948 | conf->start_next_window += | ||
| 949 | NEXT_NORMALIO_DISTANCE; | ||
| 950 | } else | ||
| 951 | conf->start_next_window = MaxSector; | ||
| 952 | } | ||
| 953 | } | ||
| 954 | spin_unlock_irqrestore(&conf->resync_lock, flags); | ||
| 955 | wake_up(&conf->wait_barrier); | 994 | wake_up(&conf->wait_barrier); |
| 956 | } | 995 | } |
| 957 | 996 | ||
| 997 | static void allow_barrier(struct r1conf *conf, sector_t sector_nr) | ||
| 998 | { | ||
| 999 | int idx = sector_to_idx(sector_nr); | ||
| 1000 | |||
| 1001 | _allow_barrier(conf, idx); | ||
| 1002 | } | ||
| 1003 | |||
| 1004 | static void allow_all_barriers(struct r1conf *conf) | ||
| 1005 | { | ||
| 1006 | int idx; | ||
| 1007 | |||
| 1008 | for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) | ||
| 1009 | _allow_barrier(conf, idx); | ||
| 1010 | } | ||
| 1011 | |||
| 1012 | /* conf->resync_lock should be held */ | ||
| 1013 | static int get_unqueued_pending(struct r1conf *conf) | ||
| 1014 | { | ||
| 1015 | int idx, ret; | ||
| 1016 | |||
| 1017 | for (ret = 0, idx = 0; idx < BARRIER_BUCKETS_NR; idx++) | ||
| 1018 | ret += atomic_read(&conf->nr_pending[idx]) - | ||
| 1019 | atomic_read(&conf->nr_queued[idx]); | ||
| 1020 | |||
| 1021 | return ret; | ||
| 1022 | } | ||
| 1023 | |||
| 958 | static void freeze_array(struct r1conf *conf, int extra) | 1024 | static void freeze_array(struct r1conf *conf, int extra) |
| 959 | { | 1025 | { |
| 960 | /* stop syncio and normal IO and wait for everything to | 1026 | /* Stop sync I/O and normal I/O and wait for everything to |
| 961 | * go quite. | 1027 | * go quite. |
| 962 | * We wait until nr_pending match nr_queued+extra | 1028 | * This is called in two situations: |
| 963 | * This is called in the context of one normal IO request | 1029 | * 1) management command handlers (reshape, remove disk, quiesce). |
| 964 | * that has failed. Thus any sync request that might be pending | 1030 | * 2) one normal I/O request failed. |
| 965 | * will be blocked by nr_pending, and we need to wait for | 1031 | |
| 966 | * pending IO requests to complete or be queued for re-try. | 1032 | * After array_frozen is set to 1, new sync IO will be blocked at |
| 967 | * Thus the number queued (nr_queued) plus this request (extra) | 1033 | * raise_barrier(), and new normal I/O will blocked at _wait_barrier() |
| 968 | * must match the number of pending IOs (nr_pending) before | 1034 | * or wait_read_barrier(). The flying I/Os will either complete or be |
| 969 | * we continue. | 1035 | * queued. When everything goes quite, there are only queued I/Os left. |
| 1036 | |||
| 1037 | * Every flying I/O contributes to a conf->nr_pending[idx], idx is the | ||
| 1038 | * barrier bucket index which this I/O request hits. When all sync and | ||
| 1039 | * normal I/O are queued, sum of all conf->nr_pending[] will match sum | ||
| 1040 | * of all conf->nr_queued[]. But normal I/O failure is an exception, | ||
| 1041 | * in handle_read_error(), we may call freeze_array() before trying to | ||
| 1042 | * fix the read error. In this case, the error read I/O is not queued, | ||
| 1043 | * so get_unqueued_pending() == 1. | ||
| 1044 | * | ||
| 1045 | * Therefore before this function returns, we need to wait until | ||
| 1046 | * get_unqueued_pendings(conf) gets equal to extra. For | ||
| 1047 | * normal I/O context, extra is 1, in rested situations extra is 0. | ||
| 970 | */ | 1048 | */ |
| 971 | spin_lock_irq(&conf->resync_lock); | 1049 | spin_lock_irq(&conf->resync_lock); |
| 972 | conf->array_frozen = 1; | 1050 | conf->array_frozen = 1; |
| 973 | raid1_log(conf->mddev, "wait freeze"); | 1051 | raid1_log(conf->mddev, "wait freeze"); |
| 974 | wait_event_lock_irq_cmd(conf->wait_barrier, | 1052 | wait_event_lock_irq_cmd( |
| 975 | conf->nr_pending == conf->nr_queued+extra, | 1053 | conf->wait_barrier, |
| 976 | conf->resync_lock, | 1054 | get_unqueued_pending(conf) == extra, |
| 977 | flush_pending_writes(conf)); | 1055 | conf->resync_lock, |
| 1056 | flush_pending_writes(conf)); | ||
| 978 | spin_unlock_irq(&conf->resync_lock); | 1057 | spin_unlock_irq(&conf->resync_lock); |
| 979 | } | 1058 | } |
| 980 | static void unfreeze_array(struct r1conf *conf) | 1059 | static void unfreeze_array(struct r1conf *conf) |
| @@ -982,8 +1061,8 @@ static void unfreeze_array(struct r1conf *conf) | |||
| 982 | /* reverse the effect of the freeze */ | 1061 | /* reverse the effect of the freeze */ |
| 983 | spin_lock_irq(&conf->resync_lock); | 1062 | spin_lock_irq(&conf->resync_lock); |
| 984 | conf->array_frozen = 0; | 1063 | conf->array_frozen = 0; |
| 985 | wake_up(&conf->wait_barrier); | ||
| 986 | spin_unlock_irq(&conf->resync_lock); | 1064 | spin_unlock_irq(&conf->resync_lock); |
| 1065 | wake_up(&conf->wait_barrier); | ||
| 987 | } | 1066 | } |
| 988 | 1067 | ||
| 989 | /* duplicate the data pages for behind I/O | 1068 | /* duplicate the data pages for behind I/O |
| @@ -1070,11 +1149,28 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) | |||
| 1070 | kfree(plug); | 1149 | kfree(plug); |
| 1071 | } | 1150 | } |
| 1072 | 1151 | ||
| 1073 | static void raid1_read_request(struct mddev *mddev, struct bio *bio, | 1152 | static inline struct r1bio * |
| 1074 | struct r1bio *r1_bio) | 1153 | alloc_r1bio(struct mddev *mddev, struct bio *bio, sector_t sectors_handled) |
| 1154 | { | ||
| 1155 | struct r1conf *conf = mddev->private; | ||
| 1156 | struct r1bio *r1_bio; | ||
| 1157 | |||
| 1158 | r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); | ||
| 1159 | |||
| 1160 | r1_bio->master_bio = bio; | ||
| 1161 | r1_bio->sectors = bio_sectors(bio) - sectors_handled; | ||
| 1162 | r1_bio->state = 0; | ||
| 1163 | r1_bio->mddev = mddev; | ||
| 1164 | r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled; | ||
| 1165 | |||
| 1166 | return r1_bio; | ||
| 1167 | } | ||
| 1168 | |||
| 1169 | static void raid1_read_request(struct mddev *mddev, struct bio *bio) | ||
| 1075 | { | 1170 | { |
| 1076 | struct r1conf *conf = mddev->private; | 1171 | struct r1conf *conf = mddev->private; |
| 1077 | struct raid1_info *mirror; | 1172 | struct raid1_info *mirror; |
| 1173 | struct r1bio *r1_bio; | ||
| 1078 | struct bio *read_bio; | 1174 | struct bio *read_bio; |
| 1079 | struct bitmap *bitmap = mddev->bitmap; | 1175 | struct bitmap *bitmap = mddev->bitmap; |
| 1080 | const int op = bio_op(bio); | 1176 | const int op = bio_op(bio); |
| @@ -1083,8 +1179,29 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, | |||
| 1083 | int max_sectors; | 1179 | int max_sectors; |
| 1084 | int rdisk; | 1180 | int rdisk; |
| 1085 | 1181 | ||
| 1086 | wait_barrier(conf, bio); | 1182 | /* |
| 1183 | * Still need barrier for READ in case that whole | ||
| 1184 | * array is frozen. | ||
| 1185 | */ | ||
| 1186 | wait_read_barrier(conf, bio->bi_iter.bi_sector); | ||
| 1187 | |||
| 1188 | r1_bio = alloc_r1bio(mddev, bio, 0); | ||
| 1087 | 1189 | ||
| 1190 | /* | ||
| 1191 | * We might need to issue multiple reads to different | ||
| 1192 | * devices if there are bad blocks around, so we keep | ||
| 1193 | * track of the number of reads in bio->bi_phys_segments. | ||
| 1194 | * If this is 0, there is only one r1_bio and no locking | ||
| 1195 | * will be needed when requests complete. If it is | ||
| 1196 | * non-zero, then it is the number of not-completed requests. | ||
| 1197 | */ | ||
| 1198 | bio->bi_phys_segments = 0; | ||
| 1199 | bio_clear_flag(bio, BIO_SEG_VALID); | ||
| 1200 | |||
| 1201 | /* | ||
| 1202 | * make_request() can abort the operation when read-ahead is being | ||
| 1203 | * used and no empty request is available. | ||
| 1204 | */ | ||
| 1088 | read_again: | 1205 | read_again: |
| 1089 | rdisk = read_balance(conf, r1_bio, &max_sectors); | 1206 | rdisk = read_balance(conf, r1_bio, &max_sectors); |
| 1090 | 1207 | ||
| @@ -1106,9 +1223,8 @@ read_again: | |||
| 1106 | atomic_read(&bitmap->behind_writes) == 0); | 1223 | atomic_read(&bitmap->behind_writes) == 0); |
| 1107 | } | 1224 | } |
| 1108 | r1_bio->read_disk = rdisk; | 1225 | r1_bio->read_disk = rdisk; |
| 1109 | r1_bio->start_next_window = 0; | ||
| 1110 | 1226 | ||
| 1111 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 1227 | read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); |
| 1112 | bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector, | 1228 | bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector, |
| 1113 | max_sectors); | 1229 | max_sectors); |
| 1114 | 1230 | ||
| @@ -1151,22 +1267,16 @@ read_again: | |||
| 1151 | */ | 1267 | */ |
| 1152 | reschedule_retry(r1_bio); | 1268 | reschedule_retry(r1_bio); |
| 1153 | 1269 | ||
| 1154 | r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); | 1270 | r1_bio = alloc_r1bio(mddev, bio, sectors_handled); |
| 1155 | |||
| 1156 | r1_bio->master_bio = bio; | ||
| 1157 | r1_bio->sectors = bio_sectors(bio) - sectors_handled; | ||
| 1158 | r1_bio->state = 0; | ||
| 1159 | r1_bio->mddev = mddev; | ||
| 1160 | r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled; | ||
| 1161 | goto read_again; | 1271 | goto read_again; |
| 1162 | } else | 1272 | } else |
| 1163 | generic_make_request(read_bio); | 1273 | generic_make_request(read_bio); |
| 1164 | } | 1274 | } |
| 1165 | 1275 | ||
| 1166 | static void raid1_write_request(struct mddev *mddev, struct bio *bio, | 1276 | static void raid1_write_request(struct mddev *mddev, struct bio *bio) |
| 1167 | struct r1bio *r1_bio) | ||
| 1168 | { | 1277 | { |
| 1169 | struct r1conf *conf = mddev->private; | 1278 | struct r1conf *conf = mddev->private; |
| 1279 | struct r1bio *r1_bio; | ||
| 1170 | int i, disks; | 1280 | int i, disks; |
| 1171 | struct bitmap *bitmap = mddev->bitmap; | 1281 | struct bitmap *bitmap = mddev->bitmap; |
| 1172 | unsigned long flags; | 1282 | unsigned long flags; |
| @@ -1176,7 +1286,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
| 1176 | int first_clone; | 1286 | int first_clone; |
| 1177 | int sectors_handled; | 1287 | int sectors_handled; |
| 1178 | int max_sectors; | 1288 | int max_sectors; |
| 1179 | sector_t start_next_window; | ||
| 1180 | 1289 | ||
| 1181 | /* | 1290 | /* |
| 1182 | * Register the new request and wait if the reconstruction | 1291 | * Register the new request and wait if the reconstruction |
| @@ -1212,7 +1321,19 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
| 1212 | } | 1321 | } |
| 1213 | finish_wait(&conf->wait_barrier, &w); | 1322 | finish_wait(&conf->wait_barrier, &w); |
| 1214 | } | 1323 | } |
| 1215 | start_next_window = wait_barrier(conf, bio); | 1324 | wait_barrier(conf, bio->bi_iter.bi_sector); |
| 1325 | |||
| 1326 | r1_bio = alloc_r1bio(mddev, bio, 0); | ||
| 1327 | |||
| 1328 | /* We might need to issue multiple writes to different | ||
| 1329 | * devices if there are bad blocks around, so we keep | ||
| 1330 | * track of the number of writes in bio->bi_phys_segments. | ||
| 1331 | * If this is 0, there is only one r1_bio and no locking | ||
| 1332 | * will be needed when requests complete. If it is | ||
| 1333 | * non-zero, then it is the number of not-completed requests. | ||
| 1334 | */ | ||
| 1335 | bio->bi_phys_segments = 0; | ||
| 1336 | bio_clear_flag(bio, BIO_SEG_VALID); | ||
| 1216 | 1337 | ||
| 1217 | if (conf->pending_count >= max_queued_requests) { | 1338 | if (conf->pending_count >= max_queued_requests) { |
| 1218 | md_wakeup_thread(mddev->thread); | 1339 | md_wakeup_thread(mddev->thread); |
| @@ -1233,7 +1354,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
| 1233 | 1354 | ||
| 1234 | disks = conf->raid_disks * 2; | 1355 | disks = conf->raid_disks * 2; |
| 1235 | retry_write: | 1356 | retry_write: |
| 1236 | r1_bio->start_next_window = start_next_window; | ||
| 1237 | blocked_rdev = NULL; | 1357 | blocked_rdev = NULL; |
| 1238 | rcu_read_lock(); | 1358 | rcu_read_lock(); |
| 1239 | max_sectors = r1_bio->sectors; | 1359 | max_sectors = r1_bio->sectors; |
| @@ -1300,25 +1420,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
| 1300 | if (unlikely(blocked_rdev)) { | 1420 | if (unlikely(blocked_rdev)) { |
| 1301 | /* Wait for this device to become unblocked */ | 1421 | /* Wait for this device to become unblocked */ |
| 1302 | int j; | 1422 | int j; |
| 1303 | sector_t old = start_next_window; | ||
| 1304 | 1423 | ||
| 1305 | for (j = 0; j < i; j++) | 1424 | for (j = 0; j < i; j++) |
| 1306 | if (r1_bio->bios[j]) | 1425 | if (r1_bio->bios[j]) |
| 1307 | rdev_dec_pending(conf->mirrors[j].rdev, mddev); | 1426 | rdev_dec_pending(conf->mirrors[j].rdev, mddev); |
| 1308 | r1_bio->state = 0; | 1427 | r1_bio->state = 0; |
| 1309 | allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector); | 1428 | allow_barrier(conf, bio->bi_iter.bi_sector); |
| 1310 | raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); | 1429 | raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); |
| 1311 | md_wait_for_blocked_rdev(blocked_rdev, mddev); | 1430 | md_wait_for_blocked_rdev(blocked_rdev, mddev); |
| 1312 | start_next_window = wait_barrier(conf, bio); | 1431 | wait_barrier(conf, bio->bi_iter.bi_sector); |
| 1313 | /* | ||
| 1314 | * We must make sure the multi r1bios of bio have | ||
| 1315 | * the same value of bi_phys_segments | ||
| 1316 | */ | ||
| 1317 | if (bio->bi_phys_segments && old && | ||
| 1318 | old != start_next_window) | ||
| 1319 | /* Wait for the former r1bio(s) to complete */ | ||
| 1320 | wait_event(conf->wait_barrier, | ||
| 1321 | bio->bi_phys_segments == 1); | ||
| 1322 | goto retry_write; | 1432 | goto retry_write; |
| 1323 | } | 1433 | } |
| 1324 | 1434 | ||
| @@ -1341,13 +1451,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
| 1341 | 1451 | ||
| 1342 | first_clone = 1; | 1452 | first_clone = 1; |
| 1343 | for (i = 0; i < disks; i++) { | 1453 | for (i = 0; i < disks; i++) { |
| 1344 | struct bio *mbio; | 1454 | struct bio *mbio = NULL; |
| 1455 | sector_t offset; | ||
| 1345 | if (!r1_bio->bios[i]) | 1456 | if (!r1_bio->bios[i]) |
| 1346 | continue; | 1457 | continue; |
| 1347 | 1458 | ||
| 1348 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 1459 | offset = r1_bio->sector - bio->bi_iter.bi_sector; |
| 1349 | bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector, | ||
| 1350 | max_sectors); | ||
| 1351 | 1460 | ||
| 1352 | if (first_clone) { | 1461 | if (first_clone) { |
| 1353 | /* do behind I/O ? | 1462 | /* do behind I/O ? |
| @@ -1357,8 +1466,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
| 1357 | if (bitmap && | 1466 | if (bitmap && |
| 1358 | (atomic_read(&bitmap->behind_writes) | 1467 | (atomic_read(&bitmap->behind_writes) |
| 1359 | < mddev->bitmap_info.max_write_behind) && | 1468 | < mddev->bitmap_info.max_write_behind) && |
| 1360 | !waitqueue_active(&bitmap->behind_wait)) | 1469 | !waitqueue_active(&bitmap->behind_wait)) { |
| 1470 | mbio = bio_clone_bioset_partial(bio, GFP_NOIO, | ||
| 1471 | mddev->bio_set, | ||
| 1472 | offset << 9, | ||
| 1473 | max_sectors << 9); | ||
| 1361 | alloc_behind_pages(mbio, r1_bio); | 1474 | alloc_behind_pages(mbio, r1_bio); |
| 1475 | } | ||
| 1362 | 1476 | ||
| 1363 | bitmap_startwrite(bitmap, r1_bio->sector, | 1477 | bitmap_startwrite(bitmap, r1_bio->sector, |
| 1364 | r1_bio->sectors, | 1478 | r1_bio->sectors, |
| @@ -1366,6 +1480,19 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
| 1366 | &r1_bio->state)); | 1480 | &r1_bio->state)); |
| 1367 | first_clone = 0; | 1481 | first_clone = 0; |
| 1368 | } | 1482 | } |
| 1483 | |||
| 1484 | if (!mbio) { | ||
| 1485 | if (r1_bio->behind_bvecs) | ||
| 1486 | mbio = bio_clone_bioset_partial(bio, GFP_NOIO, | ||
| 1487 | mddev->bio_set, | ||
| 1488 | offset << 9, | ||
| 1489 | max_sectors << 9); | ||
| 1490 | else { | ||
| 1491 | mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); | ||
| 1492 | bio_trim(mbio, offset, max_sectors); | ||
| 1493 | } | ||
| 1494 | } | ||
| 1495 | |||
| 1369 | if (r1_bio->behind_bvecs) { | 1496 | if (r1_bio->behind_bvecs) { |
| 1370 | struct bio_vec *bvec; | 1497 | struct bio_vec *bvec; |
| 1371 | int j; | 1498 | int j; |
| @@ -1385,8 +1512,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
| 1385 | conf->mirrors[i].rdev->data_offset); | 1512 | conf->mirrors[i].rdev->data_offset); |
| 1386 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; | 1513 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; |
| 1387 | mbio->bi_end_io = raid1_end_write_request; | 1514 | mbio->bi_end_io = raid1_end_write_request; |
| 1388 | mbio->bi_opf = bio_op(bio) | | 1515 | mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA)); |
| 1389 | (bio->bi_opf & (REQ_SYNC | REQ_PREFLUSH | REQ_FUA)); | ||
| 1390 | if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) && | 1516 | if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) && |
| 1391 | !test_bit(WriteMostly, &conf->mirrors[i].rdev->flags) && | 1517 | !test_bit(WriteMostly, &conf->mirrors[i].rdev->flags) && |
| 1392 | conf->raid_disks - mddev->degraded > 1) | 1518 | conf->raid_disks - mddev->degraded > 1) |
| @@ -1427,12 +1553,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
| 1427 | /* We need another r1_bio. It has already been counted | 1553 | /* We need another r1_bio. It has already been counted |
| 1428 | * in bio->bi_phys_segments | 1554 | * in bio->bi_phys_segments |
| 1429 | */ | 1555 | */ |
| 1430 | r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); | 1556 | r1_bio = alloc_r1bio(mddev, bio, sectors_handled); |
| 1431 | r1_bio->master_bio = bio; | ||
| 1432 | r1_bio->sectors = bio_sectors(bio) - sectors_handled; | ||
| 1433 | r1_bio->state = 0; | ||
| 1434 | r1_bio->mddev = mddev; | ||
| 1435 | r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled; | ||
| 1436 | goto retry_write; | 1557 | goto retry_write; |
| 1437 | } | 1558 | } |
| 1438 | 1559 | ||
| @@ -1444,36 +1565,30 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
| 1444 | 1565 | ||
| 1445 | static void raid1_make_request(struct mddev *mddev, struct bio *bio) | 1566 | static void raid1_make_request(struct mddev *mddev, struct bio *bio) |
| 1446 | { | 1567 | { |
| 1447 | struct r1conf *conf = mddev->private; | 1568 | struct bio *split; |
| 1448 | struct r1bio *r1_bio; | 1569 | sector_t sectors; |
| 1449 | 1570 | ||
| 1450 | /* | 1571 | if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { |
| 1451 | * make_request() can abort the operation when read-ahead is being | 1572 | md_flush_request(mddev, bio); |
| 1452 | * used and no empty request is available. | 1573 | return; |
| 1453 | * | 1574 | } |
| 1454 | */ | ||
| 1455 | r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); | ||
| 1456 | |||
| 1457 | r1_bio->master_bio = bio; | ||
| 1458 | r1_bio->sectors = bio_sectors(bio); | ||
| 1459 | r1_bio->state = 0; | ||
| 1460 | r1_bio->mddev = mddev; | ||
| 1461 | r1_bio->sector = bio->bi_iter.bi_sector; | ||
| 1462 | 1575 | ||
| 1463 | /* | 1576 | /* if bio exceeds barrier unit boundary, split it */ |
| 1464 | * We might need to issue multiple reads to different devices if there | 1577 | do { |
| 1465 | * are bad blocks around, so we keep track of the number of reads in | 1578 | sectors = align_to_barrier_unit_end( |
| 1466 | * bio->bi_phys_segments. If this is 0, there is only one r1_bio and | 1579 | bio->bi_iter.bi_sector, bio_sectors(bio)); |
| 1467 | * no locking will be needed when requests complete. If it is | 1580 | if (sectors < bio_sectors(bio)) { |
| 1468 | * non-zero, then it is the number of not-completed requests. | 1581 | split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set); |
| 1469 | */ | 1582 | bio_chain(split, bio); |
| 1470 | bio->bi_phys_segments = 0; | 1583 | } else { |
| 1471 | bio_clear_flag(bio, BIO_SEG_VALID); | 1584 | split = bio; |
| 1585 | } | ||
| 1472 | 1586 | ||
| 1473 | if (bio_data_dir(bio) == READ) | 1587 | if (bio_data_dir(split) == READ) |
| 1474 | raid1_read_request(mddev, bio, r1_bio); | 1588 | raid1_read_request(mddev, split); |
| 1475 | else | 1589 | else |
| 1476 | raid1_write_request(mddev, bio, r1_bio); | 1590 | raid1_write_request(mddev, split); |
| 1591 | } while (split != bio); | ||
| 1477 | } | 1592 | } |
| 1478 | 1593 | ||
| 1479 | static void raid1_status(struct seq_file *seq, struct mddev *mddev) | 1594 | static void raid1_status(struct seq_file *seq, struct mddev *mddev) |
| @@ -1564,19 +1679,11 @@ static void print_conf(struct r1conf *conf) | |||
| 1564 | 1679 | ||
| 1565 | static void close_sync(struct r1conf *conf) | 1680 | static void close_sync(struct r1conf *conf) |
| 1566 | { | 1681 | { |
| 1567 | wait_barrier(conf, NULL); | 1682 | wait_all_barriers(conf); |
| 1568 | allow_barrier(conf, 0, 0); | 1683 | allow_all_barriers(conf); |
| 1569 | 1684 | ||
| 1570 | mempool_destroy(conf->r1buf_pool); | 1685 | mempool_destroy(conf->r1buf_pool); |
| 1571 | conf->r1buf_pool = NULL; | 1686 | conf->r1buf_pool = NULL; |
| 1572 | |||
| 1573 | spin_lock_irq(&conf->resync_lock); | ||
| 1574 | conf->next_resync = MaxSector - 2 * NEXT_NORMALIO_DISTANCE; | ||
| 1575 | conf->start_next_window = MaxSector; | ||
| 1576 | conf->current_window_requests += | ||
| 1577 | conf->next_window_requests; | ||
| 1578 | conf->next_window_requests = 0; | ||
| 1579 | spin_unlock_irq(&conf->resync_lock); | ||
| 1580 | } | 1687 | } |
| 1581 | 1688 | ||
| 1582 | static int raid1_spare_active(struct mddev *mddev) | 1689 | static int raid1_spare_active(struct mddev *mddev) |
| @@ -2273,7 +2380,8 @@ static int narrow_write_error(struct r1bio *r1_bio, int i) | |||
| 2273 | 2380 | ||
| 2274 | wbio->bi_vcnt = vcnt; | 2381 | wbio->bi_vcnt = vcnt; |
| 2275 | } else { | 2382 | } else { |
| 2276 | wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); | 2383 | wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO, |
| 2384 | mddev->bio_set); | ||
| 2277 | } | 2385 | } |
| 2278 | 2386 | ||
| 2279 | bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); | 2387 | bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); |
| @@ -2323,8 +2431,9 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio | |||
| 2323 | 2431 | ||
| 2324 | static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) | 2432 | static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) |
| 2325 | { | 2433 | { |
| 2326 | int m; | 2434 | int m, idx; |
| 2327 | bool fail = false; | 2435 | bool fail = false; |
| 2436 | |||
| 2328 | for (m = 0; m < conf->raid_disks * 2 ; m++) | 2437 | for (m = 0; m < conf->raid_disks * 2 ; m++) |
| 2329 | if (r1_bio->bios[m] == IO_MADE_GOOD) { | 2438 | if (r1_bio->bios[m] == IO_MADE_GOOD) { |
| 2330 | struct md_rdev *rdev = conf->mirrors[m].rdev; | 2439 | struct md_rdev *rdev = conf->mirrors[m].rdev; |
| @@ -2350,8 +2459,14 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) | |||
| 2350 | if (fail) { | 2459 | if (fail) { |
| 2351 | spin_lock_irq(&conf->device_lock); | 2460 | spin_lock_irq(&conf->device_lock); |
| 2352 | list_add(&r1_bio->retry_list, &conf->bio_end_io_list); | 2461 | list_add(&r1_bio->retry_list, &conf->bio_end_io_list); |
| 2353 | conf->nr_queued++; | 2462 | idx = sector_to_idx(r1_bio->sector); |
| 2463 | atomic_inc(&conf->nr_queued[idx]); | ||
| 2354 | spin_unlock_irq(&conf->device_lock); | 2464 | spin_unlock_irq(&conf->device_lock); |
| 2465 | /* | ||
| 2466 | * In case freeze_array() is waiting for condition | ||
| 2467 | * get_unqueued_pending() == extra to be true. | ||
| 2468 | */ | ||
| 2469 | wake_up(&conf->wait_barrier); | ||
| 2355 | md_wakeup_thread(conf->mddev->thread); | 2470 | md_wakeup_thread(conf->mddev->thread); |
| 2356 | } else { | 2471 | } else { |
| 2357 | if (test_bit(R1BIO_WriteError, &r1_bio->state)) | 2472 | if (test_bit(R1BIO_WriteError, &r1_bio->state)) |
| @@ -2411,7 +2526,8 @@ read_more: | |||
| 2411 | const unsigned long do_sync | 2526 | const unsigned long do_sync |
| 2412 | = r1_bio->master_bio->bi_opf & REQ_SYNC; | 2527 | = r1_bio->master_bio->bi_opf & REQ_SYNC; |
| 2413 | r1_bio->read_disk = disk; | 2528 | r1_bio->read_disk = disk; |
| 2414 | bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); | 2529 | bio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO, |
| 2530 | mddev->bio_set); | ||
| 2415 | bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector, | 2531 | bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector, |
| 2416 | max_sectors); | 2532 | max_sectors); |
| 2417 | r1_bio->bios[r1_bio->read_disk] = bio; | 2533 | r1_bio->bios[r1_bio->read_disk] = bio; |
| @@ -2445,15 +2561,8 @@ read_more: | |||
| 2445 | generic_make_request(bio); | 2561 | generic_make_request(bio); |
| 2446 | bio = NULL; | 2562 | bio = NULL; |
| 2447 | 2563 | ||
| 2448 | r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); | 2564 | r1_bio = alloc_r1bio(mddev, mbio, sectors_handled); |
| 2449 | |||
| 2450 | r1_bio->master_bio = mbio; | ||
| 2451 | r1_bio->sectors = bio_sectors(mbio) - sectors_handled; | ||
| 2452 | r1_bio->state = 0; | ||
| 2453 | set_bit(R1BIO_ReadError, &r1_bio->state); | 2565 | set_bit(R1BIO_ReadError, &r1_bio->state); |
| 2454 | r1_bio->mddev = mddev; | ||
| 2455 | r1_bio->sector = mbio->bi_iter.bi_sector + | ||
| 2456 | sectors_handled; | ||
| 2457 | 2566 | ||
| 2458 | goto read_more; | 2567 | goto read_more; |
| 2459 | } else { | 2568 | } else { |
| @@ -2472,6 +2581,7 @@ static void raid1d(struct md_thread *thread) | |||
| 2472 | struct r1conf *conf = mddev->private; | 2581 | struct r1conf *conf = mddev->private; |
| 2473 | struct list_head *head = &conf->retry_list; | 2582 | struct list_head *head = &conf->retry_list; |
| 2474 | struct blk_plug plug; | 2583 | struct blk_plug plug; |
| 2584 | int idx; | ||
| 2475 | 2585 | ||
| 2476 | md_check_recovery(mddev); | 2586 | md_check_recovery(mddev); |
| 2477 | 2587 | ||
| @@ -2479,17 +2589,15 @@ static void raid1d(struct md_thread *thread) | |||
| 2479 | !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { | 2589 | !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { |
| 2480 | LIST_HEAD(tmp); | 2590 | LIST_HEAD(tmp); |
| 2481 | spin_lock_irqsave(&conf->device_lock, flags); | 2591 | spin_lock_irqsave(&conf->device_lock, flags); |
| 2482 | if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { | 2592 | if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) |
| 2483 | while (!list_empty(&conf->bio_end_io_list)) { | 2593 | list_splice_init(&conf->bio_end_io_list, &tmp); |
| 2484 | list_move(conf->bio_end_io_list.prev, &tmp); | ||
| 2485 | conf->nr_queued--; | ||
| 2486 | } | ||
| 2487 | } | ||
| 2488 | spin_unlock_irqrestore(&conf->device_lock, flags); | 2594 | spin_unlock_irqrestore(&conf->device_lock, flags); |
| 2489 | while (!list_empty(&tmp)) { | 2595 | while (!list_empty(&tmp)) { |
| 2490 | r1_bio = list_first_entry(&tmp, struct r1bio, | 2596 | r1_bio = list_first_entry(&tmp, struct r1bio, |
| 2491 | retry_list); | 2597 | retry_list); |
| 2492 | list_del(&r1_bio->retry_list); | 2598 | list_del(&r1_bio->retry_list); |
| 2599 | idx = sector_to_idx(r1_bio->sector); | ||
| 2600 | atomic_dec(&conf->nr_queued[idx]); | ||
| 2493 | if (mddev->degraded) | 2601 | if (mddev->degraded) |
| 2494 | set_bit(R1BIO_Degraded, &r1_bio->state); | 2602 | set_bit(R1BIO_Degraded, &r1_bio->state); |
| 2495 | if (test_bit(R1BIO_WriteError, &r1_bio->state)) | 2603 | if (test_bit(R1BIO_WriteError, &r1_bio->state)) |
| @@ -2510,7 +2618,8 @@ static void raid1d(struct md_thread *thread) | |||
| 2510 | } | 2618 | } |
| 2511 | r1_bio = list_entry(head->prev, struct r1bio, retry_list); | 2619 | r1_bio = list_entry(head->prev, struct r1bio, retry_list); |
| 2512 | list_del(head->prev); | 2620 | list_del(head->prev); |
| 2513 | conf->nr_queued--; | 2621 | idx = sector_to_idx(r1_bio->sector); |
| 2622 | atomic_dec(&conf->nr_queued[idx]); | ||
| 2514 | spin_unlock_irqrestore(&conf->device_lock, flags); | 2623 | spin_unlock_irqrestore(&conf->device_lock, flags); |
| 2515 | 2624 | ||
| 2516 | mddev = r1_bio->mddev; | 2625 | mddev = r1_bio->mddev; |
| @@ -2549,7 +2658,6 @@ static int init_resync(struct r1conf *conf) | |||
| 2549 | conf->poolinfo); | 2658 | conf->poolinfo); |
| 2550 | if (!conf->r1buf_pool) | 2659 | if (!conf->r1buf_pool) |
| 2551 | return -ENOMEM; | 2660 | return -ENOMEM; |
| 2552 | conf->next_resync = 0; | ||
| 2553 | return 0; | 2661 | return 0; |
| 2554 | } | 2662 | } |
| 2555 | 2663 | ||
| @@ -2578,6 +2686,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, | |||
| 2578 | int still_degraded = 0; | 2686 | int still_degraded = 0; |
| 2579 | int good_sectors = RESYNC_SECTORS; | 2687 | int good_sectors = RESYNC_SECTORS; |
| 2580 | int min_bad = 0; /* number of sectors that are bad in all devices */ | 2688 | int min_bad = 0; /* number of sectors that are bad in all devices */ |
| 2689 | int idx = sector_to_idx(sector_nr); | ||
| 2581 | 2690 | ||
| 2582 | if (!conf->r1buf_pool) | 2691 | if (!conf->r1buf_pool) |
| 2583 | if (init_resync(conf)) | 2692 | if (init_resync(conf)) |
| @@ -2627,7 +2736,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, | |||
| 2627 | * If there is non-resync activity waiting for a turn, then let it | 2736 | * If there is non-resync activity waiting for a turn, then let it |
| 2628 | * though before starting on this new sync request. | 2737 | * though before starting on this new sync request. |
| 2629 | */ | 2738 | */ |
| 2630 | if (conf->nr_waiting) | 2739 | if (atomic_read(&conf->nr_waiting[idx])) |
| 2631 | schedule_timeout_uninterruptible(1); | 2740 | schedule_timeout_uninterruptible(1); |
| 2632 | 2741 | ||
| 2633 | /* we are incrementing sector_nr below. To be safe, we check against | 2742 | /* we are incrementing sector_nr below. To be safe, we check against |
| @@ -2654,6 +2763,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, | |||
| 2654 | r1_bio->sector = sector_nr; | 2763 | r1_bio->sector = sector_nr; |
| 2655 | r1_bio->state = 0; | 2764 | r1_bio->state = 0; |
| 2656 | set_bit(R1BIO_IsSync, &r1_bio->state); | 2765 | set_bit(R1BIO_IsSync, &r1_bio->state); |
| 2766 | /* make sure good_sectors won't go across barrier unit boundary */ | ||
| 2767 | good_sectors = align_to_barrier_unit_end(sector_nr, good_sectors); | ||
| 2657 | 2768 | ||
| 2658 | for (i = 0; i < conf->raid_disks * 2; i++) { | 2769 | for (i = 0; i < conf->raid_disks * 2; i++) { |
| 2659 | struct md_rdev *rdev; | 2770 | struct md_rdev *rdev; |
| @@ -2884,6 +2995,26 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
| 2884 | if (!conf) | 2995 | if (!conf) |
| 2885 | goto abort; | 2996 | goto abort; |
| 2886 | 2997 | ||
| 2998 | conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR, | ||
| 2999 | sizeof(atomic_t), GFP_KERNEL); | ||
| 3000 | if (!conf->nr_pending) | ||
| 3001 | goto abort; | ||
| 3002 | |||
| 3003 | conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR, | ||
| 3004 | sizeof(atomic_t), GFP_KERNEL); | ||
| 3005 | if (!conf->nr_waiting) | ||
| 3006 | goto abort; | ||
| 3007 | |||
| 3008 | conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR, | ||
| 3009 | sizeof(atomic_t), GFP_KERNEL); | ||
| 3010 | if (!conf->nr_queued) | ||
| 3011 | goto abort; | ||
| 3012 | |||
| 3013 | conf->barrier = kcalloc(BARRIER_BUCKETS_NR, | ||
| 3014 | sizeof(atomic_t), GFP_KERNEL); | ||
| 3015 | if (!conf->barrier) | ||
| 3016 | goto abort; | ||
| 3017 | |||
| 2887 | conf->mirrors = kzalloc(sizeof(struct raid1_info) | 3018 | conf->mirrors = kzalloc(sizeof(struct raid1_info) |
| 2888 | * mddev->raid_disks * 2, | 3019 | * mddev->raid_disks * 2, |
| 2889 | GFP_KERNEL); | 3020 | GFP_KERNEL); |
| @@ -2939,9 +3070,6 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
| 2939 | conf->pending_count = 0; | 3070 | conf->pending_count = 0; |
| 2940 | conf->recovery_disabled = mddev->recovery_disabled - 1; | 3071 | conf->recovery_disabled = mddev->recovery_disabled - 1; |
| 2941 | 3072 | ||
| 2942 | conf->start_next_window = MaxSector; | ||
| 2943 | conf->current_window_requests = conf->next_window_requests = 0; | ||
| 2944 | |||
| 2945 | err = -EIO; | 3073 | err = -EIO; |
| 2946 | for (i = 0; i < conf->raid_disks * 2; i++) { | 3074 | for (i = 0; i < conf->raid_disks * 2; i++) { |
| 2947 | 3075 | ||
| @@ -2984,6 +3112,10 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
| 2984 | kfree(conf->mirrors); | 3112 | kfree(conf->mirrors); |
| 2985 | safe_put_page(conf->tmppage); | 3113 | safe_put_page(conf->tmppage); |
| 2986 | kfree(conf->poolinfo); | 3114 | kfree(conf->poolinfo); |
| 3115 | kfree(conf->nr_pending); | ||
| 3116 | kfree(conf->nr_waiting); | ||
| 3117 | kfree(conf->nr_queued); | ||
| 3118 | kfree(conf->barrier); | ||
| 2987 | kfree(conf); | 3119 | kfree(conf); |
| 2988 | } | 3120 | } |
| 2989 | return ERR_PTR(err); | 3121 | return ERR_PTR(err); |
| @@ -3085,6 +3217,10 @@ static void raid1_free(struct mddev *mddev, void *priv) | |||
| 3085 | kfree(conf->mirrors); | 3217 | kfree(conf->mirrors); |
| 3086 | safe_put_page(conf->tmppage); | 3218 | safe_put_page(conf->tmppage); |
| 3087 | kfree(conf->poolinfo); | 3219 | kfree(conf->poolinfo); |
| 3220 | kfree(conf->nr_pending); | ||
| 3221 | kfree(conf->nr_waiting); | ||
| 3222 | kfree(conf->nr_queued); | ||
| 3223 | kfree(conf->barrier); | ||
| 3088 | kfree(conf); | 3224 | kfree(conf); |
| 3089 | } | 3225 | } |
| 3090 | 3226 | ||
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index c52ef424a24b..dd22a37d0d83 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h | |||
| @@ -1,6 +1,30 @@ | |||
| 1 | #ifndef _RAID1_H | 1 | #ifndef _RAID1_H |
| 2 | #define _RAID1_H | 2 | #define _RAID1_H |
| 3 | 3 | ||
| 4 | /* | ||
| 5 | * each barrier unit size is 64MB fow now | ||
| 6 | * note: it must be larger than RESYNC_DEPTH | ||
| 7 | */ | ||
| 8 | #define BARRIER_UNIT_SECTOR_BITS 17 | ||
| 9 | #define BARRIER_UNIT_SECTOR_SIZE (1<<17) | ||
| 10 | /* | ||
| 11 | * In struct r1conf, the following members are related to I/O barrier | ||
| 12 | * buckets, | ||
| 13 | * atomic_t *nr_pending; | ||
| 14 | * atomic_t *nr_waiting; | ||
| 15 | * atomic_t *nr_queued; | ||
| 16 | * atomic_t *barrier; | ||
| 17 | * Each of them points to array of atomic_t variables, each array is | ||
| 18 | * designed to have BARRIER_BUCKETS_NR elements and occupy a single | ||
| 19 | * memory page. The data width of atomic_t variables is 4 bytes, equal | ||
| 20 | * to 1<<(ilog2(sizeof(atomic_t))), BARRIER_BUCKETS_NR_BITS is defined | ||
| 21 | * as (PAGE_SHIFT - ilog2(sizeof(int))) to make sure an array of | ||
| 22 | * atomic_t variables with BARRIER_BUCKETS_NR elements just exactly | ||
| 23 | * occupies a single memory page. | ||
| 24 | */ | ||
| 25 | #define BARRIER_BUCKETS_NR_BITS (PAGE_SHIFT - ilog2(sizeof(atomic_t))) | ||
| 26 | #define BARRIER_BUCKETS_NR (1<<BARRIER_BUCKETS_NR_BITS) | ||
| 27 | |||
| 4 | struct raid1_info { | 28 | struct raid1_info { |
| 5 | struct md_rdev *rdev; | 29 | struct md_rdev *rdev; |
| 6 | sector_t head_position; | 30 | sector_t head_position; |
| @@ -35,25 +59,6 @@ struct r1conf { | |||
| 35 | */ | 59 | */ |
| 36 | int raid_disks; | 60 | int raid_disks; |
| 37 | 61 | ||
| 38 | /* During resync, read_balancing is only allowed on the part | ||
| 39 | * of the array that has been resynced. 'next_resync' tells us | ||
| 40 | * where that is. | ||
| 41 | */ | ||
| 42 | sector_t next_resync; | ||
| 43 | |||
| 44 | /* When raid1 starts resync, we divide array into four partitions | ||
| 45 | * |---------|--------------|---------------------|-------------| | ||
| 46 | * next_resync start_next_window end_window | ||
| 47 | * start_next_window = next_resync + NEXT_NORMALIO_DISTANCE | ||
| 48 | * end_window = start_next_window + NEXT_NORMALIO_DISTANCE | ||
| 49 | * current_window_requests means the count of normalIO between | ||
| 50 | * start_next_window and end_window. | ||
| 51 | * next_window_requests means the count of normalIO after end_window. | ||
| 52 | * */ | ||
| 53 | sector_t start_next_window; | ||
| 54 | int current_window_requests; | ||
| 55 | int next_window_requests; | ||
| 56 | |||
| 57 | spinlock_t device_lock; | 62 | spinlock_t device_lock; |
| 58 | 63 | ||
| 59 | /* list of 'struct r1bio' that need to be processed by raid1d, | 64 | /* list of 'struct r1bio' that need to be processed by raid1d, |
| @@ -79,10 +84,10 @@ struct r1conf { | |||
| 79 | */ | 84 | */ |
| 80 | wait_queue_head_t wait_barrier; | 85 | wait_queue_head_t wait_barrier; |
| 81 | spinlock_t resync_lock; | 86 | spinlock_t resync_lock; |
| 82 | int nr_pending; | 87 | atomic_t *nr_pending; |
| 83 | int nr_waiting; | 88 | atomic_t *nr_waiting; |
| 84 | int nr_queued; | 89 | atomic_t *nr_queued; |
| 85 | int barrier; | 90 | atomic_t *barrier; |
| 86 | int array_frozen; | 91 | int array_frozen; |
| 87 | 92 | ||
| 88 | /* Set to 1 if a full sync is needed, (fresh device added). | 93 | /* Set to 1 if a full sync is needed, (fresh device added). |
| @@ -135,7 +140,6 @@ struct r1bio { | |||
| 135 | * in this BehindIO request | 140 | * in this BehindIO request |
| 136 | */ | 141 | */ |
| 137 | sector_t sector; | 142 | sector_t sector; |
| 138 | sector_t start_next_window; | ||
| 139 | int sectors; | 143 | int sectors; |
| 140 | unsigned long state; | 144 | unsigned long state; |
| 141 | struct mddev *mddev; | 145 | struct mddev *mddev; |
| @@ -185,4 +189,10 @@ enum r1bio_state { | |||
| 185 | R1BIO_WriteError, | 189 | R1BIO_WriteError, |
| 186 | R1BIO_FailFast, | 190 | R1BIO_FailFast, |
| 187 | }; | 191 | }; |
| 192 | |||
| 193 | static inline int sector_to_idx(sector_t sector) | ||
| 194 | { | ||
| 195 | return hash_long(sector >> BARRIER_UNIT_SECTOR_BITS, | ||
| 196 | BARRIER_BUCKETS_NR_BITS); | ||
| 197 | } | ||
| 188 | #endif | 198 | #endif |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 6bc5c2a85160..063c43d83b72 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
| @@ -1132,7 +1132,7 @@ read_again: | |||
| 1132 | } | 1132 | } |
| 1133 | slot = r10_bio->read_slot; | 1133 | slot = r10_bio->read_slot; |
| 1134 | 1134 | ||
| 1135 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 1135 | read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); |
| 1136 | bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector, | 1136 | bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector, |
| 1137 | max_sectors); | 1137 | max_sectors); |
| 1138 | 1138 | ||
| @@ -1406,7 +1406,7 @@ retry_write: | |||
| 1406 | int d = r10_bio->devs[i].devnum; | 1406 | int d = r10_bio->devs[i].devnum; |
| 1407 | if (r10_bio->devs[i].bio) { | 1407 | if (r10_bio->devs[i].bio) { |
| 1408 | struct md_rdev *rdev = conf->mirrors[d].rdev; | 1408 | struct md_rdev *rdev = conf->mirrors[d].rdev; |
| 1409 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 1409 | mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); |
| 1410 | bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, | 1410 | bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, |
| 1411 | max_sectors); | 1411 | max_sectors); |
| 1412 | r10_bio->devs[i].bio = mbio; | 1412 | r10_bio->devs[i].bio = mbio; |
| @@ -1457,7 +1457,7 @@ retry_write: | |||
| 1457 | smp_mb(); | 1457 | smp_mb(); |
| 1458 | rdev = conf->mirrors[d].rdev; | 1458 | rdev = conf->mirrors[d].rdev; |
| 1459 | } | 1459 | } |
| 1460 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 1460 | mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); |
| 1461 | bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, | 1461 | bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, |
| 1462 | max_sectors); | 1462 | max_sectors); |
| 1463 | r10_bio->devs[i].repl_bio = mbio; | 1463 | r10_bio->devs[i].repl_bio = mbio; |
| @@ -2565,7 +2565,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) | |||
| 2565 | if (sectors > sect_to_write) | 2565 | if (sectors > sect_to_write) |
| 2566 | sectors = sect_to_write; | 2566 | sectors = sect_to_write; |
| 2567 | /* Write at 'sector' for 'sectors' */ | 2567 | /* Write at 'sector' for 'sectors' */ |
| 2568 | wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 2568 | wbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); |
| 2569 | bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors); | 2569 | bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors); |
| 2570 | wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector); | 2570 | wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector); |
| 2571 | wbio->bi_iter.bi_sector = wsector + | 2571 | wbio->bi_iter.bi_sector = wsector + |
| @@ -2641,8 +2641,7 @@ read_more: | |||
| 2641 | mdname(mddev), | 2641 | mdname(mddev), |
| 2642 | bdevname(rdev->bdev, b), | 2642 | bdevname(rdev->bdev, b), |
| 2643 | (unsigned long long)r10_bio->sector); | 2643 | (unsigned long long)r10_bio->sector); |
| 2644 | bio = bio_clone_mddev(r10_bio->master_bio, | 2644 | bio = bio_clone_fast(r10_bio->master_bio, GFP_NOIO, mddev->bio_set); |
| 2645 | GFP_NOIO, mddev); | ||
| 2646 | bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); | 2645 | bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); |
| 2647 | r10_bio->devs[slot].bio = bio; | 2646 | r10_bio->devs[slot].bio = bio; |
| 2648 | r10_bio->devs[slot].rdev = rdev; | 2647 | r10_bio->devs[slot].rdev = rdev; |
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 302dea3296ba..3f307be01b10 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c | |||
| @@ -20,6 +20,7 @@ | |||
| 20 | #include <linux/crc32c.h> | 20 | #include <linux/crc32c.h> |
| 21 | #include <linux/random.h> | 21 | #include <linux/random.h> |
| 22 | #include <linux/kthread.h> | 22 | #include <linux/kthread.h> |
| 23 | #include <linux/types.h> | ||
| 23 | #include "md.h" | 24 | #include "md.h" |
| 24 | #include "raid5.h" | 25 | #include "raid5.h" |
| 25 | #include "bitmap.h" | 26 | #include "bitmap.h" |
| @@ -164,9 +165,60 @@ struct r5l_log { | |||
| 164 | struct work_struct deferred_io_work; | 165 | struct work_struct deferred_io_work; |
| 165 | /* to disable write back during in degraded mode */ | 166 | /* to disable write back during in degraded mode */ |
| 166 | struct work_struct disable_writeback_work; | 167 | struct work_struct disable_writeback_work; |
| 168 | |||
| 169 | /* to for chunk_aligned_read in writeback mode, details below */ | ||
| 170 | spinlock_t tree_lock; | ||
| 171 | struct radix_tree_root big_stripe_tree; | ||
| 167 | }; | 172 | }; |
| 168 | 173 | ||
| 169 | /* | 174 | /* |
| 175 | * Enable chunk_aligned_read() with write back cache. | ||
| 176 | * | ||
| 177 | * Each chunk may contain more than one stripe (for example, a 256kB | ||
| 178 | * chunk contains 64 4kB-page, so this chunk contain 64 stripes). For | ||
| 179 | * chunk_aligned_read, these stripes are grouped into one "big_stripe". | ||
| 180 | * For each big_stripe, we count how many stripes of this big_stripe | ||
| 181 | * are in the write back cache. These data are tracked in a radix tree | ||
| 182 | * (big_stripe_tree). We use radix_tree item pointer as the counter. | ||
| 183 | * r5c_tree_index() is used to calculate keys for the radix tree. | ||
| 184 | * | ||
| 185 | * chunk_aligned_read() calls r5c_big_stripe_cached() to look up | ||
| 186 | * big_stripe of each chunk in the tree. If this big_stripe is in the | ||
| 187 | * tree, chunk_aligned_read() aborts. This look up is protected by | ||
| 188 | * rcu_read_lock(). | ||
| 189 | * | ||
| 190 | * It is necessary to remember whether a stripe is counted in | ||
| 191 | * big_stripe_tree. Instead of adding new flag, we reuses existing flags: | ||
| 192 | * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE. If either of these | ||
| 193 | * two flags are set, the stripe is counted in big_stripe_tree. This | ||
| 194 | * requires moving set_bit(STRIPE_R5C_PARTIAL_STRIPE) to | ||
| 195 | * r5c_try_caching_write(); and moving clear_bit of | ||
| 196 | * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE to | ||
| 197 | * r5c_finish_stripe_write_out(). | ||
| 198 | */ | ||
| 199 | |||
| 200 | /* | ||
| 201 | * radix tree requests lowest 2 bits of data pointer to be 2b'00. | ||
| 202 | * So it is necessary to left shift the counter by 2 bits before using it | ||
| 203 | * as data pointer of the tree. | ||
| 204 | */ | ||
| 205 | #define R5C_RADIX_COUNT_SHIFT 2 | ||
| 206 | |||
| 207 | /* | ||
| 208 | * calculate key for big_stripe_tree | ||
| 209 | * | ||
| 210 | * sect: align_bi->bi_iter.bi_sector or sh->sector | ||
| 211 | */ | ||
| 212 | static inline sector_t r5c_tree_index(struct r5conf *conf, | ||
| 213 | sector_t sect) | ||
| 214 | { | ||
| 215 | sector_t offset; | ||
| 216 | |||
| 217 | offset = sector_div(sect, conf->chunk_sectors); | ||
| 218 | return sect; | ||
| 219 | } | ||
| 220 | |||
| 221 | /* | ||
| 170 | * an IO range starts from a meta data block and end at the next meta data | 222 | * an IO range starts from a meta data block and end at the next meta data |
| 171 | * block. The io unit's the meta data block tracks data/parity followed it. io | 223 | * block. The io unit's the meta data block tracks data/parity followed it. io |
| 172 | * unit is written to log disk with normal write, as we always flush log disk | 224 | * unit is written to log disk with normal write, as we always flush log disk |
| @@ -337,17 +389,30 @@ void r5c_check_cached_full_stripe(struct r5conf *conf) | |||
| 337 | /* | 389 | /* |
| 338 | * Total log space (in sectors) needed to flush all data in cache | 390 | * Total log space (in sectors) needed to flush all data in cache |
| 339 | * | 391 | * |
| 340 | * Currently, writing-out phase automatically includes all pending writes | 392 | * To avoid deadlock due to log space, it is necessary to reserve log |
| 341 | * to the same sector. So the reclaim of each stripe takes up to | 393 | * space to flush critical stripes (stripes that occupying log space near |
| 342 | * (conf->raid_disks + 1) pages of log space. | 394 | * last_checkpoint). This function helps check how much log space is |
| 395 | * required to flush all cached stripes. | ||
| 343 | * | 396 | * |
| 344 | * To totally avoid deadlock due to log space, the code reserves | 397 | * To reduce log space requirements, two mechanisms are used to give cache |
| 345 | * (conf->raid_disks + 1) pages for each stripe in cache, which is not | 398 | * flush higher priorities: |
| 346 | * necessary in most cases. | 399 | * 1. In handle_stripe_dirtying() and schedule_reconstruction(), |
| 400 | * stripes ALREADY in journal can be flushed w/o pending writes; | ||
| 401 | * 2. In r5l_write_stripe() and r5c_cache_data(), stripes NOT in journal | ||
| 402 | * can be delayed (r5l_add_no_space_stripe). | ||
| 347 | * | 403 | * |
| 348 | * To improve this, we will need writing-out phase to be able to NOT include | 404 | * In cache flush, the stripe goes through 1 and then 2. For a stripe that |
| 349 | * pending writes, which will reduce the requirement to | 405 | * already passed 1, flushing it requires at most (conf->max_degraded + 1) |
| 350 | * (conf->max_degraded + 1) pages per stripe in cache. | 406 | * pages of journal space. For stripes that has not passed 1, flushing it |
| 407 | * requires (conf->raid_disks + 1) pages of journal space. There are at | ||
| 408 | * most (conf->group_cnt + 1) stripe that passed 1. So total journal space | ||
| 409 | * required to flush all cached stripes (in pages) is: | ||
| 410 | * | ||
| 411 | * (stripe_in_journal_count - group_cnt - 1) * (max_degraded + 1) + | ||
| 412 | * (group_cnt + 1) * (raid_disks + 1) | ||
| 413 | * or | ||
| 414 | * (stripe_in_journal_count) * (max_degraded + 1) + | ||
| 415 | * (group_cnt + 1) * (raid_disks - max_degraded) | ||
| 351 | */ | 416 | */ |
| 352 | static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) | 417 | static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) |
| 353 | { | 418 | { |
| @@ -356,8 +421,9 @@ static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) | |||
| 356 | if (!r5c_is_writeback(log)) | 421 | if (!r5c_is_writeback(log)) |
| 357 | return 0; | 422 | return 0; |
| 358 | 423 | ||
| 359 | return BLOCK_SECTORS * (conf->raid_disks + 1) * | 424 | return BLOCK_SECTORS * |
| 360 | atomic_read(&log->stripe_in_journal_count); | 425 | ((conf->max_degraded + 1) * atomic_read(&log->stripe_in_journal_count) + |
| 426 | (conf->raid_disks - conf->max_degraded) * (conf->group_cnt + 1)); | ||
| 361 | } | 427 | } |
| 362 | 428 | ||
| 363 | /* | 429 | /* |
| @@ -412,16 +478,6 @@ void r5c_make_stripe_write_out(struct stripe_head *sh) | |||
| 412 | 478 | ||
| 413 | if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | 479 | if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) |
| 414 | atomic_inc(&conf->preread_active_stripes); | 480 | atomic_inc(&conf->preread_active_stripes); |
| 415 | |||
| 416 | if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) { | ||
| 417 | BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0); | ||
| 418 | atomic_dec(&conf->r5c_cached_partial_stripes); | ||
| 419 | } | ||
| 420 | |||
| 421 | if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { | ||
| 422 | BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0); | ||
| 423 | atomic_dec(&conf->r5c_cached_full_stripes); | ||
| 424 | } | ||
| 425 | } | 481 | } |
| 426 | 482 | ||
| 427 | static void r5c_handle_data_cached(struct stripe_head *sh) | 483 | static void r5c_handle_data_cached(struct stripe_head *sh) |
| @@ -1271,6 +1327,10 @@ static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh) | |||
| 1271 | atomic_inc(&conf->active_stripes); | 1327 | atomic_inc(&conf->active_stripes); |
| 1272 | r5c_make_stripe_write_out(sh); | 1328 | r5c_make_stripe_write_out(sh); |
| 1273 | 1329 | ||
| 1330 | if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) | ||
| 1331 | atomic_inc(&conf->r5c_flushing_partial_stripes); | ||
| 1332 | else | ||
| 1333 | atomic_inc(&conf->r5c_flushing_full_stripes); | ||
| 1274 | raid5_release_stripe(sh); | 1334 | raid5_release_stripe(sh); |
| 1275 | } | 1335 | } |
| 1276 | 1336 | ||
| @@ -1313,12 +1373,16 @@ static void r5c_do_reclaim(struct r5conf *conf) | |||
| 1313 | unsigned long flags; | 1373 | unsigned long flags; |
| 1314 | int total_cached; | 1374 | int total_cached; |
| 1315 | int stripes_to_flush; | 1375 | int stripes_to_flush; |
| 1376 | int flushing_partial, flushing_full; | ||
| 1316 | 1377 | ||
| 1317 | if (!r5c_is_writeback(log)) | 1378 | if (!r5c_is_writeback(log)) |
| 1318 | return; | 1379 | return; |
| 1319 | 1380 | ||
| 1381 | flushing_partial = atomic_read(&conf->r5c_flushing_partial_stripes); | ||
| 1382 | flushing_full = atomic_read(&conf->r5c_flushing_full_stripes); | ||
| 1320 | total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + | 1383 | total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + |
| 1321 | atomic_read(&conf->r5c_cached_full_stripes); | 1384 | atomic_read(&conf->r5c_cached_full_stripes) - |
| 1385 | flushing_full - flushing_partial; | ||
| 1322 | 1386 | ||
| 1323 | if (total_cached > conf->min_nr_stripes * 3 / 4 || | 1387 | if (total_cached > conf->min_nr_stripes * 3 / 4 || |
| 1324 | atomic_read(&conf->empty_inactive_list_nr) > 0) | 1388 | atomic_read(&conf->empty_inactive_list_nr) > 0) |
| @@ -1328,7 +1392,7 @@ static void r5c_do_reclaim(struct r5conf *conf) | |||
| 1328 | */ | 1392 | */ |
| 1329 | stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP; | 1393 | stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP; |
| 1330 | else if (total_cached > conf->min_nr_stripes * 1 / 2 || | 1394 | else if (total_cached > conf->min_nr_stripes * 1 / 2 || |
| 1331 | atomic_read(&conf->r5c_cached_full_stripes) > | 1395 | atomic_read(&conf->r5c_cached_full_stripes) - flushing_full > |
| 1332 | R5C_FULL_STRIPE_FLUSH_BATCH) | 1396 | R5C_FULL_STRIPE_FLUSH_BATCH) |
| 1333 | /* | 1397 | /* |
| 1334 | * if stripe cache pressure moderate, or if there is many full | 1398 | * if stripe cache pressure moderate, or if there is many full |
| @@ -1362,9 +1426,9 @@ static void r5c_do_reclaim(struct r5conf *conf) | |||
| 1362 | !test_bit(STRIPE_HANDLE, &sh->state) && | 1426 | !test_bit(STRIPE_HANDLE, &sh->state) && |
| 1363 | atomic_read(&sh->count) == 0) { | 1427 | atomic_read(&sh->count) == 0) { |
| 1364 | r5c_flush_stripe(conf, sh); | 1428 | r5c_flush_stripe(conf, sh); |
| 1429 | if (count++ >= R5C_RECLAIM_STRIPE_GROUP) | ||
| 1430 | break; | ||
| 1365 | } | 1431 | } |
| 1366 | if (count++ >= R5C_RECLAIM_STRIPE_GROUP) | ||
| 1367 | break; | ||
| 1368 | } | 1432 | } |
| 1369 | spin_unlock(&conf->device_lock); | 1433 | spin_unlock(&conf->device_lock); |
| 1370 | spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); | 1434 | spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); |
| @@ -2320,6 +2384,10 @@ int r5c_try_caching_write(struct r5conf *conf, | |||
| 2320 | int i; | 2384 | int i; |
| 2321 | struct r5dev *dev; | 2385 | struct r5dev *dev; |
| 2322 | int to_cache = 0; | 2386 | int to_cache = 0; |
| 2387 | void **pslot; | ||
| 2388 | sector_t tree_index; | ||
| 2389 | int ret; | ||
| 2390 | uintptr_t refcount; | ||
| 2323 | 2391 | ||
| 2324 | BUG_ON(!r5c_is_writeback(log)); | 2392 | BUG_ON(!r5c_is_writeback(log)); |
| 2325 | 2393 | ||
| @@ -2364,6 +2432,44 @@ int r5c_try_caching_write(struct r5conf *conf, | |||
| 2364 | } | 2432 | } |
| 2365 | } | 2433 | } |
| 2366 | 2434 | ||
| 2435 | /* if the stripe is not counted in big_stripe_tree, add it now */ | ||
| 2436 | if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) && | ||
| 2437 | !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { | ||
| 2438 | tree_index = r5c_tree_index(conf, sh->sector); | ||
| 2439 | spin_lock(&log->tree_lock); | ||
| 2440 | pslot = radix_tree_lookup_slot(&log->big_stripe_tree, | ||
| 2441 | tree_index); | ||
| 2442 | if (pslot) { | ||
| 2443 | refcount = (uintptr_t)radix_tree_deref_slot_protected( | ||
| 2444 | pslot, &log->tree_lock) >> | ||
| 2445 | R5C_RADIX_COUNT_SHIFT; | ||
| 2446 | radix_tree_replace_slot( | ||
| 2447 | &log->big_stripe_tree, pslot, | ||
| 2448 | (void *)((refcount + 1) << R5C_RADIX_COUNT_SHIFT)); | ||
| 2449 | } else { | ||
| 2450 | /* | ||
| 2451 | * this radix_tree_insert can fail safely, so no | ||
| 2452 | * need to call radix_tree_preload() | ||
| 2453 | */ | ||
| 2454 | ret = radix_tree_insert( | ||
| 2455 | &log->big_stripe_tree, tree_index, | ||
| 2456 | (void *)(1 << R5C_RADIX_COUNT_SHIFT)); | ||
| 2457 | if (ret) { | ||
| 2458 | spin_unlock(&log->tree_lock); | ||
| 2459 | r5c_make_stripe_write_out(sh); | ||
| 2460 | return -EAGAIN; | ||
| 2461 | } | ||
| 2462 | } | ||
| 2463 | spin_unlock(&log->tree_lock); | ||
| 2464 | |||
| 2465 | /* | ||
| 2466 | * set STRIPE_R5C_PARTIAL_STRIPE, this shows the stripe is | ||
| 2467 | * counted in the radix tree | ||
| 2468 | */ | ||
| 2469 | set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state); | ||
| 2470 | atomic_inc(&conf->r5c_cached_partial_stripes); | ||
| 2471 | } | ||
| 2472 | |||
| 2367 | for (i = disks; i--; ) { | 2473 | for (i = disks; i--; ) { |
| 2368 | dev = &sh->dev[i]; | 2474 | dev = &sh->dev[i]; |
| 2369 | if (dev->towrite) { | 2475 | if (dev->towrite) { |
| @@ -2438,17 +2544,20 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, | |||
| 2438 | struct stripe_head *sh, | 2544 | struct stripe_head *sh, |
| 2439 | struct stripe_head_state *s) | 2545 | struct stripe_head_state *s) |
| 2440 | { | 2546 | { |
| 2547 | struct r5l_log *log = conf->log; | ||
| 2441 | int i; | 2548 | int i; |
| 2442 | int do_wakeup = 0; | 2549 | int do_wakeup = 0; |
| 2550 | sector_t tree_index; | ||
| 2551 | void **pslot; | ||
| 2552 | uintptr_t refcount; | ||
| 2443 | 2553 | ||
| 2444 | if (!conf->log || | 2554 | if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) |
| 2445 | !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) | ||
| 2446 | return; | 2555 | return; |
| 2447 | 2556 | ||
| 2448 | WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); | 2557 | WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); |
| 2449 | clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); | 2558 | clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); |
| 2450 | 2559 | ||
| 2451 | if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) | 2560 | if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) |
| 2452 | return; | 2561 | return; |
| 2453 | 2562 | ||
| 2454 | for (i = sh->disks; i--; ) { | 2563 | for (i = sh->disks; i--; ) { |
| @@ -2470,12 +2579,45 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, | |||
| 2470 | if (do_wakeup) | 2579 | if (do_wakeup) |
| 2471 | wake_up(&conf->wait_for_overlap); | 2580 | wake_up(&conf->wait_for_overlap); |
| 2472 | 2581 | ||
| 2473 | spin_lock_irq(&conf->log->stripe_in_journal_lock); | 2582 | spin_lock_irq(&log->stripe_in_journal_lock); |
| 2474 | list_del_init(&sh->r5c); | 2583 | list_del_init(&sh->r5c); |
| 2475 | spin_unlock_irq(&conf->log->stripe_in_journal_lock); | 2584 | spin_unlock_irq(&log->stripe_in_journal_lock); |
| 2476 | sh->log_start = MaxSector; | 2585 | sh->log_start = MaxSector; |
| 2477 | atomic_dec(&conf->log->stripe_in_journal_count); | 2586 | |
| 2478 | r5c_update_log_state(conf->log); | 2587 | atomic_dec(&log->stripe_in_journal_count); |
| 2588 | r5c_update_log_state(log); | ||
| 2589 | |||
| 2590 | /* stop counting this stripe in big_stripe_tree */ | ||
| 2591 | if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) || | ||
| 2592 | test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { | ||
| 2593 | tree_index = r5c_tree_index(conf, sh->sector); | ||
| 2594 | spin_lock(&log->tree_lock); | ||
| 2595 | pslot = radix_tree_lookup_slot(&log->big_stripe_tree, | ||
| 2596 | tree_index); | ||
| 2597 | BUG_ON(pslot == NULL); | ||
| 2598 | refcount = (uintptr_t)radix_tree_deref_slot_protected( | ||
| 2599 | pslot, &log->tree_lock) >> | ||
| 2600 | R5C_RADIX_COUNT_SHIFT; | ||
| 2601 | if (refcount == 1) | ||
| 2602 | radix_tree_delete(&log->big_stripe_tree, tree_index); | ||
| 2603 | else | ||
| 2604 | radix_tree_replace_slot( | ||
| 2605 | &log->big_stripe_tree, pslot, | ||
| 2606 | (void *)((refcount - 1) << R5C_RADIX_COUNT_SHIFT)); | ||
| 2607 | spin_unlock(&log->tree_lock); | ||
| 2608 | } | ||
| 2609 | |||
| 2610 | if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) { | ||
| 2611 | BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0); | ||
| 2612 | atomic_dec(&conf->r5c_flushing_partial_stripes); | ||
| 2613 | atomic_dec(&conf->r5c_cached_partial_stripes); | ||
| 2614 | } | ||
| 2615 | |||
| 2616 | if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { | ||
| 2617 | BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0); | ||
| 2618 | atomic_dec(&conf->r5c_flushing_full_stripes); | ||
| 2619 | atomic_dec(&conf->r5c_cached_full_stripes); | ||
| 2620 | } | ||
| 2479 | } | 2621 | } |
| 2480 | 2622 | ||
| 2481 | int | 2623 | int |
| @@ -2535,6 +2677,22 @@ r5c_cache_data(struct r5l_log *log, struct stripe_head *sh, | |||
| 2535 | return 0; | 2677 | return 0; |
| 2536 | } | 2678 | } |
| 2537 | 2679 | ||
| 2680 | /* check whether this big stripe is in write back cache. */ | ||
| 2681 | bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect) | ||
| 2682 | { | ||
| 2683 | struct r5l_log *log = conf->log; | ||
| 2684 | sector_t tree_index; | ||
| 2685 | void *slot; | ||
| 2686 | |||
| 2687 | if (!log) | ||
| 2688 | return false; | ||
| 2689 | |||
| 2690 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
| 2691 | tree_index = r5c_tree_index(conf, sect); | ||
| 2692 | slot = radix_tree_lookup(&log->big_stripe_tree, tree_index); | ||
| 2693 | return slot != NULL; | ||
| 2694 | } | ||
| 2695 | |||
| 2538 | static int r5l_load_log(struct r5l_log *log) | 2696 | static int r5l_load_log(struct r5l_log *log) |
| 2539 | { | 2697 | { |
| 2540 | struct md_rdev *rdev = log->rdev; | 2698 | struct md_rdev *rdev = log->rdev; |
| @@ -2681,6 +2839,9 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) | |||
| 2681 | if (!log->meta_pool) | 2839 | if (!log->meta_pool) |
| 2682 | goto out_mempool; | 2840 | goto out_mempool; |
| 2683 | 2841 | ||
| 2842 | spin_lock_init(&log->tree_lock); | ||
| 2843 | INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN); | ||
| 2844 | |||
| 2684 | log->reclaim_thread = md_register_thread(r5l_reclaim_thread, | 2845 | log->reclaim_thread = md_register_thread(r5l_reclaim_thread, |
| 2685 | log->rdev->mddev, "reclaim"); | 2846 | log->rdev->mddev, "reclaim"); |
| 2686 | if (!log->reclaim_thread) | 2847 | if (!log->reclaim_thread) |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6214e699342c..2ce23b01dbb2 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
| @@ -281,13 +281,13 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, | |||
| 281 | atomic_dec(&conf->r5c_cached_partial_stripes); | 281 | atomic_dec(&conf->r5c_cached_partial_stripes); |
| 282 | list_add_tail(&sh->lru, &conf->r5c_full_stripe_list); | 282 | list_add_tail(&sh->lru, &conf->r5c_full_stripe_list); |
| 283 | r5c_check_cached_full_stripe(conf); | 283 | r5c_check_cached_full_stripe(conf); |
| 284 | } else { | 284 | } else |
| 285 | /* partial stripe */ | 285 | /* |
| 286 | if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE, | 286 | * STRIPE_R5C_PARTIAL_STRIPE is set in |
| 287 | &sh->state)) | 287 | * r5c_try_caching_write(). No need to |
| 288 | atomic_inc(&conf->r5c_cached_partial_stripes); | 288 | * set it again. |
| 289 | */ | ||
| 289 | list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list); | 290 | list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list); |
| 290 | } | ||
| 291 | } | 291 | } |
| 292 | } | 292 | } |
| 293 | } | 293 | } |
| @@ -353,17 +353,15 @@ static void release_inactive_stripe_list(struct r5conf *conf, | |||
| 353 | static int release_stripe_list(struct r5conf *conf, | 353 | static int release_stripe_list(struct r5conf *conf, |
| 354 | struct list_head *temp_inactive_list) | 354 | struct list_head *temp_inactive_list) |
| 355 | { | 355 | { |
| 356 | struct stripe_head *sh; | 356 | struct stripe_head *sh, *t; |
| 357 | int count = 0; | 357 | int count = 0; |
| 358 | struct llist_node *head; | 358 | struct llist_node *head; |
| 359 | 359 | ||
| 360 | head = llist_del_all(&conf->released_stripes); | 360 | head = llist_del_all(&conf->released_stripes); |
| 361 | head = llist_reverse_order(head); | 361 | head = llist_reverse_order(head); |
| 362 | while (head) { | 362 | llist_for_each_entry_safe(sh, t, head, release_list) { |
| 363 | int hash; | 363 | int hash; |
| 364 | 364 | ||
| 365 | sh = llist_entry(head, struct stripe_head, release_list); | ||
| 366 | head = llist_next(head); | ||
| 367 | /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ | 365 | /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ |
| 368 | smp_mb(); | 366 | smp_mb(); |
| 369 | clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state); | 367 | clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state); |
| @@ -863,6 +861,43 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) | |||
| 863 | return 1; | 861 | return 1; |
| 864 | } | 862 | } |
| 865 | 863 | ||
| 864 | static void flush_deferred_bios(struct r5conf *conf) | ||
| 865 | { | ||
| 866 | struct bio_list tmp; | ||
| 867 | struct bio *bio; | ||
| 868 | |||
| 869 | if (!conf->batch_bio_dispatch || !conf->group_cnt) | ||
| 870 | return; | ||
| 871 | |||
| 872 | bio_list_init(&tmp); | ||
| 873 | spin_lock(&conf->pending_bios_lock); | ||
| 874 | bio_list_merge(&tmp, &conf->pending_bios); | ||
| 875 | bio_list_init(&conf->pending_bios); | ||
| 876 | spin_unlock(&conf->pending_bios_lock); | ||
| 877 | |||
| 878 | while ((bio = bio_list_pop(&tmp))) | ||
| 879 | generic_make_request(bio); | ||
| 880 | } | ||
| 881 | |||
| 882 | static void defer_bio_issue(struct r5conf *conf, struct bio *bio) | ||
| 883 | { | ||
| 884 | /* | ||
| 885 | * change group_cnt will drain all bios, so this is safe | ||
| 886 | * | ||
| 887 | * A read generally means a read-modify-write, which usually means a | ||
| 888 | * randwrite, so we don't delay it | ||
| 889 | */ | ||
| 890 | if (!conf->batch_bio_dispatch || !conf->group_cnt || | ||
| 891 | bio_op(bio) == REQ_OP_READ) { | ||
| 892 | generic_make_request(bio); | ||
| 893 | return; | ||
| 894 | } | ||
| 895 | spin_lock(&conf->pending_bios_lock); | ||
| 896 | bio_list_add(&conf->pending_bios, bio); | ||
| 897 | spin_unlock(&conf->pending_bios_lock); | ||
| 898 | md_wakeup_thread(conf->mddev->thread); | ||
| 899 | } | ||
| 900 | |||
| 866 | static void | 901 | static void |
| 867 | raid5_end_read_request(struct bio *bi); | 902 | raid5_end_read_request(struct bio *bi); |
| 868 | static void | 903 | static void |
| @@ -1043,7 +1078,7 @@ again: | |||
| 1043 | trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), | 1078 | trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), |
| 1044 | bi, disk_devt(conf->mddev->gendisk), | 1079 | bi, disk_devt(conf->mddev->gendisk), |
| 1045 | sh->dev[i].sector); | 1080 | sh->dev[i].sector); |
| 1046 | generic_make_request(bi); | 1081 | defer_bio_issue(conf, bi); |
| 1047 | } | 1082 | } |
| 1048 | if (rrdev) { | 1083 | if (rrdev) { |
| 1049 | if (s->syncing || s->expanding || s->expanded | 1084 | if (s->syncing || s->expanding || s->expanded |
| @@ -1088,7 +1123,7 @@ again: | |||
| 1088 | trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), | 1123 | trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), |
| 1089 | rbi, disk_devt(conf->mddev->gendisk), | 1124 | rbi, disk_devt(conf->mddev->gendisk), |
| 1090 | sh->dev[i].sector); | 1125 | sh->dev[i].sector); |
| 1091 | generic_make_request(rbi); | 1126 | defer_bio_issue(conf, rbi); |
| 1092 | } | 1127 | } |
| 1093 | if (!rdev && !rrdev) { | 1128 | if (!rdev && !rrdev) { |
| 1094 | if (op_is_write(op)) | 1129 | if (op_is_write(op)) |
| @@ -2914,12 +2949,36 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) | |||
| 2914 | * like to flush data in journal to RAID disks first, so complex rmw | 2949 | * like to flush data in journal to RAID disks first, so complex rmw |
| 2915 | * is handled in the write patch (handle_stripe_dirtying). | 2950 | * is handled in the write patch (handle_stripe_dirtying). |
| 2916 | * | 2951 | * |
| 2952 | * 2. when journal space is critical (R5C_LOG_CRITICAL=1) | ||
| 2953 | * | ||
| 2954 | * It is important to be able to flush all stripes in raid5-cache. | ||
| 2955 | * Therefore, we need reserve some space on the journal device for | ||
| 2956 | * these flushes. If flush operation includes pending writes to the | ||
| 2957 | * stripe, we need to reserve (conf->raid_disk + 1) pages per stripe | ||
| 2958 | * for the flush out. If we exclude these pending writes from flush | ||
| 2959 | * operation, we only need (conf->max_degraded + 1) pages per stripe. | ||
| 2960 | * Therefore, excluding pending writes in these cases enables more | ||
| 2961 | * efficient use of the journal device. | ||
| 2962 | * | ||
| 2963 | * Note: To make sure the stripe makes progress, we only delay | ||
| 2964 | * towrite for stripes with data already in journal (injournal > 0). | ||
| 2965 | * When LOG_CRITICAL, stripes with injournal == 0 will be sent to | ||
| 2966 | * no_space_stripes list. | ||
| 2967 | * | ||
| 2917 | */ | 2968 | */ |
| 2918 | static inline bool delay_towrite(struct r5dev *dev, | 2969 | static inline bool delay_towrite(struct r5conf *conf, |
| 2919 | struct stripe_head_state *s) | 2970 | struct r5dev *dev, |
| 2971 | struct stripe_head_state *s) | ||
| 2920 | { | 2972 | { |
| 2921 | return !test_bit(R5_OVERWRITE, &dev->flags) && | 2973 | /* case 1 above */ |
| 2922 | !test_bit(R5_Insync, &dev->flags) && s->injournal; | 2974 | if (!test_bit(R5_OVERWRITE, &dev->flags) && |
| 2975 | !test_bit(R5_Insync, &dev->flags) && s->injournal) | ||
| 2976 | return true; | ||
| 2977 | /* case 2 above */ | ||
| 2978 | if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && | ||
| 2979 | s->injournal > 0) | ||
| 2980 | return true; | ||
| 2981 | return false; | ||
| 2923 | } | 2982 | } |
| 2924 | 2983 | ||
| 2925 | static void | 2984 | static void |
| @@ -2942,7 +3001,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, | |||
| 2942 | for (i = disks; i--; ) { | 3001 | for (i = disks; i--; ) { |
| 2943 | struct r5dev *dev = &sh->dev[i]; | 3002 | struct r5dev *dev = &sh->dev[i]; |
| 2944 | 3003 | ||
| 2945 | if (dev->towrite && !delay_towrite(dev, s)) { | 3004 | if (dev->towrite && !delay_towrite(conf, dev, s)) { |
| 2946 | set_bit(R5_LOCKED, &dev->flags); | 3005 | set_bit(R5_LOCKED, &dev->flags); |
| 2947 | set_bit(R5_Wantdrain, &dev->flags); | 3006 | set_bit(R5_Wantdrain, &dev->flags); |
| 2948 | if (!expand) | 3007 | if (!expand) |
| @@ -3694,7 +3753,7 @@ static int handle_stripe_dirtying(struct r5conf *conf, | |||
| 3694 | } else for (i = disks; i--; ) { | 3753 | } else for (i = disks; i--; ) { |
| 3695 | /* would I have to read this buffer for read_modify_write */ | 3754 | /* would I have to read this buffer for read_modify_write */ |
| 3696 | struct r5dev *dev = &sh->dev[i]; | 3755 | struct r5dev *dev = &sh->dev[i]; |
| 3697 | if (((dev->towrite && !delay_towrite(dev, s)) || | 3756 | if (((dev->towrite && !delay_towrite(conf, dev, s)) || |
| 3698 | i == sh->pd_idx || i == sh->qd_idx || | 3757 | i == sh->pd_idx || i == sh->qd_idx || |
| 3699 | test_bit(R5_InJournal, &dev->flags)) && | 3758 | test_bit(R5_InJournal, &dev->flags)) && |
| 3700 | !test_bit(R5_LOCKED, &dev->flags) && | 3759 | !test_bit(R5_LOCKED, &dev->flags) && |
| @@ -3718,8 +3777,8 @@ static int handle_stripe_dirtying(struct r5conf *conf, | |||
| 3718 | } | 3777 | } |
| 3719 | } | 3778 | } |
| 3720 | 3779 | ||
| 3721 | pr_debug("for sector %llu, rmw=%d rcw=%d\n", | 3780 | pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n", |
| 3722 | (unsigned long long)sh->sector, rmw, rcw); | 3781 | (unsigned long long)sh->sector, sh->state, rmw, rcw); |
| 3723 | set_bit(STRIPE_HANDLE, &sh->state); | 3782 | set_bit(STRIPE_HANDLE, &sh->state); |
| 3724 | if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { | 3783 | if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { |
| 3725 | /* prefer read-modify-write, but need to get some data */ | 3784 | /* prefer read-modify-write, but need to get some data */ |
| @@ -3759,7 +3818,7 @@ static int handle_stripe_dirtying(struct r5conf *conf, | |||
| 3759 | 3818 | ||
| 3760 | for (i = disks; i--; ) { | 3819 | for (i = disks; i--; ) { |
| 3761 | struct r5dev *dev = &sh->dev[i]; | 3820 | struct r5dev *dev = &sh->dev[i]; |
| 3762 | if (((dev->towrite && !delay_towrite(dev, s)) || | 3821 | if (((dev->towrite && !delay_towrite(conf, dev, s)) || |
| 3763 | i == sh->pd_idx || i == sh->qd_idx || | 3822 | i == sh->pd_idx || i == sh->qd_idx || |
| 3764 | test_bit(R5_InJournal, &dev->flags)) && | 3823 | test_bit(R5_InJournal, &dev->flags)) && |
| 3765 | !test_bit(R5_LOCKED, &dev->flags) && | 3824 | !test_bit(R5_LOCKED, &dev->flags) && |
| @@ -4995,9 +5054,9 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) | |||
| 4995 | return 0; | 5054 | return 0; |
| 4996 | } | 5055 | } |
| 4997 | /* | 5056 | /* |
| 4998 | * use bio_clone_mddev to make a copy of the bio | 5057 | * use bio_clone_fast to make a copy of the bio |
| 4999 | */ | 5058 | */ |
| 5000 | align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); | 5059 | align_bi = bio_clone_fast(raid_bio, GFP_NOIO, mddev->bio_set); |
| 5001 | if (!align_bi) | 5060 | if (!align_bi) |
| 5002 | return 0; | 5061 | return 0; |
| 5003 | /* | 5062 | /* |
| @@ -5025,6 +5084,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) | |||
| 5025 | rdev->recovery_offset >= end_sector))) | 5084 | rdev->recovery_offset >= end_sector))) |
| 5026 | rdev = NULL; | 5085 | rdev = NULL; |
| 5027 | } | 5086 | } |
| 5087 | |||
| 5088 | if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) { | ||
| 5089 | rcu_read_unlock(); | ||
| 5090 | bio_put(align_bi); | ||
| 5091 | return 0; | ||
| 5092 | } | ||
| 5093 | |||
| 5028 | if (rdev) { | 5094 | if (rdev) { |
| 5029 | sector_t first_bad; | 5095 | sector_t first_bad; |
| 5030 | int bad_sectors; | 5096 | int bad_sectors; |
| @@ -5381,7 +5447,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) | |||
| 5381 | * data on failed drives. | 5447 | * data on failed drives. |
| 5382 | */ | 5448 | */ |
| 5383 | if (rw == READ && mddev->degraded == 0 && | 5449 | if (rw == READ && mddev->degraded == 0 && |
| 5384 | !r5c_is_writeback(conf->log) && | ||
| 5385 | mddev->reshape_position == MaxSector) { | 5450 | mddev->reshape_position == MaxSector) { |
| 5386 | bi = chunk_aligned_read(mddev, bi); | 5451 | bi = chunk_aligned_read(mddev, bi); |
| 5387 | if (!bi) | 5452 | if (!bi) |
| @@ -6126,6 +6191,8 @@ static void raid5d(struct md_thread *thread) | |||
| 6126 | mutex_unlock(&conf->cache_size_mutex); | 6191 | mutex_unlock(&conf->cache_size_mutex); |
| 6127 | } | 6192 | } |
| 6128 | 6193 | ||
| 6194 | flush_deferred_bios(conf); | ||
| 6195 | |||
| 6129 | r5l_flush_stripe_to_raid(conf->log); | 6196 | r5l_flush_stripe_to_raid(conf->log); |
| 6130 | 6197 | ||
| 6131 | async_tx_issue_pending_all(); | 6198 | async_tx_issue_pending_all(); |
| @@ -6711,6 +6778,18 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
| 6711 | atomic_set(&conf->active_stripes, 0); | 6778 | atomic_set(&conf->active_stripes, 0); |
| 6712 | atomic_set(&conf->preread_active_stripes, 0); | 6779 | atomic_set(&conf->preread_active_stripes, 0); |
| 6713 | atomic_set(&conf->active_aligned_reads, 0); | 6780 | atomic_set(&conf->active_aligned_reads, 0); |
| 6781 | bio_list_init(&conf->pending_bios); | ||
| 6782 | spin_lock_init(&conf->pending_bios_lock); | ||
| 6783 | conf->batch_bio_dispatch = true; | ||
| 6784 | rdev_for_each(rdev, mddev) { | ||
| 6785 | if (test_bit(Journal, &rdev->flags)) | ||
| 6786 | continue; | ||
| 6787 | if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) { | ||
| 6788 | conf->batch_bio_dispatch = false; | ||
| 6789 | break; | ||
| 6790 | } | ||
| 6791 | } | ||
| 6792 | |||
| 6714 | conf->bypass_threshold = BYPASS_THRESHOLD; | 6793 | conf->bypass_threshold = BYPASS_THRESHOLD; |
| 6715 | conf->recovery_disabled = mddev->recovery_disabled - 1; | 6794 | conf->recovery_disabled = mddev->recovery_disabled - 1; |
| 6716 | 6795 | ||
| @@ -6757,6 +6836,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
| 6757 | INIT_LIST_HEAD(&conf->r5c_full_stripe_list); | 6836 | INIT_LIST_HEAD(&conf->r5c_full_stripe_list); |
| 6758 | atomic_set(&conf->r5c_cached_partial_stripes, 0); | 6837 | atomic_set(&conf->r5c_cached_partial_stripes, 0); |
| 6759 | INIT_LIST_HEAD(&conf->r5c_partial_stripe_list); | 6838 | INIT_LIST_HEAD(&conf->r5c_partial_stripe_list); |
| 6839 | atomic_set(&conf->r5c_flushing_full_stripes, 0); | ||
| 6840 | atomic_set(&conf->r5c_flushing_partial_stripes, 0); | ||
| 6760 | 6841 | ||
| 6761 | conf->level = mddev->new_level; | 6842 | conf->level = mddev->new_level; |
| 6762 | conf->chunk_sectors = mddev->new_chunk_sectors; | 6843 | conf->chunk_sectors = mddev->new_chunk_sectors; |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 1440fa26e296..4bb27b97bf6b 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
| @@ -663,6 +663,8 @@ struct r5conf { | |||
| 663 | struct list_head r5c_full_stripe_list; | 663 | struct list_head r5c_full_stripe_list; |
| 664 | atomic_t r5c_cached_partial_stripes; | 664 | atomic_t r5c_cached_partial_stripes; |
| 665 | struct list_head r5c_partial_stripe_list; | 665 | struct list_head r5c_partial_stripe_list; |
| 666 | atomic_t r5c_flushing_full_stripes; | ||
| 667 | atomic_t r5c_flushing_partial_stripes; | ||
| 666 | 668 | ||
| 667 | atomic_t empty_inactive_list_nr; | 669 | atomic_t empty_inactive_list_nr; |
| 668 | struct llist_head released_stripes; | 670 | struct llist_head released_stripes; |
| @@ -684,6 +686,10 @@ struct r5conf { | |||
| 684 | int group_cnt; | 686 | int group_cnt; |
| 685 | int worker_cnt_per_group; | 687 | int worker_cnt_per_group; |
| 686 | struct r5l_log *log; | 688 | struct r5l_log *log; |
| 689 | |||
| 690 | struct bio_list pending_bios; | ||
| 691 | spinlock_t pending_bios_lock; | ||
| 692 | bool batch_bio_dispatch; | ||
| 687 | }; | 693 | }; |
| 688 | 694 | ||
| 689 | 695 | ||
| @@ -788,4 +794,5 @@ extern void r5c_check_stripe_cache_usage(struct r5conf *conf); | |||
| 788 | extern void r5c_check_cached_full_stripe(struct r5conf *conf); | 794 | extern void r5c_check_cached_full_stripe(struct r5conf *conf); |
| 789 | extern struct md_sysfs_entry r5c_journal_mode; | 795 | extern struct md_sysfs_entry r5c_journal_mode; |
| 790 | extern void r5c_update_on_rdev_error(struct mddev *mddev); | 796 | extern void r5c_update_on_rdev_error(struct mddev *mddev); |
| 797 | extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect); | ||
| 791 | #endif | 798 | #endif |
diff --git a/include/linux/bio.h b/include/linux/bio.h index 7cf8a6c70a3f..8e521194f6fc 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h | |||
| @@ -183,7 +183,7 @@ static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter, | |||
| 183 | 183 | ||
| 184 | #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len) | 184 | #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len) |
| 185 | 185 | ||
| 186 | static inline unsigned bio_segments(struct bio *bio) | 186 | static inline unsigned __bio_segments(struct bio *bio, struct bvec_iter *bvec) |
| 187 | { | 187 | { |
| 188 | unsigned segs = 0; | 188 | unsigned segs = 0; |
| 189 | struct bio_vec bv; | 189 | struct bio_vec bv; |
| @@ -205,12 +205,17 @@ static inline unsigned bio_segments(struct bio *bio) | |||
| 205 | break; | 205 | break; |
| 206 | } | 206 | } |
| 207 | 207 | ||
| 208 | bio_for_each_segment(bv, bio, iter) | 208 | __bio_for_each_segment(bv, bio, iter, *bvec) |
| 209 | segs++; | 209 | segs++; |
| 210 | 210 | ||
| 211 | return segs; | 211 | return segs; |
| 212 | } | 212 | } |
| 213 | 213 | ||
| 214 | static inline unsigned bio_segments(struct bio *bio) | ||
| 215 | { | ||
| 216 | return __bio_segments(bio, &bio->bi_iter); | ||
| 217 | } | ||
| 218 | |||
| 214 | /* | 219 | /* |
| 215 | * get a reference to a bio, so it won't disappear. the intended use is | 220 | * get a reference to a bio, so it won't disappear. the intended use is |
| 216 | * something like: | 221 | * something like: |
| @@ -384,6 +389,8 @@ extern void bio_put(struct bio *); | |||
| 384 | extern void __bio_clone_fast(struct bio *, struct bio *); | 389 | extern void __bio_clone_fast(struct bio *, struct bio *); |
| 385 | extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *); | 390 | extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *); |
| 386 | extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs); | 391 | extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs); |
| 392 | extern struct bio *bio_clone_bioset_partial(struct bio *, gfp_t, | ||
| 393 | struct bio_set *, int, int); | ||
| 387 | 394 | ||
| 388 | extern struct bio_set *fs_bio_set; | 395 | extern struct bio_set *fs_bio_set; |
| 389 | 396 | ||
diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 84812a9fb16f..72fab4999c00 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c | |||
| @@ -1102,6 +1102,7 @@ void radix_tree_replace_slot(struct radix_tree_root *root, | |||
| 1102 | { | 1102 | { |
| 1103 | replace_slot(root, NULL, slot, item, true); | 1103 | replace_slot(root, NULL, slot, item, true); |
| 1104 | } | 1104 | } |
| 1105 | EXPORT_SYMBOL(radix_tree_replace_slot); | ||
| 1105 | 1106 | ||
| 1106 | /** | 1107 | /** |
| 1107 | * radix_tree_iter_replace - replace item in a slot | 1108 | * radix_tree_iter_replace - replace item in a slot |
