aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c158
1 files changed, 128 insertions, 30 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b162b839a662..968dacaced6d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -63,6 +63,7 @@
63#define STRIPE_SHIFT (PAGE_SHIFT - 9) 63#define STRIPE_SHIFT (PAGE_SHIFT - 9)
64#define STRIPE_SECTORS (STRIPE_SIZE>>9) 64#define STRIPE_SECTORS (STRIPE_SIZE>>9)
65#define IO_THRESHOLD 1 65#define IO_THRESHOLD 1
66#define BYPASS_THRESHOLD 1
66#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) 67#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
67#define HASH_MASK (NR_HASH - 1) 68#define HASH_MASK (NR_HASH - 1)
68 69
@@ -398,6 +399,7 @@ static void ops_run_io(struct stripe_head *sh)
398 399
399 might_sleep(); 400 might_sleep();
400 401
402 set_bit(STRIPE_IO_STARTED, &sh->state);
401 for (i = disks; i--; ) { 403 for (i = disks; i--; ) {
402 int rw; 404 int rw;
403 struct bio *bi; 405 struct bio *bi;
@@ -433,7 +435,7 @@ static void ops_run_io(struct stripe_head *sh)
433 435
434 bi->bi_bdev = rdev->bdev; 436 bi->bi_bdev = rdev->bdev;
435 pr_debug("%s: for %llu schedule op %ld on disc %d\n", 437 pr_debug("%s: for %llu schedule op %ld on disc %d\n",
436 __FUNCTION__, (unsigned long long)sh->sector, 438 __func__, (unsigned long long)sh->sector,
437 bi->bi_rw, i); 439 bi->bi_rw, i);
438 atomic_inc(&sh->count); 440 atomic_inc(&sh->count);
439 bi->bi_sector = sh->sector + rdev->data_offset; 441 bi->bi_sector = sh->sector + rdev->data_offset;
@@ -520,7 +522,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
520 raid5_conf_t *conf = sh->raid_conf; 522 raid5_conf_t *conf = sh->raid_conf;
521 int i; 523 int i;
522 524
523 pr_debug("%s: stripe %llu\n", __FUNCTION__, 525 pr_debug("%s: stripe %llu\n", __func__,
524 (unsigned long long)sh->sector); 526 (unsigned long long)sh->sector);
525 527
526 /* clear completed biofills */ 528 /* clear completed biofills */
@@ -569,7 +571,7 @@ static void ops_run_biofill(struct stripe_head *sh)
569 raid5_conf_t *conf = sh->raid_conf; 571 raid5_conf_t *conf = sh->raid_conf;
570 int i; 572 int i;
571 573
572 pr_debug("%s: stripe %llu\n", __FUNCTION__, 574 pr_debug("%s: stripe %llu\n", __func__,
573 (unsigned long long)sh->sector); 575 (unsigned long long)sh->sector);
574 576
575 for (i = sh->disks; i--; ) { 577 for (i = sh->disks; i--; ) {
@@ -600,7 +602,7 @@ static void ops_complete_compute5(void *stripe_head_ref)
600 int target = sh->ops.target; 602 int target = sh->ops.target;
601 struct r5dev *tgt = &sh->dev[target]; 603 struct r5dev *tgt = &sh->dev[target];
602 604
603 pr_debug("%s: stripe %llu\n", __FUNCTION__, 605 pr_debug("%s: stripe %llu\n", __func__,
604 (unsigned long long)sh->sector); 606 (unsigned long long)sh->sector);
605 607
606 set_bit(R5_UPTODATE, &tgt->flags); 608 set_bit(R5_UPTODATE, &tgt->flags);
@@ -625,7 +627,7 @@ ops_run_compute5(struct stripe_head *sh, unsigned long pending)
625 int i; 627 int i;
626 628
627 pr_debug("%s: stripe %llu block: %d\n", 629 pr_debug("%s: stripe %llu block: %d\n",
628 __FUNCTION__, (unsigned long long)sh->sector, target); 630 __func__, (unsigned long long)sh->sector, target);
629 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 631 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
630 632
631 for (i = disks; i--; ) 633 for (i = disks; i--; )
@@ -653,7 +655,7 @@ static void ops_complete_prexor(void *stripe_head_ref)
653{ 655{
654 struct stripe_head *sh = stripe_head_ref; 656 struct stripe_head *sh = stripe_head_ref;
655 657
656 pr_debug("%s: stripe %llu\n", __FUNCTION__, 658 pr_debug("%s: stripe %llu\n", __func__,
657 (unsigned long long)sh->sector); 659 (unsigned long long)sh->sector);
658 660
659 set_bit(STRIPE_OP_PREXOR, &sh->ops.complete); 661 set_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
@@ -670,7 +672,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
670 /* existing parity data subtracted */ 672 /* existing parity data subtracted */
671 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 673 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
672 674
673 pr_debug("%s: stripe %llu\n", __FUNCTION__, 675 pr_debug("%s: stripe %llu\n", __func__,
674 (unsigned long long)sh->sector); 676 (unsigned long long)sh->sector);
675 677
676 for (i = disks; i--; ) { 678 for (i = disks; i--; ) {
@@ -699,7 +701,7 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
699 */ 701 */
700 int prexor = test_bit(STRIPE_OP_PREXOR, &pending); 702 int prexor = test_bit(STRIPE_OP_PREXOR, &pending);
701 703
702 pr_debug("%s: stripe %llu\n", __FUNCTION__, 704 pr_debug("%s: stripe %llu\n", __func__,
703 (unsigned long long)sh->sector); 705 (unsigned long long)sh->sector);
704 706
705 for (i = disks; i--; ) { 707 for (i = disks; i--; ) {
@@ -744,7 +746,7 @@ static void ops_complete_postxor(void *stripe_head_ref)
744{ 746{
745 struct stripe_head *sh = stripe_head_ref; 747 struct stripe_head *sh = stripe_head_ref;
746 748
747 pr_debug("%s: stripe %llu\n", __FUNCTION__, 749 pr_debug("%s: stripe %llu\n", __func__,
748 (unsigned long long)sh->sector); 750 (unsigned long long)sh->sector);
749 751
750 set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); 752 set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
@@ -757,7 +759,7 @@ static void ops_complete_write(void *stripe_head_ref)
757 struct stripe_head *sh = stripe_head_ref; 759 struct stripe_head *sh = stripe_head_ref;
758 int disks = sh->disks, i, pd_idx = sh->pd_idx; 760 int disks = sh->disks, i, pd_idx = sh->pd_idx;
759 761
760 pr_debug("%s: stripe %llu\n", __FUNCTION__, 762 pr_debug("%s: stripe %llu\n", __func__,
761 (unsigned long long)sh->sector); 763 (unsigned long long)sh->sector);
762 764
763 for (i = disks; i--; ) { 765 for (i = disks; i--; ) {
@@ -787,7 +789,7 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
787 unsigned long flags; 789 unsigned long flags;
788 dma_async_tx_callback callback; 790 dma_async_tx_callback callback;
789 791
790 pr_debug("%s: stripe %llu\n", __FUNCTION__, 792 pr_debug("%s: stripe %llu\n", __func__,
791 (unsigned long long)sh->sector); 793 (unsigned long long)sh->sector);
792 794
793 /* check if prexor is active which means only process blocks 795 /* check if prexor is active which means only process blocks
@@ -837,7 +839,7 @@ static void ops_complete_check(void *stripe_head_ref)
837 struct stripe_head *sh = stripe_head_ref; 839 struct stripe_head *sh = stripe_head_ref;
838 int pd_idx = sh->pd_idx; 840 int pd_idx = sh->pd_idx;
839 841
840 pr_debug("%s: stripe %llu\n", __FUNCTION__, 842 pr_debug("%s: stripe %llu\n", __func__,
841 (unsigned long long)sh->sector); 843 (unsigned long long)sh->sector);
842 844
843 if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) && 845 if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) &&
@@ -859,7 +861,7 @@ static void ops_run_check(struct stripe_head *sh)
859 int count = 0, pd_idx = sh->pd_idx, i; 861 int count = 0, pd_idx = sh->pd_idx, i;
860 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 862 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
861 863
862 pr_debug("%s: stripe %llu\n", __FUNCTION__, 864 pr_debug("%s: stripe %llu\n", __func__,
863 (unsigned long long)sh->sector); 865 (unsigned long long)sh->sector);
864 866
865 for (i = disks; i--; ) { 867 for (i = disks; i--; ) {
@@ -1260,8 +1262,8 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1260 } 1262 }
1261 set_bit(Faulty, &rdev->flags); 1263 set_bit(Faulty, &rdev->flags);
1262 printk (KERN_ALERT 1264 printk (KERN_ALERT
1263 "raid5: Disk failure on %s, disabling device." 1265 "raid5: Disk failure on %s, disabling device.\n"
1264 " Operation continuing on %d devices\n", 1266 "raid5: Operation continuing on %d devices.\n",
1265 bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); 1267 bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
1266 } 1268 }
1267} 1269}
@@ -1720,6 +1722,9 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
1720 locked++; 1722 locked++;
1721 } 1723 }
1722 } 1724 }
1725 if (locked + 1 == disks)
1726 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
1727 atomic_inc(&sh->raid_conf->pending_full_writes);
1723 } else { 1728 } else {
1724 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 1729 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
1725 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 1730 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
@@ -1759,7 +1764,7 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
1759 locked++; 1764 locked++;
1760 1765
1761 pr_debug("%s: stripe %llu locked: %d pending: %lx\n", 1766 pr_debug("%s: stripe %llu locked: %d pending: %lx\n",
1762 __FUNCTION__, (unsigned long long)sh->sector, 1767 __func__, (unsigned long long)sh->sector,
1763 locked, sh->ops.pending); 1768 locked, sh->ops.pending);
1764 1769
1765 return locked; 1770 return locked;
@@ -1947,6 +1952,9 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
1947 STRIPE_SECTORS, 0, 0); 1952 STRIPE_SECTORS, 0, 0);
1948 } 1953 }
1949 1954
1955 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
1956 if (atomic_dec_and_test(&conf->pending_full_writes))
1957 md_wakeup_thread(conf->mddev->thread);
1950} 1958}
1951 1959
1952/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks 1960/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks
@@ -2149,6 +2157,10 @@ static void handle_completed_write_requests(raid5_conf_t *conf,
2149 0); 2157 0);
2150 } 2158 }
2151 } 2159 }
2160
2161 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2162 if (atomic_dec_and_test(&conf->pending_full_writes))
2163 md_wakeup_thread(conf->mddev->thread);
2152} 2164}
2153 2165
2154static void handle_issuing_new_write_requests5(raid5_conf_t *conf, 2166static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
@@ -2333,6 +2345,9 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
2333 s->locked++; 2345 s->locked++;
2334 set_bit(R5_Wantwrite, &sh->dev[i].flags); 2346 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2335 } 2347 }
2348 if (s->locked == disks)
2349 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
2350 atomic_inc(&conf->pending_full_writes);
2336 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ 2351 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
2337 set_bit(STRIPE_INSYNC, &sh->state); 2352 set_bit(STRIPE_INSYNC, &sh->state);
2338 2353
@@ -3094,6 +3109,8 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3094 else 3109 else
3095 continue; 3110 continue;
3096 3111
3112 set_bit(STRIPE_IO_STARTED, &sh->state);
3113
3097 bi = &sh->dev[i].req; 3114 bi = &sh->dev[i].req;
3098 3115
3099 bi->bi_rw = rw; 3116 bi->bi_rw = rw;
@@ -3164,7 +3181,7 @@ static void raid5_activate_delayed(raid5_conf_t *conf)
3164 clear_bit(STRIPE_DELAYED, &sh->state); 3181 clear_bit(STRIPE_DELAYED, &sh->state);
3165 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3182 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3166 atomic_inc(&conf->preread_active_stripes); 3183 atomic_inc(&conf->preread_active_stripes);
3167 list_add_tail(&sh->lru, &conf->handle_list); 3184 list_add_tail(&sh->lru, &conf->hold_list);
3168 } 3185 }
3169 } else 3186 } else
3170 blk_plug_device(conf->mddev->queue); 3187 blk_plug_device(conf->mddev->queue);
@@ -3442,6 +3459,58 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
3442 } 3459 }
3443} 3460}
3444 3461
3462/* __get_priority_stripe - get the next stripe to process
3463 *
3464 * Full stripe writes are allowed to pass preread active stripes up until
3465 * the bypass_threshold is exceeded. In general the bypass_count
3466 * increments when the handle_list is handled before the hold_list; however, it
3467 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a
3468 * stripe with in flight i/o. The bypass_count will be reset when the
3469 * head of the hold_list has changed, i.e. the head was promoted to the
3470 * handle_list.
3471 */
3472static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf)
3473{
3474 struct stripe_head *sh;
3475
3476 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
3477 __func__,
3478 list_empty(&conf->handle_list) ? "empty" : "busy",
3479 list_empty(&conf->hold_list) ? "empty" : "busy",
3480 atomic_read(&conf->pending_full_writes), conf->bypass_count);
3481
3482 if (!list_empty(&conf->handle_list)) {
3483 sh = list_entry(conf->handle_list.next, typeof(*sh), lru);
3484
3485 if (list_empty(&conf->hold_list))
3486 conf->bypass_count = 0;
3487 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
3488 if (conf->hold_list.next == conf->last_hold)
3489 conf->bypass_count++;
3490 else {
3491 conf->last_hold = conf->hold_list.next;
3492 conf->bypass_count -= conf->bypass_threshold;
3493 if (conf->bypass_count < 0)
3494 conf->bypass_count = 0;
3495 }
3496 }
3497 } else if (!list_empty(&conf->hold_list) &&
3498 ((conf->bypass_threshold &&
3499 conf->bypass_count > conf->bypass_threshold) ||
3500 atomic_read(&conf->pending_full_writes) == 0)) {
3501 sh = list_entry(conf->hold_list.next,
3502 typeof(*sh), lru);
3503 conf->bypass_count -= conf->bypass_threshold;
3504 if (conf->bypass_count < 0)
3505 conf->bypass_count = 0;
3506 } else
3507 return NULL;
3508
3509 list_del_init(&sh->lru);
3510 atomic_inc(&sh->count);
3511 BUG_ON(atomic_read(&sh->count) != 1);
3512 return sh;
3513}
3445 3514
3446static int make_request(struct request_queue *q, struct bio * bi) 3515static int make_request(struct request_queue *q, struct bio * bi)
3447{ 3516{
@@ -3914,7 +3983,6 @@ static void raid5d(mddev_t *mddev)
3914 handled = 0; 3983 handled = 0;
3915 spin_lock_irq(&conf->device_lock); 3984 spin_lock_irq(&conf->device_lock);
3916 while (1) { 3985 while (1) {
3917 struct list_head *first;
3918 struct bio *bio; 3986 struct bio *bio;
3919 3987
3920 if (conf->seq_flush != conf->seq_write) { 3988 if (conf->seq_flush != conf->seq_write) {
@@ -3936,17 +4004,12 @@ static void raid5d(mddev_t *mddev)
3936 handled++; 4004 handled++;
3937 } 4005 }
3938 4006
3939 if (list_empty(&conf->handle_list)) { 4007 sh = __get_priority_stripe(conf);
4008
4009 if (!sh) {
3940 async_tx_issue_pending_all(); 4010 async_tx_issue_pending_all();
3941 break; 4011 break;
3942 } 4012 }
3943
3944 first = conf->handle_list.next;
3945 sh = list_entry(first, struct stripe_head, lru);
3946
3947 list_del_init(first);
3948 atomic_inc(&sh->count);
3949 BUG_ON(atomic_read(&sh->count)!= 1);
3950 spin_unlock_irq(&conf->device_lock); 4013 spin_unlock_irq(&conf->device_lock);
3951 4014
3952 handled++; 4015 handled++;
@@ -3978,15 +4041,13 @@ static ssize_t
3978raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) 4041raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
3979{ 4042{
3980 raid5_conf_t *conf = mddev_to_conf(mddev); 4043 raid5_conf_t *conf = mddev_to_conf(mddev);
3981 char *end; 4044 unsigned long new;
3982 int new;
3983 if (len >= PAGE_SIZE) 4045 if (len >= PAGE_SIZE)
3984 return -EINVAL; 4046 return -EINVAL;
3985 if (!conf) 4047 if (!conf)
3986 return -ENODEV; 4048 return -ENODEV;
3987 4049
3988 new = simple_strtoul(page, &end, 10); 4050 if (strict_strtoul(page, 10, &new))
3989 if (!*page || (*end && *end != '\n') )
3990 return -EINVAL; 4051 return -EINVAL;
3991 if (new <= 16 || new > 32768) 4052 if (new <= 16 || new > 32768)
3992 return -EINVAL; 4053 return -EINVAL;
@@ -4011,6 +4072,40 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
4011 raid5_store_stripe_cache_size); 4072 raid5_store_stripe_cache_size);
4012 4073
4013static ssize_t 4074static ssize_t
4075raid5_show_preread_threshold(mddev_t *mddev, char *page)
4076{
4077 raid5_conf_t *conf = mddev_to_conf(mddev);
4078 if (conf)
4079 return sprintf(page, "%d\n", conf->bypass_threshold);
4080 else
4081 return 0;
4082}
4083
4084static ssize_t
4085raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len)
4086{
4087 raid5_conf_t *conf = mddev_to_conf(mddev);
4088 unsigned long new;
4089 if (len >= PAGE_SIZE)
4090 return -EINVAL;
4091 if (!conf)
4092 return -ENODEV;
4093
4094 if (strict_strtoul(page, 10, &new))
4095 return -EINVAL;
4096 if (new > conf->max_nr_stripes)
4097 return -EINVAL;
4098 conf->bypass_threshold = new;
4099 return len;
4100}
4101
4102static struct md_sysfs_entry
4103raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
4104 S_IRUGO | S_IWUSR,
4105 raid5_show_preread_threshold,
4106 raid5_store_preread_threshold);
4107
4108static ssize_t
4014stripe_cache_active_show(mddev_t *mddev, char *page) 4109stripe_cache_active_show(mddev_t *mddev, char *page)
4015{ 4110{
4016 raid5_conf_t *conf = mddev_to_conf(mddev); 4111 raid5_conf_t *conf = mddev_to_conf(mddev);
@@ -4026,6 +4121,7 @@ raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
4026static struct attribute *raid5_attrs[] = { 4121static struct attribute *raid5_attrs[] = {
4027 &raid5_stripecache_size.attr, 4122 &raid5_stripecache_size.attr,
4028 &raid5_stripecache_active.attr, 4123 &raid5_stripecache_active.attr,
4124 &raid5_preread_bypass_threshold.attr,
4029 NULL, 4125 NULL,
4030}; 4126};
4031static struct attribute_group raid5_attrs_group = { 4127static struct attribute_group raid5_attrs_group = {
@@ -4130,12 +4226,14 @@ static int run(mddev_t *mddev)
4130 init_waitqueue_head(&conf->wait_for_stripe); 4226 init_waitqueue_head(&conf->wait_for_stripe);
4131 init_waitqueue_head(&conf->wait_for_overlap); 4227 init_waitqueue_head(&conf->wait_for_overlap);
4132 INIT_LIST_HEAD(&conf->handle_list); 4228 INIT_LIST_HEAD(&conf->handle_list);
4229 INIT_LIST_HEAD(&conf->hold_list);
4133 INIT_LIST_HEAD(&conf->delayed_list); 4230 INIT_LIST_HEAD(&conf->delayed_list);
4134 INIT_LIST_HEAD(&conf->bitmap_list); 4231 INIT_LIST_HEAD(&conf->bitmap_list);
4135 INIT_LIST_HEAD(&conf->inactive_list); 4232 INIT_LIST_HEAD(&conf->inactive_list);
4136 atomic_set(&conf->active_stripes, 0); 4233 atomic_set(&conf->active_stripes, 0);
4137 atomic_set(&conf->preread_active_stripes, 0); 4234 atomic_set(&conf->preread_active_stripes, 0);
4138 atomic_set(&conf->active_aligned_reads, 0); 4235 atomic_set(&conf->active_aligned_reads, 0);
4236 conf->bypass_threshold = BYPASS_THRESHOLD;
4139 4237
4140 pr_debug("raid5: run(%s) called.\n", mdname(mddev)); 4238 pr_debug("raid5: run(%s) called.\n", mdname(mddev));
4141 4239