aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorShaohua Li <shli@kernel.org>2012-07-19 02:01:31 -0400
committerNeilBrown <neilb@suse.de>2012-07-19 02:01:31 -0400
commitb17459c05000fdbe8d10946570a26510f86ec0f6 (patch)
tree251a9e640d91a9b9ffc804519ea07ed365035636 /drivers/md
parent7eaf7e8eb31747e4259d60288b44b194fb3d56c7 (diff)
raid5: add a per-stripe lock
Add a per-stripe lock to protect stripe specific data. The purpose is to reduce lock contention of conf->device_lock. stripe ->toread, ->towrite are protected by per-stripe lock. Accessing bio list of the stripe is always serialized by this lock, so adding bio to the lists (add_stripe_bio()) and removing bio from the lists (like ops_run_biofill()) not race. If bio in ->read, ->written ... list are not shared by multiple stripes, we don't need any lock to protect ->read, ->written, because STRIPE_ACTIVE will protect them. If the bio are shared, there are two protections: 1. bi_phys_segments acts as a reference count 2. traverse the list uses r5_next_bio, which makes traverse never access bio not belonging to the stripe Let's have an example: | stripe1 | stripe2 | stripe3 | ...bio1......|bio2|bio3|....bio4..... stripe2 has 4 bios, when it's finished, it will decrement bi_phys_segments for all bios, but only end_bio for bio2 and bio3. bio1->bi_next still points to bio2, but this doesn't matter. When stripe1 is finished, it will not touch bio2 because of r5_next_bio check. Next time stripe1 will end_bio for bio1 and stripe3 will end_bio bio4. before add_stripe_bio() addes a bio to a stripe, we already increament the bio bi_phys_segments, so don't worry other stripes release the bio. Signed-off-by: Shaohua Li <shli@fusionio.com> Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/raid5.c35
-rw-r--r--drivers/md/raid5.h1
2 files changed, 20 insertions, 16 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9ad452c6d7e3..c2192a2907e4 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -762,14 +762,12 @@ static void ops_complete_biofill(void *stripe_head_ref)
762{ 762{
763 struct stripe_head *sh = stripe_head_ref; 763 struct stripe_head *sh = stripe_head_ref;
764 struct bio *return_bi = NULL; 764 struct bio *return_bi = NULL;
765 struct r5conf *conf = sh->raid_conf;
766 int i; 765 int i;
767 766
768 pr_debug("%s: stripe %llu\n", __func__, 767 pr_debug("%s: stripe %llu\n", __func__,
769 (unsigned long long)sh->sector); 768 (unsigned long long)sh->sector);
770 769
771 /* clear completed biofills */ 770 /* clear completed biofills */
772 spin_lock_irq(&conf->device_lock);
773 for (i = sh->disks; i--; ) { 771 for (i = sh->disks; i--; ) {
774 struct r5dev *dev = &sh->dev[i]; 772 struct r5dev *dev = &sh->dev[i];
775 773
@@ -795,7 +793,6 @@ static void ops_complete_biofill(void *stripe_head_ref)
795 } 793 }
796 } 794 }
797 } 795 }
798 spin_unlock_irq(&conf->device_lock);
799 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 796 clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
800 797
801 return_io(return_bi); 798 return_io(return_bi);
@@ -807,7 +804,6 @@ static void ops_complete_biofill(void *stripe_head_ref)
807static void ops_run_biofill(struct stripe_head *sh) 804static void ops_run_biofill(struct stripe_head *sh)
808{ 805{
809 struct dma_async_tx_descriptor *tx = NULL; 806 struct dma_async_tx_descriptor *tx = NULL;
810 struct r5conf *conf = sh->raid_conf;
811 struct async_submit_ctl submit; 807 struct async_submit_ctl submit;
812 int i; 808 int i;
813 809
@@ -818,10 +814,10 @@ static void ops_run_biofill(struct stripe_head *sh)
818 struct r5dev *dev = &sh->dev[i]; 814 struct r5dev *dev = &sh->dev[i];
819 if (test_bit(R5_Wantfill, &dev->flags)) { 815 if (test_bit(R5_Wantfill, &dev->flags)) {
820 struct bio *rbi; 816 struct bio *rbi;
821 spin_lock_irq(&conf->device_lock); 817 spin_lock_irq(&sh->stripe_lock);
822 dev->read = rbi = dev->toread; 818 dev->read = rbi = dev->toread;
823 dev->toread = NULL; 819 dev->toread = NULL;
824 spin_unlock_irq(&conf->device_lock); 820 spin_unlock_irq(&sh->stripe_lock);
825 while (rbi && rbi->bi_sector < 821 while (rbi && rbi->bi_sector <
826 dev->sector + STRIPE_SECTORS) { 822 dev->sector + STRIPE_SECTORS) {
827 tx = async_copy_data(0, rbi, dev->page, 823 tx = async_copy_data(0, rbi, dev->page,
@@ -1157,12 +1153,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1157 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 1153 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
1158 struct bio *wbi; 1154 struct bio *wbi;
1159 1155
1160 spin_lock_irq(&sh->raid_conf->device_lock); 1156 spin_lock_irq(&sh->stripe_lock);
1161 chosen = dev->towrite; 1157 chosen = dev->towrite;
1162 dev->towrite = NULL; 1158 dev->towrite = NULL;
1163 BUG_ON(dev->written); 1159 BUG_ON(dev->written);
1164 wbi = dev->written = chosen; 1160 wbi = dev->written = chosen;
1165 spin_unlock_irq(&sh->raid_conf->device_lock); 1161 spin_unlock_irq(&sh->stripe_lock);
1166 1162
1167 while (wbi && wbi->bi_sector < 1163 while (wbi && wbi->bi_sector <
1168 dev->sector + STRIPE_SECTORS) { 1164 dev->sector + STRIPE_SECTORS) {
@@ -1467,6 +1463,8 @@ static int grow_one_stripe(struct r5conf *conf)
1467 init_waitqueue_head(&sh->ops.wait_for_ops); 1463 init_waitqueue_head(&sh->ops.wait_for_ops);
1468 #endif 1464 #endif
1469 1465
1466 spin_lock_init(&sh->stripe_lock);
1467
1470 if (grow_buffers(sh)) { 1468 if (grow_buffers(sh)) {
1471 shrink_buffers(sh); 1469 shrink_buffers(sh);
1472 kmem_cache_free(conf->slab_cache, sh); 1470 kmem_cache_free(conf->slab_cache, sh);
@@ -2353,8 +2351,15 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2353 (unsigned long long)bi->bi_sector, 2351 (unsigned long long)bi->bi_sector,
2354 (unsigned long long)sh->sector); 2352 (unsigned long long)sh->sector);
2355 2353
2356 2354 /*
2357 spin_lock_irq(&conf->device_lock); 2355 * If several bio share a stripe. The bio bi_phys_segments acts as a
2356 * reference count to avoid race. The reference count should already be
2357 * increased before this function is called (for example, in
2358 * make_request()), so other bio sharing this stripe will not free the
2359 * stripe. If a stripe is owned by one stripe, the stripe lock will
2360 * protect it.
2361 */
2362 spin_lock_irq(&sh->stripe_lock);
2358 if (forwrite) { 2363 if (forwrite) {
2359 bip = &sh->dev[dd_idx].towrite; 2364 bip = &sh->dev[dd_idx].towrite;
2360 if (*bip == NULL) 2365 if (*bip == NULL)
@@ -2388,7 +2393,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2388 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2393 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
2389 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2394 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
2390 } 2395 }
2391 spin_unlock_irq(&conf->device_lock); 2396 spin_unlock_irq(&sh->stripe_lock);
2392 2397
2393 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2398 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
2394 (unsigned long long)(*bip)->bi_sector, 2399 (unsigned long long)(*bip)->bi_sector,
@@ -2404,7 +2409,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2404 2409
2405 overlap: 2410 overlap:
2406 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 2411 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
2407 spin_unlock_irq(&conf->device_lock); 2412 spin_unlock_irq(&sh->stripe_lock);
2408 return 0; 2413 return 0;
2409} 2414}
2410 2415
@@ -2454,11 +2459,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2454 rdev_dec_pending(rdev, conf->mddev); 2459 rdev_dec_pending(rdev, conf->mddev);
2455 } 2460 }
2456 } 2461 }
2457 spin_lock_irq(&conf->device_lock); 2462 spin_lock_irq(&sh->stripe_lock);
2458 /* fail all writes first */ 2463 /* fail all writes first */
2459 bi = sh->dev[i].towrite; 2464 bi = sh->dev[i].towrite;
2460 sh->dev[i].towrite = NULL; 2465 sh->dev[i].towrite = NULL;
2461 spin_unlock_irq(&conf->device_lock); 2466 spin_unlock_irq(&sh->stripe_lock);
2462 if (bi) { 2467 if (bi) {
2463 s->to_write--; 2468 s->to_write--;
2464 bitmap_end = 1; 2469 bitmap_end = 1;
@@ -3192,7 +3197,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3192 3197
3193 /* Now to look around and see what can be done */ 3198 /* Now to look around and see what can be done */
3194 rcu_read_lock(); 3199 rcu_read_lock();
3195 spin_lock_irq(&conf->device_lock);
3196 for (i=disks; i--; ) { 3200 for (i=disks; i--; ) {
3197 struct md_rdev *rdev; 3201 struct md_rdev *rdev;
3198 sector_t first_bad; 3202 sector_t first_bad;
@@ -3338,7 +3342,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3338 do_recovery = 1; 3342 do_recovery = 1;
3339 } 3343 }
3340 } 3344 }
3341 spin_unlock_irq(&conf->device_lock);
3342 if (test_bit(STRIPE_SYNCING, &sh->state)) { 3345 if (test_bit(STRIPE_SYNCING, &sh->state)) {
3343 /* If there is a failed device being replaced, 3346 /* If there is a failed device being replaced,
3344 * we must be recovering. 3347 * we must be recovering.
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 2164021f3b5f..f03fb3395183 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -210,6 +210,7 @@ struct stripe_head {
210 int disks; /* disks in stripe */ 210 int disks; /* disks in stripe */
211 enum check_states check_state; 211 enum check_states check_state;
212 enum reconstruct_states reconstruct_state; 212 enum reconstruct_states reconstruct_state;
213 spinlock_t stripe_lock;
213 /** 214 /**
214 * struct stripe_operations 215 * struct stripe_operations
215 * @target - STRIPE_OP_COMPUTE_BLK target 216 * @target - STRIPE_OP_COMPUTE_BLK target