aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid1.c
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2010-10-18 21:54:01 -0400
committerNeilBrown <neilb@suse.de>2010-10-28 02:34:07 -0400
commit4e78064f42ad474ce9c31760861f7fb0cfc22532 (patch)
tree3a1abaa98ebcbd62eacfbe95d72e44195fb3bc1f /drivers/md/raid1.c
parente804ac780e2f01cb3b914daca2fd4780d1743db1 (diff)
md: Fix possible deadlock with multiple mempool allocations.
It is not safe to allocate from a mempool while holding an item previously allocated from that mempool as that can deadlock when the mempool is close to exhaustion. So don't use a bio list to collect the bios to write to multiple devices in raid1 and raid10. Instead queue each bio as it becomes available so an unplug will activate all previously allocated bios and so a new bio has a chance of being allocated. This means we must set the 'remaining' count to '1' before submitting any requests, then when all are submitted, decrement 'remaining' and possible handle the write completion at that point. Reported-by: Torsten Kaiser <just.for.lkml@googlemail.com> Tested-by: Torsten Kaiser <just.for.lkml@googlemail.com> Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r--drivers/md/raid1.c98
1 files changed, 46 insertions, 52 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a4b85a947532..3362cfc8073c 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -306,6 +306,28 @@ static void raid1_end_read_request(struct bio *bio, int error)
306 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); 306 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
307} 307}
308 308
309static void r1_bio_write_done(r1bio_t *r1_bio, int vcnt, struct bio_vec *bv,
310 int behind)
311{
312 if (atomic_dec_and_test(&r1_bio->remaining))
313 {
314 /* it really is the end of this request */
315 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
316 /* free extra copy of the data pages */
317 int i = vcnt;
318 while (i--)
319 safe_put_page(bv[i].bv_page);
320 }
321 /* clear the bitmap if all writes complete successfully */
322 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
323 r1_bio->sectors,
324 !test_bit(R1BIO_Degraded, &r1_bio->state),
325 behind);
326 md_write_end(r1_bio->mddev);
327 raid_end_bio_io(r1_bio);
328 }
329}
330
309static void raid1_end_write_request(struct bio *bio, int error) 331static void raid1_end_write_request(struct bio *bio, int error)
310{ 332{
311 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 333 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -373,21 +395,7 @@ static void raid1_end_write_request(struct bio *bio, int error)
373 * Let's see if all mirrored write operations have finished 395 * Let's see if all mirrored write operations have finished
374 * already. 396 * already.
375 */ 397 */
376 if (atomic_dec_and_test(&r1_bio->remaining)) { 398 r1_bio_write_done(r1_bio, bio->bi_vcnt, bio->bi_io_vec, behind);
377 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
378 /* free extra copy of the data pages */
379 int i = bio->bi_vcnt;
380 while (i--)
381 safe_put_page(bio->bi_io_vec[i].bv_page);
382 }
383 /* clear the bitmap if all writes complete successfully */
384 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
385 r1_bio->sectors,
386 !test_bit(R1BIO_Degraded, &r1_bio->state),
387 behind);
388 md_write_end(r1_bio->mddev);
389 raid_end_bio_io(r1_bio);
390 }
391 399
392 if (to_put) 400 if (to_put)
393 bio_put(to_put); 401 bio_put(to_put);
@@ -735,23 +743,26 @@ static void unfreeze_array(conf_t *conf)
735} 743}
736 744
737 745
738/* duplicate the data pages for behind I/O */ 746/* duplicate the data pages for behind I/O
739static struct page **alloc_behind_pages(struct bio *bio) 747 * We return a list of bio_vec rather than just page pointers
748 * as it makes freeing easier
749 */
750static struct bio_vec *alloc_behind_pages(struct bio *bio)
740{ 751{
741 int i; 752 int i;
742 struct bio_vec *bvec; 753 struct bio_vec *bvec;
743 struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *), 754 struct bio_vec *pages = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
744 GFP_NOIO); 755 GFP_NOIO);
745 if (unlikely(!pages)) 756 if (unlikely(!pages))
746 goto do_sync_io; 757 goto do_sync_io;
747 758
748 bio_for_each_segment(bvec, bio, i) { 759 bio_for_each_segment(bvec, bio, i) {
749 pages[i] = alloc_page(GFP_NOIO); 760 pages[i].bv_page = alloc_page(GFP_NOIO);
750 if (unlikely(!pages[i])) 761 if (unlikely(!pages[i].bv_page))
751 goto do_sync_io; 762 goto do_sync_io;
752 memcpy(kmap(pages[i]) + bvec->bv_offset, 763 memcpy(kmap(pages[i].bv_page) + bvec->bv_offset,
753 kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); 764 kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
754 kunmap(pages[i]); 765 kunmap(pages[i].bv_page);
755 kunmap(bvec->bv_page); 766 kunmap(bvec->bv_page);
756 } 767 }
757 768
@@ -759,8 +770,8 @@ static struct page **alloc_behind_pages(struct bio *bio)
759 770
760do_sync_io: 771do_sync_io:
761 if (pages) 772 if (pages)
762 for (i = 0; i < bio->bi_vcnt && pages[i]; i++) 773 for (i = 0; i < bio->bi_vcnt && pages[i].bv_page; i++)
763 put_page(pages[i]); 774 put_page(pages[i].bv_page);
764 kfree(pages); 775 kfree(pages);
765 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); 776 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
766 return NULL; 777 return NULL;
@@ -775,8 +786,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
775 int i, targets = 0, disks; 786 int i, targets = 0, disks;
776 struct bitmap *bitmap; 787 struct bitmap *bitmap;
777 unsigned long flags; 788 unsigned long flags;
778 struct bio_list bl; 789 struct bio_vec *behind_pages = NULL;
779 struct page **behind_pages = NULL;
780 const int rw = bio_data_dir(bio); 790 const int rw = bio_data_dir(bio);
781 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 791 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
782 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); 792 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
@@ -873,13 +883,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
873 * bios[x] to bio 883 * bios[x] to bio
874 */ 884 */
875 disks = conf->raid_disks; 885 disks = conf->raid_disks;
876#if 0
877 { static int first=1;
878 if (first) printk("First Write sector %llu disks %d\n",
879 (unsigned long long)r1_bio->sector, disks);
880 first = 0;
881 }
882#endif
883 retry_write: 886 retry_write:
884 blocked_rdev = NULL; 887 blocked_rdev = NULL;
885 rcu_read_lock(); 888 rcu_read_lock();
@@ -937,10 +940,11 @@ static int make_request(mddev_t *mddev, struct bio * bio)
937 (behind_pages = alloc_behind_pages(bio)) != NULL) 940 (behind_pages = alloc_behind_pages(bio)) != NULL)
938 set_bit(R1BIO_BehindIO, &r1_bio->state); 941 set_bit(R1BIO_BehindIO, &r1_bio->state);
939 942
940 atomic_set(&r1_bio->remaining, 0); 943 atomic_set(&r1_bio->remaining, 1);
941 atomic_set(&r1_bio->behind_remaining, 0); 944 atomic_set(&r1_bio->behind_remaining, 0);
942 945
943 bio_list_init(&bl); 946 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
947 test_bit(R1BIO_BehindIO, &r1_bio->state));
944 for (i = 0; i < disks; i++) { 948 for (i = 0; i < disks; i++) {
945 struct bio *mbio; 949 struct bio *mbio;
946 if (!r1_bio->bios[i]) 950 if (!r1_bio->bios[i])
@@ -967,35 +971,25 @@ static int make_request(mddev_t *mddev, struct bio * bio)
967 * them all 971 * them all
968 */ 972 */
969 __bio_for_each_segment(bvec, mbio, j, 0) 973 __bio_for_each_segment(bvec, mbio, j, 0)
970 bvec->bv_page = behind_pages[j]; 974 bvec->bv_page = behind_pages[j].bv_page;
971 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) 975 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
972 atomic_inc(&r1_bio->behind_remaining); 976 atomic_inc(&r1_bio->behind_remaining);
973 } 977 }
974 978
975 atomic_inc(&r1_bio->remaining); 979 atomic_inc(&r1_bio->remaining);
976 980 spin_lock_irqsave(&conf->device_lock, flags);
977 bio_list_add(&bl, mbio); 981 bio_list_add(&conf->pending_bio_list, mbio);
982 blk_plug_device(mddev->queue);
983 spin_unlock_irqrestore(&conf->device_lock, flags);
978 } 984 }
985 r1_bio_write_done(r1_bio, bio->bi_vcnt, behind_pages, behind_pages != NULL);
979 kfree(behind_pages); /* the behind pages are attached to the bios now */ 986 kfree(behind_pages); /* the behind pages are attached to the bios now */
980 987
981 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, 988 /* In case raid1d snuck in to freeze_array */
982 test_bit(R1BIO_BehindIO, &r1_bio->state));
983 spin_lock_irqsave(&conf->device_lock, flags);
984 bio_list_merge(&conf->pending_bio_list, &bl);
985 bio_list_init(&bl);
986
987 blk_plug_device(mddev->queue);
988 spin_unlock_irqrestore(&conf->device_lock, flags);
989
990 /* In case raid1d snuck into freeze_array */
991 wake_up(&conf->wait_barrier); 989 wake_up(&conf->wait_barrier);
992 990
993 if (do_sync) 991 if (do_sync)
994 md_wakeup_thread(mddev->thread); 992 md_wakeup_thread(mddev->thread);
995#if 0
996 while ((bio = bio_list_pop(&bl)) != NULL)
997 generic_make_request(bio);
998#endif
999 993
1000 return 0; 994 return 0;
1001} 995}