aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2010-10-18 21:54:01 -0400
committerNeilBrown <neilb@suse.de>2010-10-28 02:34:07 -0400
commit4e78064f42ad474ce9c31760861f7fb0cfc22532 (patch)
tree3a1abaa98ebcbd62eacfbe95d72e44195fb3bc1f /drivers/md
parente804ac780e2f01cb3b914daca2fd4780d1743db1 (diff)
md: Fix possible deadlock with multiple mempool allocations.
It is not safe to allocate from a mempool while holding an item previously allocated from that mempool as that can deadlock when the mempool is close to exhaustion. So don't use a bio list to collect the bios to write to multiple devices in raid1 and raid10. Instead queue each bio as it becomes available so an unplug will activate all previously allocated bios and so a new bio has a chance of being allocated. This means we must set the 'remaining' count to '1' before submitting any requests, then when all are submitted, decrement 'remaining' and possible handle the write completion at that point. Reported-by: Torsten Kaiser <just.for.lkml@googlemail.com> Tested-by: Torsten Kaiser <just.for.lkml@googlemail.com> Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/raid1.c98
-rw-r--r--drivers/md/raid10.c25
2 files changed, 58 insertions, 65 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a4b85a947532..3362cfc8073c 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -306,6 +306,28 @@ static void raid1_end_read_request(struct bio *bio, int error)
306 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); 306 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
307} 307}
308 308
309static void r1_bio_write_done(r1bio_t *r1_bio, int vcnt, struct bio_vec *bv,
310 int behind)
311{
312 if (atomic_dec_and_test(&r1_bio->remaining))
313 {
314 /* it really is the end of this request */
315 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
316 /* free extra copy of the data pages */
317 int i = vcnt;
318 while (i--)
319 safe_put_page(bv[i].bv_page);
320 }
321 /* clear the bitmap if all writes complete successfully */
322 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
323 r1_bio->sectors,
324 !test_bit(R1BIO_Degraded, &r1_bio->state),
325 behind);
326 md_write_end(r1_bio->mddev);
327 raid_end_bio_io(r1_bio);
328 }
329}
330
309static void raid1_end_write_request(struct bio *bio, int error) 331static void raid1_end_write_request(struct bio *bio, int error)
310{ 332{
311 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 333 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -373,21 +395,7 @@ static void raid1_end_write_request(struct bio *bio, int error)
373 * Let's see if all mirrored write operations have finished 395 * Let's see if all mirrored write operations have finished
374 * already. 396 * already.
375 */ 397 */
376 if (atomic_dec_and_test(&r1_bio->remaining)) { 398 r1_bio_write_done(r1_bio, bio->bi_vcnt, bio->bi_io_vec, behind);
377 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
378 /* free extra copy of the data pages */
379 int i = bio->bi_vcnt;
380 while (i--)
381 safe_put_page(bio->bi_io_vec[i].bv_page);
382 }
383 /* clear the bitmap if all writes complete successfully */
384 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
385 r1_bio->sectors,
386 !test_bit(R1BIO_Degraded, &r1_bio->state),
387 behind);
388 md_write_end(r1_bio->mddev);
389 raid_end_bio_io(r1_bio);
390 }
391 399
392 if (to_put) 400 if (to_put)
393 bio_put(to_put); 401 bio_put(to_put);
@@ -735,23 +743,26 @@ static void unfreeze_array(conf_t *conf)
735} 743}
736 744
737 745
738/* duplicate the data pages for behind I/O */ 746/* duplicate the data pages for behind I/O
739static struct page **alloc_behind_pages(struct bio *bio) 747 * We return a list of bio_vec rather than just page pointers
748 * as it makes freeing easier
749 */
750static struct bio_vec *alloc_behind_pages(struct bio *bio)
740{ 751{
741 int i; 752 int i;
742 struct bio_vec *bvec; 753 struct bio_vec *bvec;
743 struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *), 754 struct bio_vec *pages = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
744 GFP_NOIO); 755 GFP_NOIO);
745 if (unlikely(!pages)) 756 if (unlikely(!pages))
746 goto do_sync_io; 757 goto do_sync_io;
747 758
748 bio_for_each_segment(bvec, bio, i) { 759 bio_for_each_segment(bvec, bio, i) {
749 pages[i] = alloc_page(GFP_NOIO); 760 pages[i].bv_page = alloc_page(GFP_NOIO);
750 if (unlikely(!pages[i])) 761 if (unlikely(!pages[i].bv_page))
751 goto do_sync_io; 762 goto do_sync_io;
752 memcpy(kmap(pages[i]) + bvec->bv_offset, 763 memcpy(kmap(pages[i].bv_page) + bvec->bv_offset,
753 kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); 764 kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
754 kunmap(pages[i]); 765 kunmap(pages[i].bv_page);
755 kunmap(bvec->bv_page); 766 kunmap(bvec->bv_page);
756 } 767 }
757 768
@@ -759,8 +770,8 @@ static struct page **alloc_behind_pages(struct bio *bio)
759 770
760do_sync_io: 771do_sync_io:
761 if (pages) 772 if (pages)
762 for (i = 0; i < bio->bi_vcnt && pages[i]; i++) 773 for (i = 0; i < bio->bi_vcnt && pages[i].bv_page; i++)
763 put_page(pages[i]); 774 put_page(pages[i].bv_page);
764 kfree(pages); 775 kfree(pages);
765 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); 776 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
766 return NULL; 777 return NULL;
@@ -775,8 +786,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
775 int i, targets = 0, disks; 786 int i, targets = 0, disks;
776 struct bitmap *bitmap; 787 struct bitmap *bitmap;
777 unsigned long flags; 788 unsigned long flags;
778 struct bio_list bl; 789 struct bio_vec *behind_pages = NULL;
779 struct page **behind_pages = NULL;
780 const int rw = bio_data_dir(bio); 790 const int rw = bio_data_dir(bio);
781 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 791 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
782 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); 792 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
@@ -873,13 +883,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
873 * bios[x] to bio 883 * bios[x] to bio
874 */ 884 */
875 disks = conf->raid_disks; 885 disks = conf->raid_disks;
876#if 0
877 { static int first=1;
878 if (first) printk("First Write sector %llu disks %d\n",
879 (unsigned long long)r1_bio->sector, disks);
880 first = 0;
881 }
882#endif
883 retry_write: 886 retry_write:
884 blocked_rdev = NULL; 887 blocked_rdev = NULL;
885 rcu_read_lock(); 888 rcu_read_lock();
@@ -937,10 +940,11 @@ static int make_request(mddev_t *mddev, struct bio * bio)
937 (behind_pages = alloc_behind_pages(bio)) != NULL) 940 (behind_pages = alloc_behind_pages(bio)) != NULL)
938 set_bit(R1BIO_BehindIO, &r1_bio->state); 941 set_bit(R1BIO_BehindIO, &r1_bio->state);
939 942
940 atomic_set(&r1_bio->remaining, 0); 943 atomic_set(&r1_bio->remaining, 1);
941 atomic_set(&r1_bio->behind_remaining, 0); 944 atomic_set(&r1_bio->behind_remaining, 0);
942 945
943 bio_list_init(&bl); 946 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
947 test_bit(R1BIO_BehindIO, &r1_bio->state));
944 for (i = 0; i < disks; i++) { 948 for (i = 0; i < disks; i++) {
945 struct bio *mbio; 949 struct bio *mbio;
946 if (!r1_bio->bios[i]) 950 if (!r1_bio->bios[i])
@@ -967,35 +971,25 @@ static int make_request(mddev_t *mddev, struct bio * bio)
967 * them all 971 * them all
968 */ 972 */
969 __bio_for_each_segment(bvec, mbio, j, 0) 973 __bio_for_each_segment(bvec, mbio, j, 0)
970 bvec->bv_page = behind_pages[j]; 974 bvec->bv_page = behind_pages[j].bv_page;
971 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) 975 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
972 atomic_inc(&r1_bio->behind_remaining); 976 atomic_inc(&r1_bio->behind_remaining);
973 } 977 }
974 978
975 atomic_inc(&r1_bio->remaining); 979 atomic_inc(&r1_bio->remaining);
976 980 spin_lock_irqsave(&conf->device_lock, flags);
977 bio_list_add(&bl, mbio); 981 bio_list_add(&conf->pending_bio_list, mbio);
982 blk_plug_device(mddev->queue);
983 spin_unlock_irqrestore(&conf->device_lock, flags);
978 } 984 }
985 r1_bio_write_done(r1_bio, bio->bi_vcnt, behind_pages, behind_pages != NULL);
979 kfree(behind_pages); /* the behind pages are attached to the bios now */ 986 kfree(behind_pages); /* the behind pages are attached to the bios now */
980 987
981 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, 988 /* In case raid1d snuck in to freeze_array */
982 test_bit(R1BIO_BehindIO, &r1_bio->state));
983 spin_lock_irqsave(&conf->device_lock, flags);
984 bio_list_merge(&conf->pending_bio_list, &bl);
985 bio_list_init(&bl);
986
987 blk_plug_device(mddev->queue);
988 spin_unlock_irqrestore(&conf->device_lock, flags);
989
990 /* In case raid1d snuck into freeze_array */
991 wake_up(&conf->wait_barrier); 989 wake_up(&conf->wait_barrier);
992 990
993 if (do_sync) 991 if (do_sync)
994 md_wakeup_thread(mddev->thread); 992 md_wakeup_thread(mddev->thread);
995#if 0
996 while ((bio = bio_list_pop(&bl)) != NULL)
997 generic_make_request(bio);
998#endif
999 993
1000 return 0; 994 return 0;
1001} 995}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 387fe4b4fab7..8f5543a62416 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -801,7 +801,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
801 const int rw = bio_data_dir(bio); 801 const int rw = bio_data_dir(bio);
802 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 802 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
803 const unsigned long do_fua = (bio->bi_rw & REQ_FUA); 803 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
804 struct bio_list bl;
805 unsigned long flags; 804 unsigned long flags;
806 mdk_rdev_t *blocked_rdev; 805 mdk_rdev_t *blocked_rdev;
807 806
@@ -950,9 +949,9 @@ static int make_request(mddev_t *mddev, struct bio * bio)
950 goto retry_write; 949 goto retry_write;
951 } 950 }
952 951
953 atomic_set(&r10_bio->remaining, 0); 952 atomic_set(&r10_bio->remaining, 1);
953 bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
954 954
955 bio_list_init(&bl);
956 for (i = 0; i < conf->copies; i++) { 955 for (i = 0; i < conf->copies; i++) {
957 struct bio *mbio; 956 struct bio *mbio;
958 int d = r10_bio->devs[i].devnum; 957 int d = r10_bio->devs[i].devnum;
@@ -970,22 +969,22 @@ static int make_request(mddev_t *mddev, struct bio * bio)
970 mbio->bi_private = r10_bio; 969 mbio->bi_private = r10_bio;
971 970
972 atomic_inc(&r10_bio->remaining); 971 atomic_inc(&r10_bio->remaining);
973 bio_list_add(&bl, mbio); 972 spin_lock_irqsave(&conf->device_lock, flags);
973 bio_list_add(&conf->pending_bio_list, mbio);
974 blk_plug_device(mddev->queue);
975 spin_unlock_irqrestore(&conf->device_lock, flags);
974 } 976 }
975 977
976 if (unlikely(!atomic_read(&r10_bio->remaining))) { 978 if (atomic_dec_and_test(&r10_bio->remaining)) {
977 /* the array is dead */ 979 /* This matches the end of raid10_end_write_request() */
980 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
981 r10_bio->sectors,
982 !test_bit(R10BIO_Degraded, &r10_bio->state),
983 0);
978 md_write_end(mddev); 984 md_write_end(mddev);
979 raid_end_bio_io(r10_bio); 985 raid_end_bio_io(r10_bio);
980 return 0;
981 } 986 }
982 987
983 bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
984 spin_lock_irqsave(&conf->device_lock, flags);
985 bio_list_merge(&conf->pending_bio_list, &bl);
986 blk_plug_device(mddev->queue);
987 spin_unlock_irqrestore(&conf->device_lock, flags);
988
989 /* In case raid10d snuck in to freeze_array */ 988 /* In case raid10d snuck in to freeze_array */
990 wake_up(&conf->wait_barrier); 989 wake_up(&conf->wait_barrier);
991 990