aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-10-29 11:47:36 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-10-29 11:47:36 -0400
commitd8d048f69a618c531575cb1f398a7186f0532ef2 (patch)
treecb8d6ee7a3cb2aea53e5e95de0ca689cc69411a2
parent53113b06e48c6c38f7612c1f8043b8a0d2adf72b (diff)
parentf3ac8bf7ce1c5abd763ea762e95d1cdcf7799372 (diff)
Merge branch 'for-linus' of git://neil.brown.name/md
* 'for-linus' of git://neil.brown.name/md: md: tidy up device searches in read_balance. md/raid1: fix some typos in comments. md/raid1: discard unused variable. md: unplug writes to external bitmaps. md: use separate bio pool for each md device. md: change type of first arg to sync_page_io. md/raid1: perform mem allocation before disabling writes during resync. md: use bio_kmalloc rather than bio_alloc when failure is acceptable. md: Fix possible deadlock with multiple mempool allocations. md: fix and update workqueue usage md: use sector_t in bitmap_get_counter md: remove md_mutex locking. md: Fix regression with raid1 arrays without persistent metadata.
-rw-r--r--drivers/md/bitmap.c30
-rw-r--r--drivers/md/bitmap.h4
-rw-r--r--drivers/md/faulty.c2
-rw-r--r--drivers/md/md.c162
-rw-r--r--drivers/md/md.h8
-rw-r--r--drivers/md/raid1.c224
-rw-r--r--drivers/md/raid1.h2
-rw-r--r--drivers/md/raid10.c42
-rw-r--r--drivers/md/raid5.c6
9 files changed, 275 insertions, 205 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index e4fb58db5454..5a1ffe3527aa 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -212,7 +212,7 @@ static struct page *read_sb_page(mddev_t *mddev, loff_t offset,
212 212
213 target = rdev->sb_start + offset + index * (PAGE_SIZE/512); 213 target = rdev->sb_start + offset + index * (PAGE_SIZE/512);
214 214
215 if (sync_page_io(rdev->bdev, target, 215 if (sync_page_io(rdev, target,
216 roundup(size, bdev_logical_block_size(rdev->bdev)), 216 roundup(size, bdev_logical_block_size(rdev->bdev)),
217 page, READ)) { 217 page, READ)) {
218 page->index = index; 218 page->index = index;
@@ -343,7 +343,7 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait)
343 atomic_inc(&bitmap->pending_writes); 343 atomic_inc(&bitmap->pending_writes);
344 set_buffer_locked(bh); 344 set_buffer_locked(bh);
345 set_buffer_mapped(bh); 345 set_buffer_mapped(bh);
346 submit_bh(WRITE, bh); 346 submit_bh(WRITE | REQ_UNPLUG | REQ_SYNC, bh);
347 bh = bh->b_this_page; 347 bh = bh->b_this_page;
348 } 348 }
349 349
@@ -1101,7 +1101,7 @@ static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc)
1101 bitmap_checkfree(bitmap, page); 1101 bitmap_checkfree(bitmap, page);
1102} 1102}
1103static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, 1103static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
1104 sector_t offset, int *blocks, 1104 sector_t offset, sector_t *blocks,
1105 int create); 1105 int create);
1106 1106
1107/* 1107/*
@@ -1115,7 +1115,7 @@ void bitmap_daemon_work(mddev_t *mddev)
1115 unsigned long j; 1115 unsigned long j;
1116 unsigned long flags; 1116 unsigned long flags;
1117 struct page *page = NULL, *lastpage = NULL; 1117 struct page *page = NULL, *lastpage = NULL;
1118 int blocks; 1118 sector_t blocks;
1119 void *paddr; 1119 void *paddr;
1120 struct dm_dirty_log *log = mddev->bitmap_info.log; 1120 struct dm_dirty_log *log = mddev->bitmap_info.log;
1121 1121
@@ -1258,7 +1258,7 @@ void bitmap_daemon_work(mddev_t *mddev)
1258} 1258}
1259 1259
1260static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, 1260static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
1261 sector_t offset, int *blocks, 1261 sector_t offset, sector_t *blocks,
1262 int create) 1262 int create)
1263__releases(bitmap->lock) 1263__releases(bitmap->lock)
1264__acquires(bitmap->lock) 1264__acquires(bitmap->lock)
@@ -1316,7 +1316,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
1316 } 1316 }
1317 1317
1318 while (sectors) { 1318 while (sectors) {
1319 int blocks; 1319 sector_t blocks;
1320 bitmap_counter_t *bmc; 1320 bitmap_counter_t *bmc;
1321 1321
1322 spin_lock_irq(&bitmap->lock); 1322 spin_lock_irq(&bitmap->lock);
@@ -1381,7 +1381,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
1381 success = 0; 1381 success = 0;
1382 1382
1383 while (sectors) { 1383 while (sectors) {
1384 int blocks; 1384 sector_t blocks;
1385 unsigned long flags; 1385 unsigned long flags;
1386 bitmap_counter_t *bmc; 1386 bitmap_counter_t *bmc;
1387 1387
@@ -1423,7 +1423,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
1423} 1423}
1424EXPORT_SYMBOL(bitmap_endwrite); 1424EXPORT_SYMBOL(bitmap_endwrite);
1425 1425
1426static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, 1426static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks,
1427 int degraded) 1427 int degraded)
1428{ 1428{
1429 bitmap_counter_t *bmc; 1429 bitmap_counter_t *bmc;
@@ -1452,7 +1452,7 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *bloc
1452 return rv; 1452 return rv;
1453} 1453}
1454 1454
1455int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, 1455int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks,
1456 int degraded) 1456 int degraded)
1457{ 1457{
1458 /* bitmap_start_sync must always report on multiples of whole 1458 /* bitmap_start_sync must always report on multiples of whole
@@ -1463,7 +1463,7 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
1463 * Return the 'or' of the result. 1463 * Return the 'or' of the result.
1464 */ 1464 */
1465 int rv = 0; 1465 int rv = 0;
1466 int blocks1; 1466 sector_t blocks1;
1467 1467
1468 *blocks = 0; 1468 *blocks = 0;
1469 while (*blocks < (PAGE_SIZE>>9)) { 1469 while (*blocks < (PAGE_SIZE>>9)) {
@@ -1476,7 +1476,7 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
1476} 1476}
1477EXPORT_SYMBOL(bitmap_start_sync); 1477EXPORT_SYMBOL(bitmap_start_sync);
1478 1478
1479void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted) 1479void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted)
1480{ 1480{
1481 bitmap_counter_t *bmc; 1481 bitmap_counter_t *bmc;
1482 unsigned long flags; 1482 unsigned long flags;
@@ -1515,7 +1515,7 @@ void bitmap_close_sync(struct bitmap *bitmap)
1515 * RESYNC bit wherever it is still on 1515 * RESYNC bit wherever it is still on
1516 */ 1516 */
1517 sector_t sector = 0; 1517 sector_t sector = 0;
1518 int blocks; 1518 sector_t blocks;
1519 if (!bitmap) 1519 if (!bitmap)
1520 return; 1520 return;
1521 while (sector < bitmap->mddev->resync_max_sectors) { 1521 while (sector < bitmap->mddev->resync_max_sectors) {
@@ -1528,7 +1528,7 @@ EXPORT_SYMBOL(bitmap_close_sync);
1528void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) 1528void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
1529{ 1529{
1530 sector_t s = 0; 1530 sector_t s = 0;
1531 int blocks; 1531 sector_t blocks;
1532 1532
1533 if (!bitmap) 1533 if (!bitmap)
1534 return; 1534 return;
@@ -1562,7 +1562,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
1562 * be 0 at this point 1562 * be 0 at this point
1563 */ 1563 */
1564 1564
1565 int secs; 1565 sector_t secs;
1566 bitmap_counter_t *bmc; 1566 bitmap_counter_t *bmc;
1567 spin_lock_irq(&bitmap->lock); 1567 spin_lock_irq(&bitmap->lock);
1568 bmc = bitmap_get_counter(bitmap, offset, &secs, 1); 1568 bmc = bitmap_get_counter(bitmap, offset, &secs, 1);
@@ -1790,7 +1790,7 @@ int bitmap_load(mddev_t *mddev)
1790 * All chunks should be clean, but some might need_sync. 1790 * All chunks should be clean, but some might need_sync.
1791 */ 1791 */
1792 while (sector < mddev->resync_max_sectors) { 1792 while (sector < mddev->resync_max_sectors) {
1793 int blocks; 1793 sector_t blocks;
1794 bitmap_start_sync(bitmap, sector, &blocks, 0); 1794 bitmap_start_sync(bitmap, sector, &blocks, 0);
1795 sector += blocks; 1795 sector += blocks;
1796 } 1796 }
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index e872a7bad6b8..931a7a7c3796 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -271,8 +271,8 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset,
271 unsigned long sectors, int behind); 271 unsigned long sectors, int behind);
272void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, 272void bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
273 unsigned long sectors, int success, int behind); 273 unsigned long sectors, int success, int behind);
274int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded); 274int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded);
275void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted); 275void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted);
276void bitmap_close_sync(struct bitmap *bitmap); 276void bitmap_close_sync(struct bitmap *bitmap);
277void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); 277void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
278 278
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 1a8987884614..339fdc670751 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -210,7 +210,7 @@ static int make_request(mddev_t *mddev, struct bio *bio)
210 } 210 }
211 } 211 }
212 if (failit) { 212 if (failit) {
213 struct bio *b = bio_clone(bio, GFP_NOIO); 213 struct bio *b = bio_clone_mddev(bio, GFP_NOIO, mddev);
214 b->bi_bdev = conf->rdev->bdev; 214 b->bi_bdev = conf->rdev->bdev;
215 b->bi_private = bio; 215 b->bi_private = bio;
216 b->bi_end_io = faulty_fail; 216 b->bi_end_io = faulty_fail;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 225815197a3d..4e957f3140a8 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -57,8 +57,6 @@
57#define DEBUG 0 57#define DEBUG 0
58#define dprintk(x...) ((void)(DEBUG && printk(x))) 58#define dprintk(x...) ((void)(DEBUG && printk(x)))
59 59
60static DEFINE_MUTEX(md_mutex);
61
62#ifndef MODULE 60#ifndef MODULE
63static void autostart_arrays(int part); 61static void autostart_arrays(int part);
64#endif 62#endif
@@ -69,6 +67,8 @@ static DEFINE_SPINLOCK(pers_lock);
69static void md_print_devices(void); 67static void md_print_devices(void);
70 68
71static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 69static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
70static struct workqueue_struct *md_wq;
71static struct workqueue_struct *md_misc_wq;
72 72
73#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } 73#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
74 74
@@ -149,6 +149,72 @@ static const struct block_device_operations md_fops;
149 149
150static int start_readonly; 150static int start_readonly;
151 151
152/* bio_clone_mddev
153 * like bio_clone, but with a local bio set
154 */
155
156static void mddev_bio_destructor(struct bio *bio)
157{
158 mddev_t *mddev, **mddevp;
159
160 mddevp = (void*)bio;
161 mddev = mddevp[-1];
162
163 bio_free(bio, mddev->bio_set);
164}
165
166struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
167 mddev_t *mddev)
168{
169 struct bio *b;
170 mddev_t **mddevp;
171
172 if (!mddev || !mddev->bio_set)
173 return bio_alloc(gfp_mask, nr_iovecs);
174
175 b = bio_alloc_bioset(gfp_mask, nr_iovecs,
176 mddev->bio_set);
177 if (!b)
178 return NULL;
179 mddevp = (void*)b;
180 mddevp[-1] = mddev;
181 b->bi_destructor = mddev_bio_destructor;
182 return b;
183}
184EXPORT_SYMBOL_GPL(bio_alloc_mddev);
185
186struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
187 mddev_t *mddev)
188{
189 struct bio *b;
190 mddev_t **mddevp;
191
192 if (!mddev || !mddev->bio_set)
193 return bio_clone(bio, gfp_mask);
194
195 b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs,
196 mddev->bio_set);
197 if (!b)
198 return NULL;
199 mddevp = (void*)b;
200 mddevp[-1] = mddev;
201 b->bi_destructor = mddev_bio_destructor;
202 __bio_clone(b, bio);
203 if (bio_integrity(bio)) {
204 int ret;
205
206 ret = bio_integrity_clone(b, bio, gfp_mask, mddev->bio_set);
207
208 if (ret < 0) {
209 bio_put(b);
210 return NULL;
211 }
212 }
213
214 return b;
215}
216EXPORT_SYMBOL_GPL(bio_clone_mddev);
217
152/* 218/*
153 * We have a system wide 'event count' that is incremented 219 * We have a system wide 'event count' that is incremented
154 * on any 'interesting' event, and readers of /proc/mdstat 220 * on any 'interesting' event, and readers of /proc/mdstat
@@ -300,7 +366,7 @@ static void md_end_flush(struct bio *bio, int err)
300 366
301 if (atomic_dec_and_test(&mddev->flush_pending)) { 367 if (atomic_dec_and_test(&mddev->flush_pending)) {
302 /* The pre-request flush has finished */ 368 /* The pre-request flush has finished */
303 schedule_work(&mddev->flush_work); 369 queue_work(md_wq, &mddev->flush_work);
304 } 370 }
305 bio_put(bio); 371 bio_put(bio);
306} 372}
@@ -321,7 +387,7 @@ static void submit_flushes(mddev_t *mddev)
321 atomic_inc(&rdev->nr_pending); 387 atomic_inc(&rdev->nr_pending);
322 atomic_inc(&rdev->nr_pending); 388 atomic_inc(&rdev->nr_pending);
323 rcu_read_unlock(); 389 rcu_read_unlock();
324 bi = bio_alloc(GFP_KERNEL, 0); 390 bi = bio_alloc_mddev(GFP_KERNEL, 0, mddev);
325 bi->bi_end_io = md_end_flush; 391 bi->bi_end_io = md_end_flush;
326 bi->bi_private = rdev; 392 bi->bi_private = rdev;
327 bi->bi_bdev = rdev->bdev; 393 bi->bi_bdev = rdev->bdev;
@@ -369,7 +435,7 @@ void md_flush_request(mddev_t *mddev, struct bio *bio)
369 submit_flushes(mddev); 435 submit_flushes(mddev);
370 436
371 if (atomic_dec_and_test(&mddev->flush_pending)) 437 if (atomic_dec_and_test(&mddev->flush_pending))
372 schedule_work(&mddev->flush_work); 438 queue_work(md_wq, &mddev->flush_work);
373} 439}
374EXPORT_SYMBOL(md_flush_request); 440EXPORT_SYMBOL(md_flush_request);
375 441
@@ -428,6 +494,8 @@ static void mddev_delayed_delete(struct work_struct *ws);
428 494
429static void mddev_put(mddev_t *mddev) 495static void mddev_put(mddev_t *mddev)
430{ 496{
497 struct bio_set *bs = NULL;
498
431 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 499 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
432 return; 500 return;
433 if (!mddev->raid_disks && list_empty(&mddev->disks) && 501 if (!mddev->raid_disks && list_empty(&mddev->disks) &&
@@ -435,19 +503,22 @@ static void mddev_put(mddev_t *mddev)
435 /* Array is not configured at all, and not held active, 503 /* Array is not configured at all, and not held active,
436 * so destroy it */ 504 * so destroy it */
437 list_del(&mddev->all_mddevs); 505 list_del(&mddev->all_mddevs);
506 bs = mddev->bio_set;
507 mddev->bio_set = NULL;
438 if (mddev->gendisk) { 508 if (mddev->gendisk) {
439 /* we did a probe so need to clean up. 509 /* We did a probe so need to clean up. Call
440 * Call schedule_work inside the spinlock 510 * queue_work inside the spinlock so that
441 * so that flush_scheduled_work() after 511 * flush_workqueue() after mddev_find will
442 * mddev_find will succeed in waiting for the 512 * succeed in waiting for the work to be done.
443 * work to be done.
444 */ 513 */
445 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 514 INIT_WORK(&mddev->del_work, mddev_delayed_delete);
446 schedule_work(&mddev->del_work); 515 queue_work(md_misc_wq, &mddev->del_work);
447 } else 516 } else
448 kfree(mddev); 517 kfree(mddev);
449 } 518 }
450 spin_unlock(&all_mddevs_lock); 519 spin_unlock(&all_mddevs_lock);
520 if (bs)
521 bioset_free(bs);
451} 522}
452 523
453void mddev_init(mddev_t *mddev) 524void mddev_init(mddev_t *mddev)
@@ -691,7 +762,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
691 * if zero is reached. 762 * if zero is reached.
692 * If an error occurred, call md_error 763 * If an error occurred, call md_error
693 */ 764 */
694 struct bio *bio = bio_alloc(GFP_NOIO, 1); 765 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
695 766
696 bio->bi_bdev = rdev->bdev; 767 bio->bi_bdev = rdev->bdev;
697 bio->bi_sector = sector; 768 bio->bi_sector = sector;
@@ -722,16 +793,16 @@ static void bi_complete(struct bio *bio, int error)
722 complete((struct completion*)bio->bi_private); 793 complete((struct completion*)bio->bi_private);
723} 794}
724 795
725int sync_page_io(struct block_device *bdev, sector_t sector, int size, 796int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
726 struct page *page, int rw) 797 struct page *page, int rw)
727{ 798{
728 struct bio *bio = bio_alloc(GFP_NOIO, 1); 799 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
729 struct completion event; 800 struct completion event;
730 int ret; 801 int ret;
731 802
732 rw |= REQ_SYNC | REQ_UNPLUG; 803 rw |= REQ_SYNC | REQ_UNPLUG;
733 804
734 bio->bi_bdev = bdev; 805 bio->bi_bdev = rdev->bdev;
735 bio->bi_sector = sector; 806 bio->bi_sector = sector;
736 bio_add_page(bio, page, size, 0); 807 bio_add_page(bio, page, size, 0);
737 init_completion(&event); 808 init_completion(&event);
@@ -757,7 +828,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size)
757 return 0; 828 return 0;
758 829
759 830
760 if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ)) 831 if (!sync_page_io(rdev, rdev->sb_start, size, rdev->sb_page, READ))
761 goto fail; 832 goto fail;
762 rdev->sb_loaded = 1; 833 rdev->sb_loaded = 1;
763 return 0; 834 return 0;
@@ -1850,7 +1921,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1850 synchronize_rcu(); 1921 synchronize_rcu();
1851 INIT_WORK(&rdev->del_work, md_delayed_delete); 1922 INIT_WORK(&rdev->del_work, md_delayed_delete);
1852 kobject_get(&rdev->kobj); 1923 kobject_get(&rdev->kobj);
1853 schedule_work(&rdev->del_work); 1924 queue_work(md_misc_wq, &rdev->del_work);
1854} 1925}
1855 1926
1856/* 1927/*
@@ -2108,6 +2179,8 @@ repeat:
2108 if (!mddev->persistent) { 2179 if (!mddev->persistent) {
2109 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 2180 clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
2110 clear_bit(MD_CHANGE_DEVS, &mddev->flags); 2181 clear_bit(MD_CHANGE_DEVS, &mddev->flags);
2182 if (!mddev->external)
2183 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2111 wake_up(&mddev->sb_wait); 2184 wake_up(&mddev->sb_wait);
2112 return; 2185 return;
2113 } 2186 }
@@ -4192,10 +4265,10 @@ static int md_alloc(dev_t dev, char *name)
4192 shift = partitioned ? MdpMinorShift : 0; 4265 shift = partitioned ? MdpMinorShift : 0;
4193 unit = MINOR(mddev->unit) >> shift; 4266 unit = MINOR(mddev->unit) >> shift;
4194 4267
4195 /* wait for any previous instance if this device 4268 /* wait for any previous instance of this device to be
4196 * to be completed removed (mddev_delayed_delete). 4269 * completely removed (mddev_delayed_delete).
4197 */ 4270 */
4198 flush_scheduled_work(); 4271 flush_workqueue(md_misc_wq);
4199 4272
4200 mutex_lock(&disks_mutex); 4273 mutex_lock(&disks_mutex);
4201 error = -EEXIST; 4274 error = -EEXIST;
@@ -4378,6 +4451,9 @@ int md_run(mddev_t *mddev)
4378 sysfs_notify_dirent_safe(rdev->sysfs_state); 4451 sysfs_notify_dirent_safe(rdev->sysfs_state);
4379 } 4452 }
4380 4453
4454 if (mddev->bio_set == NULL)
4455 mddev->bio_set = bioset_create(BIO_POOL_SIZE, sizeof(mddev));
4456
4381 spin_lock(&pers_lock); 4457 spin_lock(&pers_lock);
4382 pers = find_pers(mddev->level, mddev->clevel); 4458 pers = find_pers(mddev->level, mddev->clevel);
4383 if (!pers || !try_module_get(pers->owner)) { 4459 if (!pers || !try_module_get(pers->owner)) {
@@ -5885,16 +5961,14 @@ static int md_open(struct block_device *bdev, fmode_t mode)
5885 mddev_t *mddev = mddev_find(bdev->bd_dev); 5961 mddev_t *mddev = mddev_find(bdev->bd_dev);
5886 int err; 5962 int err;
5887 5963
5888 mutex_lock(&md_mutex);
5889 if (mddev->gendisk != bdev->bd_disk) { 5964 if (mddev->gendisk != bdev->bd_disk) {
5890 /* we are racing with mddev_put which is discarding this 5965 /* we are racing with mddev_put which is discarding this
5891 * bd_disk. 5966 * bd_disk.
5892 */ 5967 */
5893 mddev_put(mddev); 5968 mddev_put(mddev);
5894 /* Wait until bdev->bd_disk is definitely gone */ 5969 /* Wait until bdev->bd_disk is definitely gone */
5895 flush_scheduled_work(); 5970 flush_workqueue(md_misc_wq);
5896 /* Then retry the open from the top */ 5971 /* Then retry the open from the top */
5897 mutex_unlock(&md_mutex);
5898 return -ERESTARTSYS; 5972 return -ERESTARTSYS;
5899 } 5973 }
5900 BUG_ON(mddev != bdev->bd_disk->private_data); 5974 BUG_ON(mddev != bdev->bd_disk->private_data);
@@ -5908,7 +5982,6 @@ static int md_open(struct block_device *bdev, fmode_t mode)
5908 5982
5909 check_disk_size_change(mddev->gendisk, bdev); 5983 check_disk_size_change(mddev->gendisk, bdev);
5910 out: 5984 out:
5911 mutex_unlock(&md_mutex);
5912 return err; 5985 return err;
5913} 5986}
5914 5987
@@ -5917,10 +5990,8 @@ static int md_release(struct gendisk *disk, fmode_t mode)
5917 mddev_t *mddev = disk->private_data; 5990 mddev_t *mddev = disk->private_data;
5918 5991
5919 BUG_ON(!mddev); 5992 BUG_ON(!mddev);
5920 mutex_lock(&md_mutex);
5921 atomic_dec(&mddev->openers); 5993 atomic_dec(&mddev->openers);
5922 mddev_put(mddev); 5994 mddev_put(mddev);
5923 mutex_unlock(&md_mutex);
5924 5995
5925 return 0; 5996 return 0;
5926} 5997}
@@ -6052,7 +6123,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
6052 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6123 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6053 md_wakeup_thread(mddev->thread); 6124 md_wakeup_thread(mddev->thread);
6054 if (mddev->event_work.func) 6125 if (mddev->event_work.func)
6055 schedule_work(&mddev->event_work); 6126 queue_work(md_misc_wq, &mddev->event_work);
6056 md_new_event_inintr(mddev); 6127 md_new_event_inintr(mddev);
6057} 6128}
6058 6129
@@ -7212,12 +7283,23 @@ static void md_geninit(void)
7212 7283
7213static int __init md_init(void) 7284static int __init md_init(void)
7214{ 7285{
7215 if (register_blkdev(MD_MAJOR, "md")) 7286 int ret = -ENOMEM;
7216 return -1; 7287
7217 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 7288 md_wq = alloc_workqueue("md", WQ_RESCUER, 0);
7218 unregister_blkdev(MD_MAJOR, "md"); 7289 if (!md_wq)
7219 return -1; 7290 goto err_wq;
7220 } 7291
7292 md_misc_wq = alloc_workqueue("md_misc", 0, 0);
7293 if (!md_misc_wq)
7294 goto err_misc_wq;
7295
7296 if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
7297 goto err_md;
7298
7299 if ((ret = register_blkdev(0, "mdp")) < 0)
7300 goto err_mdp;
7301 mdp_major = ret;
7302
7221 blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE, 7303 blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
7222 md_probe, NULL, NULL); 7304 md_probe, NULL, NULL);
7223 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, 7305 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
@@ -7228,8 +7310,16 @@ static int __init md_init(void)
7228 7310
7229 md_geninit(); 7311 md_geninit();
7230 return 0; 7312 return 0;
7231}
7232 7313
7314err_mdp:
7315 unregister_blkdev(MD_MAJOR, "md");
7316err_md:
7317 destroy_workqueue(md_misc_wq);
7318err_misc_wq:
7319 destroy_workqueue(md_wq);
7320err_wq:
7321 return ret;
7322}
7233 7323
7234#ifndef MODULE 7324#ifndef MODULE
7235 7325
@@ -7316,6 +7406,8 @@ static __exit void md_exit(void)
7316 export_array(mddev); 7406 export_array(mddev);
7317 mddev->hold_active = 0; 7407 mddev->hold_active = 0;
7318 } 7408 }
7409 destroy_workqueue(md_misc_wq);
7410 destroy_workqueue(md_wq);
7319} 7411}
7320 7412
7321subsys_initcall(md_init); 7413subsys_initcall(md_init);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 112a2c32db0c..d05bab55df4e 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -331,6 +331,8 @@ struct mddev_s
331 struct attribute_group *to_remove; 331 struct attribute_group *to_remove;
332 struct plug_handle *plug; /* if used by personality */ 332 struct plug_handle *plug; /* if used by personality */
333 333
334 struct bio_set *bio_set;
335
334 /* Generic flush handling. 336 /* Generic flush handling.
335 * The last to finish preflush schedules a worker to submit 337 * The last to finish preflush schedules a worker to submit
336 * the rest of the request (without the REQ_FLUSH flag). 338 * the rest of the request (without the REQ_FLUSH flag).
@@ -495,7 +497,7 @@ extern void md_flush_request(mddev_t *mddev, struct bio *bio);
495extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 497extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
496 sector_t sector, int size, struct page *page); 498 sector_t sector, int size, struct page *page);
497extern void md_super_wait(mddev_t *mddev); 499extern void md_super_wait(mddev_t *mddev);
498extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, 500extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
499 struct page *page, int rw); 501 struct page *page, int rw);
500extern void md_do_sync(mddev_t *mddev); 502extern void md_do_sync(mddev_t *mddev);
501extern void md_new_event(mddev_t *mddev); 503extern void md_new_event(mddev_t *mddev);
@@ -517,4 +519,8 @@ extern void md_rdev_init(mdk_rdev_t *rdev);
517 519
518extern void mddev_suspend(mddev_t *mddev); 520extern void mddev_suspend(mddev_t *mddev);
519extern void mddev_resume(mddev_t *mddev); 521extern void mddev_resume(mddev_t *mddev);
522extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
523 mddev_t *mddev);
524extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
525 mddev_t *mddev);
520#endif /* _MD_MD_H */ 526#endif /* _MD_MD_H */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 378a25894c57..45f8324196ec 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -100,7 +100,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
100 * Allocate bios : 1 for reading, n-1 for writing 100 * Allocate bios : 1 for reading, n-1 for writing
101 */ 101 */
102 for (j = pi->raid_disks ; j-- ; ) { 102 for (j = pi->raid_disks ; j-- ; ) {
103 bio = bio_alloc(gfp_flags, RESYNC_PAGES); 103 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
104 if (!bio) 104 if (!bio)
105 goto out_free_bio; 105 goto out_free_bio;
106 r1_bio->bios[j] = bio; 106 r1_bio->bios[j] = bio;
@@ -306,6 +306,28 @@ static void raid1_end_read_request(struct bio *bio, int error)
306 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); 306 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
307} 307}
308 308
309static void r1_bio_write_done(r1bio_t *r1_bio, int vcnt, struct bio_vec *bv,
310 int behind)
311{
312 if (atomic_dec_and_test(&r1_bio->remaining))
313 {
314 /* it really is the end of this request */
315 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
316 /* free extra copy of the data pages */
317 int i = vcnt;
318 while (i--)
319 safe_put_page(bv[i].bv_page);
320 }
321 /* clear the bitmap if all writes complete successfully */
322 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
323 r1_bio->sectors,
324 !test_bit(R1BIO_Degraded, &r1_bio->state),
325 behind);
326 md_write_end(r1_bio->mddev);
327 raid_end_bio_io(r1_bio);
328 }
329}
330
309static void raid1_end_write_request(struct bio *bio, int error) 331static void raid1_end_write_request(struct bio *bio, int error)
310{ 332{
311 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 333 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -373,21 +395,7 @@ static void raid1_end_write_request(struct bio *bio, int error)
373 * Let's see if all mirrored write operations have finished 395 * Let's see if all mirrored write operations have finished
374 * already. 396 * already.
375 */ 397 */
376 if (atomic_dec_and_test(&r1_bio->remaining)) { 398 r1_bio_write_done(r1_bio, bio->bi_vcnt, bio->bi_io_vec, behind);
377 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
378 /* free extra copy of the data pages */
379 int i = bio->bi_vcnt;
380 while (i--)
381 safe_put_page(bio->bi_io_vec[i].bv_page);
382 }
383 /* clear the bitmap if all writes complete successfully */
384 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
385 r1_bio->sectors,
386 !test_bit(R1BIO_Degraded, &r1_bio->state),
387 behind);
388 md_write_end(r1_bio->mddev);
389 raid_end_bio_io(r1_bio);
390 }
391 399
392 if (to_put) 400 if (to_put)
393 bio_put(to_put); 401 bio_put(to_put);
@@ -411,11 +419,13 @@ static void raid1_end_write_request(struct bio *bio, int error)
411static int read_balance(conf_t *conf, r1bio_t *r1_bio) 419static int read_balance(conf_t *conf, r1bio_t *r1_bio)
412{ 420{
413 const sector_t this_sector = r1_bio->sector; 421 const sector_t this_sector = r1_bio->sector;
414 int new_disk = conf->last_used, disk = new_disk;
415 int wonly_disk = -1;
416 const int sectors = r1_bio->sectors; 422 const int sectors = r1_bio->sectors;
423 int new_disk = -1;
424 int start_disk;
425 int i;
417 sector_t new_distance, current_distance; 426 sector_t new_distance, current_distance;
418 mdk_rdev_t *rdev; 427 mdk_rdev_t *rdev;
428 int choose_first;
419 429
420 rcu_read_lock(); 430 rcu_read_lock();
421 /* 431 /*
@@ -426,54 +436,33 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
426 retry: 436 retry:
427 if (conf->mddev->recovery_cp < MaxSector && 437 if (conf->mddev->recovery_cp < MaxSector &&
428 (this_sector + sectors >= conf->next_resync)) { 438 (this_sector + sectors >= conf->next_resync)) {
429 /* Choose the first operational device, for consistancy */ 439 choose_first = 1;
430 new_disk = 0; 440 start_disk = 0;
431 441 } else {
432 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); 442 choose_first = 0;
433 r1_bio->bios[new_disk] == IO_BLOCKED || 443 start_disk = conf->last_used;
434 !rdev || !test_bit(In_sync, &rdev->flags)
435 || test_bit(WriteMostly, &rdev->flags);
436 rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) {
437
438 if (rdev && test_bit(In_sync, &rdev->flags) &&
439 r1_bio->bios[new_disk] != IO_BLOCKED)
440 wonly_disk = new_disk;
441
442 if (new_disk == conf->raid_disks - 1) {
443 new_disk = wonly_disk;
444 break;
445 }
446 }
447 goto rb_out;
448 } 444 }
449 445
450
451 /* make sure the disk is operational */ 446 /* make sure the disk is operational */
452 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); 447 for (i = 0 ; i < conf->raid_disks ; i++) {
453 r1_bio->bios[new_disk] == IO_BLOCKED || 448 int disk = start_disk + i;
454 !rdev || !test_bit(In_sync, &rdev->flags) || 449 if (disk >= conf->raid_disks)
455 test_bit(WriteMostly, &rdev->flags); 450 disk -= conf->raid_disks;
456 rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) { 451
457 452 rdev = rcu_dereference(conf->mirrors[disk].rdev);
458 if (rdev && test_bit(In_sync, &rdev->flags) && 453 if (r1_bio->bios[disk] == IO_BLOCKED
459 r1_bio->bios[new_disk] != IO_BLOCKED) 454 || rdev == NULL
460 wonly_disk = new_disk; 455 || !test_bit(In_sync, &rdev->flags))
461 456 continue;
462 if (new_disk <= 0) 457
463 new_disk = conf->raid_disks; 458 new_disk = disk;
464 new_disk--; 459 if (!test_bit(WriteMostly, &rdev->flags))
465 if (new_disk == disk) {
466 new_disk = wonly_disk;
467 break; 460 break;
468 }
469 } 461 }
470 462
471 if (new_disk < 0) 463 if (new_disk < 0 || choose_first)
472 goto rb_out; 464 goto rb_out;
473 465
474 disk = new_disk;
475 /* now disk == new_disk == starting point for search */
476
477 /* 466 /*
478 * Don't change to another disk for sequential reads: 467 * Don't change to another disk for sequential reads:
479 */ 468 */
@@ -482,20 +471,21 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
482 if (this_sector == conf->mirrors[new_disk].head_position) 471 if (this_sector == conf->mirrors[new_disk].head_position)
483 goto rb_out; 472 goto rb_out;
484 473
485 current_distance = abs(this_sector - conf->mirrors[disk].head_position); 474 current_distance = abs(this_sector
486 475 - conf->mirrors[new_disk].head_position);
487 /* Find the disk whose head is closest */
488 476
489 do { 477 /* look for a better disk - i.e. head is closer */
490 if (disk <= 0) 478 start_disk = new_disk;
491 disk = conf->raid_disks; 479 for (i = 1; i < conf->raid_disks; i++) {
492 disk--; 480 int disk = start_disk + 1;
481 if (disk >= conf->raid_disks)
482 disk -= conf->raid_disks;
493 483
494 rdev = rcu_dereference(conf->mirrors[disk].rdev); 484 rdev = rcu_dereference(conf->mirrors[disk].rdev);
495 485 if (r1_bio->bios[disk] == IO_BLOCKED
496 if (!rdev || r1_bio->bios[disk] == IO_BLOCKED || 486 || rdev == NULL
497 !test_bit(In_sync, &rdev->flags) || 487 || !test_bit(In_sync, &rdev->flags)
498 test_bit(WriteMostly, &rdev->flags)) 488 || test_bit(WriteMostly, &rdev->flags))
499 continue; 489 continue;
500 490
501 if (!atomic_read(&rdev->nr_pending)) { 491 if (!atomic_read(&rdev->nr_pending)) {
@@ -507,11 +497,9 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
507 current_distance = new_distance; 497 current_distance = new_distance;
508 new_disk = disk; 498 new_disk = disk;
509 } 499 }
510 } while (disk != conf->last_used); 500 }
511 501
512 rb_out: 502 rb_out:
513
514
515 if (new_disk >= 0) { 503 if (new_disk >= 0) {
516 rdev = rcu_dereference(conf->mirrors[new_disk].rdev); 504 rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
517 if (!rdev) 505 if (!rdev)
@@ -658,7 +646,7 @@ static void raise_barrier(conf_t *conf)
658 /* block any new IO from starting */ 646 /* block any new IO from starting */
659 conf->barrier++; 647 conf->barrier++;
660 648
661 /* No wait for all pending IO to complete */ 649 /* Now wait for all pending IO to complete */
662 wait_event_lock_irq(conf->wait_barrier, 650 wait_event_lock_irq(conf->wait_barrier,
663 !conf->nr_pending && conf->barrier < RESYNC_DEPTH, 651 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
664 conf->resync_lock, 652 conf->resync_lock,
@@ -735,23 +723,26 @@ static void unfreeze_array(conf_t *conf)
735} 723}
736 724
737 725
738/* duplicate the data pages for behind I/O */ 726/* duplicate the data pages for behind I/O
739static struct page **alloc_behind_pages(struct bio *bio) 727 * We return a list of bio_vec rather than just page pointers
728 * as it makes freeing easier
729 */
730static struct bio_vec *alloc_behind_pages(struct bio *bio)
740{ 731{
741 int i; 732 int i;
742 struct bio_vec *bvec; 733 struct bio_vec *bvec;
743 struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *), 734 struct bio_vec *pages = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
744 GFP_NOIO); 735 GFP_NOIO);
745 if (unlikely(!pages)) 736 if (unlikely(!pages))
746 goto do_sync_io; 737 goto do_sync_io;
747 738
748 bio_for_each_segment(bvec, bio, i) { 739 bio_for_each_segment(bvec, bio, i) {
749 pages[i] = alloc_page(GFP_NOIO); 740 pages[i].bv_page = alloc_page(GFP_NOIO);
750 if (unlikely(!pages[i])) 741 if (unlikely(!pages[i].bv_page))
751 goto do_sync_io; 742 goto do_sync_io;
752 memcpy(kmap(pages[i]) + bvec->bv_offset, 743 memcpy(kmap(pages[i].bv_page) + bvec->bv_offset,
753 kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); 744 kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
754 kunmap(pages[i]); 745 kunmap(pages[i].bv_page);
755 kunmap(bvec->bv_page); 746 kunmap(bvec->bv_page);
756 } 747 }
757 748
@@ -759,8 +750,8 @@ static struct page **alloc_behind_pages(struct bio *bio)
759 750
760do_sync_io: 751do_sync_io:
761 if (pages) 752 if (pages)
762 for (i = 0; i < bio->bi_vcnt && pages[i]; i++) 753 for (i = 0; i < bio->bi_vcnt && pages[i].bv_page; i++)
763 put_page(pages[i]); 754 put_page(pages[i].bv_page);
764 kfree(pages); 755 kfree(pages);
765 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); 756 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
766 return NULL; 757 return NULL;
@@ -775,8 +766,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
775 int i, targets = 0, disks; 766 int i, targets = 0, disks;
776 struct bitmap *bitmap; 767 struct bitmap *bitmap;
777 unsigned long flags; 768 unsigned long flags;
778 struct bio_list bl; 769 struct bio_vec *behind_pages = NULL;
779 struct page **behind_pages = NULL;
780 const int rw = bio_data_dir(bio); 770 const int rw = bio_data_dir(bio);
781 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 771 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
782 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); 772 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
@@ -851,7 +841,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
851 } 841 }
852 r1_bio->read_disk = rdisk; 842 r1_bio->read_disk = rdisk;
853 843
854 read_bio = bio_clone(bio, GFP_NOIO); 844 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
855 845
856 r1_bio->bios[rdisk] = read_bio; 846 r1_bio->bios[rdisk] = read_bio;
857 847
@@ -873,13 +863,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
873 * bios[x] to bio 863 * bios[x] to bio
874 */ 864 */
875 disks = conf->raid_disks; 865 disks = conf->raid_disks;
876#if 0
877 { static int first=1;
878 if (first) printk("First Write sector %llu disks %d\n",
879 (unsigned long long)r1_bio->sector, disks);
880 first = 0;
881 }
882#endif
883 retry_write: 866 retry_write:
884 blocked_rdev = NULL; 867 blocked_rdev = NULL;
885 rcu_read_lock(); 868 rcu_read_lock();
@@ -937,16 +920,17 @@ static int make_request(mddev_t *mddev, struct bio * bio)
937 (behind_pages = alloc_behind_pages(bio)) != NULL) 920 (behind_pages = alloc_behind_pages(bio)) != NULL)
938 set_bit(R1BIO_BehindIO, &r1_bio->state); 921 set_bit(R1BIO_BehindIO, &r1_bio->state);
939 922
940 atomic_set(&r1_bio->remaining, 0); 923 atomic_set(&r1_bio->remaining, 1);
941 atomic_set(&r1_bio->behind_remaining, 0); 924 atomic_set(&r1_bio->behind_remaining, 0);
942 925
943 bio_list_init(&bl); 926 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
927 test_bit(R1BIO_BehindIO, &r1_bio->state));
944 for (i = 0; i < disks; i++) { 928 for (i = 0; i < disks; i++) {
945 struct bio *mbio; 929 struct bio *mbio;
946 if (!r1_bio->bios[i]) 930 if (!r1_bio->bios[i])
947 continue; 931 continue;
948 932
949 mbio = bio_clone(bio, GFP_NOIO); 933 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
950 r1_bio->bios[i] = mbio; 934 r1_bio->bios[i] = mbio;
951 935
952 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; 936 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
@@ -963,39 +947,29 @@ static int make_request(mddev_t *mddev, struct bio * bio)
963 * we clear any unused pointer in the io_vec, rather 947 * we clear any unused pointer in the io_vec, rather
964 * than leave them unchanged. This is important 948 * than leave them unchanged. This is important
965 * because when we come to free the pages, we won't 949 * because when we come to free the pages, we won't
966 * know the originial bi_idx, so we just free 950 * know the original bi_idx, so we just free
967 * them all 951 * them all
968 */ 952 */
969 __bio_for_each_segment(bvec, mbio, j, 0) 953 __bio_for_each_segment(bvec, mbio, j, 0)
970 bvec->bv_page = behind_pages[j]; 954 bvec->bv_page = behind_pages[j].bv_page;
971 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) 955 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
972 atomic_inc(&r1_bio->behind_remaining); 956 atomic_inc(&r1_bio->behind_remaining);
973 } 957 }
974 958
975 atomic_inc(&r1_bio->remaining); 959 atomic_inc(&r1_bio->remaining);
976 960 spin_lock_irqsave(&conf->device_lock, flags);
977 bio_list_add(&bl, mbio); 961 bio_list_add(&conf->pending_bio_list, mbio);
962 blk_plug_device(mddev->queue);
963 spin_unlock_irqrestore(&conf->device_lock, flags);
978 } 964 }
965 r1_bio_write_done(r1_bio, bio->bi_vcnt, behind_pages, behind_pages != NULL);
979 kfree(behind_pages); /* the behind pages are attached to the bios now */ 966 kfree(behind_pages); /* the behind pages are attached to the bios now */
980 967
981 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, 968 /* In case raid1d snuck in to freeze_array */
982 test_bit(R1BIO_BehindIO, &r1_bio->state));
983 spin_lock_irqsave(&conf->device_lock, flags);
984 bio_list_merge(&conf->pending_bio_list, &bl);
985 bio_list_init(&bl);
986
987 blk_plug_device(mddev->queue);
988 spin_unlock_irqrestore(&conf->device_lock, flags);
989
990 /* In case raid1d snuck into freeze_array */
991 wake_up(&conf->wait_barrier); 969 wake_up(&conf->wait_barrier);
992 970
993 if (do_sync) 971 if (do_sync)
994 md_wakeup_thread(mddev->thread); 972 md_wakeup_thread(mddev->thread);
995#if 0
996 while ((bio = bio_list_pop(&bl)) != NULL)
997 generic_make_request(bio);
998#endif
999 973
1000 return 0; 974 return 0;
1001} 975}
@@ -1183,7 +1157,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
1183 err = -EBUSY; 1157 err = -EBUSY;
1184 goto abort; 1158 goto abort;
1185 } 1159 }
1186 /* Only remove non-faulty devices is recovery 1160 /* Only remove non-faulty devices if recovery
1187 * is not possible. 1161 * is not possible.
1188 */ 1162 */
1189 if (!test_bit(Faulty, &rdev->flags) && 1163 if (!test_bit(Faulty, &rdev->flags) &&
@@ -1245,7 +1219,7 @@ static void end_sync_write(struct bio *bio, int error)
1245 break; 1219 break;
1246 } 1220 }
1247 if (!uptodate) { 1221 if (!uptodate) {
1248 int sync_blocks = 0; 1222 sector_t sync_blocks = 0;
1249 sector_t s = r1_bio->sector; 1223 sector_t s = r1_bio->sector;
1250 long sectors_to_go = r1_bio->sectors; 1224 long sectors_to_go = r1_bio->sectors;
1251 /* make sure these bits doesn't get cleared. */ 1225 /* make sure these bits doesn't get cleared. */
@@ -1388,7 +1362,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
1388 * active, and resync is currently active 1362 * active, and resync is currently active
1389 */ 1363 */
1390 rdev = conf->mirrors[d].rdev; 1364 rdev = conf->mirrors[d].rdev;
1391 if (sync_page_io(rdev->bdev, 1365 if (sync_page_io(rdev,
1392 sect + rdev->data_offset, 1366 sect + rdev->data_offset,
1393 s<<9, 1367 s<<9,
1394 bio->bi_io_vec[idx].bv_page, 1368 bio->bi_io_vec[idx].bv_page,
@@ -1414,7 +1388,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
1414 continue; 1388 continue;
1415 rdev = conf->mirrors[d].rdev; 1389 rdev = conf->mirrors[d].rdev;
1416 atomic_add(s, &rdev->corrected_errors); 1390 atomic_add(s, &rdev->corrected_errors);
1417 if (sync_page_io(rdev->bdev, 1391 if (sync_page_io(rdev,
1418 sect + rdev->data_offset, 1392 sect + rdev->data_offset,
1419 s<<9, 1393 s<<9,
1420 bio->bi_io_vec[idx].bv_page, 1394 bio->bi_io_vec[idx].bv_page,
@@ -1429,7 +1403,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
1429 if (r1_bio->bios[d]->bi_end_io != end_sync_read) 1403 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1430 continue; 1404 continue;
1431 rdev = conf->mirrors[d].rdev; 1405 rdev = conf->mirrors[d].rdev;
1432 if (sync_page_io(rdev->bdev, 1406 if (sync_page_io(rdev,
1433 sect + rdev->data_offset, 1407 sect + rdev->data_offset,
1434 s<<9, 1408 s<<9,
1435 bio->bi_io_vec[idx].bv_page, 1409 bio->bi_io_vec[idx].bv_page,
@@ -1513,7 +1487,7 @@ static void fix_read_error(conf_t *conf, int read_disk,
1513 rdev = conf->mirrors[d].rdev; 1487 rdev = conf->mirrors[d].rdev;
1514 if (rdev && 1488 if (rdev &&
1515 test_bit(In_sync, &rdev->flags) && 1489 test_bit(In_sync, &rdev->flags) &&
1516 sync_page_io(rdev->bdev, 1490 sync_page_io(rdev,
1517 sect + rdev->data_offset, 1491 sect + rdev->data_offset,
1518 s<<9, 1492 s<<9,
1519 conf->tmppage, READ)) 1493 conf->tmppage, READ))
@@ -1539,7 +1513,7 @@ static void fix_read_error(conf_t *conf, int read_disk,
1539 rdev = conf->mirrors[d].rdev; 1513 rdev = conf->mirrors[d].rdev;
1540 if (rdev && 1514 if (rdev &&
1541 test_bit(In_sync, &rdev->flags)) { 1515 test_bit(In_sync, &rdev->flags)) {
1542 if (sync_page_io(rdev->bdev, 1516 if (sync_page_io(rdev,
1543 sect + rdev->data_offset, 1517 sect + rdev->data_offset,
1544 s<<9, conf->tmppage, WRITE) 1518 s<<9, conf->tmppage, WRITE)
1545 == 0) 1519 == 0)
@@ -1556,7 +1530,7 @@ static void fix_read_error(conf_t *conf, int read_disk,
1556 rdev = conf->mirrors[d].rdev; 1530 rdev = conf->mirrors[d].rdev;
1557 if (rdev && 1531 if (rdev &&
1558 test_bit(In_sync, &rdev->flags)) { 1532 test_bit(In_sync, &rdev->flags)) {
1559 if (sync_page_io(rdev->bdev, 1533 if (sync_page_io(rdev,
1560 sect + rdev->data_offset, 1534 sect + rdev->data_offset,
1561 s<<9, conf->tmppage, READ) 1535 s<<9, conf->tmppage, READ)
1562 == 0) 1536 == 0)
@@ -1646,7 +1620,8 @@ static void raid1d(mddev_t *mddev)
1646 mddev->ro ? IO_BLOCKED : NULL; 1620 mddev->ro ? IO_BLOCKED : NULL;
1647 r1_bio->read_disk = disk; 1621 r1_bio->read_disk = disk;
1648 bio_put(bio); 1622 bio_put(bio);
1649 bio = bio_clone(r1_bio->master_bio, GFP_NOIO); 1623 bio = bio_clone_mddev(r1_bio->master_bio,
1624 GFP_NOIO, mddev);
1650 r1_bio->bios[r1_bio->read_disk] = bio; 1625 r1_bio->bios[r1_bio->read_disk] = bio;
1651 rdev = conf->mirrors[disk].rdev; 1626 rdev = conf->mirrors[disk].rdev;
1652 if (printk_ratelimit()) 1627 if (printk_ratelimit())
@@ -1705,7 +1680,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1705 int i; 1680 int i;
1706 int wonly = -1; 1681 int wonly = -1;
1707 int write_targets = 0, read_targets = 0; 1682 int write_targets = 0, read_targets = 0;
1708 int sync_blocks; 1683 sector_t sync_blocks;
1709 int still_degraded = 0; 1684 int still_degraded = 0;
1710 1685
1711 if (!conf->r1buf_pool) 1686 if (!conf->r1buf_pool)
@@ -1755,11 +1730,11 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1755 msleep_interruptible(1000); 1730 msleep_interruptible(1000);
1756 1731
1757 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 1732 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
1733 r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
1758 raise_barrier(conf); 1734 raise_barrier(conf);
1759 1735
1760 conf->next_resync = sector_nr; 1736 conf->next_resync = sector_nr;
1761 1737
1762 r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
1763 rcu_read_lock(); 1738 rcu_read_lock();
1764 /* 1739 /*
1765 * If we get a correctably read error during resync or recovery, 1740 * If we get a correctably read error during resync or recovery,
@@ -1971,7 +1946,6 @@ static conf_t *setup_conf(mddev_t *mddev)
1971 init_waitqueue_head(&conf->wait_barrier); 1946 init_waitqueue_head(&conf->wait_barrier);
1972 1947
1973 bio_list_init(&conf->pending_bio_list); 1948 bio_list_init(&conf->pending_bio_list);
1974 bio_list_init(&conf->flushing_bio_list);
1975 1949
1976 conf->last_used = -1; 1950 conf->last_used = -1;
1977 for (i = 0; i < conf->raid_disks; i++) { 1951 for (i = 0; i < conf->raid_disks; i++) {
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index adf8cfd73313..cbfdf1a6acd9 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -35,8 +35,6 @@ struct r1_private_data_s {
35 struct list_head retry_list; 35 struct list_head retry_list;
36 /* queue pending writes and submit them on unplug */ 36 /* queue pending writes and submit them on unplug */
37 struct bio_list pending_bio_list; 37 struct bio_list pending_bio_list;
38 /* queue of writes that have been unplugged */
39 struct bio_list flushing_bio_list;
40 38
41 /* for use when syncing mirrors: */ 39 /* for use when syncing mirrors: */
42 40
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index f0d082f749be..c67aa54694ae 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -120,7 +120,7 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
120 * Allocate bios. 120 * Allocate bios.
121 */ 121 */
122 for (j = nalloc ; j-- ; ) { 122 for (j = nalloc ; j-- ; ) {
123 bio = bio_alloc(gfp_flags, RESYNC_PAGES); 123 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
124 if (!bio) 124 if (!bio)
125 goto out_free_bio; 125 goto out_free_bio;
126 r10_bio->devs[j].bio = bio; 126 r10_bio->devs[j].bio = bio;
@@ -801,7 +801,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
801 const int rw = bio_data_dir(bio); 801 const int rw = bio_data_dir(bio);
802 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 802 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
803 const unsigned long do_fua = (bio->bi_rw & REQ_FUA); 803 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
804 struct bio_list bl;
805 unsigned long flags; 804 unsigned long flags;
806 mdk_rdev_t *blocked_rdev; 805 mdk_rdev_t *blocked_rdev;
807 806
@@ -890,7 +889,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
890 } 889 }
891 mirror = conf->mirrors + disk; 890 mirror = conf->mirrors + disk;
892 891
893 read_bio = bio_clone(bio, GFP_NOIO); 892 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
894 893
895 r10_bio->devs[slot].bio = read_bio; 894 r10_bio->devs[slot].bio = read_bio;
896 895
@@ -950,16 +949,16 @@ static int make_request(mddev_t *mddev, struct bio * bio)
950 goto retry_write; 949 goto retry_write;
951 } 950 }
952 951
953 atomic_set(&r10_bio->remaining, 0); 952 atomic_set(&r10_bio->remaining, 1);
953 bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
954 954
955 bio_list_init(&bl);
956 for (i = 0; i < conf->copies; i++) { 955 for (i = 0; i < conf->copies; i++) {
957 struct bio *mbio; 956 struct bio *mbio;
958 int d = r10_bio->devs[i].devnum; 957 int d = r10_bio->devs[i].devnum;
959 if (!r10_bio->devs[i].bio) 958 if (!r10_bio->devs[i].bio)
960 continue; 959 continue;
961 960
962 mbio = bio_clone(bio, GFP_NOIO); 961 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
963 r10_bio->devs[i].bio = mbio; 962 r10_bio->devs[i].bio = mbio;
964 963
965 mbio->bi_sector = r10_bio->devs[i].addr+ 964 mbio->bi_sector = r10_bio->devs[i].addr+
@@ -970,22 +969,22 @@ static int make_request(mddev_t *mddev, struct bio * bio)
970 mbio->bi_private = r10_bio; 969 mbio->bi_private = r10_bio;
971 970
972 atomic_inc(&r10_bio->remaining); 971 atomic_inc(&r10_bio->remaining);
973 bio_list_add(&bl, mbio); 972 spin_lock_irqsave(&conf->device_lock, flags);
973 bio_list_add(&conf->pending_bio_list, mbio);
974 blk_plug_device(mddev->queue);
975 spin_unlock_irqrestore(&conf->device_lock, flags);
974 } 976 }
975 977
976 if (unlikely(!atomic_read(&r10_bio->remaining))) { 978 if (atomic_dec_and_test(&r10_bio->remaining)) {
977 /* the array is dead */ 979 /* This matches the end of raid10_end_write_request() */
980 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
981 r10_bio->sectors,
982 !test_bit(R10BIO_Degraded, &r10_bio->state),
983 0);
978 md_write_end(mddev); 984 md_write_end(mddev);
979 raid_end_bio_io(r10_bio); 985 raid_end_bio_io(r10_bio);
980 return 0;
981 } 986 }
982 987
983 bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
984 spin_lock_irqsave(&conf->device_lock, flags);
985 bio_list_merge(&conf->pending_bio_list, &bl);
986 blk_plug_device(mddev->queue);
987 spin_unlock_irqrestore(&conf->device_lock, flags);
988
989 /* In case raid10d snuck in to freeze_array */ 988 /* In case raid10d snuck in to freeze_array */
990 wake_up(&conf->wait_barrier); 989 wake_up(&conf->wait_barrier);
991 990
@@ -1558,7 +1557,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1558 test_bit(In_sync, &rdev->flags)) { 1557 test_bit(In_sync, &rdev->flags)) {
1559 atomic_inc(&rdev->nr_pending); 1558 atomic_inc(&rdev->nr_pending);
1560 rcu_read_unlock(); 1559 rcu_read_unlock();
1561 success = sync_page_io(rdev->bdev, 1560 success = sync_page_io(rdev,
1562 r10_bio->devs[sl].addr + 1561 r10_bio->devs[sl].addr +
1563 sect + rdev->data_offset, 1562 sect + rdev->data_offset,
1564 s<<9, 1563 s<<9,
@@ -1597,7 +1596,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1597 atomic_inc(&rdev->nr_pending); 1596 atomic_inc(&rdev->nr_pending);
1598 rcu_read_unlock(); 1597 rcu_read_unlock();
1599 atomic_add(s, &rdev->corrected_errors); 1598 atomic_add(s, &rdev->corrected_errors);
1600 if (sync_page_io(rdev->bdev, 1599 if (sync_page_io(rdev,
1601 r10_bio->devs[sl].addr + 1600 r10_bio->devs[sl].addr +
1602 sect + rdev->data_offset, 1601 sect + rdev->data_offset,
1603 s<<9, conf->tmppage, WRITE) 1602 s<<9, conf->tmppage, WRITE)
@@ -1634,7 +1633,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1634 char b[BDEVNAME_SIZE]; 1633 char b[BDEVNAME_SIZE];
1635 atomic_inc(&rdev->nr_pending); 1634 atomic_inc(&rdev->nr_pending);
1636 rcu_read_unlock(); 1635 rcu_read_unlock();
1637 if (sync_page_io(rdev->bdev, 1636 if (sync_page_io(rdev,
1638 r10_bio->devs[sl].addr + 1637 r10_bio->devs[sl].addr +
1639 sect + rdev->data_offset, 1638 sect + rdev->data_offset,
1640 s<<9, conf->tmppage, 1639 s<<9, conf->tmppage,
@@ -1747,7 +1746,8 @@ static void raid10d(mddev_t *mddev)
1747 mdname(mddev), 1746 mdname(mddev),
1748 bdevname(rdev->bdev,b), 1747 bdevname(rdev->bdev,b),
1749 (unsigned long long)r10_bio->sector); 1748 (unsigned long long)r10_bio->sector);
1750 bio = bio_clone(r10_bio->master_bio, GFP_NOIO); 1749 bio = bio_clone_mddev(r10_bio->master_bio,
1750 GFP_NOIO, mddev);
1751 r10_bio->devs[r10_bio->read_slot].bio = bio; 1751 r10_bio->devs[r10_bio->read_slot].bio = bio;
1752 bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr 1752 bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr
1753 + rdev->data_offset; 1753 + rdev->data_offset;
@@ -1820,7 +1820,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1820 int disk; 1820 int disk;
1821 int i; 1821 int i;
1822 int max_sync; 1822 int max_sync;
1823 int sync_blocks; 1823 sector_t sync_blocks;
1824 1824
1825 sector_t sectors_skipped = 0; 1825 sector_t sectors_skipped = 0;
1826 int chunks_skipped = 0; 1826 int chunks_skipped = 0;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 31140d1259dc..dc574f303f8b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3876,9 +3876,9 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio)
3876 return 0; 3876 return 0;
3877 } 3877 }
3878 /* 3878 /*
3879 * use bio_clone to make a copy of the bio 3879 * use bio_clone_mddev to make a copy of the bio
3880 */ 3880 */
3881 align_bi = bio_clone(raid_bio, GFP_NOIO); 3881 align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev);
3882 if (!align_bi) 3882 if (!align_bi)
3883 return 0; 3883 return 0;
3884 /* 3884 /*
@@ -4360,7 +4360,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
4360 raid5_conf_t *conf = mddev->private; 4360 raid5_conf_t *conf = mddev->private;
4361 struct stripe_head *sh; 4361 struct stripe_head *sh;
4362 sector_t max_sector = mddev->dev_sectors; 4362 sector_t max_sector = mddev->dev_sectors;
4363 int sync_blocks; 4363 sector_t sync_blocks;
4364 int still_degraded = 0; 4364 int still_degraded = 0;
4365 int i; 4365 int i;
4366 4366