aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeilBrown <neilb@cse.unsw.edu.au>2005-09-09 19:23:47 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2005-09-09 19:39:10 -0400
commit4b6d287f627b5fb6a49f78f9e81649ff98c62bb7 (patch)
tree7b6cbc6a997e25a7fb6185da7129e539c4ffda8b
parent8ddf9efe6708f3674f0ddfeb6425fd27bea109a2 (diff)
[PATCH] md: add write-behind support for md/raid1
If a device is flagged 'WriteMostly' and the array has a bitmap, and the bitmap superblock indicates that write_behind is allowed, then write_behind is enabled for WriteMostly devices. Write requests will be acknowledges as complete to the caller (via b_end_io) when all non-WriteMostly devices have completed the write, but will not be cleared from the bitmap until all devices complete. This requires memory allocation to make a local copy of the data being written. If there is insufficient memory, then we fall-back on normal write semantics. Signed-Off-By: Paul Clements <paul.clements@steeleye.com> Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--drivers/md/bitmap.c26
-rw-r--r--drivers/md/raid1.c124
-rw-r--r--include/linux/raid/bitmap.h15
-rw-r--r--include/linux/raid/md_k.h3
-rw-r--r--include/linux/raid/raid1.h13
5 files changed, 165 insertions, 16 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 2925219f0881..2c84de2b4ad5 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -437,6 +437,7 @@ void bitmap_print_sb(struct bitmap *bitmap)
437 printk(KERN_DEBUG " daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep)); 437 printk(KERN_DEBUG " daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep));
438 printk(KERN_DEBUG " sync size: %llu KB\n", 438 printk(KERN_DEBUG " sync size: %llu KB\n",
439 (unsigned long long)le64_to_cpu(sb->sync_size)/2); 439 (unsigned long long)le64_to_cpu(sb->sync_size)/2);
440 printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind));
440 kunmap(bitmap->sb_page); 441 kunmap(bitmap->sb_page);
441} 442}
442 443
@@ -445,7 +446,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
445{ 446{
446 char *reason = NULL; 447 char *reason = NULL;
447 bitmap_super_t *sb; 448 bitmap_super_t *sb;
448 unsigned long chunksize, daemon_sleep; 449 unsigned long chunksize, daemon_sleep, write_behind;
449 unsigned long bytes_read; 450 unsigned long bytes_read;
450 unsigned long long events; 451 unsigned long long events;
451 int err = -EINVAL; 452 int err = -EINVAL;
@@ -474,6 +475,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
474 475
475 chunksize = le32_to_cpu(sb->chunksize); 476 chunksize = le32_to_cpu(sb->chunksize);
476 daemon_sleep = le32_to_cpu(sb->daemon_sleep); 477 daemon_sleep = le32_to_cpu(sb->daemon_sleep);
478 write_behind = le32_to_cpu(sb->write_behind);
477 479
478 /* verify that the bitmap-specific fields are valid */ 480 /* verify that the bitmap-specific fields are valid */
479 if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) 481 if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
@@ -485,7 +487,9 @@ static int bitmap_read_sb(struct bitmap *bitmap)
485 else if ((1 << ffz(~chunksize)) != chunksize) 487 else if ((1 << ffz(~chunksize)) != chunksize)
486 reason = "bitmap chunksize not a power of 2"; 488 reason = "bitmap chunksize not a power of 2";
487 else if (daemon_sleep < 1 || daemon_sleep > 15) 489 else if (daemon_sleep < 1 || daemon_sleep > 15)
488 reason = "daemon sleep period out of range"; 490 reason = "daemon sleep period out of range (1-15s)";
491 else if (write_behind > COUNTER_MAX)
492 reason = "write-behind limit out of range (0 - 16383)";
489 if (reason) { 493 if (reason) {
490 printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n", 494 printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n",
491 bmname(bitmap), reason); 495 bmname(bitmap), reason);
@@ -518,6 +522,7 @@ success:
518 /* assign fields using values from superblock */ 522 /* assign fields using values from superblock */
519 bitmap->chunksize = chunksize; 523 bitmap->chunksize = chunksize;
520 bitmap->daemon_sleep = daemon_sleep; 524 bitmap->daemon_sleep = daemon_sleep;
525 bitmap->max_write_behind = write_behind;
521 bitmap->flags |= sb->state; 526 bitmap->flags |= sb->state;
522 bitmap->events_cleared = le64_to_cpu(sb->events_cleared); 527 bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
523 if (sb->state & BITMAP_STALE) 528 if (sb->state & BITMAP_STALE)
@@ -1282,9 +1287,16 @@ static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
1282 } 1287 }
1283} 1288}
1284 1289
1285int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors) 1290int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind)
1286{ 1291{
1287 if (!bitmap) return 0; 1292 if (!bitmap) return 0;
1293
1294 if (behind) {
1295 atomic_inc(&bitmap->behind_writes);
1296 PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n",
1297 atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);
1298 }
1299
1288 while (sectors) { 1300 while (sectors) {
1289 int blocks; 1301 int blocks;
1290 bitmap_counter_t *bmc; 1302 bitmap_counter_t *bmc;
@@ -1319,9 +1331,15 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
1319} 1331}
1320 1332
1321void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, 1333void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors,
1322 int success) 1334 int success, int behind)
1323{ 1335{
1324 if (!bitmap) return; 1336 if (!bitmap) return;
1337 if (behind) {
1338 atomic_dec(&bitmap->behind_writes);
1339 PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n",
1340 atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);
1341 }
1342
1325 while (sectors) { 1343 while (sectors) {
1326 int blocks; 1344 int blocks;
1327 unsigned long flags; 1345 unsigned long flags;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 28839a8193f2..ba7f5f256161 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -222,8 +222,17 @@ static void raid_end_bio_io(r1bio_t *r1_bio)
222{ 222{
223 struct bio *bio = r1_bio->master_bio; 223 struct bio *bio = r1_bio->master_bio;
224 224
225 bio_endio(bio, bio->bi_size, 225 /* if nobody has done the final endio yet, do it now */
226 test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); 226 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
227 PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n",
228 (bio_data_dir(bio) == WRITE) ? "write" : "read",
229 (unsigned long long) bio->bi_sector,
230 (unsigned long long) bio->bi_sector +
231 (bio->bi_size >> 9) - 1);
232
233 bio_endio(bio, bio->bi_size,
234 test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
235 }
227 free_r1bio(r1_bio); 236 free_r1bio(r1_bio);
228} 237}
229 238
@@ -292,7 +301,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
292{ 301{
293 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 302 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
294 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 303 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
295 int mirror; 304 int mirror, behind;
296 conf_t *conf = mddev_to_conf(r1_bio->mddev); 305 conf_t *conf = mddev_to_conf(r1_bio->mddev);
297 306
298 if (bio->bi_size) 307 if (bio->bi_size)
@@ -323,16 +332,46 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
323 332
324 update_head_pos(mirror, r1_bio); 333 update_head_pos(mirror, r1_bio);
325 334
335 behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
336 if (behind) {
337 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
338 atomic_dec(&r1_bio->behind_remaining);
339
340 /* In behind mode, we ACK the master bio once the I/O has safely
341 * reached all non-writemostly disks. Setting the Returned bit
342 * ensures that this gets done only once -- we don't ever want to
343 * return -EIO here, instead we'll wait */
344
345 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
346 test_bit(R1BIO_Uptodate, &r1_bio->state)) {
347 /* Maybe we can return now */
348 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
349 struct bio *mbio = r1_bio->master_bio;
350 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
351 (unsigned long long) mbio->bi_sector,
352 (unsigned long long) mbio->bi_sector +
353 (mbio->bi_size >> 9) - 1);
354 bio_endio(mbio, mbio->bi_size, 0);
355 }
356 }
357 }
326 /* 358 /*
327 * 359 *
328 * Let's see if all mirrored write operations have finished 360 * Let's see if all mirrored write operations have finished
329 * already. 361 * already.
330 */ 362 */
331 if (atomic_dec_and_test(&r1_bio->remaining)) { 363 if (atomic_dec_and_test(&r1_bio->remaining)) {
364 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
365 /* free extra copy of the data pages */
366 int i = bio->bi_vcnt;
367 while (i--)
368 __free_page(bio->bi_io_vec[i].bv_page);
369 }
332 /* clear the bitmap if all writes complete successfully */ 370 /* clear the bitmap if all writes complete successfully */
333 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, 371 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
334 r1_bio->sectors, 372 r1_bio->sectors,
335 !test_bit(R1BIO_Degraded, &r1_bio->state)); 373 !test_bit(R1BIO_Degraded, &r1_bio->state),
374 behind);
336 md_write_end(r1_bio->mddev); 375 md_write_end(r1_bio->mddev);
337 raid_end_bio_io(r1_bio); 376 raid_end_bio_io(r1_bio);
338 } 377 }
@@ -562,6 +601,39 @@ static void device_barrier(conf_t *conf, sector_t sect)
562 spin_unlock_irq(&conf->resync_lock); 601 spin_unlock_irq(&conf->resync_lock);
563} 602}
564 603
604/* duplicate the data pages for behind I/O */
605static struct page **alloc_behind_pages(struct bio *bio)
606{
607 int i;
608 struct bio_vec *bvec;
609 struct page **pages = kmalloc(bio->bi_vcnt * sizeof(struct page *),
610 GFP_NOIO);
611 if (unlikely(!pages))
612 goto do_sync_io;
613
614 memset(pages, 0, bio->bi_vcnt * sizeof(struct page *));
615
616 bio_for_each_segment(bvec, bio, i) {
617 pages[i] = alloc_page(GFP_NOIO);
618 if (unlikely(!pages[i]))
619 goto do_sync_io;
620 memcpy(kmap(pages[i]) + bvec->bv_offset,
621 kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
622 kunmap(pages[i]);
623 kunmap(bvec->bv_page);
624 }
625
626 return pages;
627
628do_sync_io:
629 if (pages)
630 for (i = 0; i < bio->bi_vcnt && pages[i]; i++)
631 __free_page(pages[i]);
632 kfree(pages);
633 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
634 return NULL;
635}
636
565static int make_request(request_queue_t *q, struct bio * bio) 637static int make_request(request_queue_t *q, struct bio * bio)
566{ 638{
567 mddev_t *mddev = q->queuedata; 639 mddev_t *mddev = q->queuedata;
@@ -574,6 +646,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
574 struct bitmap *bitmap = mddev->bitmap; 646 struct bitmap *bitmap = mddev->bitmap;
575 unsigned long flags; 647 unsigned long flags;
576 struct bio_list bl; 648 struct bio_list bl;
649 struct page **behind_pages = NULL;
577 650
578 if (unlikely(bio_barrier(bio))) { 651 if (unlikely(bio_barrier(bio))) {
579 bio_endio(bio, bio->bi_size, -EOPNOTSUPP); 652 bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
@@ -613,8 +686,6 @@ static int make_request(request_queue_t *q, struct bio * bio)
613 r1_bio->mddev = mddev; 686 r1_bio->mddev = mddev;
614 r1_bio->sector = bio->bi_sector; 687 r1_bio->sector = bio->bi_sector;
615 688
616 r1_bio->state = 0;
617
618 if (bio_data_dir(bio) == READ) { 689 if (bio_data_dir(bio) == READ) {
619 /* 690 /*
620 * read balancing logic: 691 * read balancing logic:
@@ -675,13 +746,22 @@ static int make_request(request_queue_t *q, struct bio * bio)
675 } 746 }
676 rcu_read_unlock(); 747 rcu_read_unlock();
677 748
749 BUG_ON(targets == 0); /* we never fail the last device */
750
678 if (targets < conf->raid_disks) { 751 if (targets < conf->raid_disks) {
679 /* array is degraded, we will not clear the bitmap 752 /* array is degraded, we will not clear the bitmap
680 * on I/O completion (see raid1_end_write_request) */ 753 * on I/O completion (see raid1_end_write_request) */
681 set_bit(R1BIO_Degraded, &r1_bio->state); 754 set_bit(R1BIO_Degraded, &r1_bio->state);
682 } 755 }
683 756
757 /* do behind I/O ? */
758 if (bitmap &&
759 atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind &&
760 (behind_pages = alloc_behind_pages(bio)) != NULL)
761 set_bit(R1BIO_BehindIO, &r1_bio->state);
762
684 atomic_set(&r1_bio->remaining, 0); 763 atomic_set(&r1_bio->remaining, 0);
764 atomic_set(&r1_bio->behind_remaining, 0);
685 765
686 bio_list_init(&bl); 766 bio_list_init(&bl);
687 for (i = 0; i < disks; i++) { 767 for (i = 0; i < disks; i++) {
@@ -698,12 +778,31 @@ static int make_request(request_queue_t *q, struct bio * bio)
698 mbio->bi_rw = WRITE; 778 mbio->bi_rw = WRITE;
699 mbio->bi_private = r1_bio; 779 mbio->bi_private = r1_bio;
700 780
781 if (behind_pages) {
782 struct bio_vec *bvec;
783 int j;
784
785 /* Yes, I really want the '__' version so that
786 * we clear any unused pointer in the io_vec, rather
787 * than leave them unchanged. This is important
788 * because when we come to free the pages, we won't
789 * know the originial bi_idx, so we just free
790 * them all
791 */
792 __bio_for_each_segment(bvec, mbio, j, 0)
793 bvec->bv_page = behind_pages[j];
794 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
795 atomic_inc(&r1_bio->behind_remaining);
796 }
797
701 atomic_inc(&r1_bio->remaining); 798 atomic_inc(&r1_bio->remaining);
702 799
703 bio_list_add(&bl, mbio); 800 bio_list_add(&bl, mbio);
704 } 801 }
802 kfree(behind_pages); /* the behind pages are attached to the bios now */
705 803
706 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors); 804 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
805 test_bit(R1BIO_BehindIO, &r1_bio->state));
707 spin_lock_irqsave(&conf->device_lock, flags); 806 spin_lock_irqsave(&conf->device_lock, flags);
708 bio_list_merge(&conf->pending_bio_list, &bl); 807 bio_list_merge(&conf->pending_bio_list, &bl);
709 bio_list_init(&bl); 808 bio_list_init(&bl);
@@ -1471,6 +1570,17 @@ out:
1471static int stop(mddev_t *mddev) 1570static int stop(mddev_t *mddev)
1472{ 1571{
1473 conf_t *conf = mddev_to_conf(mddev); 1572 conf_t *conf = mddev_to_conf(mddev);
1573 struct bitmap *bitmap = mddev->bitmap;
1574 int behind_wait = 0;
1575
1576 /* wait for behind writes to complete */
1577 while (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
1578 behind_wait++;
1579 printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait);
1580 set_current_state(TASK_UNINTERRUPTIBLE);
1581 schedule_timeout(HZ); /* wait a second */
1582 /* need to kick something here to make sure I/O goes? */
1583 }
1474 1584
1475 md_unregister_thread(mddev->thread); 1585 md_unregister_thread(mddev->thread);
1476 mddev->thread = NULL; 1586 mddev->thread = NULL;
diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h
index 4bf1659f8aa8..9de99198caf1 100644
--- a/include/linux/raid/bitmap.h
+++ b/include/linux/raid/bitmap.h
@@ -7,7 +7,7 @@
7#define BITMAP_H 1 7#define BITMAP_H 1
8 8
9#define BITMAP_MAJOR 3 9#define BITMAP_MAJOR 3
10#define BITMAP_MINOR 38 10#define BITMAP_MINOR 39
11 11
12/* 12/*
13 * in-memory bitmap: 13 * in-memory bitmap:
@@ -147,8 +147,9 @@ typedef struct bitmap_super_s {
147 __u32 state; /* 48 bitmap state information */ 147 __u32 state; /* 48 bitmap state information */
148 __u32 chunksize; /* 52 the bitmap chunk size in bytes */ 148 __u32 chunksize; /* 52 the bitmap chunk size in bytes */
149 __u32 daemon_sleep; /* 56 seconds between disk flushes */ 149 __u32 daemon_sleep; /* 56 seconds between disk flushes */
150 __u32 write_behind; /* 60 number of outstanding write-behind writes */
150 151
151 __u8 pad[256 - 60]; /* set to zero */ 152 __u8 pad[256 - 64]; /* set to zero */
152} bitmap_super_t; 153} bitmap_super_t;
153 154
154/* notes: 155/* notes:
@@ -226,6 +227,9 @@ struct bitmap {
226 227
227 unsigned long flags; 228 unsigned long flags;
228 229
230 unsigned long max_write_behind; /* write-behind mode */
231 atomic_t behind_writes;
232
229 /* 233 /*
230 * the bitmap daemon - periodically wakes up and sweeps the bitmap 234 * the bitmap daemon - periodically wakes up and sweeps the bitmap
231 * file, cleaning up bits and flushing out pages to disk as necessary 235 * file, cleaning up bits and flushing out pages to disk as necessary
@@ -260,9 +264,10 @@ int bitmap_setallbits(struct bitmap *bitmap);
260void bitmap_write_all(struct bitmap *bitmap); 264void bitmap_write_all(struct bitmap *bitmap);
261 265
262/* these are exported */ 266/* these are exported */
263int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors); 267int bitmap_startwrite(struct bitmap *bitmap, sector_t offset,
264void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, 268 unsigned long sectors, int behind);
265 int success); 269void bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
270 unsigned long sectors, int success, int behind);
266int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded); 271int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded);
267void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted); 272void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted);
268void bitmap_close_sync(struct bitmap *bitmap); 273void bitmap_close_sync(struct bitmap *bitmap);
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 7ef78e15ce04..2514e5fcda7f 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -275,6 +275,9 @@ struct mddev_s
275 atomic_t writes_pending; 275 atomic_t writes_pending;
276 request_queue_t *queue; /* for plugging ... */ 276 request_queue_t *queue; /* for plugging ... */
277 277
278 atomic_t write_behind; /* outstanding async IO */
279 unsigned int max_write_behind; /* 0 = sync */
280
278 struct bitmap *bitmap; /* the bitmap for the device */ 281 struct bitmap *bitmap; /* the bitmap for the device */
279 struct file *bitmap_file; /* the bitmap file */ 282 struct file *bitmap_file; /* the bitmap file */
280 long bitmap_offset; /* offset from superblock of 283 long bitmap_offset; /* offset from superblock of
diff --git a/include/linux/raid/raid1.h b/include/linux/raid/raid1.h
index 9d93cf12e890..60e19b667548 100644
--- a/include/linux/raid/raid1.h
+++ b/include/linux/raid/raid1.h
@@ -80,6 +80,9 @@ struct r1bio_s {
80 atomic_t remaining; /* 'have we finished' count, 80 atomic_t remaining; /* 'have we finished' count,
81 * used from IRQ handlers 81 * used from IRQ handlers
82 */ 82 */
83 atomic_t behind_remaining; /* number of write-behind ios remaining
84 * in this BehindIO request
85 */
83 sector_t sector; 86 sector_t sector;
84 int sectors; 87 int sectors;
85 unsigned long state; 88 unsigned long state;
@@ -107,4 +110,14 @@ struct r1bio_s {
107#define R1BIO_Uptodate 0 110#define R1BIO_Uptodate 0
108#define R1BIO_IsSync 1 111#define R1BIO_IsSync 1
109#define R1BIO_Degraded 2 112#define R1BIO_Degraded 2
113#define R1BIO_BehindIO 3
114/* For write-behind requests, we call bi_end_io when
115 * the last non-write-behind device completes, providing
116 * any write was successful. Otherwise we call when
117 * any write-behind write succeeds, otherwise we call
118 * with failure when last write completes (and all failed).
119 * Record that bi_end_io was called with this flag...
120 */
121#define R1BIO_Returned 4
122
110#endif 123#endif