[PATCH] md: add write-intent-bitmap support to raid5

Most awkward part of this is delaying write requests until bitmap updates have been flushed. To achieve this, we have a sequence number (seq_flush) which is incremented each time the raid5 is unplugged. If the raid thread notices that this has changed, it flushes bitmap changes, and assigned the value of seq_flush to seq_write. When a write request arrives, it is given the number from seq_write, and that write request may not complete until seq_flush is larger than the saved seq number. We have a new queue for storing stripes which are waiting for a bitmap flush and an extra flag for stripes to record if the write was 'degraded' and so should not clear the a bit in the bitmap. Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: NeilBrown <neilb@cse.unsw.edu.au> 2005-09-09 19:23:54 -0400
committer: Linus Torvalds <torvalds@g5.osdl.org> 2005-09-09 19:39:12 -0400
commit: 72626685dc66d455742a7f215a0535c551628b9e (patch)
tree: 91e19a61a5a3b782007132b6b2e353e8936dd656 /drivers/md/raid5.c
parent: 0002b2718dd04da67c21f8a7830de8d95a9b0345 (diff)
1 files changed, 123 insertions, 10 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ed859e08d600..4683ca24c046 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -24,6 +24,8 @@
 #include <linux/bitops.h>
 #include <asm/atomic.h>
+#include <linux/raid/bitmap.h>
 /*
 * Stripe cache
 */
@@ -79,8 +81,13 @@ static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
                if (test_bit(STRIPE_HANDLE, &sh->state)) {
                        if (test_bit(STRIPE_DELAYED, &sh->state))
                                list_add_tail(&sh->lru, &conf->delayed_list);
-                        else
+                        else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+                                 conf->seq_write == sh->bm_seq)
+                                list_add_tail(&sh->lru, &conf->bitmap_list);
+                        else {
+                                clear_bit(STRIPE_BIT_DELAY, &sh->state);
                                list_add_tail(&sh->lru, &conf->handle_list);
+                        }
                        md_wakeup_thread(conf->mddev->thread);
                } else {
                        if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
@@ -244,6 +251,9 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
        spin_lock_irq(&conf->device_lock);
        do {
+                wait_event_lock_irq(conf->wait_for_stripe,
+                                    conf->quiesce == 0,
+                                    conf->device_lock, /* nothing */);
                sh = __find_stripe(conf, sector);
                if (!sh) {
                        if (!conf->inactive_blocked)
@@ -803,6 +813,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 {
        struct bio **bip;
        raid5_conf_t *conf = sh->raid_conf;
+        int firstwrite=0;
        PRINTK("adding bh b#%llu to stripe s#%llu\n",
                (unsigned long long)bi->bi_sector,
@@ -811,9 +822,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
        spin_lock(&sh->lock);
        spin_lock_irq(&conf->device_lock);
-        if (forwrite)
+        if (forwrite) {
                bip = &sh->dev[dd_idx].towrite;
-        else
+                if (*bip == NULL && sh->dev[dd_idx].written == NULL)
+                        firstwrite = 1;
+        } else
                bip = &sh->dev[dd_idx].toread;
        while (*bip && (*bip)->bi_sector < bi->bi_sector) {
                if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
@@ -836,6 +849,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
                (unsigned long long)bi->bi_sector,
                (unsigned long long)sh->sector, dd_idx);
+        if (conf->mddev->bitmap && firstwrite) {
+                sh->bm_seq = conf->seq_write;
+                bitmap_startwrite(conf->mddev->bitmap, sh->sector,
+                                  STRIPE_SECTORS, 0);
+                set_bit(STRIPE_BIT_DELAY, &sh->state);
+        }
        if (forwrite) {
                /* check if page is covered */
                sector_t sector = sh->dev[dd_idx].sector;
@@ -958,12 +978,13 @@ static void handle_stripe(struct stripe_head *sh)
         * need to be failed
         */
        if (failed > 1 && to_read+to_write+written) {
-                spin_lock_irq(&conf->device_lock);
                for (i=disks; i--; ) {
+                        int bitmap_end = 0;
+                        spin_lock_irq(&conf->device_lock);
                        /* fail all writes first */
                        bi = sh->dev[i].towrite;
                        sh->dev[i].towrite = NULL;
-                        if (bi) to_write--;
+                        if (bi) { to_write--; bitmap_end = 1; }
                        if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
                                wake_up(&conf->wait_for_overlap);
@@ -981,6 +1002,7 @@ static void handle_stripe(struct stripe_head *sh)
                        /* and fail all 'written' */
                        bi = sh->dev[i].written;
                        sh->dev[i].written = NULL;
+                        if (bi) bitmap_end = 1;
                        while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
                                struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
                                clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -1009,8 +1031,11 @@ static void handle_stripe(struct stripe_head *sh)
                                        bi = nextbi;
                                }
                        }
+                        spin_unlock_irq(&conf->device_lock);
+                        if (bitmap_end)
+                                bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+                                                STRIPE_SECTORS, 0, 0);
                }
-                spin_unlock_irq(&conf->device_lock);
        }
        if (failed > 1 && syncing) {
                md_done_sync(conf->mddev, STRIPE_SECTORS,0);
@@ -1038,6 +1063,7 @@ static void handle_stripe(struct stripe_head *sh)
                         test_bit(R5_UPTODATE, &dev->flags) ) {
                        /* We can return any write requests */
                            struct bio *wbi, *wbi2;
+                            int bitmap_end = 0;
                            PRINTK("Return write for disc %d\n", i);
                            spin_lock_irq(&conf->device_lock);
                            wbi = dev->written;
@@ -1051,7 +1077,13 @@ static void handle_stripe(struct stripe_head *sh)
                                    }
                                    wbi = wbi2;
                            }
+                            if (dev->towrite == NULL)
+                                    bitmap_end = 1;
                            spin_unlock_irq(&conf->device_lock);
+                            if (bitmap_end)
+                                    bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+                                                    STRIPE_SECTORS,
+                                                    !test_bit(STRIPE_DEGRADED, &sh->state), 0);
                    }
                }
        }
@@ -1175,7 +1207,8 @@ static void handle_stripe(struct stripe_head *sh)
                                }
                        }
                /* now if nothing is locked, and if we have enough data, we can start a write request */
-                if (locked == 0 && (rcw == 0 ||rmw == 0)) {
+                if (locked == 0 && (rcw == 0 ||rmw == 0) &&
+                    !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
                        PRINTK("Computing parity...\n");
                        compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
                        /* now every locked buffer is ready to be written */
@@ -1231,6 +1264,7 @@ static void handle_stripe(struct stripe_head *sh)
                        dev = &sh->dev[failed_num];
                        set_bit(R5_LOCKED, &dev->flags);
                        set_bit(R5_Wantwrite, &dev->flags);
+                        clear_bit(STRIPE_DEGRADED, &sh->state);
                        locked++;
                        set_bit(STRIPE_INSYNC, &sh->state);
                        set_bit(R5_Syncio, &dev->flags);
@@ -1298,6 +1332,8 @@ static void handle_stripe(struct stripe_head *sh)
                        bi->bi_next = NULL;
                        generic_make_request(bi);
                } else {
+                        if (rw == 1)
+                                set_bit(STRIPE_DEGRADED, &sh->state);
                        PRINTK("skip op %ld on disc %d for sector %llu\n",
                                bi->bi_rw, i, (unsigned long long)sh->sector);
                        clear_bit(R5_LOCKED, &sh->dev[i].flags);
@@ -1322,6 +1358,20 @@ static inline void raid5_activate_delayed(raid5_conf_t *conf)
        }
 }
+static inline void activate_bit_delay(raid5_conf_t *conf)
+{
+        /* device_lock is held */
+        struct list_head head;
+        list_add(&head, &conf->bitmap_list);
+        list_del_init(&conf->bitmap_list);
+        while (!list_empty(&head)) {
+                struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
+                list_del_init(&sh->lru);
+                atomic_inc(&sh->count);
+                __release_stripe(conf, sh);
+        }
+}
 static void unplug_slaves(mddev_t *mddev)
 {
        raid5_conf_t *conf = mddev_to_conf(mddev);
@@ -1354,8 +1404,10 @@ static void raid5_unplug_device(request_queue_t *q)
        spin_lock_irqsave(&conf->device_lock, flags);
-        if (blk_remove_plug(q))
+        if (blk_remove_plug(q)) {
+                conf->seq_flush++;
                raid5_activate_delayed(conf);
+        }
        md_wakeup_thread(mddev->thread);
        spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -1493,10 +1545,20 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
        sector_t first_sector;
        int raid_disks = conf->raid_disks;
        int data_disks = raid_disks-1;
+        sector_t max_sector = mddev->size << 1;
+        int sync_blocks;
-        if (sector_nr >= mddev->size <<1) {
+        if (sector_nr >= max_sector) {
                /* just being told to finish up .. nothing much to do */
                unplug_slaves(mddev);
+                if (mddev->curr_resync < max_sector) /* aborted */
+                        bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
+                                        &sync_blocks, 1);
+                else /* compelted sync */
+                        conf->fullsync = 0;
+                bitmap_close_sync(mddev->bitmap);
                return 0;
        }
        /* if there is 1 or more failed drives and we are trying
@@ -1508,6 +1570,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                *skipped = 1;
                return rv;
        }
+        if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
+            !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
+                /* we can skip this block, and probably more */
+                sync_blocks /= STRIPE_SECTORS;
+                *skipped = 1;
+                return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
+        }
        x = sector_nr;
        chunk_offset = sector_div(x, sectors_per_chunk);
@@ -1525,6 +1594,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                set_current_state(TASK_UNINTERRUPTIBLE);
                schedule_timeout(1);
        }
+        bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0);
        spin_lock(&sh->lock);   
        set_bit(STRIPE_SYNCING, &sh->state);
        clear_bit(STRIPE_INSYNC, &sh->state);
@@ -1558,6 +1628,13 @@ static void raid5d (mddev_t *mddev)
        while (1) {
                struct list_head *first;
+                if (conf->seq_flush - conf->seq_write > 0) {
+                        int seq = conf->seq_flush;
+                        bitmap_unplug(mddev->bitmap);
+                        conf->seq_write = seq;
+                        activate_bit_delay(conf);
+                }
                if (list_empty(&conf->handle_list) &&
                    atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
                    !blk_queue_plugged(mddev->queue) &&
@@ -1591,7 +1668,7 @@ static void raid5d (mddev_t *mddev)
        PRINTK("--- raid5d inactive\n");
 }
-static int run (mddev_t *mddev)
+static int run(mddev_t *mddev)
 {
        raid5_conf_t *conf;
        int raid_disk, memory;
@@ -1621,6 +1698,7 @@ static int run (mddev_t *mddev)
        init_waitqueue_head(&conf->wait_for_overlap);
        INIT_LIST_HEAD(&conf->handle_list);
        INIT_LIST_HEAD(&conf->delayed_list);
+        INIT_LIST_HEAD(&conf->bitmap_list);
        INIT_LIST_HEAD(&conf->inactive_list);
        atomic_set(&conf->active_stripes, 0);
        atomic_set(&conf->preread_active_stripes, 0);
@@ -1732,6 +1810,9 @@ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
        /* Ok, everything is just fine now */
+        if (mddev->bitmap)
+                mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
        mddev->queue->unplug_fn = raid5_unplug_device;
        mddev->queue->issue_flush_fn = raid5_issue_flush;
@@ -1912,6 +1993,8 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
                        rdev->in_sync = 0;
                        rdev->raid_disk = disk;
                        found = 1;
+                        if (rdev->saved_raid_disk != disk)
+                                conf->fullsync = 1;
                        p->rdev = rdev;
                        break;
                }
@@ -1941,6 +2024,35 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
        return 0;
 }
+static void raid5_quiesce(mddev_t *mddev, int state)
+{
+        raid5_conf_t *conf = mddev_to_conf(mddev);
+        switch(state) {
+        case 1: /* stop all writes */
+                spin_lock_irq(&conf->device_lock);
+                conf->quiesce = 1;
+                wait_event_lock_irq(conf->wait_for_stripe,
+                                    atomic_read(&conf->active_stripes) == 0,
+                                    conf->device_lock, /* nothing */);
+                spin_unlock_irq(&conf->device_lock);
+                break;
+        case 0: /* re-enable writes */
+                spin_lock_irq(&conf->device_lock);
+                conf->quiesce = 0;
+                wake_up(&conf->wait_for_stripe);
+                spin_unlock_irq(&conf->device_lock);
+                break;
+        }
+        if (mddev->thread) {
+                if (mddev->bitmap)
+                        mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
+                else
+                        mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
+                md_wakeup_thread(mddev->thread);
+        }
+}
 static mdk_personality_t raid5_personality=
 {
        .name           = "raid5",
@@ -1955,6 +2067,7 @@ static mdk_personality_t raid5_personality=
        .spare_active   = raid5_spare_active,
        .sync_request   = sync_request,
        .resize         = raid5_resize,
+        .quiesce        = raid5_quiesce,
 };
 static int __init raid5_init (void)
author	NeilBrown <neilb@cse.unsw.edu.au>	2005-09-09 19:23:54 -0400
committer	Linus Torvalds <torvalds@g5.osdl.org>	2005-09-09 19:39:12 -0400
commit	72626685dc66d455742a7f215a0535c551628b9e (patch)
tree	91e19a61a5a3b782007132b6b2e353e8936dd656 /drivers/md/raid5.c
parent	0002b2718dd04da67c21f8a7830de8d95a9b0345 (diff)