From 6ff8d8ec06690f4011a6c3ad9e0759b9094f0601 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 6 Jan 2006 00:20:15 -0800 Subject: [PATCH] md: allow dirty raid[456] arrays to be started at boot See patch to md.txt for more details Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid6main.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'drivers/md/raid6main.c') diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c index 0000d162d198..9ac6dcd55127 100644 --- a/drivers/md/raid6main.c +++ b/drivers/md/raid6main.c @@ -1929,13 +1929,18 @@ static int run(mddev_t *mddev) goto abort; } -#if 0 /* FIX: For now */ if (mddev->degraded > 0 && mddev->recovery_cp != MaxSector) { - printk(KERN_ERR "raid6: cannot start dirty degraded array for %s\n", mdname(mddev)); - goto abort; + if (mddev->ok_start_degraded) + printk(KERN_WARNING "raid6: starting dirty degraded array:%s" + "- data corruption possible.\n", + mdname(mddev)); + else { + printk(KERN_ERR "raid6: cannot start dirty degraded array" + " for %s\n", mdname(mddev)); + goto abort; + } } -#endif { mddev->thread = md_register_thread(raid6d, mddev, "%s_raid6"); -- cgit v1.2.2 From b15c2e57f0f5bf596a19e9c5571e5b07cdfc7363 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 6 Jan 2006 00:20:16 -0800 Subject: [PATCH] md: move bitmap_create to after md array has been initialised This is important because bitmap_create uses mddev->resync_max_sectors and that doesn't have a valid value until after the array has been initialised (with pers->run()). [It doesn't make a difference for current personalities that support bitmaps, but will make a difference for raid10] This has the added advantage of meaning with can move the thread->timeout manipulation inside the bitmap.c code instead of sprinkling identical code throughout all personalities. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid6main.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'drivers/md/raid6main.c') diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c index 9ac6dcd55127..304455d236f9 100644 --- a/drivers/md/raid6main.c +++ b/drivers/md/raid6main.c @@ -1990,9 +1990,6 @@ static int run(mddev_t *mddev) /* Ok, everything is just fine now */ mddev->array_size = mddev->size * (mddev->raid_disks - 2); - if (mddev->bitmap) - mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; - mddev->queue->unplug_fn = raid6_unplug_device; mddev->queue->issue_flush_fn = raid6_issue_flush; return 0; @@ -2228,14 +2225,8 @@ static void raid6_quiesce(mddev_t *mddev, int state) spin_unlock_irq(&conf->device_lock); break; } - if (mddev->thread) { - if (mddev->bitmap) - mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; - else - mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; - md_wakeup_thread(mddev->thread); - } } + static mdk_personality_t raid6_personality= { .name = "raid6", -- cgit v1.2.2 From ca65b73bd9c301d243df93780f7b26579e6c9204 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 6 Jan 2006 00:20:17 -0800 Subject: [PATCH] md: fix raid6 resync check/repair code raid6 currently does not check the P/Q syndromes when doing a resync, it just calculates the correct value and writes it. Doing the check can reduce writes (often to 0) for a resync, and it is needed to properly implement the echo check > sync_action operation. This patch implements the appropriate checks and tidies up some related code. It also allows raid6 user-requested resync to bypass the intent bitmap. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid6main.c | 182 ++++++++++++++++++++++++++++--------------------- 1 file changed, 106 insertions(+), 76 deletions(-) (limited to 'drivers/md/raid6main.c') diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c index 304455d236f9..52e8796bb8ac 100644 --- a/drivers/md/raid6main.c +++ b/drivers/md/raid6main.c @@ -805,7 +805,7 @@ static void compute_parity(struct stripe_head *sh, int method) } /* Compute one missing block */ -static void compute_block_1(struct stripe_head *sh, int dd_idx) +static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) { raid6_conf_t *conf = sh->raid_conf; int i, count, disks = conf->raid_disks; @@ -821,7 +821,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx) compute_parity(sh, UPDATE_PARITY); } else { ptr[0] = page_address(sh->dev[dd_idx].page); - memset(ptr[0], 0, STRIPE_SIZE); + if (!nozero) memset(ptr[0], 0, STRIPE_SIZE); count = 1; for (i = disks ; i--; ) { if (i == dd_idx || i == qd_idx) @@ -838,7 +838,8 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx) } if (count != 1) xor_block(count, STRIPE_SIZE, ptr); - set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); + if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); + else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); } } @@ -871,7 +872,7 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) return; } else { /* We're missing D+Q; recompute D from P */ - compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1); + compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0); compute_parity(sh, UPDATE_PARITY); /* Is this necessary? */ return; } @@ -982,6 +983,12 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in } +static int page_is_zero(struct page *p) +{ + char *a = page_address(p); + return ((*(u32*)a) == 0 && + memcmp(a, a+4, STRIPE_SIZE-4)==0); +} /* * handle_stripe - do things to a stripe. * @@ -1000,7 +1007,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in * */ -static void handle_stripe(struct stripe_head *sh) +static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) { raid6_conf_t *conf = sh->raid_conf; int disks = conf->raid_disks; @@ -1228,7 +1235,7 @@ static void handle_stripe(struct stripe_head *sh) if (uptodate == disks-1) { PRINTK("Computing stripe %llu block %d\n", (unsigned long long)sh->sector, i); - compute_block_1(sh, i); + compute_block_1(sh, i, 0); uptodate++; } else if ( uptodate == disks-2 && failed >= 2 ) { /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */ @@ -1323,7 +1330,7 @@ static void handle_stripe(struct stripe_head *sh) /* We have failed blocks and need to compute them */ switch ( failed ) { case 0: BUG(); - case 1: compute_block_1(sh, failed_num[0]); break; + case 1: compute_block_1(sh, failed_num[0], 0); break; case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break; default: BUG(); /* This request should have been failed? */ } @@ -1338,12 +1345,10 @@ static void handle_stripe(struct stripe_head *sh) (unsigned long long)sh->sector, i); locked++; set_bit(R5_Wantwrite, &sh->dev[i].flags); -#if 0 /**** FIX: I don't understand the logic here... ****/ - if (!test_bit(R5_Insync, &sh->dev[i].flags) - || ((i==pd_idx || i==qd_idx) && failed == 0)) /* FIX? */ - set_bit(STRIPE_INSYNC, &sh->state); -#endif } + /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ + set_bit(STRIPE_INSYNC, &sh->state); + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { atomic_dec(&conf->preread_active_stripes); if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) @@ -1356,79 +1361,97 @@ static void handle_stripe(struct stripe_head *sh) * Any reads will already have been scheduled, so we just see if enough data * is available */ - if (syncing && locked == 0 && - !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 2) { - set_bit(STRIPE_HANDLE, &sh->state); -#if 0 /* RAID-6: Don't support CHECK PARITY yet */ - if (failed == 0) { - char *pagea; - if (uptodate != disks) - BUG(); - compute_parity(sh, CHECK_PARITY); - uptodate--; - pagea = page_address(sh->dev[pd_idx].page); - if ((*(u32*)pagea) == 0 && - !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) { - /* parity is correct (on disc, not in buffer any more) */ - set_bit(STRIPE_INSYNC, &sh->state); - } - } -#endif - if (!test_bit(STRIPE_INSYNC, &sh->state)) { - int failed_needupdate[2]; - struct r5dev *adev, *bdev; - - if ( failed < 1 ) - failed_num[0] = pd_idx; - if ( failed < 2 ) - failed_num[1] = (failed_num[0] == qd_idx) ? pd_idx : qd_idx; + if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) { + int update_p = 0, update_q = 0; + struct r5dev *dev; - failed_needupdate[0] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[0]].flags); - failed_needupdate[1] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[1]].flags); + set_bit(STRIPE_HANDLE, &sh->state); - PRINTK("sync: failed=%d num=%d,%d fnu=%u%u\n", - failed, failed_num[0], failed_num[1], failed_needupdate[0], failed_needupdate[1]); + BUG_ON(failed>2); + BUG_ON(uptodate < disks); + /* Want to check and possibly repair P and Q. + * However there could be one 'failed' device, in which + * case we can only check one of them, possibly using the + * other to generate missing data + */ -#if 0 /* RAID-6: This code seems to require that CHECK_PARITY destroys the uptodateness of the parity */ - /* should be able to compute the missing block(s) and write to spare */ - if ( failed_needupdate[0] ^ failed_needupdate[1] ) { - if (uptodate+1 != disks) - BUG(); - compute_block_1(sh, failed_needupdate[0] ? failed_num[0] : failed_num[1]); - uptodate++; - } else if ( failed_needupdate[0] & failed_needupdate[1] ) { - if (uptodate+2 != disks) - BUG(); - compute_block_2(sh, failed_num[0], failed_num[1]); - uptodate += 2; + /* If !tmp_page, we cannot do the calculations, + * but as we have set STRIPE_HANDLE, we will soon be called + * by stripe_handle with a tmp_page - just wait until then. + */ + if (tmp_page) { + if (failed == q_failed) { + /* The only possible failed device holds 'Q', so it makes + * sense to check P (If anything else were failed, we would + * have used P to recreate it). + */ + compute_block_1(sh, pd_idx, 1); + if (!page_is_zero(sh->dev[pd_idx].page)) { + compute_block_1(sh,pd_idx,0); + update_p = 1; + } + } + if (!q_failed && failed < 2) { + /* q is not failed, and we didn't use it to generate + * anything, so it makes sense to check it + */ + memcpy(page_address(tmp_page), + page_address(sh->dev[qd_idx].page), + STRIPE_SIZE); + compute_parity(sh, UPDATE_PARITY); + if (memcmp(page_address(tmp_page), + page_address(sh->dev[qd_idx].page), + STRIPE_SIZE)!= 0) { + clear_bit(STRIPE_INSYNC, &sh->state); + update_q = 1; + } + } + if (update_p || update_q) { + conf->mddev->resync_mismatches += STRIPE_SECTORS; + if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) + /* don't try to repair!! */ + update_p = update_q = 0; } -#else - compute_block_2(sh, failed_num[0], failed_num[1]); - uptodate += failed_needupdate[0] + failed_needupdate[1]; -#endif - if (uptodate != disks) - BUG(); + /* now write out any block on a failed drive, + * or P or Q if they need it + */ - PRINTK("Marking for sync stripe %llu blocks %d,%d\n", - (unsigned long long)sh->sector, failed_num[0], failed_num[1]); + if (failed == 2) { + dev = &sh->dev[failed_num[1]]; + locked++; + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + set_bit(R5_Syncio, &dev->flags); + } + if (failed >= 1) { + dev = &sh->dev[failed_num[0]]; + locked++; + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + set_bit(R5_Syncio, &dev->flags); + } - /**** FIX: Should we really do both of these unconditionally? ****/ - adev = &sh->dev[failed_num[0]]; - locked += !test_bit(R5_LOCKED, &adev->flags); - set_bit(R5_LOCKED, &adev->flags); - set_bit(R5_Wantwrite, &adev->flags); - bdev = &sh->dev[failed_num[1]]; - locked += !test_bit(R5_LOCKED, &bdev->flags); - set_bit(R5_LOCKED, &bdev->flags); + if (update_p) { + dev = &sh->dev[pd_idx]; + locked ++; + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + set_bit(R5_Syncio, &dev->flags); + } + if (update_q) { + dev = &sh->dev[qd_idx]; + locked++; + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + set_bit(R5_Syncio, &dev->flags); + } clear_bit(STRIPE_DEGRADED, &sh->state); - set_bit(R5_Wantwrite, &bdev->flags); set_bit(STRIPE_INSYNC, &sh->state); - set_bit(R5_Syncio, &adev->flags); - set_bit(R5_Syncio, &bdev->flags); } } + if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { md_done_sync(conf->mddev, STRIPE_SECTORS,1); clear_bit(STRIPE_SYNCING, &sh->state); @@ -1664,7 +1687,7 @@ static int make_request (request_queue_t *q, struct bio * bi) } finish_wait(&conf->wait_for_overlap, &w); raid6_plug_device(conf); - handle_stripe(sh); + handle_stripe(sh, NULL); release_stripe(sh); } else { /* cannot get stripe for read-ahead, just give-up */ @@ -1728,6 +1751,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return rv; } if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { /* we can skip this block, and probably more */ sync_blocks /= STRIPE_SECTORS; @@ -1765,7 +1789,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i clear_bit(STRIPE_INSYNC, &sh->state); spin_unlock(&sh->lock); - handle_stripe(sh); + handle_stripe(sh, NULL); release_stripe(sh); return STRIPE_SECTORS; @@ -1821,7 +1845,7 @@ static void raid6d (mddev_t *mddev) spin_unlock_irq(&conf->device_lock); handled++; - handle_stripe(sh); + handle_stripe(sh, conf->spare_page); release_stripe(sh); spin_lock_irq(&conf->device_lock); @@ -1860,6 +1884,10 @@ static int run(mddev_t *mddev) goto abort; memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE); + conf->spare_page = alloc_page(GFP_KERNEL); + if (!conf->spare_page) + goto abort; + spin_lock_init(&conf->device_lock); init_waitqueue_head(&conf->wait_for_stripe); init_waitqueue_head(&conf->wait_for_overlap); @@ -1996,6 +2024,8 @@ static int run(mddev_t *mddev) abort: if (conf) { print_raid6_conf(conf); + if (conf->spare_page) + page_cache_release(conf->spare_page); if (conf->stripe_hashtbl) free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER); -- cgit v1.2.2 From d69762e98456b71167865db9e33e732a28dd36ab Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 6 Jan 2006 00:20:18 -0800 Subject: [PATCH] md: improve handing of read errors with raid6 This is a simple port of match functionality across from raid5. If we get a read error, we don't kick the drive straight away, but try to over-write with good data first. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid6main.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 66 insertions(+), 4 deletions(-) (limited to 'drivers/md/raid6main.c') diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c index 52e8796bb8ac..7a51553d8be5 100644 --- a/drivers/md/raid6main.c +++ b/drivers/md/raid6main.c @@ -367,8 +367,8 @@ static void shrink_stripes(raid6_conf_t *conf) conf->slab_cache = NULL; } -static int raid6_end_read_request (struct bio * bi, unsigned int bytes_done, - int error) +static int raid6_end_read_request(struct bio * bi, unsigned int bytes_done, + int error) { struct stripe_head *sh = bi->bi_private; raid6_conf_t *conf = sh->raid_conf; @@ -420,9 +420,35 @@ static int raid6_end_read_request (struct bio * bi, unsigned int bytes_done, #else set_bit(R5_UPTODATE, &sh->dev[i].flags); #endif + if (test_bit(R5_ReadError, &sh->dev[i].flags)) { + printk(KERN_INFO "raid6: read error corrected!!\n"); + clear_bit(R5_ReadError, &sh->dev[i].flags); + clear_bit(R5_ReWrite, &sh->dev[i].flags); + } + if (atomic_read(&conf->disks[i].rdev->read_errors)) + atomic_set(&conf->disks[i].rdev->read_errors, 0); } else { - md_error(conf->mddev, conf->disks[i].rdev); + int retry = 0; clear_bit(R5_UPTODATE, &sh->dev[i].flags); + atomic_inc(&conf->disks[i].rdev->read_errors); + if (conf->mddev->degraded) + printk(KERN_WARNING "raid6: read error not correctable.\n"); + else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) + /* Oh, no!!! */ + printk(KERN_WARNING "raid6: read error NOT corrected!!\n"); + else if (atomic_read(&conf->disks[i].rdev->read_errors) + > conf->max_nr_stripes) + printk(KERN_WARNING + "raid6: Too many read errors, failing device.\n"); + else + retry = 1; + if (retry) + set_bit(R5_ReadError, &sh->dev[i].flags); + else { + clear_bit(R5_ReadError, &sh->dev[i].flags); + clear_bit(R5_ReWrite, &sh->dev[i].flags); + md_error(conf->mddev, conf->disks[i].rdev); + } } rdev_dec_pending(conf->disks[i].rdev, conf->mddev); #if 0 @@ -1079,6 +1105,12 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) if (dev->written) written++; rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */ if (!rdev || !test_bit(In_sync, &rdev->flags)) { + /* The ReadError flag will just be confusing now */ + clear_bit(R5_ReadError, &dev->flags); + clear_bit(R5_ReWrite, &dev->flags); + } + if (!rdev || !test_bit(In_sync, &rdev->flags) + || test_bit(R5_ReadError, &dev->flags)) { if ( failed < 2 ) failed_num[failed] = i; failed++; @@ -1095,6 +1127,14 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) if (failed > 2 && to_read+to_write+written) { for (i=disks; i--; ) { int bitmap_end = 0; + + if (test_bit(R5_ReadError, &sh->dev[i].flags)) { + mdk_rdev_t *rdev = conf->disks[i].rdev; + if (rdev && test_bit(In_sync, &rdev->flags)) + /* multiple read failures in one stripe */ + md_error(conf->mddev, rdev); + } + spin_lock_irq(&conf->device_lock); /* fail all writes first */ bi = sh->dev[i].towrite; @@ -1130,7 +1170,8 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) } /* fail any reads if this device is non-operational */ - if (!test_bit(R5_Insync, &sh->dev[i].flags)) { + if (!test_bit(R5_Insync, &sh->dev[i].flags) || + test_bit(R5_ReadError, &sh->dev[i].flags)) { bi = sh->dev[i].toread; sh->dev[i].toread = NULL; if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) @@ -1457,6 +1498,27 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) clear_bit(STRIPE_SYNCING, &sh->state); } + /* If the failed drives are just a ReadError, then we might need + * to progress the repair/check process + */ + if (failed <= 2 && ! conf->mddev->ro) + for (i=0; idev[failed_num[i]]; + if (test_bit(R5_ReadError, &dev->flags) + && !test_bit(R5_LOCKED, &dev->flags) + && test_bit(R5_UPTODATE, &dev->flags) + ) { + if (!test_bit(R5_ReWrite, &dev->flags)) { + set_bit(R5_Wantwrite, &dev->flags); + set_bit(R5_ReWrite, &dev->flags); + set_bit(R5_LOCKED, &dev->flags); + } else { + /* let's read it back */ + set_bit(R5_Wantread, &dev->flags); + set_bit(R5_LOCKED, &dev->flags); + } + } + } spin_unlock(&sh->lock); while ((bi=return_bi)) { -- cgit v1.2.2 From 9910f16af35419a5382fa7850eecc220103036fa Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 6 Jan 2006 00:20:24 -0800 Subject: [PATCH] md: fix up some rdev rcu locking in raid5/6 There is this "FIXME" comment with a typo in it!! that been annoying me for days, so I just had to remove it. conf->disks[i].rdev should only be accessed if - we know we hold a reference or - the mddev->reconfig_sem is down or - we have a rcu_readlock handle_stripe was referencing rdev in three places without any of these. For the first two, get an rcu_readlock. For the last, the same access (md_sync_acct call) is made a little later after the rdev has been claimed under and rcu_readlock, if R5_Syncio is set. So just use that access... However R5_Syncio isn't really needed as the 'syncing' variable contains the same information. So use that instead. Issues, comment, and fix are identical in raid5 and raid6. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid6main.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'drivers/md/raid6main.c') diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c index 7a51553d8be5..b5b7a8d0b165 100644 --- a/drivers/md/raid6main.c +++ b/drivers/md/raid6main.c @@ -1060,11 +1060,11 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) syncing = test_bit(STRIPE_SYNCING, &sh->state); /* Now to look around and see what can be done */ + rcu_read_lock(); for (i=disks; i--; ) { mdk_rdev_t *rdev; dev = &sh->dev[i]; clear_bit(R5_Insync, &dev->flags); - clear_bit(R5_Syncio, &dev->flags); PRINTK("check %d: state 0x%lx read %p write %p written %p\n", i, dev->flags, dev->toread, dev->towrite, dev->written); @@ -1103,7 +1103,7 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) non_overwrite++; } if (dev->written) written++; - rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */ + rdev = rcu_dereference(conf->disks[i].rdev); if (!rdev || !test_bit(In_sync, &rdev->flags)) { /* The ReadError flag will just be confusing now */ clear_bit(R5_ReadError, &dev->flags); @@ -1117,6 +1117,7 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) } else set_bit(R5_Insync, &dev->flags); } + rcu_read_unlock(); PRINTK("locked=%d uptodate=%d to_read=%d" " to_write=%d failed=%d failed_num=%d,%d\n", locked, uptodate, to_read, to_write, failed, @@ -1129,10 +1130,13 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) int bitmap_end = 0; if (test_bit(R5_ReadError, &sh->dev[i].flags)) { - mdk_rdev_t *rdev = conf->disks[i].rdev; + mdk_rdev_t *rdev; + rcu_read_lock(); + rdev = rcu_dereference(conf->disks[i].rdev); if (rdev && test_bit(In_sync, &rdev->flags)) /* multiple read failures in one stripe */ md_error(conf->mddev, rdev); + rcu_read_unlock(); } spin_lock_irq(&conf->device_lock); @@ -1307,9 +1311,6 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) locked++; PRINTK("Reading block %d (sync=%d)\n", i, syncing); - if (syncing) - md_sync_acct(conf->disks[i].rdev->bdev, - STRIPE_SECTORS); } } } @@ -1463,14 +1464,12 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) locked++; set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantwrite, &dev->flags); - set_bit(R5_Syncio, &dev->flags); } if (failed >= 1) { dev = &sh->dev[failed_num[0]]; locked++; set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantwrite, &dev->flags); - set_bit(R5_Syncio, &dev->flags); } if (update_p) { @@ -1478,14 +1477,12 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) locked ++; set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantwrite, &dev->flags); - set_bit(R5_Syncio, &dev->flags); } if (update_q) { dev = &sh->dev[qd_idx]; locked++; set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantwrite, &dev->flags); - set_bit(R5_Syncio, &dev->flags); } clear_bit(STRIPE_DEGRADED, &sh->state); @@ -1557,7 +1554,7 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) rcu_read_unlock(); if (rdev) { - if (test_bit(R5_Syncio, &sh->dev[i].flags)) + if (syncing) md_sync_acct(rdev->bdev, STRIPE_SECTORS); bi->bi_bdev = rdev->bdev; -- cgit v1.2.2 From 2d1f3b5d1b2cd11a162eb29645df749ec0036413 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 6 Jan 2006 00:20:31 -0800 Subject: [PATCH] md: clean up 'page' related names in md Substitute: page_cache_get -> get_page page_cache_release -> put_page PAGE_CACHE_SHIFT -> PAGE_SHIFT PAGE_CACHE_SIZE -> PAGE_SIZE PAGE_CACHE_MASK -> PAGE_MASK __free_page -> put_page because we aren't using the page cache, we are just using pages. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid6main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/md/raid6main.c') diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c index b5b7a8d0b165..4062fc16ac2b 100644 --- a/drivers/md/raid6main.c +++ b/drivers/md/raid6main.c @@ -186,7 +186,7 @@ static void shrink_buffers(struct stripe_head *sh, int num) if (!p) continue; sh->dev[i].page = NULL; - page_cache_release(p); + put_page(p); } } @@ -2069,7 +2069,7 @@ static int run(mddev_t *mddev) */ { int stripe = (mddev->raid_disks-2) * mddev->chunk_size - / PAGE_CACHE_SIZE; + / PAGE_SIZE; if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) mddev->queue->backing_dev_info.ra_pages = 2 * stripe; } @@ -2084,7 +2084,7 @@ abort: if (conf) { print_raid6_conf(conf); if (conf->spare_page) - page_cache_release(conf->spare_page); + put_page(conf->spare_page); if (conf->stripe_hashtbl) free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER); -- cgit v1.2.2 From fccddba060f2b4916a30aa27acc3d03b01bb981e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 6 Jan 2006 00:20:33 -0800 Subject: [PATCH] md: tidy up raid5/6 hash table code - replace open-coded hash chain with hlist macros - Fix hash-table size at one page - it is already quite generous, so there will never be a need to use multiple pages, so no need for __get_free_pages No functional change. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid6main.c | 46 +++++++++++++++++----------------------------- 1 file changed, 17 insertions(+), 29 deletions(-) (limited to 'drivers/md/raid6main.c') diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c index 4062fc16ac2b..79b5244f44f4 100644 --- a/drivers/md/raid6main.c +++ b/drivers/md/raid6main.c @@ -40,12 +40,10 @@ #define STRIPE_SHIFT (PAGE_SHIFT - 9) #define STRIPE_SECTORS (STRIPE_SIZE>>9) #define IO_THRESHOLD 1 -#define HASH_PAGES 1 -#define HASH_PAGES_ORDER 0 -#define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *)) +#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) #define HASH_MASK (NR_HASH - 1) -#define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]) +#define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK])) /* bio's attached to a stripe+device for I/O are linked together in bi_sector * order without overlap. There may be several bio's per stripe+device, and @@ -132,29 +130,21 @@ static void release_stripe(struct stripe_head *sh) spin_unlock_irqrestore(&conf->device_lock, flags); } -static void remove_hash(struct stripe_head *sh) +static inline void remove_hash(struct stripe_head *sh) { PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector); - if (sh->hash_pprev) { - if (sh->hash_next) - sh->hash_next->hash_pprev = sh->hash_pprev; - *sh->hash_pprev = sh->hash_next; - sh->hash_pprev = NULL; - } + hlist_del_init(&sh->hash); } -static __inline__ void insert_hash(raid6_conf_t *conf, struct stripe_head *sh) +static inline void insert_hash(raid6_conf_t *conf, struct stripe_head *sh) { - struct stripe_head **shp = &stripe_hash(conf, sh->sector); + struct hlist_head *hp = stripe_hash(conf, sh->sector); PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector); CHECK_DEVLOCK(); - if ((sh->hash_next = *shp) != NULL) - (*shp)->hash_pprev = &sh->hash_next; - *shp = sh; - sh->hash_pprev = shp; + hlist_add_head(&sh->hash, hp); } @@ -247,10 +237,11 @@ static inline void init_stripe(struct stripe_head *sh, sector_t sector, int pd_i static struct stripe_head *__find_stripe(raid6_conf_t *conf, sector_t sector) { struct stripe_head *sh; + struct hlist_node *hn; CHECK_DEVLOCK(); PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector); - for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next) + hlist_for_each_entry (sh, hn, stripe_hash(conf, sector), hash) if (sh->sector == sector) return sh; PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector); @@ -1931,17 +1922,15 @@ static int run(mddev_t *mddev) return -EIO; } - mddev->private = kmalloc (sizeof (raid6_conf_t) - + mddev->raid_disks * sizeof(struct disk_info), - GFP_KERNEL); + mddev->private = kzalloc(sizeof (raid6_conf_t) + + mddev->raid_disks * sizeof(struct disk_info), + GFP_KERNEL); if ((conf = mddev->private) == NULL) goto abort; - memset (conf, 0, sizeof (*conf) + mddev->raid_disks * sizeof(struct disk_info) ); conf->mddev = mddev; - if ((conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL) + if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) goto abort; - memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE); conf->spare_page = alloc_page(GFP_KERNEL); if (!conf->spare_page) @@ -2085,9 +2074,7 @@ abort: print_raid6_conf(conf); if (conf->spare_page) put_page(conf->spare_page); - if (conf->stripe_hashtbl) - free_pages((unsigned long) conf->stripe_hashtbl, - HASH_PAGES_ORDER); + kfree(conf->stripe_hashtbl); kfree(conf); } mddev->private = NULL; @@ -2104,7 +2091,7 @@ static int stop (mddev_t *mddev) md_unregister_thread(mddev->thread); mddev->thread = NULL; shrink_stripes(conf); - free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER); + kfree(conf->stripe_hashtbl); blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ kfree(conf); mddev->private = NULL; @@ -2131,12 +2118,13 @@ static void print_sh (struct seq_file *seq, struct stripe_head *sh) static void printall (struct seq_file *seq, raid6_conf_t *conf) { struct stripe_head *sh; + struct hlist_node *hn; int i; spin_lock_irq(&conf->device_lock); for (i = 0; i < NR_HASH; i++) { sh = conf->stripe_hashtbl[i]; - for (; sh; sh = sh->hash_next) { + hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) { if (sh->raid_conf != conf) continue; print_sh(seq, sh); -- cgit v1.2.2 From 2604b703b6b3db80e3c75ce472a54dfd0b7bf9f4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 6 Jan 2006 00:20:36 -0800 Subject: [PATCH] md: remove personality numbering from md md supports multiple different RAID level, each being implemented by a 'personality' (which is often in a separate module). These personalities have fairly artificial 'numbers'. The numbers are use to: 1- provide an index into an array where the various personalities are recorded 2- identify the module (via an alias) which implements are particular personality. Neither of these uses really justify the existence of personality numbers. The array can be replaced by a linked list which is searched (array lookup only happens very rarely). Module identification can be done using an alias based on level rather than 'personality' number. The current 'raid5' modules support two level (4 and 5) but only one personality. This slight awkwardness (which was handled in the mapping from level to personality) can be better handled by allowing raid5 to register 2 personalities. With this change in place, the core md module does not need to have an exhaustive list of all possible personalities, so other personalities can be added independently. This patch also moves the check for chunksize being non-zero into the ->run routines for the personalities that need it, rather than having it in core-md. This has a side effect of allowing 'faulty' and 'linear' not to have a chunk-size set. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid6main.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'drivers/md/raid6main.c') diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c index 79b5244f44f4..950e5fa6e1f2 100644 --- a/drivers/md/raid6main.c +++ b/drivers/md/raid6main.c @@ -2304,9 +2304,10 @@ static void raid6_quiesce(mddev_t *mddev, int state) } } -static mdk_personality_t raid6_personality= +static struct mdk_personality raid6_personality = { .name = "raid6", + .level = 6, .owner = THIS_MODULE, .make_request = make_request, .run = run, @@ -2321,7 +2322,7 @@ static mdk_personality_t raid6_personality= .quiesce = raid6_quiesce, }; -static int __init raid6_init (void) +static int __init raid6_init(void) { int e; @@ -2329,15 +2330,16 @@ static int __init raid6_init (void) if ( e ) return e; - return register_md_personality (RAID6, &raid6_personality); + return register_md_personality(&raid6_personality); } static void raid6_exit (void) { - unregister_md_personality (RAID6); + unregister_md_personality(&raid6_personality); } module_init(raid6_init); module_exit(raid6_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS("md-personality-8"); /* RAID6 */ +MODULE_ALIAS("md-level-6"); -- cgit v1.2.2 From 1345b1d8adbdeceb1c871d9a4af5e2a700b341c6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 6 Jan 2006 00:20:40 -0800 Subject: [PATCH] md: define and use safe_put_page for md md sometimes call put_page on NULL pointers (treating it like kfree). This is not safe, so define and use a 'safe_put_page' which checks for NULL. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid6main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/md/raid6main.c') diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c index 950e5fa6e1f2..06b32bd671a3 100644 --- a/drivers/md/raid6main.c +++ b/drivers/md/raid6main.c @@ -2072,8 +2072,7 @@ static int run(mddev_t *mddev) abort: if (conf) { print_raid6_conf(conf); - if (conf->spare_page) - put_page(conf->spare_page); + safe_put_page(conf->spare_page); kfree(conf->stripe_hashtbl); kfree(conf); } -- cgit v1.2.2 From d9d166c2a9d5d01af34396793950aa695883eed4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 6 Jan 2006 00:20:51 -0800 Subject: [PATCH] md: allow array level to be set textually via sysfs Signed-off-by: Neil Brown Acked-by: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid6main.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/md/raid6main.c') diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c index 06b32bd671a3..84dd875bb2f6 100644 --- a/drivers/md/raid6main.c +++ b/drivers/md/raid6main.c @@ -2341,4 +2341,5 @@ module_init(raid6_init); module_exit(raid6_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS("md-personality-8"); /* RAID6 */ +MODULE_ALIAS("md-raid6"); MODULE_ALIAS("md-level-6"); -- cgit v1.2.2 From 4dbcdc751cb25ffca3a8374cbc5ab6de961cc545 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 6 Jan 2006 00:20:52 -0800 Subject: [PATCH] md: count corrected read errors per drive Store this total in superblock (As appropriate), and make it available to userspace via sysfs. Signed-off-by: Neil Brown Acked-by: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid6main.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/md/raid6main.c') diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c index 84dd875bb2f6..8c823d686a60 100644 --- a/drivers/md/raid6main.c +++ b/drivers/md/raid6main.c @@ -1562,6 +1562,9 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) bi->bi_io_vec[0].bv_offset = 0; bi->bi_size = STRIPE_SIZE; bi->bi_next = NULL; + if (rw == WRITE && + test_bit(R5_ReWrite, &sh->dev[i].flags)) + atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); generic_make_request(bi); } else { if (rw == 1) -- cgit v1.2.2