aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2011-07-27 21:31:46 -0400
committerNeilBrown <neilb@suse.de>2011-07-27 21:31:46 -0400
commit2230dfe4ccc3add340dc6d437965b2de1d269fde (patch)
treefc45b727ad2e1a148e7d20f327b45a3afc474e9d /drivers
parenta519b26dbe6533416d21b552053b0bf687f878d7 (diff)
md: beginnings of bad block management.
This the first step in allowing md to track bad-blocks per-device so that we can fail individual blocks rather than the whole device. This patch just adds a data structure for recording bad blocks, with routines to add, remove, search the list. Signed-off-by: NeilBrown <neilb@suse.de> Reviewed-by: Namhyung Kim <namhyung@gmail.com>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/md/md.c415
-rw-r--r--drivers/md/md.h48
2 files changed, 459 insertions, 4 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4279b3b58d1a..463a392c0705 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1952,6 +1952,9 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1952 sysfs_remove_link(&rdev->kobj, "block"); 1952 sysfs_remove_link(&rdev->kobj, "block");
1953 sysfs_put(rdev->sysfs_state); 1953 sysfs_put(rdev->sysfs_state);
1954 rdev->sysfs_state = NULL; 1954 rdev->sysfs_state = NULL;
1955 kfree(rdev->badblocks.page);
1956 rdev->badblocks.count = 0;
1957 rdev->badblocks.page = NULL;
1955 /* We need to delay this, otherwise we can deadlock when 1958 /* We need to delay this, otherwise we can deadlock when
1956 * writing to 'remove' to "dev/state". We also need 1959 * writing to 'remove' to "dev/state". We also need
1957 * to delay it due to rcu usage. 1960 * to delay it due to rcu usage.
@@ -2778,7 +2781,7 @@ static struct kobj_type rdev_ktype = {
2778 .default_attrs = rdev_default_attrs, 2781 .default_attrs = rdev_default_attrs,
2779}; 2782};
2780 2783
2781void md_rdev_init(mdk_rdev_t *rdev) 2784int md_rdev_init(mdk_rdev_t *rdev)
2782{ 2785{
2783 rdev->desc_nr = -1; 2786 rdev->desc_nr = -1;
2784 rdev->saved_raid_disk = -1; 2787 rdev->saved_raid_disk = -1;
@@ -2794,6 +2797,19 @@ void md_rdev_init(mdk_rdev_t *rdev)
2794 2797
2795 INIT_LIST_HEAD(&rdev->same_set); 2798 INIT_LIST_HEAD(&rdev->same_set);
2796 init_waitqueue_head(&rdev->blocked_wait); 2799 init_waitqueue_head(&rdev->blocked_wait);
2800
2801 /* Add space to store bad block list.
2802 * This reserves the space even on arrays where it cannot
2803 * be used - I wonder if that matters
2804 */
2805 rdev->badblocks.count = 0;
2806 rdev->badblocks.shift = 0;
2807 rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
2808 seqlock_init(&rdev->badblocks.lock);
2809 if (rdev->badblocks.page == NULL)
2810 return -ENOMEM;
2811
2812 return 0;
2797} 2813}
2798EXPORT_SYMBOL_GPL(md_rdev_init); 2814EXPORT_SYMBOL_GPL(md_rdev_init);
2799/* 2815/*
@@ -2819,8 +2835,11 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
2819 return ERR_PTR(-ENOMEM); 2835 return ERR_PTR(-ENOMEM);
2820 } 2836 }
2821 2837
2822 md_rdev_init(rdev); 2838 err = md_rdev_init(rdev);
2823 if ((err = alloc_disk_sb(rdev))) 2839 if (err)
2840 goto abort_free;
2841 err = alloc_disk_sb(rdev);
2842 if (err)
2824 goto abort_free; 2843 goto abort_free;
2825 2844
2826 err = lock_rdev(rdev, newdev, super_format == -2); 2845 err = lock_rdev(rdev, newdev, super_format == -2);
@@ -2865,6 +2884,7 @@ abort_free:
2865 unlock_rdev(rdev); 2884 unlock_rdev(rdev);
2866 free_disk_sb(rdev); 2885 free_disk_sb(rdev);
2867 } 2886 }
2887 kfree(rdev->badblocks.page);
2868 kfree(rdev); 2888 kfree(rdev);
2869 return ERR_PTR(err); 2889 return ERR_PTR(err);
2870} 2890}
@@ -7327,6 +7347,395 @@ void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
7327} 7347}
7328EXPORT_SYMBOL(md_wait_for_blocked_rdev); 7348EXPORT_SYMBOL(md_wait_for_blocked_rdev);
7329 7349
7350
7351/* Bad block management.
7352 * We can record which blocks on each device are 'bad' and so just
7353 * fail those blocks, or that stripe, rather than the whole device.
7354 * Entries in the bad-block table are 64bits wide. This comprises:
7355 * Length of bad-range, in sectors: 0-511 for lengths 1-512
7356 * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
7357 * A 'shift' can be set so that larger blocks are tracked and
7358 * consequently larger devices can be covered.
7359 * 'Acknowledged' flag - 1 bit. - the most significant bit.
7360 *
7361 * Locking of the bad-block table uses a seqlock so md_is_badblock
7362 * might need to retry if it is very unlucky.
7363 * We will sometimes want to check for bad blocks in a bi_end_io function,
7364 * so we use the write_seqlock_irq variant.
7365 *
7366 * When looking for a bad block we specify a range and want to
7367 * know if any block in the range is bad. So we binary-search
7368 * to the last range that starts at-or-before the given endpoint,
7369 * (or "before the sector after the target range")
7370 * then see if it ends after the given start.
7371 * We return
7372 * 0 if there are no known bad blocks in the range
7373 * 1 if there are known bad block which are all acknowledged
7374 * -1 if there are bad blocks which have not yet been acknowledged in metadata.
7375 * plus the start/length of the first bad section we overlap.
7376 */
7377int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
7378 sector_t *first_bad, int *bad_sectors)
7379{
7380 int hi;
7381 int lo = 0;
7382 u64 *p = bb->page;
7383 int rv = 0;
7384 sector_t target = s + sectors;
7385 unsigned seq;
7386
7387 if (bb->shift > 0) {
7388 /* round the start down, and the end up */
7389 s >>= bb->shift;
7390 target += (1<<bb->shift) - 1;
7391 target >>= bb->shift;
7392 sectors = target - s;
7393 }
7394 /* 'target' is now the first block after the bad range */
7395
7396retry:
7397 seq = read_seqbegin(&bb->lock);
7398
7399 hi = bb->count;
7400
7401 /* Binary search between lo and hi for 'target'
7402 * i.e. for the last range that starts before 'target'
7403 */
7404 /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
7405 * are known not to be the last range before target.
7406 * VARIANT: hi-lo is the number of possible
7407 * ranges, and decreases until it reaches 1
7408 */
7409 while (hi - lo > 1) {
7410 int mid = (lo + hi) / 2;
7411 sector_t a = BB_OFFSET(p[mid]);
7412 if (a < target)
7413 /* This could still be the one, earlier ranges
7414 * could not. */
7415 lo = mid;
7416 else
7417 /* This and later ranges are definitely out. */
7418 hi = mid;
7419 }
7420 /* 'lo' might be the last that started before target, but 'hi' isn't */
7421 if (hi > lo) {
7422 /* need to check all range that end after 's' to see if
7423 * any are unacknowledged.
7424 */
7425 while (lo >= 0 &&
7426 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
7427 if (BB_OFFSET(p[lo]) < target) {
7428 /* starts before the end, and finishes after
7429 * the start, so they must overlap
7430 */
7431 if (rv != -1 && BB_ACK(p[lo]))
7432 rv = 1;
7433 else
7434 rv = -1;
7435 *first_bad = BB_OFFSET(p[lo]);
7436 *bad_sectors = BB_LEN(p[lo]);
7437 }
7438 lo--;
7439 }
7440 }
7441
7442 if (read_seqretry(&bb->lock, seq))
7443 goto retry;
7444
7445 return rv;
7446}
7447EXPORT_SYMBOL_GPL(md_is_badblock);
7448
7449/*
7450 * Add a range of bad blocks to the table.
7451 * This might extend the table, or might contract it
7452 * if two adjacent ranges can be merged.
7453 * We binary-search to find the 'insertion' point, then
7454 * decide how best to handle it.
7455 */
7456static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
7457 int acknowledged)
7458{
7459 u64 *p;
7460 int lo, hi;
7461 int rv = 1;
7462
7463 if (bb->shift < 0)
7464 /* badblocks are disabled */
7465 return 0;
7466
7467 if (bb->shift) {
7468 /* round the start down, and the end up */
7469 sector_t next = s + sectors;
7470 s >>= bb->shift;
7471 next += (1<<bb->shift) - 1;
7472 next >>= bb->shift;
7473 sectors = next - s;
7474 }
7475
7476 write_seqlock_irq(&bb->lock);
7477
7478 p = bb->page;
7479 lo = 0;
7480 hi = bb->count;
7481 /* Find the last range that starts at-or-before 's' */
7482 while (hi - lo > 1) {
7483 int mid = (lo + hi) / 2;
7484 sector_t a = BB_OFFSET(p[mid]);
7485 if (a <= s)
7486 lo = mid;
7487 else
7488 hi = mid;
7489 }
7490 if (hi > lo && BB_OFFSET(p[lo]) > s)
7491 hi = lo;
7492
7493 if (hi > lo) {
7494 /* we found a range that might merge with the start
7495 * of our new range
7496 */
7497 sector_t a = BB_OFFSET(p[lo]);
7498 sector_t e = a + BB_LEN(p[lo]);
7499 int ack = BB_ACK(p[lo]);
7500 if (e >= s) {
7501 /* Yes, we can merge with a previous range */
7502 if (s == a && s + sectors >= e)
7503 /* new range covers old */
7504 ack = acknowledged;
7505 else
7506 ack = ack && acknowledged;
7507
7508 if (e < s + sectors)
7509 e = s + sectors;
7510 if (e - a <= BB_MAX_LEN) {
7511 p[lo] = BB_MAKE(a, e-a, ack);
7512 s = e;
7513 } else {
7514 /* does not all fit in one range,
7515 * make p[lo] maximal
7516 */
7517 if (BB_LEN(p[lo]) != BB_MAX_LEN)
7518 p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
7519 s = a + BB_MAX_LEN;
7520 }
7521 sectors = e - s;
7522 }
7523 }
7524 if (sectors && hi < bb->count) {
7525 /* 'hi' points to the first range that starts after 's'.
7526 * Maybe we can merge with the start of that range */
7527 sector_t a = BB_OFFSET(p[hi]);
7528 sector_t e = a + BB_LEN(p[hi]);
7529 int ack = BB_ACK(p[hi]);
7530 if (a <= s + sectors) {
7531 /* merging is possible */
7532 if (e <= s + sectors) {
7533 /* full overlap */
7534 e = s + sectors;
7535 ack = acknowledged;
7536 } else
7537 ack = ack && acknowledged;
7538
7539 a = s;
7540 if (e - a <= BB_MAX_LEN) {
7541 p[hi] = BB_MAKE(a, e-a, ack);
7542 s = e;
7543 } else {
7544 p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
7545 s = a + BB_MAX_LEN;
7546 }
7547 sectors = e - s;
7548 lo = hi;
7549 hi++;
7550 }
7551 }
7552 if (sectors == 0 && hi < bb->count) {
7553 /* we might be able to combine lo and hi */
7554 /* Note: 's' is at the end of 'lo' */
7555 sector_t a = BB_OFFSET(p[hi]);
7556 int lolen = BB_LEN(p[lo]);
7557 int hilen = BB_LEN(p[hi]);
7558 int newlen = lolen + hilen - (s - a);
7559 if (s >= a && newlen < BB_MAX_LEN) {
7560 /* yes, we can combine them */
7561 int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
7562 p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
7563 memmove(p + hi, p + hi + 1,
7564 (bb->count - hi - 1) * 8);
7565 bb->count--;
7566 }
7567 }
7568 while (sectors) {
7569 /* didn't merge (it all).
7570 * Need to add a range just before 'hi' */
7571 if (bb->count >= MD_MAX_BADBLOCKS) {
7572 /* No room for more */
7573 rv = 0;
7574 break;
7575 } else {
7576 int this_sectors = sectors;
7577 memmove(p + hi + 1, p + hi,
7578 (bb->count - hi) * 8);
7579 bb->count++;
7580
7581 if (this_sectors > BB_MAX_LEN)
7582 this_sectors = BB_MAX_LEN;
7583 p[hi] = BB_MAKE(s, this_sectors, acknowledged);
7584 sectors -= this_sectors;
7585 s += this_sectors;
7586 }
7587 }
7588
7589 bb->changed = 1;
7590 write_sequnlock_irq(&bb->lock);
7591
7592 return rv;
7593}
7594
7595int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors,
7596 int acknowledged)
7597{
7598 int rv = md_set_badblocks(&rdev->badblocks,
7599 s + rdev->data_offset, sectors, acknowledged);
7600 if (rv) {
7601 /* Make sure they get written out promptly */
7602 set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
7603 md_wakeup_thread(rdev->mddev->thread);
7604 }
7605 return rv;
7606}
7607EXPORT_SYMBOL_GPL(rdev_set_badblocks);
7608
7609/*
7610 * Remove a range of bad blocks from the table.
7611 * This may involve extending the table if we spilt a region,
7612 * but it must not fail. So if the table becomes full, we just
7613 * drop the remove request.
7614 */
7615static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
7616{
7617 u64 *p;
7618 int lo, hi;
7619 sector_t target = s + sectors;
7620 int rv = 0;
7621
7622 if (bb->shift > 0) {
7623 /* When clearing we round the start up and the end down.
7624 * This should not matter as the shift should align with
7625 * the block size and no rounding should ever be needed.
7626 * However it is better the think a block is bad when it
7627 * isn't than to think a block is not bad when it is.
7628 */
7629 s += (1<<bb->shift) - 1;
7630 s >>= bb->shift;
7631 target >>= bb->shift;
7632 sectors = target - s;
7633 }
7634
7635 write_seqlock_irq(&bb->lock);
7636
7637 p = bb->page;
7638 lo = 0;
7639 hi = bb->count;
7640 /* Find the last range that starts before 'target' */
7641 while (hi - lo > 1) {
7642 int mid = (lo + hi) / 2;
7643 sector_t a = BB_OFFSET(p[mid]);
7644 if (a < target)
7645 lo = mid;
7646 else
7647 hi = mid;
7648 }
7649 if (hi > lo) {
7650 /* p[lo] is the last range that could overlap the
7651 * current range. Earlier ranges could also overlap,
7652 * but only this one can overlap the end of the range.
7653 */
7654 if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
7655 /* Partial overlap, leave the tail of this range */
7656 int ack = BB_ACK(p[lo]);
7657 sector_t a = BB_OFFSET(p[lo]);
7658 sector_t end = a + BB_LEN(p[lo]);
7659
7660 if (a < s) {
7661 /* we need to split this range */
7662 if (bb->count >= MD_MAX_BADBLOCKS) {
7663 rv = 0;
7664 goto out;
7665 }
7666 memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
7667 bb->count++;
7668 p[lo] = BB_MAKE(a, s-a, ack);
7669 lo++;
7670 }
7671 p[lo] = BB_MAKE(target, end - target, ack);
7672 /* there is no longer an overlap */
7673 hi = lo;
7674 lo--;
7675 }
7676 while (lo >= 0 &&
7677 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
7678 /* This range does overlap */
7679 if (BB_OFFSET(p[lo]) < s) {
7680 /* Keep the early parts of this range. */
7681 int ack = BB_ACK(p[lo]);
7682 sector_t start = BB_OFFSET(p[lo]);
7683 p[lo] = BB_MAKE(start, s - start, ack);
7684 /* now low doesn't overlap, so.. */
7685 break;
7686 }
7687 lo--;
7688 }
7689 /* 'lo' is strictly before, 'hi' is strictly after,
7690 * anything between needs to be discarded
7691 */
7692 if (hi - lo > 1) {
7693 memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
7694 bb->count -= (hi - lo - 1);
7695 }
7696 }
7697
7698 bb->changed = 1;
7699out:
7700 write_sequnlock_irq(&bb->lock);
7701 return rv;
7702}
7703
7704int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors)
7705{
7706 return md_clear_badblocks(&rdev->badblocks,
7707 s + rdev->data_offset,
7708 sectors);
7709}
7710EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
7711
7712/*
7713 * Acknowledge all bad blocks in a list.
7714 * This only succeeds if ->changed is clear. It is used by
7715 * in-kernel metadata updates
7716 */
7717void md_ack_all_badblocks(struct badblocks *bb)
7718{
7719 if (bb->page == NULL || bb->changed)
7720 /* no point even trying */
7721 return;
7722 write_seqlock_irq(&bb->lock);
7723
7724 if (bb->changed == 0) {
7725 u64 *p = bb->page;
7726 int i;
7727 for (i = 0; i < bb->count ; i++) {
7728 if (!BB_ACK(p[i])) {
7729 sector_t start = BB_OFFSET(p[i]);
7730 int len = BB_LEN(p[i]);
7731 p[i] = BB_MAKE(start, len, 1);
7732 }
7733 }
7734 }
7735 write_sequnlock_irq(&bb->lock);
7736}
7737EXPORT_SYMBOL_GPL(md_ack_all_badblocks);
7738
7330static int md_notify_reboot(struct notifier_block *this, 7739static int md_notify_reboot(struct notifier_block *this,
7331 unsigned long code, void *x) 7740 unsigned long code, void *x)
7332{ 7741{
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 7d906a96477a..85af8433f8b8 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -29,6 +29,13 @@
29typedef struct mddev_s mddev_t; 29typedef struct mddev_s mddev_t;
30typedef struct mdk_rdev_s mdk_rdev_t; 30typedef struct mdk_rdev_s mdk_rdev_t;
31 31
32/* Bad block numbers are stored sorted in a single page.
33 * 64bits is used for each block or extent.
34 * 54 bits are sector number, 9 bits are extent size,
35 * 1 bit is an 'acknowledged' flag.
36 */
37#define MD_MAX_BADBLOCKS (PAGE_SIZE/8)
38
32/* 39/*
33 * MD's 'extended' device 40 * MD's 'extended' device
34 */ 41 */
@@ -111,8 +118,47 @@ struct mdk_rdev_s
111 118
112 struct sysfs_dirent *sysfs_state; /* handle for 'state' 119 struct sysfs_dirent *sysfs_state; /* handle for 'state'
113 * sysfs entry */ 120 * sysfs entry */
121
122 struct badblocks {
123 int count; /* count of bad blocks */
124 int shift; /* shift from sectors to block size
125 * a -ve shift means badblocks are
126 * disabled.*/
127 u64 *page; /* badblock list */
128 int changed;
129 seqlock_t lock;
130 } badblocks;
114}; 131};
115 132
133#define BB_LEN_MASK (0x00000000000001FFULL)
134#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
135#define BB_ACK_MASK (0x8000000000000000ULL)
136#define BB_MAX_LEN 512
137#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9)
138#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1)
139#define BB_ACK(x) (!!((x) & BB_ACK_MASK))
140#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
141
142extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
143 sector_t *first_bad, int *bad_sectors);
144static inline int is_badblock(mdk_rdev_t *rdev, sector_t s, int sectors,
145 sector_t *first_bad, int *bad_sectors)
146{
147 if (unlikely(rdev->badblocks.count)) {
148 int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s,
149 sectors,
150 first_bad, bad_sectors);
151 if (rv)
152 *first_bad -= rdev->data_offset;
153 return rv;
154 }
155 return 0;
156}
157extern int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors,
158 int acknowledged);
159extern int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors);
160extern void md_ack_all_badblocks(struct badblocks *bb);
161
116struct mddev_s 162struct mddev_s
117{ 163{
118 void *private; 164 void *private;
@@ -517,7 +563,7 @@ extern void mddev_init(mddev_t *mddev);
517extern int md_run(mddev_t *mddev); 563extern int md_run(mddev_t *mddev);
518extern void md_stop(mddev_t *mddev); 564extern void md_stop(mddev_t *mddev);
519extern void md_stop_writes(mddev_t *mddev); 565extern void md_stop_writes(mddev_t *mddev);
520extern void md_rdev_init(mdk_rdev_t *rdev); 566extern int md_rdev_init(mdk_rdev_t *rdev);
521 567
522extern void mddev_suspend(mddev_t *mddev); 568extern void mddev_suspend(mddev_t *mddev);
523extern void mddev_resume(mddev_t *mddev); 569extern void mddev_resume(mddev_t *mddev);