diff options
author | NeilBrown <neilb@suse.de> | 2011-07-27 21:31:46 -0400 |
---|---|---|
committer | NeilBrown <neilb@suse.de> | 2011-07-27 21:31:46 -0400 |
commit | 2230dfe4ccc3add340dc6d437965b2de1d269fde (patch) | |
tree | fc45b727ad2e1a148e7d20f327b45a3afc474e9d /drivers/md/md.c | |
parent | a519b26dbe6533416d21b552053b0bf687f878d7 (diff) |
md: beginnings of bad block management.
This the first step in allowing md to track bad-blocks per-device so
that we can fail individual blocks rather than the whole device.
This patch just adds a data structure for recording bad blocks, with
routines to add, remove, search the list.
Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r-- | drivers/md/md.c | 415 |
1 files changed, 412 insertions, 3 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index 4279b3b58d1a..463a392c0705 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -1952,6 +1952,9 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) | |||
1952 | sysfs_remove_link(&rdev->kobj, "block"); | 1952 | sysfs_remove_link(&rdev->kobj, "block"); |
1953 | sysfs_put(rdev->sysfs_state); | 1953 | sysfs_put(rdev->sysfs_state); |
1954 | rdev->sysfs_state = NULL; | 1954 | rdev->sysfs_state = NULL; |
1955 | kfree(rdev->badblocks.page); | ||
1956 | rdev->badblocks.count = 0; | ||
1957 | rdev->badblocks.page = NULL; | ||
1955 | /* We need to delay this, otherwise we can deadlock when | 1958 | /* We need to delay this, otherwise we can deadlock when |
1956 | * writing to 'remove' to "dev/state". We also need | 1959 | * writing to 'remove' to "dev/state". We also need |
1957 | * to delay it due to rcu usage. | 1960 | * to delay it due to rcu usage. |
@@ -2778,7 +2781,7 @@ static struct kobj_type rdev_ktype = { | |||
2778 | .default_attrs = rdev_default_attrs, | 2781 | .default_attrs = rdev_default_attrs, |
2779 | }; | 2782 | }; |
2780 | 2783 | ||
2781 | void md_rdev_init(mdk_rdev_t *rdev) | 2784 | int md_rdev_init(mdk_rdev_t *rdev) |
2782 | { | 2785 | { |
2783 | rdev->desc_nr = -1; | 2786 | rdev->desc_nr = -1; |
2784 | rdev->saved_raid_disk = -1; | 2787 | rdev->saved_raid_disk = -1; |
@@ -2794,6 +2797,19 @@ void md_rdev_init(mdk_rdev_t *rdev) | |||
2794 | 2797 | ||
2795 | INIT_LIST_HEAD(&rdev->same_set); | 2798 | INIT_LIST_HEAD(&rdev->same_set); |
2796 | init_waitqueue_head(&rdev->blocked_wait); | 2799 | init_waitqueue_head(&rdev->blocked_wait); |
2800 | |||
2801 | /* Add space to store bad block list. | ||
2802 | * This reserves the space even on arrays where it cannot | ||
2803 | * be used - I wonder if that matters | ||
2804 | */ | ||
2805 | rdev->badblocks.count = 0; | ||
2806 | rdev->badblocks.shift = 0; | ||
2807 | rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
2808 | seqlock_init(&rdev->badblocks.lock); | ||
2809 | if (rdev->badblocks.page == NULL) | ||
2810 | return -ENOMEM; | ||
2811 | |||
2812 | return 0; | ||
2797 | } | 2813 | } |
2798 | EXPORT_SYMBOL_GPL(md_rdev_init); | 2814 | EXPORT_SYMBOL_GPL(md_rdev_init); |
2799 | /* | 2815 | /* |
@@ -2819,8 +2835,11 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi | |||
2819 | return ERR_PTR(-ENOMEM); | 2835 | return ERR_PTR(-ENOMEM); |
2820 | } | 2836 | } |
2821 | 2837 | ||
2822 | md_rdev_init(rdev); | 2838 | err = md_rdev_init(rdev); |
2823 | if ((err = alloc_disk_sb(rdev))) | 2839 | if (err) |
2840 | goto abort_free; | ||
2841 | err = alloc_disk_sb(rdev); | ||
2842 | if (err) | ||
2824 | goto abort_free; | 2843 | goto abort_free; |
2825 | 2844 | ||
2826 | err = lock_rdev(rdev, newdev, super_format == -2); | 2845 | err = lock_rdev(rdev, newdev, super_format == -2); |
@@ -2865,6 +2884,7 @@ abort_free: | |||
2865 | unlock_rdev(rdev); | 2884 | unlock_rdev(rdev); |
2866 | free_disk_sb(rdev); | 2885 | free_disk_sb(rdev); |
2867 | } | 2886 | } |
2887 | kfree(rdev->badblocks.page); | ||
2868 | kfree(rdev); | 2888 | kfree(rdev); |
2869 | return ERR_PTR(err); | 2889 | return ERR_PTR(err); |
2870 | } | 2890 | } |
@@ -7327,6 +7347,395 @@ void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) | |||
7327 | } | 7347 | } |
7328 | EXPORT_SYMBOL(md_wait_for_blocked_rdev); | 7348 | EXPORT_SYMBOL(md_wait_for_blocked_rdev); |
7329 | 7349 | ||
7350 | |||
7351 | /* Bad block management. | ||
7352 | * We can record which blocks on each device are 'bad' and so just | ||
7353 | * fail those blocks, or that stripe, rather than the whole device. | ||
7354 | * Entries in the bad-block table are 64bits wide. This comprises: | ||
7355 | * Length of bad-range, in sectors: 0-511 for lengths 1-512 | ||
7356 | * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) | ||
7357 | * A 'shift' can be set so that larger blocks are tracked and | ||
7358 | * consequently larger devices can be covered. | ||
7359 | * 'Acknowledged' flag - 1 bit. - the most significant bit. | ||
7360 | * | ||
7361 | * Locking of the bad-block table uses a seqlock so md_is_badblock | ||
7362 | * might need to retry if it is very unlucky. | ||
7363 | * We will sometimes want to check for bad blocks in a bi_end_io function, | ||
7364 | * so we use the write_seqlock_irq variant. | ||
7365 | * | ||
7366 | * When looking for a bad block we specify a range and want to | ||
7367 | * know if any block in the range is bad. So we binary-search | ||
7368 | * to the last range that starts at-or-before the given endpoint, | ||
7369 | * (or "before the sector after the target range") | ||
7370 | * then see if it ends after the given start. | ||
7371 | * We return | ||
7372 | * 0 if there are no known bad blocks in the range | ||
7373 | * 1 if there are known bad block which are all acknowledged | ||
7374 | * -1 if there are bad blocks which have not yet been acknowledged in metadata. | ||
7375 | * plus the start/length of the first bad section we overlap. | ||
7376 | */ | ||
7377 | int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, | ||
7378 | sector_t *first_bad, int *bad_sectors) | ||
7379 | { | ||
7380 | int hi; | ||
7381 | int lo = 0; | ||
7382 | u64 *p = bb->page; | ||
7383 | int rv = 0; | ||
7384 | sector_t target = s + sectors; | ||
7385 | unsigned seq; | ||
7386 | |||
7387 | if (bb->shift > 0) { | ||
7388 | /* round the start down, and the end up */ | ||
7389 | s >>= bb->shift; | ||
7390 | target += (1<<bb->shift) - 1; | ||
7391 | target >>= bb->shift; | ||
7392 | sectors = target - s; | ||
7393 | } | ||
7394 | /* 'target' is now the first block after the bad range */ | ||
7395 | |||
7396 | retry: | ||
7397 | seq = read_seqbegin(&bb->lock); | ||
7398 | |||
7399 | hi = bb->count; | ||
7400 | |||
7401 | /* Binary search between lo and hi for 'target' | ||
7402 | * i.e. for the last range that starts before 'target' | ||
7403 | */ | ||
7404 | /* INVARIANT: ranges before 'lo' and at-or-after 'hi' | ||
7405 | * are known not to be the last range before target. | ||
7406 | * VARIANT: hi-lo is the number of possible | ||
7407 | * ranges, and decreases until it reaches 1 | ||
7408 | */ | ||
7409 | while (hi - lo > 1) { | ||
7410 | int mid = (lo + hi) / 2; | ||
7411 | sector_t a = BB_OFFSET(p[mid]); | ||
7412 | if (a < target) | ||
7413 | /* This could still be the one, earlier ranges | ||
7414 | * could not. */ | ||
7415 | lo = mid; | ||
7416 | else | ||
7417 | /* This and later ranges are definitely out. */ | ||
7418 | hi = mid; | ||
7419 | } | ||
7420 | /* 'lo' might be the last that started before target, but 'hi' isn't */ | ||
7421 | if (hi > lo) { | ||
7422 | /* need to check all range that end after 's' to see if | ||
7423 | * any are unacknowledged. | ||
7424 | */ | ||
7425 | while (lo >= 0 && | ||
7426 | BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { | ||
7427 | if (BB_OFFSET(p[lo]) < target) { | ||
7428 | /* starts before the end, and finishes after | ||
7429 | * the start, so they must overlap | ||
7430 | */ | ||
7431 | if (rv != -1 && BB_ACK(p[lo])) | ||
7432 | rv = 1; | ||
7433 | else | ||
7434 | rv = -1; | ||
7435 | *first_bad = BB_OFFSET(p[lo]); | ||
7436 | *bad_sectors = BB_LEN(p[lo]); | ||
7437 | } | ||
7438 | lo--; | ||
7439 | } | ||
7440 | } | ||
7441 | |||
7442 | if (read_seqretry(&bb->lock, seq)) | ||
7443 | goto retry; | ||
7444 | |||
7445 | return rv; | ||
7446 | } | ||
7447 | EXPORT_SYMBOL_GPL(md_is_badblock); | ||
7448 | |||
7449 | /* | ||
7450 | * Add a range of bad blocks to the table. | ||
7451 | * This might extend the table, or might contract it | ||
7452 | * if two adjacent ranges can be merged. | ||
7453 | * We binary-search to find the 'insertion' point, then | ||
7454 | * decide how best to handle it. | ||
7455 | */ | ||
7456 | static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, | ||
7457 | int acknowledged) | ||
7458 | { | ||
7459 | u64 *p; | ||
7460 | int lo, hi; | ||
7461 | int rv = 1; | ||
7462 | |||
7463 | if (bb->shift < 0) | ||
7464 | /* badblocks are disabled */ | ||
7465 | return 0; | ||
7466 | |||
7467 | if (bb->shift) { | ||
7468 | /* round the start down, and the end up */ | ||
7469 | sector_t next = s + sectors; | ||
7470 | s >>= bb->shift; | ||
7471 | next += (1<<bb->shift) - 1; | ||
7472 | next >>= bb->shift; | ||
7473 | sectors = next - s; | ||
7474 | } | ||
7475 | |||
7476 | write_seqlock_irq(&bb->lock); | ||
7477 | |||
7478 | p = bb->page; | ||
7479 | lo = 0; | ||
7480 | hi = bb->count; | ||
7481 | /* Find the last range that starts at-or-before 's' */ | ||
7482 | while (hi - lo > 1) { | ||
7483 | int mid = (lo + hi) / 2; | ||
7484 | sector_t a = BB_OFFSET(p[mid]); | ||
7485 | if (a <= s) | ||
7486 | lo = mid; | ||
7487 | else | ||
7488 | hi = mid; | ||
7489 | } | ||
7490 | if (hi > lo && BB_OFFSET(p[lo]) > s) | ||
7491 | hi = lo; | ||
7492 | |||
7493 | if (hi > lo) { | ||
7494 | /* we found a range that might merge with the start | ||
7495 | * of our new range | ||
7496 | */ | ||
7497 | sector_t a = BB_OFFSET(p[lo]); | ||
7498 | sector_t e = a + BB_LEN(p[lo]); | ||
7499 | int ack = BB_ACK(p[lo]); | ||
7500 | if (e >= s) { | ||
7501 | /* Yes, we can merge with a previous range */ | ||
7502 | if (s == a && s + sectors >= e) | ||
7503 | /* new range covers old */ | ||
7504 | ack = acknowledged; | ||
7505 | else | ||
7506 | ack = ack && acknowledged; | ||
7507 | |||
7508 | if (e < s + sectors) | ||
7509 | e = s + sectors; | ||
7510 | if (e - a <= BB_MAX_LEN) { | ||
7511 | p[lo] = BB_MAKE(a, e-a, ack); | ||
7512 | s = e; | ||
7513 | } else { | ||
7514 | /* does not all fit in one range, | ||
7515 | * make p[lo] maximal | ||
7516 | */ | ||
7517 | if (BB_LEN(p[lo]) != BB_MAX_LEN) | ||
7518 | p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); | ||
7519 | s = a + BB_MAX_LEN; | ||
7520 | } | ||
7521 | sectors = e - s; | ||
7522 | } | ||
7523 | } | ||
7524 | if (sectors && hi < bb->count) { | ||
7525 | /* 'hi' points to the first range that starts after 's'. | ||
7526 | * Maybe we can merge with the start of that range */ | ||
7527 | sector_t a = BB_OFFSET(p[hi]); | ||
7528 | sector_t e = a + BB_LEN(p[hi]); | ||
7529 | int ack = BB_ACK(p[hi]); | ||
7530 | if (a <= s + sectors) { | ||
7531 | /* merging is possible */ | ||
7532 | if (e <= s + sectors) { | ||
7533 | /* full overlap */ | ||
7534 | e = s + sectors; | ||
7535 | ack = acknowledged; | ||
7536 | } else | ||
7537 | ack = ack && acknowledged; | ||
7538 | |||
7539 | a = s; | ||
7540 | if (e - a <= BB_MAX_LEN) { | ||
7541 | p[hi] = BB_MAKE(a, e-a, ack); | ||
7542 | s = e; | ||
7543 | } else { | ||
7544 | p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); | ||
7545 | s = a + BB_MAX_LEN; | ||
7546 | } | ||
7547 | sectors = e - s; | ||
7548 | lo = hi; | ||
7549 | hi++; | ||
7550 | } | ||
7551 | } | ||
7552 | if (sectors == 0 && hi < bb->count) { | ||
7553 | /* we might be able to combine lo and hi */ | ||
7554 | /* Note: 's' is at the end of 'lo' */ | ||
7555 | sector_t a = BB_OFFSET(p[hi]); | ||
7556 | int lolen = BB_LEN(p[lo]); | ||
7557 | int hilen = BB_LEN(p[hi]); | ||
7558 | int newlen = lolen + hilen - (s - a); | ||
7559 | if (s >= a && newlen < BB_MAX_LEN) { | ||
7560 | /* yes, we can combine them */ | ||
7561 | int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); | ||
7562 | p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); | ||
7563 | memmove(p + hi, p + hi + 1, | ||
7564 | (bb->count - hi - 1) * 8); | ||
7565 | bb->count--; | ||
7566 | } | ||
7567 | } | ||
7568 | while (sectors) { | ||
7569 | /* didn't merge (it all). | ||
7570 | * Need to add a range just before 'hi' */ | ||
7571 | if (bb->count >= MD_MAX_BADBLOCKS) { | ||
7572 | /* No room for more */ | ||
7573 | rv = 0; | ||
7574 | break; | ||
7575 | } else { | ||
7576 | int this_sectors = sectors; | ||
7577 | memmove(p + hi + 1, p + hi, | ||
7578 | (bb->count - hi) * 8); | ||
7579 | bb->count++; | ||
7580 | |||
7581 | if (this_sectors > BB_MAX_LEN) | ||
7582 | this_sectors = BB_MAX_LEN; | ||
7583 | p[hi] = BB_MAKE(s, this_sectors, acknowledged); | ||
7584 | sectors -= this_sectors; | ||
7585 | s += this_sectors; | ||
7586 | } | ||
7587 | } | ||
7588 | |||
7589 | bb->changed = 1; | ||
7590 | write_sequnlock_irq(&bb->lock); | ||
7591 | |||
7592 | return rv; | ||
7593 | } | ||
7594 | |||
7595 | int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors, | ||
7596 | int acknowledged) | ||
7597 | { | ||
7598 | int rv = md_set_badblocks(&rdev->badblocks, | ||
7599 | s + rdev->data_offset, sectors, acknowledged); | ||
7600 | if (rv) { | ||
7601 | /* Make sure they get written out promptly */ | ||
7602 | set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); | ||
7603 | md_wakeup_thread(rdev->mddev->thread); | ||
7604 | } | ||
7605 | return rv; | ||
7606 | } | ||
7607 | EXPORT_SYMBOL_GPL(rdev_set_badblocks); | ||
7608 | |||
7609 | /* | ||
7610 | * Remove a range of bad blocks from the table. | ||
7611 | * This may involve extending the table if we spilt a region, | ||
7612 | * but it must not fail. So if the table becomes full, we just | ||
7613 | * drop the remove request. | ||
7614 | */ | ||
7615 | static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors) | ||
7616 | { | ||
7617 | u64 *p; | ||
7618 | int lo, hi; | ||
7619 | sector_t target = s + sectors; | ||
7620 | int rv = 0; | ||
7621 | |||
7622 | if (bb->shift > 0) { | ||
7623 | /* When clearing we round the start up and the end down. | ||
7624 | * This should not matter as the shift should align with | ||
7625 | * the block size and no rounding should ever be needed. | ||
7626 | * However it is better the think a block is bad when it | ||
7627 | * isn't than to think a block is not bad when it is. | ||
7628 | */ | ||
7629 | s += (1<<bb->shift) - 1; | ||
7630 | s >>= bb->shift; | ||
7631 | target >>= bb->shift; | ||
7632 | sectors = target - s; | ||
7633 | } | ||
7634 | |||
7635 | write_seqlock_irq(&bb->lock); | ||
7636 | |||
7637 | p = bb->page; | ||
7638 | lo = 0; | ||
7639 | hi = bb->count; | ||
7640 | /* Find the last range that starts before 'target' */ | ||
7641 | while (hi - lo > 1) { | ||
7642 | int mid = (lo + hi) / 2; | ||
7643 | sector_t a = BB_OFFSET(p[mid]); | ||
7644 | if (a < target) | ||
7645 | lo = mid; | ||
7646 | else | ||
7647 | hi = mid; | ||
7648 | } | ||
7649 | if (hi > lo) { | ||
7650 | /* p[lo] is the last range that could overlap the | ||
7651 | * current range. Earlier ranges could also overlap, | ||
7652 | * but only this one can overlap the end of the range. | ||
7653 | */ | ||
7654 | if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { | ||
7655 | /* Partial overlap, leave the tail of this range */ | ||
7656 | int ack = BB_ACK(p[lo]); | ||
7657 | sector_t a = BB_OFFSET(p[lo]); | ||
7658 | sector_t end = a + BB_LEN(p[lo]); | ||
7659 | |||
7660 | if (a < s) { | ||
7661 | /* we need to split this range */ | ||
7662 | if (bb->count >= MD_MAX_BADBLOCKS) { | ||
7663 | rv = 0; | ||
7664 | goto out; | ||
7665 | } | ||
7666 | memmove(p+lo+1, p+lo, (bb->count - lo) * 8); | ||
7667 | bb->count++; | ||
7668 | p[lo] = BB_MAKE(a, s-a, ack); | ||
7669 | lo++; | ||
7670 | } | ||
7671 | p[lo] = BB_MAKE(target, end - target, ack); | ||
7672 | /* there is no longer an overlap */ | ||
7673 | hi = lo; | ||
7674 | lo--; | ||
7675 | } | ||
7676 | while (lo >= 0 && | ||
7677 | BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { | ||
7678 | /* This range does overlap */ | ||
7679 | if (BB_OFFSET(p[lo]) < s) { | ||
7680 | /* Keep the early parts of this range. */ | ||
7681 | int ack = BB_ACK(p[lo]); | ||
7682 | sector_t start = BB_OFFSET(p[lo]); | ||
7683 | p[lo] = BB_MAKE(start, s - start, ack); | ||
7684 | /* now low doesn't overlap, so.. */ | ||
7685 | break; | ||
7686 | } | ||
7687 | lo--; | ||
7688 | } | ||
7689 | /* 'lo' is strictly before, 'hi' is strictly after, | ||
7690 | * anything between needs to be discarded | ||
7691 | */ | ||
7692 | if (hi - lo > 1) { | ||
7693 | memmove(p+lo+1, p+hi, (bb->count - hi) * 8); | ||
7694 | bb->count -= (hi - lo - 1); | ||
7695 | } | ||
7696 | } | ||
7697 | |||
7698 | bb->changed = 1; | ||
7699 | out: | ||
7700 | write_sequnlock_irq(&bb->lock); | ||
7701 | return rv; | ||
7702 | } | ||
7703 | |||
7704 | int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors) | ||
7705 | { | ||
7706 | return md_clear_badblocks(&rdev->badblocks, | ||
7707 | s + rdev->data_offset, | ||
7708 | sectors); | ||
7709 | } | ||
7710 | EXPORT_SYMBOL_GPL(rdev_clear_badblocks); | ||
7711 | |||
7712 | /* | ||
7713 | * Acknowledge all bad blocks in a list. | ||
7714 | * This only succeeds if ->changed is clear. It is used by | ||
7715 | * in-kernel metadata updates | ||
7716 | */ | ||
7717 | void md_ack_all_badblocks(struct badblocks *bb) | ||
7718 | { | ||
7719 | if (bb->page == NULL || bb->changed) | ||
7720 | /* no point even trying */ | ||
7721 | return; | ||
7722 | write_seqlock_irq(&bb->lock); | ||
7723 | |||
7724 | if (bb->changed == 0) { | ||
7725 | u64 *p = bb->page; | ||
7726 | int i; | ||
7727 | for (i = 0; i < bb->count ; i++) { | ||
7728 | if (!BB_ACK(p[i])) { | ||
7729 | sector_t start = BB_OFFSET(p[i]); | ||
7730 | int len = BB_LEN(p[i]); | ||
7731 | p[i] = BB_MAKE(start, len, 1); | ||
7732 | } | ||
7733 | } | ||
7734 | } | ||
7735 | write_sequnlock_irq(&bb->lock); | ||
7736 | } | ||
7737 | EXPORT_SYMBOL_GPL(md_ack_all_badblocks); | ||
7738 | |||
7330 | static int md_notify_reboot(struct notifier_block *this, | 7739 | static int md_notify_reboot(struct notifier_block *this, |
7331 | unsigned long code, void *x) | 7740 | unsigned long code, void *x) |
7332 | { | 7741 | { |