diff options
author | Dan Williams <dan.j.williams@intel.com> | 2008-04-30 03:52:32 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-04-30 11:29:33 -0400 |
commit | 6bfe0b499082fd3950429017cd8ebf2a6c458aa5 (patch) | |
tree | 81476cf7f7ddbea135bdb93729e0bffae0e7c163 /drivers/md | |
parent | 11e2ede0228ee0f81ccacd15894908c3bf241f73 (diff) |
md: support blocking writes to an array on device failure
Allows a userspace metadata handler to take action upon detecting a device
failure.
Based on an original patch by Neil Brown.
Changes:
-added blocked_wait waitqueue to rdev
-don't qualify Blocked with Faulty always let userspace block writes
-added md_wait_for_blocked_rdev to wait for the block device to be clear, if
userspace misses the notification another one is sent every 5 seconds
-set MD_RECOVERY_NEEDED after clearing "blocked"
-kill DoBlock flag, just test mddev->external
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/md.c | 33 | ||||
-rw-r--r-- | drivers/md/raid1.c | 27 | ||||
-rw-r--r-- | drivers/md/raid10.c | 29 | ||||
-rw-r--r-- | drivers/md/raid5.c | 33 |
4 files changed, 115 insertions, 7 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index bec00b201a73..83eb78b00137 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -1828,6 +1828,10 @@ state_show(mdk_rdev_t *rdev, char *page) | |||
1828 | len += sprintf(page+len, "%swrite_mostly",sep); | 1828 | len += sprintf(page+len, "%swrite_mostly",sep); |
1829 | sep = ","; | 1829 | sep = ","; |
1830 | } | 1830 | } |
1831 | if (test_bit(Blocked, &rdev->flags)) { | ||
1832 | len += sprintf(page+len, "%sblocked", sep); | ||
1833 | sep = ","; | ||
1834 | } | ||
1831 | if (!test_bit(Faulty, &rdev->flags) && | 1835 | if (!test_bit(Faulty, &rdev->flags) && |
1832 | !test_bit(In_sync, &rdev->flags)) { | 1836 | !test_bit(In_sync, &rdev->flags)) { |
1833 | len += sprintf(page+len, "%sspare", sep); | 1837 | len += sprintf(page+len, "%sspare", sep); |
@@ -1844,6 +1848,8 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
1844 | * remove - disconnects the device | 1848 | * remove - disconnects the device |
1845 | * writemostly - sets write_mostly | 1849 | * writemostly - sets write_mostly |
1846 | * -writemostly - clears write_mostly | 1850 | * -writemostly - clears write_mostly |
1851 | * blocked - sets the Blocked flag | ||
1852 | * -blocked - clears the Blocked flag | ||
1847 | */ | 1853 | */ |
1848 | int err = -EINVAL; | 1854 | int err = -EINVAL; |
1849 | if (cmd_match(buf, "faulty") && rdev->mddev->pers) { | 1855 | if (cmd_match(buf, "faulty") && rdev->mddev->pers) { |
@@ -1866,6 +1872,16 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
1866 | } else if (cmd_match(buf, "-writemostly")) { | 1872 | } else if (cmd_match(buf, "-writemostly")) { |
1867 | clear_bit(WriteMostly, &rdev->flags); | 1873 | clear_bit(WriteMostly, &rdev->flags); |
1868 | err = 0; | 1874 | err = 0; |
1875 | } else if (cmd_match(buf, "blocked")) { | ||
1876 | set_bit(Blocked, &rdev->flags); | ||
1877 | err = 0; | ||
1878 | } else if (cmd_match(buf, "-blocked")) { | ||
1879 | clear_bit(Blocked, &rdev->flags); | ||
1880 | wake_up(&rdev->blocked_wait); | ||
1881 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); | ||
1882 | md_wakeup_thread(rdev->mddev->thread); | ||
1883 | |||
1884 | err = 0; | ||
1869 | } | 1885 | } |
1870 | return err ? err : len; | 1886 | return err ? err : len; |
1871 | } | 1887 | } |
@@ -2194,7 +2210,9 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi | |||
2194 | goto abort_free; | 2210 | goto abort_free; |
2195 | } | 2211 | } |
2196 | } | 2212 | } |
2213 | |||
2197 | INIT_LIST_HEAD(&rdev->same_set); | 2214 | INIT_LIST_HEAD(&rdev->same_set); |
2215 | init_waitqueue_head(&rdev->blocked_wait); | ||
2198 | 2216 | ||
2199 | return rdev; | 2217 | return rdev; |
2200 | 2218 | ||
@@ -4958,6 +4976,9 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
4958 | 4976 | ||
4959 | if (!rdev || test_bit(Faulty, &rdev->flags)) | 4977 | if (!rdev || test_bit(Faulty, &rdev->flags)) |
4960 | return; | 4978 | return; |
4979 | |||
4980 | if (mddev->external) | ||
4981 | set_bit(Blocked, &rdev->flags); | ||
4961 | /* | 4982 | /* |
4962 | dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", | 4983 | dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", |
4963 | mdname(mddev), | 4984 | mdname(mddev), |
@@ -5760,7 +5781,7 @@ static int remove_and_add_spares(mddev_t *mddev) | |||
5760 | 5781 | ||
5761 | rdev_for_each(rdev, rtmp, mddev) | 5782 | rdev_for_each(rdev, rtmp, mddev) |
5762 | if (rdev->raid_disk >= 0 && | 5783 | if (rdev->raid_disk >= 0 && |
5763 | !mddev->external && | 5784 | !test_bit(Blocked, &rdev->flags) && |
5764 | (test_bit(Faulty, &rdev->flags) || | 5785 | (test_bit(Faulty, &rdev->flags) || |
5765 | ! test_bit(In_sync, &rdev->flags)) && | 5786 | ! test_bit(In_sync, &rdev->flags)) && |
5766 | atomic_read(&rdev->nr_pending)==0) { | 5787 | atomic_read(&rdev->nr_pending)==0) { |
@@ -5959,6 +5980,16 @@ void md_check_recovery(mddev_t *mddev) | |||
5959 | } | 5980 | } |
5960 | } | 5981 | } |
5961 | 5982 | ||
5983 | void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) | ||
5984 | { | ||
5985 | sysfs_notify(&rdev->kobj, NULL, "state"); | ||
5986 | wait_event_timeout(rdev->blocked_wait, | ||
5987 | !test_bit(Blocked, &rdev->flags), | ||
5988 | msecs_to_jiffies(5000)); | ||
5989 | rdev_dec_pending(rdev, mddev); | ||
5990 | } | ||
5991 | EXPORT_SYMBOL(md_wait_for_blocked_rdev); | ||
5992 | |||
5962 | static int md_notify_reboot(struct notifier_block *this, | 5993 | static int md_notify_reboot(struct notifier_block *this, |
5963 | unsigned long code, void *x) | 5994 | unsigned long code, void *x) |
5964 | { | 5995 | { |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 9fd473a6dbf5..6778b7cb39bd 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -773,7 +773,6 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
773 | r1bio_t *r1_bio; | 773 | r1bio_t *r1_bio; |
774 | struct bio *read_bio; | 774 | struct bio *read_bio; |
775 | int i, targets = 0, disks; | 775 | int i, targets = 0, disks; |
776 | mdk_rdev_t *rdev; | ||
777 | struct bitmap *bitmap = mddev->bitmap; | 776 | struct bitmap *bitmap = mddev->bitmap; |
778 | unsigned long flags; | 777 | unsigned long flags; |
779 | struct bio_list bl; | 778 | struct bio_list bl; |
@@ -781,6 +780,7 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
781 | const int rw = bio_data_dir(bio); | 780 | const int rw = bio_data_dir(bio); |
782 | const int do_sync = bio_sync(bio); | 781 | const int do_sync = bio_sync(bio); |
783 | int do_barriers; | 782 | int do_barriers; |
783 | mdk_rdev_t *blocked_rdev; | ||
784 | 784 | ||
785 | /* | 785 | /* |
786 | * Register the new request and wait if the reconstruction | 786 | * Register the new request and wait if the reconstruction |
@@ -862,10 +862,17 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
862 | first = 0; | 862 | first = 0; |
863 | } | 863 | } |
864 | #endif | 864 | #endif |
865 | retry_write: | ||
866 | blocked_rdev = NULL; | ||
865 | rcu_read_lock(); | 867 | rcu_read_lock(); |
866 | for (i = 0; i < disks; i++) { | 868 | for (i = 0; i < disks; i++) { |
867 | if ((rdev=rcu_dereference(conf->mirrors[i].rdev)) != NULL && | 869 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); |
868 | !test_bit(Faulty, &rdev->flags)) { | 870 | if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { |
871 | atomic_inc(&rdev->nr_pending); | ||
872 | blocked_rdev = rdev; | ||
873 | break; | ||
874 | } | ||
875 | if (rdev && !test_bit(Faulty, &rdev->flags)) { | ||
869 | atomic_inc(&rdev->nr_pending); | 876 | atomic_inc(&rdev->nr_pending); |
870 | if (test_bit(Faulty, &rdev->flags)) { | 877 | if (test_bit(Faulty, &rdev->flags)) { |
871 | rdev_dec_pending(rdev, mddev); | 878 | rdev_dec_pending(rdev, mddev); |
@@ -878,6 +885,20 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
878 | } | 885 | } |
879 | rcu_read_unlock(); | 886 | rcu_read_unlock(); |
880 | 887 | ||
888 | if (unlikely(blocked_rdev)) { | ||
889 | /* Wait for this device to become unblocked */ | ||
890 | int j; | ||
891 | |||
892 | for (j = 0; j < i; j++) | ||
893 | if (r1_bio->bios[j]) | ||
894 | rdev_dec_pending(conf->mirrors[j].rdev, mddev); | ||
895 | |||
896 | allow_barrier(conf); | ||
897 | md_wait_for_blocked_rdev(blocked_rdev, mddev); | ||
898 | wait_barrier(conf); | ||
899 | goto retry_write; | ||
900 | } | ||
901 | |||
881 | BUG_ON(targets == 0); /* we never fail the last device */ | 902 | BUG_ON(targets == 0); /* we never fail the last device */ |
882 | 903 | ||
883 | if (targets < conf->raid_disks) { | 904 | if (targets < conf->raid_disks) { |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 1e96aa3ff513..5938fa962922 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -790,6 +790,7 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
790 | const int do_sync = bio_sync(bio); | 790 | const int do_sync = bio_sync(bio); |
791 | struct bio_list bl; | 791 | struct bio_list bl; |
792 | unsigned long flags; | 792 | unsigned long flags; |
793 | mdk_rdev_t *blocked_rdev; | ||
793 | 794 | ||
794 | if (unlikely(bio_barrier(bio))) { | 795 | if (unlikely(bio_barrier(bio))) { |
795 | bio_endio(bio, -EOPNOTSUPP); | 796 | bio_endio(bio, -EOPNOTSUPP); |
@@ -879,17 +880,23 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
879 | /* | 880 | /* |
880 | * WRITE: | 881 | * WRITE: |
881 | */ | 882 | */ |
882 | /* first select target devices under spinlock and | 883 | /* first select target devices under rcu_lock and |
883 | * inc refcount on their rdev. Record them by setting | 884 | * inc refcount on their rdev. Record them by setting |
884 | * bios[x] to bio | 885 | * bios[x] to bio |
885 | */ | 886 | */ |
886 | raid10_find_phys(conf, r10_bio); | 887 | raid10_find_phys(conf, r10_bio); |
888 | retry_write: | ||
889 | blocked_rdev = 0; | ||
887 | rcu_read_lock(); | 890 | rcu_read_lock(); |
888 | for (i = 0; i < conf->copies; i++) { | 891 | for (i = 0; i < conf->copies; i++) { |
889 | int d = r10_bio->devs[i].devnum; | 892 | int d = r10_bio->devs[i].devnum; |
890 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); | 893 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); |
891 | if (rdev && | 894 | if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { |
892 | !test_bit(Faulty, &rdev->flags)) { | 895 | atomic_inc(&rdev->nr_pending); |
896 | blocked_rdev = rdev; | ||
897 | break; | ||
898 | } | ||
899 | if (rdev && !test_bit(Faulty, &rdev->flags)) { | ||
893 | atomic_inc(&rdev->nr_pending); | 900 | atomic_inc(&rdev->nr_pending); |
894 | r10_bio->devs[i].bio = bio; | 901 | r10_bio->devs[i].bio = bio; |
895 | } else { | 902 | } else { |
@@ -899,6 +906,22 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
899 | } | 906 | } |
900 | rcu_read_unlock(); | 907 | rcu_read_unlock(); |
901 | 908 | ||
909 | if (unlikely(blocked_rdev)) { | ||
910 | /* Have to wait for this device to get unblocked, then retry */ | ||
911 | int j; | ||
912 | int d; | ||
913 | |||
914 | for (j = 0; j < i; j++) | ||
915 | if (r10_bio->devs[j].bio) { | ||
916 | d = r10_bio->devs[j].devnum; | ||
917 | rdev_dec_pending(conf->mirrors[d].rdev, mddev); | ||
918 | } | ||
919 | allow_barrier(conf); | ||
920 | md_wait_for_blocked_rdev(blocked_rdev, mddev); | ||
921 | wait_barrier(conf); | ||
922 | goto retry_write; | ||
923 | } | ||
924 | |||
902 | atomic_set(&r10_bio->remaining, 0); | 925 | atomic_set(&r10_bio->remaining, 0); |
903 | 926 | ||
904 | bio_list_init(&bl); | 927 | bio_list_init(&bl); |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 968dacaced6d..087eee0cb809 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -2607,6 +2607,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, | |||
2607 | } | 2607 | } |
2608 | } | 2608 | } |
2609 | 2609 | ||
2610 | |||
2610 | /* | 2611 | /* |
2611 | * handle_stripe - do things to a stripe. | 2612 | * handle_stripe - do things to a stripe. |
2612 | * | 2613 | * |
@@ -2632,6 +2633,7 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2632 | struct stripe_head_state s; | 2633 | struct stripe_head_state s; |
2633 | struct r5dev *dev; | 2634 | struct r5dev *dev; |
2634 | unsigned long pending = 0; | 2635 | unsigned long pending = 0; |
2636 | mdk_rdev_t *blocked_rdev = NULL; | ||
2635 | 2637 | ||
2636 | memset(&s, 0, sizeof(s)); | 2638 | memset(&s, 0, sizeof(s)); |
2637 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " | 2639 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " |
@@ -2691,6 +2693,11 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2691 | if (dev->written) | 2693 | if (dev->written) |
2692 | s.written++; | 2694 | s.written++; |
2693 | rdev = rcu_dereference(conf->disks[i].rdev); | 2695 | rdev = rcu_dereference(conf->disks[i].rdev); |
2696 | if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { | ||
2697 | blocked_rdev = rdev; | ||
2698 | atomic_inc(&rdev->nr_pending); | ||
2699 | break; | ||
2700 | } | ||
2694 | if (!rdev || !test_bit(In_sync, &rdev->flags)) { | 2701 | if (!rdev || !test_bit(In_sync, &rdev->flags)) { |
2695 | /* The ReadError flag will just be confusing now */ | 2702 | /* The ReadError flag will just be confusing now */ |
2696 | clear_bit(R5_ReadError, &dev->flags); | 2703 | clear_bit(R5_ReadError, &dev->flags); |
@@ -2705,6 +2712,11 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2705 | } | 2712 | } |
2706 | rcu_read_unlock(); | 2713 | rcu_read_unlock(); |
2707 | 2714 | ||
2715 | if (unlikely(blocked_rdev)) { | ||
2716 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2717 | goto unlock; | ||
2718 | } | ||
2719 | |||
2708 | if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) | 2720 | if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) |
2709 | sh->ops.count++; | 2721 | sh->ops.count++; |
2710 | 2722 | ||
@@ -2894,8 +2906,13 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2894 | if (sh->ops.count) | 2906 | if (sh->ops.count) |
2895 | pending = get_stripe_work(sh); | 2907 | pending = get_stripe_work(sh); |
2896 | 2908 | ||
2909 | unlock: | ||
2897 | spin_unlock(&sh->lock); | 2910 | spin_unlock(&sh->lock); |
2898 | 2911 | ||
2912 | /* wait for this device to become unblocked */ | ||
2913 | if (unlikely(blocked_rdev)) | ||
2914 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); | ||
2915 | |||
2899 | if (pending) | 2916 | if (pending) |
2900 | raid5_run_ops(sh, pending); | 2917 | raid5_run_ops(sh, pending); |
2901 | 2918 | ||
@@ -2912,6 +2929,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
2912 | struct stripe_head_state s; | 2929 | struct stripe_head_state s; |
2913 | struct r6_state r6s; | 2930 | struct r6_state r6s; |
2914 | struct r5dev *dev, *pdev, *qdev; | 2931 | struct r5dev *dev, *pdev, *qdev; |
2932 | mdk_rdev_t *blocked_rdev = NULL; | ||
2915 | 2933 | ||
2916 | r6s.qd_idx = raid6_next_disk(pd_idx, disks); | 2934 | r6s.qd_idx = raid6_next_disk(pd_idx, disks); |
2917 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, " | 2935 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, " |
@@ -2975,6 +2993,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
2975 | if (dev->written) | 2993 | if (dev->written) |
2976 | s.written++; | 2994 | s.written++; |
2977 | rdev = rcu_dereference(conf->disks[i].rdev); | 2995 | rdev = rcu_dereference(conf->disks[i].rdev); |
2996 | if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { | ||
2997 | blocked_rdev = rdev; | ||
2998 | atomic_inc(&rdev->nr_pending); | ||
2999 | break; | ||
3000 | } | ||
2978 | if (!rdev || !test_bit(In_sync, &rdev->flags)) { | 3001 | if (!rdev || !test_bit(In_sync, &rdev->flags)) { |
2979 | /* The ReadError flag will just be confusing now */ | 3002 | /* The ReadError flag will just be confusing now */ |
2980 | clear_bit(R5_ReadError, &dev->flags); | 3003 | clear_bit(R5_ReadError, &dev->flags); |
@@ -2989,6 +3012,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
2989 | set_bit(R5_Insync, &dev->flags); | 3012 | set_bit(R5_Insync, &dev->flags); |
2990 | } | 3013 | } |
2991 | rcu_read_unlock(); | 3014 | rcu_read_unlock(); |
3015 | |||
3016 | if (unlikely(blocked_rdev)) { | ||
3017 | set_bit(STRIPE_HANDLE, &sh->state); | ||
3018 | goto unlock; | ||
3019 | } | ||
2992 | pr_debug("locked=%d uptodate=%d to_read=%d" | 3020 | pr_debug("locked=%d uptodate=%d to_read=%d" |
2993 | " to_write=%d failed=%d failed_num=%d,%d\n", | 3021 | " to_write=%d failed=%d failed_num=%d,%d\n", |
2994 | s.locked, s.uptodate, s.to_read, s.to_write, s.failed, | 3022 | s.locked, s.uptodate, s.to_read, s.to_write, s.failed, |
@@ -3094,8 +3122,13 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3094 | !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) | 3122 | !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) |
3095 | handle_stripe_expansion(conf, sh, &r6s); | 3123 | handle_stripe_expansion(conf, sh, &r6s); |
3096 | 3124 | ||
3125 | unlock: | ||
3097 | spin_unlock(&sh->lock); | 3126 | spin_unlock(&sh->lock); |
3098 | 3127 | ||
3128 | /* wait for this device to become unblocked */ | ||
3129 | if (unlikely(blocked_rdev)) | ||
3130 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); | ||
3131 | |||
3099 | return_io(return_bi); | 3132 | return_io(return_bi); |
3100 | 3133 | ||
3101 | for (i=disks; i-- ;) { | 3134 | for (i=disks; i-- ;) { |