aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDan Williams <dan.j.williams@intel.com>2008-04-30 03:52:32 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-04-30 11:29:33 -0400
commit6bfe0b499082fd3950429017cd8ebf2a6c458aa5 (patch)
tree81476cf7f7ddbea135bdb93729e0bffae0e7c163
parent11e2ede0228ee0f81ccacd15894908c3bf241f73 (diff)
md: support blocking writes to an array on device failure
Allows a userspace metadata handler to take action upon detecting a device failure. Based on an original patch by Neil Brown. Changes: -added blocked_wait waitqueue to rdev -don't qualify Blocked with Faulty always let userspace block writes -added md_wait_for_blocked_rdev to wait for the block device to be clear, if userspace misses the notification another one is sent every 5 seconds -set MD_RECOVERY_NEEDED after clearing "blocked" -kill DoBlock flag, just test mddev->external Signed-off-by: Dan Williams <dan.j.williams@intel.com> Signed-off-by: Neil Brown <neilb@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--drivers/md/md.c33
-rw-r--r--drivers/md/raid1.c27
-rw-r--r--drivers/md/raid10.c29
-rw-r--r--drivers/md/raid5.c33
-rw-r--r--include/linux/raid/md.h1
-rw-r--r--include/linux/raid/md_k.h4
6 files changed, 120 insertions, 7 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index bec00b201a73..83eb78b00137 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1828,6 +1828,10 @@ state_show(mdk_rdev_t *rdev, char *page)
1828 len += sprintf(page+len, "%swrite_mostly",sep); 1828 len += sprintf(page+len, "%swrite_mostly",sep);
1829 sep = ","; 1829 sep = ",";
1830 } 1830 }
1831 if (test_bit(Blocked, &rdev->flags)) {
1832 len += sprintf(page+len, "%sblocked", sep);
1833 sep = ",";
1834 }
1831 if (!test_bit(Faulty, &rdev->flags) && 1835 if (!test_bit(Faulty, &rdev->flags) &&
1832 !test_bit(In_sync, &rdev->flags)) { 1836 !test_bit(In_sync, &rdev->flags)) {
1833 len += sprintf(page+len, "%sspare", sep); 1837 len += sprintf(page+len, "%sspare", sep);
@@ -1844,6 +1848,8 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1844 * remove - disconnects the device 1848 * remove - disconnects the device
1845 * writemostly - sets write_mostly 1849 * writemostly - sets write_mostly
1846 * -writemostly - clears write_mostly 1850 * -writemostly - clears write_mostly
1851 * blocked - sets the Blocked flag
1852 * -blocked - clears the Blocked flag
1847 */ 1853 */
1848 int err = -EINVAL; 1854 int err = -EINVAL;
1849 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 1855 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
@@ -1866,6 +1872,16 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1866 } else if (cmd_match(buf, "-writemostly")) { 1872 } else if (cmd_match(buf, "-writemostly")) {
1867 clear_bit(WriteMostly, &rdev->flags); 1873 clear_bit(WriteMostly, &rdev->flags);
1868 err = 0; 1874 err = 0;
1875 } else if (cmd_match(buf, "blocked")) {
1876 set_bit(Blocked, &rdev->flags);
1877 err = 0;
1878 } else if (cmd_match(buf, "-blocked")) {
1879 clear_bit(Blocked, &rdev->flags);
1880 wake_up(&rdev->blocked_wait);
1881 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
1882 md_wakeup_thread(rdev->mddev->thread);
1883
1884 err = 0;
1869 } 1885 }
1870 return err ? err : len; 1886 return err ? err : len;
1871} 1887}
@@ -2194,7 +2210,9 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
2194 goto abort_free; 2210 goto abort_free;
2195 } 2211 }
2196 } 2212 }
2213
2197 INIT_LIST_HEAD(&rdev->same_set); 2214 INIT_LIST_HEAD(&rdev->same_set);
2215 init_waitqueue_head(&rdev->blocked_wait);
2198 2216
2199 return rdev; 2217 return rdev;
2200 2218
@@ -4958,6 +4976,9 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
4958 4976
4959 if (!rdev || test_bit(Faulty, &rdev->flags)) 4977 if (!rdev || test_bit(Faulty, &rdev->flags))
4960 return; 4978 return;
4979
4980 if (mddev->external)
4981 set_bit(Blocked, &rdev->flags);
4961/* 4982/*
4962 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 4983 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
4963 mdname(mddev), 4984 mdname(mddev),
@@ -5760,7 +5781,7 @@ static int remove_and_add_spares(mddev_t *mddev)
5760 5781
5761 rdev_for_each(rdev, rtmp, mddev) 5782 rdev_for_each(rdev, rtmp, mddev)
5762 if (rdev->raid_disk >= 0 && 5783 if (rdev->raid_disk >= 0 &&
5763 !mddev->external && 5784 !test_bit(Blocked, &rdev->flags) &&
5764 (test_bit(Faulty, &rdev->flags) || 5785 (test_bit(Faulty, &rdev->flags) ||
5765 ! test_bit(In_sync, &rdev->flags)) && 5786 ! test_bit(In_sync, &rdev->flags)) &&
5766 atomic_read(&rdev->nr_pending)==0) { 5787 atomic_read(&rdev->nr_pending)==0) {
@@ -5959,6 +5980,16 @@ void md_check_recovery(mddev_t *mddev)
5959 } 5980 }
5960} 5981}
5961 5982
5983void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
5984{
5985 sysfs_notify(&rdev->kobj, NULL, "state");
5986 wait_event_timeout(rdev->blocked_wait,
5987 !test_bit(Blocked, &rdev->flags),
5988 msecs_to_jiffies(5000));
5989 rdev_dec_pending(rdev, mddev);
5990}
5991EXPORT_SYMBOL(md_wait_for_blocked_rdev);
5992
5962static int md_notify_reboot(struct notifier_block *this, 5993static int md_notify_reboot(struct notifier_block *this,
5963 unsigned long code, void *x) 5994 unsigned long code, void *x)
5964{ 5995{
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 9fd473a6dbf5..6778b7cb39bd 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -773,7 +773,6 @@ static int make_request(struct request_queue *q, struct bio * bio)
773 r1bio_t *r1_bio; 773 r1bio_t *r1_bio;
774 struct bio *read_bio; 774 struct bio *read_bio;
775 int i, targets = 0, disks; 775 int i, targets = 0, disks;
776 mdk_rdev_t *rdev;
777 struct bitmap *bitmap = mddev->bitmap; 776 struct bitmap *bitmap = mddev->bitmap;
778 unsigned long flags; 777 unsigned long flags;
779 struct bio_list bl; 778 struct bio_list bl;
@@ -781,6 +780,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
781 const int rw = bio_data_dir(bio); 780 const int rw = bio_data_dir(bio);
782 const int do_sync = bio_sync(bio); 781 const int do_sync = bio_sync(bio);
783 int do_barriers; 782 int do_barriers;
783 mdk_rdev_t *blocked_rdev;
784 784
785 /* 785 /*
786 * Register the new request and wait if the reconstruction 786 * Register the new request and wait if the reconstruction
@@ -862,10 +862,17 @@ static int make_request(struct request_queue *q, struct bio * bio)
862 first = 0; 862 first = 0;
863 } 863 }
864#endif 864#endif
865 retry_write:
866 blocked_rdev = NULL;
865 rcu_read_lock(); 867 rcu_read_lock();
866 for (i = 0; i < disks; i++) { 868 for (i = 0; i < disks; i++) {
867 if ((rdev=rcu_dereference(conf->mirrors[i].rdev)) != NULL && 869 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
868 !test_bit(Faulty, &rdev->flags)) { 870 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
871 atomic_inc(&rdev->nr_pending);
872 blocked_rdev = rdev;
873 break;
874 }
875 if (rdev && !test_bit(Faulty, &rdev->flags)) {
869 atomic_inc(&rdev->nr_pending); 876 atomic_inc(&rdev->nr_pending);
870 if (test_bit(Faulty, &rdev->flags)) { 877 if (test_bit(Faulty, &rdev->flags)) {
871 rdev_dec_pending(rdev, mddev); 878 rdev_dec_pending(rdev, mddev);
@@ -878,6 +885,20 @@ static int make_request(struct request_queue *q, struct bio * bio)
878 } 885 }
879 rcu_read_unlock(); 886 rcu_read_unlock();
880 887
888 if (unlikely(blocked_rdev)) {
889 /* Wait for this device to become unblocked */
890 int j;
891
892 for (j = 0; j < i; j++)
893 if (r1_bio->bios[j])
894 rdev_dec_pending(conf->mirrors[j].rdev, mddev);
895
896 allow_barrier(conf);
897 md_wait_for_blocked_rdev(blocked_rdev, mddev);
898 wait_barrier(conf);
899 goto retry_write;
900 }
901
881 BUG_ON(targets == 0); /* we never fail the last device */ 902 BUG_ON(targets == 0); /* we never fail the last device */
882 903
883 if (targets < conf->raid_disks) { 904 if (targets < conf->raid_disks) {
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 1e96aa3ff513..5938fa962922 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -790,6 +790,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
790 const int do_sync = bio_sync(bio); 790 const int do_sync = bio_sync(bio);
791 struct bio_list bl; 791 struct bio_list bl;
792 unsigned long flags; 792 unsigned long flags;
793 mdk_rdev_t *blocked_rdev;
793 794
794 if (unlikely(bio_barrier(bio))) { 795 if (unlikely(bio_barrier(bio))) {
795 bio_endio(bio, -EOPNOTSUPP); 796 bio_endio(bio, -EOPNOTSUPP);
@@ -879,17 +880,23 @@ static int make_request(struct request_queue *q, struct bio * bio)
879 /* 880 /*
880 * WRITE: 881 * WRITE:
881 */ 882 */
882 /* first select target devices under spinlock and 883 /* first select target devices under rcu_lock and
883 * inc refcount on their rdev. Record them by setting 884 * inc refcount on their rdev. Record them by setting
884 * bios[x] to bio 885 * bios[x] to bio
885 */ 886 */
886 raid10_find_phys(conf, r10_bio); 887 raid10_find_phys(conf, r10_bio);
888 retry_write:
889 blocked_rdev = 0;
887 rcu_read_lock(); 890 rcu_read_lock();
888 for (i = 0; i < conf->copies; i++) { 891 for (i = 0; i < conf->copies; i++) {
889 int d = r10_bio->devs[i].devnum; 892 int d = r10_bio->devs[i].devnum;
890 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); 893 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);
891 if (rdev && 894 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
892 !test_bit(Faulty, &rdev->flags)) { 895 atomic_inc(&rdev->nr_pending);
896 blocked_rdev = rdev;
897 break;
898 }
899 if (rdev && !test_bit(Faulty, &rdev->flags)) {
893 atomic_inc(&rdev->nr_pending); 900 atomic_inc(&rdev->nr_pending);
894 r10_bio->devs[i].bio = bio; 901 r10_bio->devs[i].bio = bio;
895 } else { 902 } else {
@@ -899,6 +906,22 @@ static int make_request(struct request_queue *q, struct bio * bio)
899 } 906 }
900 rcu_read_unlock(); 907 rcu_read_unlock();
901 908
909 if (unlikely(blocked_rdev)) {
910 /* Have to wait for this device to get unblocked, then retry */
911 int j;
912 int d;
913
914 for (j = 0; j < i; j++)
915 if (r10_bio->devs[j].bio) {
916 d = r10_bio->devs[j].devnum;
917 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
918 }
919 allow_barrier(conf);
920 md_wait_for_blocked_rdev(blocked_rdev, mddev);
921 wait_barrier(conf);
922 goto retry_write;
923 }
924
902 atomic_set(&r10_bio->remaining, 0); 925 atomic_set(&r10_bio->remaining, 0);
903 926
904 bio_list_init(&bl); 927 bio_list_init(&bl);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 968dacaced6d..087eee0cb809 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2607,6 +2607,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2607 } 2607 }
2608} 2608}
2609 2609
2610
2610/* 2611/*
2611 * handle_stripe - do things to a stripe. 2612 * handle_stripe - do things to a stripe.
2612 * 2613 *
@@ -2632,6 +2633,7 @@ static void handle_stripe5(struct stripe_head *sh)
2632 struct stripe_head_state s; 2633 struct stripe_head_state s;
2633 struct r5dev *dev; 2634 struct r5dev *dev;
2634 unsigned long pending = 0; 2635 unsigned long pending = 0;
2636 mdk_rdev_t *blocked_rdev = NULL;
2635 2637
2636 memset(&s, 0, sizeof(s)); 2638 memset(&s, 0, sizeof(s));
2637 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " 2639 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d "
@@ -2691,6 +2693,11 @@ static void handle_stripe5(struct stripe_head *sh)
2691 if (dev->written) 2693 if (dev->written)
2692 s.written++; 2694 s.written++;
2693 rdev = rcu_dereference(conf->disks[i].rdev); 2695 rdev = rcu_dereference(conf->disks[i].rdev);
2696 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
2697 blocked_rdev = rdev;
2698 atomic_inc(&rdev->nr_pending);
2699 break;
2700 }
2694 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 2701 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
2695 /* The ReadError flag will just be confusing now */ 2702 /* The ReadError flag will just be confusing now */
2696 clear_bit(R5_ReadError, &dev->flags); 2703 clear_bit(R5_ReadError, &dev->flags);
@@ -2705,6 +2712,11 @@ static void handle_stripe5(struct stripe_head *sh)
2705 } 2712 }
2706 rcu_read_unlock(); 2713 rcu_read_unlock();
2707 2714
2715 if (unlikely(blocked_rdev)) {
2716 set_bit(STRIPE_HANDLE, &sh->state);
2717 goto unlock;
2718 }
2719
2708 if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) 2720 if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
2709 sh->ops.count++; 2721 sh->ops.count++;
2710 2722
@@ -2894,8 +2906,13 @@ static void handle_stripe5(struct stripe_head *sh)
2894 if (sh->ops.count) 2906 if (sh->ops.count)
2895 pending = get_stripe_work(sh); 2907 pending = get_stripe_work(sh);
2896 2908
2909 unlock:
2897 spin_unlock(&sh->lock); 2910 spin_unlock(&sh->lock);
2898 2911
2912 /* wait for this device to become unblocked */
2913 if (unlikely(blocked_rdev))
2914 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
2915
2899 if (pending) 2916 if (pending)
2900 raid5_run_ops(sh, pending); 2917 raid5_run_ops(sh, pending);
2901 2918
@@ -2912,6 +2929,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2912 struct stripe_head_state s; 2929 struct stripe_head_state s;
2913 struct r6_state r6s; 2930 struct r6_state r6s;
2914 struct r5dev *dev, *pdev, *qdev; 2931 struct r5dev *dev, *pdev, *qdev;
2932 mdk_rdev_t *blocked_rdev = NULL;
2915 2933
2916 r6s.qd_idx = raid6_next_disk(pd_idx, disks); 2934 r6s.qd_idx = raid6_next_disk(pd_idx, disks);
2917 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 2935 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
@@ -2975,6 +2993,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2975 if (dev->written) 2993 if (dev->written)
2976 s.written++; 2994 s.written++;
2977 rdev = rcu_dereference(conf->disks[i].rdev); 2995 rdev = rcu_dereference(conf->disks[i].rdev);
2996 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
2997 blocked_rdev = rdev;
2998 atomic_inc(&rdev->nr_pending);
2999 break;
3000 }
2978 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 3001 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
2979 /* The ReadError flag will just be confusing now */ 3002 /* The ReadError flag will just be confusing now */
2980 clear_bit(R5_ReadError, &dev->flags); 3003 clear_bit(R5_ReadError, &dev->flags);
@@ -2989,6 +3012,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2989 set_bit(R5_Insync, &dev->flags); 3012 set_bit(R5_Insync, &dev->flags);
2990 } 3013 }
2991 rcu_read_unlock(); 3014 rcu_read_unlock();
3015
3016 if (unlikely(blocked_rdev)) {
3017 set_bit(STRIPE_HANDLE, &sh->state);
3018 goto unlock;
3019 }
2992 pr_debug("locked=%d uptodate=%d to_read=%d" 3020 pr_debug("locked=%d uptodate=%d to_read=%d"
2993 " to_write=%d failed=%d failed_num=%d,%d\n", 3021 " to_write=%d failed=%d failed_num=%d,%d\n",
2994 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3022 s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
@@ -3094,8 +3122,13 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3094 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) 3122 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
3095 handle_stripe_expansion(conf, sh, &r6s); 3123 handle_stripe_expansion(conf, sh, &r6s);
3096 3124
3125 unlock:
3097 spin_unlock(&sh->lock); 3126 spin_unlock(&sh->lock);
3098 3127
3128 /* wait for this device to become unblocked */
3129 if (unlikely(blocked_rdev))
3130 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
3131
3099 return_io(return_bi); 3132 return_io(return_bi);
3100 3133
3101 for (i=disks; i-- ;) { 3134 for (i=disks; i-- ;) {
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index 8ab630b67fcc..81a1a02d4566 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -94,6 +94,7 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
94extern void md_do_sync(mddev_t *mddev); 94extern void md_do_sync(mddev_t *mddev);
95extern void md_new_event(mddev_t *mddev); 95extern void md_new_event(mddev_t *mddev);
96extern void md_allow_write(mddev_t *mddev); 96extern void md_allow_write(mddev_t *mddev);
97extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
97 98
98#endif /* CONFIG_MD */ 99#endif /* CONFIG_MD */
99#endif 100#endif
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 7bb6d1abf71e..812ffa590cff 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -84,6 +84,10 @@ struct mdk_rdev_s
84#define AllReserved 6 /* If whole device is reserved for 84#define AllReserved 6 /* If whole device is reserved for
85 * one array */ 85 * one array */
86#define AutoDetected 7 /* added by auto-detect */ 86#define AutoDetected 7 /* added by auto-detect */
87#define Blocked 8 /* An error occured on an externally
88 * managed array, don't allow writes
89 * until it is cleared */
90 wait_queue_head_t blocked_wait;
87 91
88 int desc_nr; /* descriptor index in the superblock */ 92 int desc_nr; /* descriptor index in the superblock */
89 int raid_disk; /* role of device in array */ 93 int raid_disk; /* role of device in array */