aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGoldwyn Rodrigues <rgoldwyn@suse.com>2015-06-24 10:30:32 -0400
committerNeilBrown <neilb@suse.com>2015-07-23 23:37:59 -0400
commit90382ed9afeafd42ef193f0eadc6b2a252d6c24d (patch)
tree3469e56a27837bfe7a01a399baccc90579e46ef6
parent33e38ac6887d975fe2635c7fcaefb6d5495cb2e1 (diff)
Fix read-balancing during node failure
During a node failure, We need to suspend read balancing so that the reads are directed to the first device and stale data is not read. Suspending writes is not required because these would be recorded and synced eventually. A new flag MD_CLUSTER_SUSPEND_READ_BALANCING is set in recover_prep(). area_resyncing() will respond true for the entire devices if this flag is set and the request type is READ. The flag is cleared in recover_done(). Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com> Reported-By: David Teigland <teigland@redhat.com> Signed-off-by: NeilBrown <neilb@suse.com>
-rw-r--r--drivers/md/md-cluster.c12
-rw-r--r--drivers/md/md-cluster.h2
-rw-r--r--drivers/md/raid1.c7
3 files changed, 16 insertions, 5 deletions
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index fcfc4b9b2672..0072190515e0 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -44,6 +44,7 @@ struct resync_info {
44 44
45/* md_cluster_info flags */ 45/* md_cluster_info flags */
46#define MD_CLUSTER_WAITING_FOR_NEWDISK 1 46#define MD_CLUSTER_WAITING_FOR_NEWDISK 1
47#define MD_CLUSTER_SUSPEND_READ_BALANCING 2
47 48
48 49
49struct md_cluster_info { 50struct md_cluster_info {
@@ -275,6 +276,9 @@ clear_bit:
275 276
276static void recover_prep(void *arg) 277static void recover_prep(void *arg)
277{ 278{
279 struct mddev *mddev = arg;
280 struct md_cluster_info *cinfo = mddev->cluster_info;
281 set_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
278} 282}
279 283
280static void recover_slot(void *arg, struct dlm_slot *slot) 284static void recover_slot(void *arg, struct dlm_slot *slot)
@@ -307,6 +311,7 @@ static void recover_done(void *arg, struct dlm_slot *slots,
307 311
308 cinfo->slot_number = our_slot; 312 cinfo->slot_number = our_slot;
309 complete(&cinfo->completion); 313 complete(&cinfo->completion);
314 clear_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
310} 315}
311 316
312static const struct dlm_lockspace_ops md_ls_ops = { 317static const struct dlm_lockspace_ops md_ls_ops = {
@@ -816,12 +821,17 @@ static void resync_finish(struct mddev *mddev)
816 resync_send(mddev, RESYNCING, 0, 0); 821 resync_send(mddev, RESYNCING, 0, 0);
817} 822}
818 823
819static int area_resyncing(struct mddev *mddev, sector_t lo, sector_t hi) 824static int area_resyncing(struct mddev *mddev, int direction,
825 sector_t lo, sector_t hi)
820{ 826{
821 struct md_cluster_info *cinfo = mddev->cluster_info; 827 struct md_cluster_info *cinfo = mddev->cluster_info;
822 int ret = 0; 828 int ret = 0;
823 struct suspend_info *s; 829 struct suspend_info *s;
824 830
831 if ((direction == READ) &&
832 test_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state))
833 return 1;
834
825 spin_lock_irq(&cinfo->suspend_lock); 835 spin_lock_irq(&cinfo->suspend_lock);
826 if (list_empty(&cinfo->suspend_list)) 836 if (list_empty(&cinfo->suspend_list))
827 goto out; 837 goto out;
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index 6817ee00e053..00defe2badbc 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -18,7 +18,7 @@ struct md_cluster_operations {
18 int (*metadata_update_start)(struct mddev *mddev); 18 int (*metadata_update_start)(struct mddev *mddev);
19 int (*metadata_update_finish)(struct mddev *mddev); 19 int (*metadata_update_finish)(struct mddev *mddev);
20 int (*metadata_update_cancel)(struct mddev *mddev); 20 int (*metadata_update_cancel)(struct mddev *mddev);
21 int (*area_resyncing)(struct mddev *mddev, sector_t lo, sector_t hi); 21 int (*area_resyncing)(struct mddev *mddev, int direction, sector_t lo, sector_t hi);
22 int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev); 22 int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev);
23 int (*add_new_disk_finish)(struct mddev *mddev); 23 int (*add_new_disk_finish)(struct mddev *mddev);
24 int (*new_disk_ack)(struct mddev *mddev, bool ack); 24 int (*new_disk_ack)(struct mddev *mddev, bool ack);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 50cf0c893b16..94f5b55069e0 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -541,7 +541,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
541 541
542 if ((conf->mddev->recovery_cp < this_sector + sectors) || 542 if ((conf->mddev->recovery_cp < this_sector + sectors) ||
543 (mddev_is_clustered(conf->mddev) && 543 (mddev_is_clustered(conf->mddev) &&
544 md_cluster_ops->area_resyncing(conf->mddev, this_sector, 544 md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
545 this_sector + sectors))) 545 this_sector + sectors)))
546 choose_first = 1; 546 choose_first = 1;
547 else 547 else
@@ -1111,7 +1111,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1111 ((bio_end_sector(bio) > mddev->suspend_lo && 1111 ((bio_end_sector(bio) > mddev->suspend_lo &&
1112 bio->bi_iter.bi_sector < mddev->suspend_hi) || 1112 bio->bi_iter.bi_sector < mddev->suspend_hi) ||
1113 (mddev_is_clustered(mddev) && 1113 (mddev_is_clustered(mddev) &&
1114 md_cluster_ops->area_resyncing(mddev, bio->bi_iter.bi_sector, bio_end_sector(bio))))) { 1114 md_cluster_ops->area_resyncing(mddev, WRITE,
1115 bio->bi_iter.bi_sector, bio_end_sector(bio))))) {
1115 /* As the suspend_* range is controlled by 1116 /* As the suspend_* range is controlled by
1116 * userspace, we want an interruptible 1117 * userspace, we want an interruptible
1117 * wait. 1118 * wait.
@@ -1124,7 +1125,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1124 if (bio_end_sector(bio) <= mddev->suspend_lo || 1125 if (bio_end_sector(bio) <= mddev->suspend_lo ||
1125 bio->bi_iter.bi_sector >= mddev->suspend_hi || 1126 bio->bi_iter.bi_sector >= mddev->suspend_hi ||
1126 (mddev_is_clustered(mddev) && 1127 (mddev_is_clustered(mddev) &&
1127 !md_cluster_ops->area_resyncing(mddev, 1128 !md_cluster_ops->area_resyncing(mddev, WRITE,
1128 bio->bi_iter.bi_sector, bio_end_sector(bio)))) 1129 bio->bi_iter.bi_sector, bio_end_sector(bio))))
1129 break; 1130 break;
1130 schedule(); 1131 schedule();