Merge tag 'md/4.8-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md

Pull MD fixes from Shaohua Li: "A few bug fixes for MD: - Guoqing fixed a bug compiling md-cluster in kernel - I fixed a potential deadlock in raid5-cache superblock write, a hang in raid5 reshape resume and a race condition introduced in rc4" * tag 'md/4.8-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md: raid5: fix a small race condition md-cluster: make md-cluster also can work when compiled into kernel raid5: guarantee enough stripes to avoid reshape hang raid5-cache: fix a deadlock in superblock write
author: Linus Torvalds <torvalds@linux-foundation.org> 2016-09-13 14:19:52 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2016-09-13 14:19:52 -0400
commit: 106f2e59ee3b89a2f93735f65499eae4e8d55abc (patch)
tree: 113f2892a5ebf0ba2b3be38133b47df2e3795c2d
parent: 309a18ae360d1d8741c676a37a3daae319fe722a (diff)
parent: c94455558337eece474eebb6a16b905f98930418 (diff)
3 files changed, 31 insertions, 41 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 67642bacd597..915e84d631a2 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7610,16 +7610,12 @@ EXPORT_SYMBOL(unregister_md_cluster_operations);
 int md_setup_cluster(struct mddev *mddev, int nodes)
 {
-        int err;
+        if (!md_cluster_ops)
+                request_module("md-cluster");
-        err = request_module("md-cluster");
-        if (err) {
-                pr_err("md-cluster module not found.\n");
-                return -ENOENT;
-        }
        spin_lock(&pers_lock);
+        /* ensure module won't be unloaded */
        if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
+                pr_err("can't find md-cluster module or get it's reference.\n");
                spin_unlock(&pers_lock);
                return -ENOENT;
        }
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 51f76ddbe265..1b1ab4a1d132 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -96,7 +96,6 @@ struct r5l_log {
        spinlock_t no_space_stripes_lock;
        bool need_cache_flush;
-        bool in_teardown;
 };
 /*
@@ -704,31 +703,22 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
        mddev = log->rdev->mddev;
        /*
-         * This is to avoid a deadlock. r5l_quiesce holds reconfig_mutex and
+         * Discard could zero data, so before discard we must make sure
-         * wait for this thread to finish. This thread waits for
+         * superblock is updated to new log tail. Updating superblock (either
-         * MD_CHANGE_PENDING clear, which is supposed to be done in
+         * directly call md_update_sb() or depend on md thread) must hold
-         * md_check_recovery(). md_check_recovery() tries to get
+         * reconfig mutex. On the other hand, raid5_quiesce is called with
-         * reconfig_mutex. Since r5l_quiesce already holds the mutex,
+         * reconfig_mutex hold. The first step of raid5_quiesce() is waitting
-         * md_check_recovery() fails, so the PENDING never get cleared. The
+         * for all IO finish, hence waitting for reclaim thread, while reclaim
-         * in_teardown check workaround this issue.
+         * thread is calling this function and waitting for reconfig mutex. So
+         * there is a deadlock. We workaround this issue with a trylock.
+         * FIXME: we could miss discard if we can't take reconfig mutex
         */
-        if (!log->in_teardown) {
+        set_mask_bits(&mddev->flags, 0,
-                set_mask_bits(&mddev->flags, 0,
+                BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
-                              BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
+        if (!mddev_trylock(mddev))
-                md_wakeup_thread(mddev->thread);
+                return;
-                wait_event(mddev->sb_wait,
+        md_update_sb(mddev, 1);
-                        !test_bit(MD_CHANGE_PENDING, &mddev->flags) ||
+        mddev_unlock(mddev);
-                        log->in_teardown);
-                /*
-                 * r5l_quiesce could run after in_teardown check and hold
-                 * mutex first. Superblock might get updated twice.
-                 */
-                if (log->in_teardown)
-                        md_update_sb(mddev, 1);
-        } else {
-                WARN_ON(!mddev_is_locked(mddev));
-                md_update_sb(mddev, 1);
-        }
        /* discard IO error really doesn't matter, ignore it */
        if (log->last_checkpoint < end) {
@@ -827,7 +817,6 @@ void r5l_quiesce(struct r5l_log *log, int state)
        if (!log || state == 2)
                return;
        if (state == 0) {
-                log->in_teardown = 0;
                /*
                 * This is a special case for hotadd. In suspend, the array has
                 * no journal. In resume, journal is initialized as well as the
@@ -838,11 +827,6 @@ void r5l_quiesce(struct r5l_log *log, int state)
                log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
                                        log->rdev->mddev, "reclaim");
        } else if (state == 1) {
-                /*
-                 * at this point all stripes are finished, so io_unit is at
-                 * least in STRIPE_END state
-                 */
-                log->in_teardown = 1;
                /* make sure r5l_write_super_and_discard_space exits */
                mddev = log->rdev->mddev;
                wake_up(&mddev->sb_wait);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index da583bb43c84..ee7fc3701700 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2423,10 +2423,10 @@ static void raid5_end_read_request(struct bio * bi)
                }
        }
        rdev_dec_pending(rdev, conf->mddev);
+        bio_reset(bi);
        clear_bit(R5_LOCKED, &sh->dev[i].flags);
        set_bit(STRIPE_HANDLE, &sh->state);
        raid5_release_stripe(sh);
-        bio_reset(bi);
 }
 static void raid5_end_write_request(struct bio *bi)
@@ -2498,6 +2498,7 @@ static void raid5_end_write_request(struct bio *bi)
        if (sh->batch_head && bi->bi_error && !replacement)
                set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
+        bio_reset(bi);
        if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
                clear_bit(R5_LOCKED, &sh->dev[i].flags);
        set_bit(STRIPE_HANDLE, &sh->state);
@@ -2505,7 +2506,6 @@ static void raid5_end_write_request(struct bio *bi)
        if (sh->batch_head && sh != sh->batch_head)
                raid5_release_stripe(sh->batch_head);
-        bio_reset(bi);
 }
 static void raid5_build_block(struct stripe_head *sh, int i, int previous)
@@ -6639,6 +6639,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        }
        conf->min_nr_stripes = NR_STRIPES;
+        if (mddev->reshape_position != MaxSector) {
+                int stripes = max_t(int,
+                        ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4,
+                        ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
+                conf->min_nr_stripes = max(NR_STRIPES, stripes);
+                if (conf->min_nr_stripes != NR_STRIPES)
+                        printk(KERN_INFO
+                                "md/raid:%s: force stripe size %d for reshape\n",
+                                mdname(mddev), conf->min_nr_stripes);
+        }
        memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
                 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
        atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
author	Linus Torvalds <torvalds@linux-foundation.org>	2016-09-13 14:19:52 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2016-09-13 14:19:52 -0400
commit	106f2e59ee3b89a2f93735f65499eae4e8d55abc (patch)
tree	113f2892a5ebf0ba2b3be38133b47df2e3795c2d
parent	309a18ae360d1d8741c676a37a3daae319fe722a (diff)
parent	c94455558337eece474eebb6a16b905f98930418 (diff)

diff --git a/drivers/md/md.c b/drivers/md/md.c index 67642bacd597..915e84d631a2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c
@@ -7610,16 +7610,12 @@ EXPORT_SYMBOL(unregister_md_cluster_operations);
7610		7610
7611	int md_setup_cluster(struct mddev *mddev, int nodes)	7611	int md_setup_cluster(struct mddev *mddev, int nodes)
7612	{	7612	{
7613	int err;	7613	if (!md_cluster_ops)
7614		7614	request_module("md-cluster");
7615	err = request_module("md-cluster");
7616	if (err) {
7617	pr_err("md-cluster module not found.\n");
7618	return -ENOENT;
7619	}
7620
7621	spin_lock(&pers_lock);	7615	spin_lock(&pers_lock);
		7616	/* ensure module won't be unloaded */
7622	if (!md_cluster_ops \|\| !try_module_get(md_cluster_mod)) {	7617	if (!md_cluster_ops \|\| !try_module_get(md_cluster_mod)) {
		7618	pr_err("can't find md-cluster module or get it's reference.\n");
7623	spin_unlock(&pers_lock);	7619	spin_unlock(&pers_lock);
7624	return -ENOENT;	7620	return -ENOENT;
7625	}	7621	}


diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 51f76ddbe265..1b1ab4a1d132 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c
@@ -96,7 +96,6 @@ struct r5l_log {
96	spinlock_t no_space_stripes_lock;	96	spinlock_t no_space_stripes_lock;
97		97
98	bool need_cache_flush;	98	bool need_cache_flush;
99	bool in_teardown;
100	};	99	};
101		100
102	/*	101	/*
@@ -704,31 +703,22 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
704		703
705	mddev = log->rdev->mddev;	704	mddev = log->rdev->mddev;
706	/*	705	/*
707	* This is to avoid a deadlock. r5l_quiesce holds reconfig_mutex and	706	* Discard could zero data, so before discard we must make sure
708	* wait for this thread to finish. This thread waits for	707	* superblock is updated to new log tail. Updating superblock (either
709	* MD_CHANGE_PENDING clear, which is supposed to be done in	708	* directly call md_update_sb() or depend on md thread) must hold
710	* md_check_recovery(). md_check_recovery() tries to get	709	* reconfig mutex. On the other hand, raid5_quiesce is called with
711	* reconfig_mutex. Since r5l_quiesce already holds the mutex,	710	* reconfig_mutex hold. The first step of raid5_quiesce() is waitting
712	* md_check_recovery() fails, so the PENDING never get cleared. The	711	* for all IO finish, hence waitting for reclaim thread, while reclaim
713	* in_teardown check workaround this issue.	712	* thread is calling this function and waitting for reconfig mutex. So
		713	* there is a deadlock. We workaround this issue with a trylock.
		714	* FIXME: we could miss discard if we can't take reconfig mutex
714	*/	715	*/
715	if (!log->in_teardown) {	716	set_mask_bits(&mddev->flags, 0,
716	set_mask_bits(&mddev->flags, 0,	717	BIT(MD_CHANGE_DEVS) \| BIT(MD_CHANGE_PENDING));
717	BIT(MD_CHANGE_DEVS) \| BIT(MD_CHANGE_PENDING));	718	if (!mddev_trylock(mddev))
718	md_wakeup_thread(mddev->thread);	719	return;
719	wait_event(mddev->sb_wait,	720	md_update_sb(mddev, 1);
720	!test_bit(MD_CHANGE_PENDING, &mddev->flags) \|\|	721	mddev_unlock(mddev);
721	log->in_teardown);
722	/*
723	* r5l_quiesce could run after in_teardown check and hold
724	* mutex first. Superblock might get updated twice.
725	*/
726	if (log->in_teardown)
727	md_update_sb(mddev, 1);
728	} else {
729	WARN_ON(!mddev_is_locked(mddev));
730	md_update_sb(mddev, 1);
731	}
732		722
733	/* discard IO error really doesn't matter, ignore it */	723	/* discard IO error really doesn't matter, ignore it */
734	if (log->last_checkpoint < end) {	724	if (log->last_checkpoint < end) {
@@ -827,7 +817,6 @@ void r5l_quiesce(struct r5l_log *log, int state)
827	if (!log \|\| state == 2)	817	if (!log \|\| state == 2)
828	return;	818	return;
829	if (state == 0) {	819	if (state == 0) {
830	log->in_teardown = 0;
831	/*	820	/*
832	* This is a special case for hotadd. In suspend, the array has	821	* This is a special case for hotadd. In suspend, the array has
833	* no journal. In resume, journal is initialized as well as the	822	* no journal. In resume, journal is initialized as well as the
@@ -838,11 +827,6 @@ void r5l_quiesce(struct r5l_log *log, int state)
838	log->reclaim_thread = md_register_thread(r5l_reclaim_thread,	827	log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
839	log->rdev->mddev, "reclaim");	828	log->rdev->mddev, "reclaim");
840	} else if (state == 1) {	829	} else if (state == 1) {
841	/*
842	* at this point all stripes are finished, so io_unit is at
843	* least in STRIPE_END state
844	*/
845	log->in_teardown = 1;
846	/* make sure r5l_write_super_and_discard_space exits */	830	/* make sure r5l_write_super_and_discard_space exits */
847	mddev = log->rdev->mddev;	831	mddev = log->rdev->mddev;
848	wake_up(&mddev->sb_wait);	832	wake_up(&mddev->sb_wait);


diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index da583bb43c84..ee7fc3701700 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c
@@ -2423,10 +2423,10 @@ static void raid5_end_read_request(struct bio * bi)
2423	}	2423	}
2424	}	2424	}
2425	rdev_dec_pending(rdev, conf->mddev);	2425	rdev_dec_pending(rdev, conf->mddev);
		2426	bio_reset(bi);
2426	clear_bit(R5_LOCKED, &sh->dev[i].flags);	2427	clear_bit(R5_LOCKED, &sh->dev[i].flags);
2427	set_bit(STRIPE_HANDLE, &sh->state);	2428	set_bit(STRIPE_HANDLE, &sh->state);
2428	raid5_release_stripe(sh);	2429	raid5_release_stripe(sh);
2429	bio_reset(bi);
2430	}	2430	}
2431		2431
2432	static void raid5_end_write_request(struct bio *bi)	2432	static void raid5_end_write_request(struct bio *bi)
@@ -2498,6 +2498,7 @@ static void raid5_end_write_request(struct bio *bi)
2498	if (sh->batch_head && bi->bi_error && !replacement)	2498	if (sh->batch_head && bi->bi_error && !replacement)
2499	set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);	2499	set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
2500		2500
		2501	bio_reset(bi);
2501	if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))	2502	if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
2502	clear_bit(R5_LOCKED, &sh->dev[i].flags);	2503	clear_bit(R5_LOCKED, &sh->dev[i].flags);
2503	set_bit(STRIPE_HANDLE, &sh->state);	2504	set_bit(STRIPE_HANDLE, &sh->state);
@@ -2505,7 +2506,6 @@ static void raid5_end_write_request(struct bio *bi)
2505		2506
2506	if (sh->batch_head && sh != sh->batch_head)	2507	if (sh->batch_head && sh != sh->batch_head)
2507	raid5_release_stripe(sh->batch_head);	2508	raid5_release_stripe(sh->batch_head);
2508	bio_reset(bi);
2509	}	2509	}
2510		2510
2511	static void raid5_build_block(struct stripe_head *sh, int i, int previous)	2511	static void raid5_build_block(struct stripe_head *sh, int i, int previous)
@@ -6639,6 +6639,16 @@ static struct r5conf setup_conf(struct mddev mddev)
6639	}	6639	}
6640		6640
6641	conf->min_nr_stripes = NR_STRIPES;	6641	conf->min_nr_stripes = NR_STRIPES;
		6642	if (mddev->reshape_position != MaxSector) {
		6643	int stripes = max_t(int,
		6644	((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4,
		6645	((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
		6646	conf->min_nr_stripes = max(NR_STRIPES, stripes);
		6647	if (conf->min_nr_stripes != NR_STRIPES)
		6648	printk(KERN_INFO
		6649	"md/raid:%s: force stripe size %d for reshape\n",
		6650	mdname(mddev), conf->min_nr_stripes);
		6651	}
6642	memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +	6652	memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
6643	max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;	6653	max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
6644	atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);	6654	atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);