1 files changed, 48 insertions, 31 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2e38cfac5b1d..9c4f7659f8b1 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -103,8 +103,7 @@ static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
 static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
 {
        int i;
-        local_irq_disable();
+        spin_lock_irq(conf->hash_locks);
-        spin_lock(conf->hash_locks);
        for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
                spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
        spin_lock(&conf->device_lock);
@@ -114,9 +113,9 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
 {
        int i;
        spin_unlock(&conf->device_lock);
-        for (i = NR_STRIPE_HASH_LOCKS; i; i--)
+        for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
-                spin_unlock(conf->hash_locks + i - 1);
+                spin_unlock(conf->hash_locks + i);
-        local_irq_enable();
+        spin_unlock_irq(conf->hash_locks);
 }
 /* Find first data disk in a raid6 stripe */
@@ -234,11 +233,15 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
                        if (test_bit(R5_InJournal, &sh->dev[i].flags))
                                injournal++;
        /*
-         * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with
+         * In the following cases, the stripe cannot be released to cached
-         * data in journal, so they are not released to cached lists
+         * lists. Therefore, we make the stripe write out and set
+         * STRIPE_HANDLE:
+         *   1. when quiesce in r5c write back;
+         *   2. when resync is requested fot the stripe.
         */
-        if (conf->quiesce && r5c_is_writeback(conf->log) &&
+        if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) ||
-            !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) {
+            (conf->quiesce && r5c_is_writeback(conf->log) &&
+             !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) {
                if (test_bit(STRIPE_R5C_CACHING, &sh->state))
                        r5c_make_stripe_write_out(sh);
                set_bit(STRIPE_HANDLE, &sh->state);
@@ -714,12 +717,11 @@ static bool is_full_stripe_write(struct stripe_head *sh)
 static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
 {
-        local_irq_disable();
        if (sh1 > sh2) {
-                spin_lock(&sh2->stripe_lock);
+                spin_lock_irq(&sh2->stripe_lock);
                spin_lock_nested(&sh1->stripe_lock, 1);
        } else {
-                spin_lock(&sh1->stripe_lock);
+                spin_lock_irq(&sh1->stripe_lock);
                spin_lock_nested(&sh2->stripe_lock, 1);
        }
 }
@@ -727,8 +729,7 @@ static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
 static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
 {
        spin_unlock(&sh1->stripe_lock);
-        spin_unlock(&sh2->stripe_lock);
+        spin_unlock_irq(&sh2->stripe_lock);
-        local_irq_enable();
 }
 /* Only freshly new full stripe normal write stripe can be added to a batch list */
@@ -2312,14 +2313,12 @@ static int resize_stripes(struct r5conf *conf, int newsize)
        struct stripe_head *osh, *nsh;
        LIST_HEAD(newstripes);
        struct disk_info *ndisks;
-        int err;
+        int err = 0;
        struct kmem_cache *sc;
        int i;
        int hash, cnt;
-        err = md_allow_write(conf->mddev);
+        md_allow_write(conf->mddev);
-        if (err)
-                return err;
        /* Step 1 */
        sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
@@ -2694,7 +2693,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
                bdevname(rdev->bdev, b),
                mdname(mddev),
                conf->raid_disks - mddev->degraded);
-        r5c_update_on_rdev_error(mddev);
+        r5c_update_on_rdev_error(mddev, rdev);
 }
 /*
@@ -3055,6 +3054,11 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
 *      When LOG_CRITICAL, stripes with injournal == 0 will be sent to
 *      no_space_stripes list.
 *
+ *   3. during journal failure
+ *      In journal failure, we try to flush all cached data to raid disks
+ *      based on data in stripe cache. The array is read-only to upper
+ *      layers, so we would skip all pending writes.
+ *
 */
 static inline bool delay_towrite(struct r5conf *conf,
                                 struct r5dev *dev,
@@ -3068,6 +3072,9 @@ static inline bool delay_towrite(struct r5conf *conf,
        if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
            s->injournal > 0)
                return true;
+        /* case 3 above */
+        if (s->log_failed && s->injournal)
+                return true;
        return false;
 }
@@ -4653,8 +4660,13 @@ static void handle_stripe(struct stripe_head *sh)
        if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
                spin_lock(&sh->stripe_lock);
-                /* Cannot process 'sync' concurrently with 'discard' */
+                /*
-                if (!test_bit(STRIPE_DISCARD, &sh->state) &&
+                 * Cannot process 'sync' concurrently with 'discard'.
+                 * Flush data in r5cache before 'sync'.
+                 */
+                if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
+                    !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) &&
+                    !test_bit(STRIPE_DISCARD, &sh->state) &&
                    test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
                        set_bit(STRIPE_SYNCING, &sh->state);
                        clear_bit(STRIPE_INSYNC, &sh->state);
@@ -4701,10 +4713,15 @@ static void handle_stripe(struct stripe_head *sh)
               " to_write=%d failed=%d failed_num=%d,%d\n",
               s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
               s.failed_num[0], s.failed_num[1]);
-        /* check if the array has lost more than max_degraded devices and,
+        /*
+         * check if the array has lost more than max_degraded devices and,
         * if so, some requests might need to be failed.
+         *
+         * When journal device failed (log_failed), we will only process
+         * the stripe if there is data need write to raid disks
         */
-        if (s.failed > conf->max_degraded || s.log_failed) {
+        if (s.failed > conf->max_degraded ||
+            (s.log_failed && s.injournal == 0)) {
                sh->check_state = 0;
                sh->reconstruct_state = 0;
                break_stripe_batch_list(sh, 0);
@@ -5277,8 +5294,10 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
        struct stripe_head *sh, *tmp;
        struct list_head *handle_list = NULL;
        struct r5worker_group *wg;
-        bool second_try = !r5c_is_writeback(conf->log);
+        bool second_try = !r5c_is_writeback(conf->log) &&
-        bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state);
+                !r5l_log_disk_error(conf);
+        bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) ||
+                r5l_log_disk_error(conf);
 again:
        wg = NULL;
@@ -6313,7 +6332,6 @@ int
 raid5_set_cache_size(struct mddev *mddev, int size)
 {
        struct r5conf *conf = mddev->private;
-        int err;
        if (size <= 16 || size > 32768)
                return -EINVAL;
@@ -6325,10 +6343,7 @@ raid5_set_cache_size(struct mddev *mddev, int size)
                ;
        mutex_unlock(&conf->cache_size_mutex);
+        md_allow_write(mddev);
-        err = md_allow_write(mddev);
-        if (err)
-                return err;
        mutex_lock(&conf->cache_size_mutex);
        while (size > conf->max_nr_stripes)
@@ -7530,7 +7545,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
                 * neilb: there is no locking about new writes here,
                 * so this cannot be safe.
                 */
-                if (atomic_read(&conf->active_stripes)) {
+                if (atomic_read(&conf->active_stripes) ||
+                    atomic_read(&conf->r5c_cached_full_stripes) ||
+                    atomic_read(&conf->r5c_cached_partial_stripes)) {
                        return -EBUSY;
                }
                log_exit(conf);

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2e38cfac5b1d..9c4f7659f8b1 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c
@@ -103,8 +103,7 @@ static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
103	static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)	103	static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
104	{	104	{
105	int i;	105	int i;
106	local_irq_disable();	106	spin_lock_irq(conf->hash_locks);
107	spin_lock(conf->hash_locks);
108	for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)	107	for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
109	spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);	108	spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
110	spin_lock(&conf->device_lock);	109	spin_lock(&conf->device_lock);
@@ -114,9 +113,9 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
114	{	113	{
115	int i;	114	int i;
116	spin_unlock(&conf->device_lock);	115	spin_unlock(&conf->device_lock);
117	for (i = NR_STRIPE_HASH_LOCKS; i; i--)	116	for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
118	spin_unlock(conf->hash_locks + i - 1);	117	spin_unlock(conf->hash_locks + i);
119	local_irq_enable();	118	spin_unlock_irq(conf->hash_locks);
120	}	119	}
121		120
122	/* Find first data disk in a raid6 stripe */	121	/* Find first data disk in a raid6 stripe */
@@ -234,11 +233,15 @@ static void do_release_stripe(struct r5conf conf, struct stripe_head sh,
234	if (test_bit(R5_InJournal, &sh->dev[i].flags))	233	if (test_bit(R5_InJournal, &sh->dev[i].flags))
235	injournal++;	234	injournal++;
236	/*	235	/*
237	* When quiesce in r5c write back, set STRIPE_HANDLE for stripes with	236	* In the following cases, the stripe cannot be released to cached
238	* data in journal, so they are not released to cached lists	237	* lists. Therefore, we make the stripe write out and set
		238	* STRIPE_HANDLE:
		239	* 1. when quiesce in r5c write back;
		240	* 2. when resync is requested fot the stripe.
239	*/	241	*/
240	if (conf->quiesce && r5c_is_writeback(conf->log) &&	242	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) \|\|
241	!test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) {	243	(conf->quiesce && r5c_is_writeback(conf->log) &&
		244	!test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) {
242	if (test_bit(STRIPE_R5C_CACHING, &sh->state))	245	if (test_bit(STRIPE_R5C_CACHING, &sh->state))
243	r5c_make_stripe_write_out(sh);	246	r5c_make_stripe_write_out(sh);
244	set_bit(STRIPE_HANDLE, &sh->state);	247	set_bit(STRIPE_HANDLE, &sh->state);
@@ -714,12 +717,11 @@ static bool is_full_stripe_write(struct stripe_head *sh)
714		717
715	static void lock_two_stripes(struct stripe_head sh1, struct stripe_head sh2)	718	static void lock_two_stripes(struct stripe_head sh1, struct stripe_head sh2)
716	{	719	{
717	local_irq_disable();
718	if (sh1 > sh2) {	720	if (sh1 > sh2) {
719	spin_lock(&sh2->stripe_lock);	721	spin_lock_irq(&sh2->stripe_lock);
720	spin_lock_nested(&sh1->stripe_lock, 1);	722	spin_lock_nested(&sh1->stripe_lock, 1);
721	} else {	723	} else {
722	spin_lock(&sh1->stripe_lock);	724	spin_lock_irq(&sh1->stripe_lock);
723	spin_lock_nested(&sh2->stripe_lock, 1);	725	spin_lock_nested(&sh2->stripe_lock, 1);
724	}	726	}
725	}	727	}
@@ -727,8 +729,7 @@ static void lock_two_stripes(struct stripe_head sh1, struct stripe_head sh2)
727	static void unlock_two_stripes(struct stripe_head sh1, struct stripe_head sh2)	729	static void unlock_two_stripes(struct stripe_head sh1, struct stripe_head sh2)
728	{	730	{
729	spin_unlock(&sh1->stripe_lock);	731	spin_unlock(&sh1->stripe_lock);
730	spin_unlock(&sh2->stripe_lock);	732	spin_unlock_irq(&sh2->stripe_lock);
731	local_irq_enable();
732	}	733	}
733		734
734	/* Only freshly new full stripe normal write stripe can be added to a batch list */	735	/* Only freshly new full stripe normal write stripe can be added to a batch list */
@@ -2312,14 +2313,12 @@ static int resize_stripes(struct r5conf *conf, int newsize)
2312	struct stripe_head osh, nsh;	2313	struct stripe_head osh, nsh;
2313	LIST_HEAD(newstripes);	2314	LIST_HEAD(newstripes);
2314	struct disk_info *ndisks;	2315	struct disk_info *ndisks;
2315	int err;	2316	int err = 0;
2316	struct kmem_cache *sc;	2317	struct kmem_cache *sc;
2317	int i;	2318	int i;
2318	int hash, cnt;	2319	int hash, cnt;
2319		2320
2320	err = md_allow_write(conf->mddev);	2321	md_allow_write(conf->mddev);
2321	if (err)
2322	return err;
2323		2322
2324	/* Step 1 */	2323	/* Step 1 */
2325	sc = kmem_cache_create(conf->cache_name[1-conf->active_name],	2324	sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
@@ -2694,7 +2693,7 @@ static void raid5_error(struct mddev mddev, struct md_rdev rdev)
2694	bdevname(rdev->bdev, b),	2693	bdevname(rdev->bdev, b),
2695	mdname(mddev),	2694	mdname(mddev),
2696	conf->raid_disks - mddev->degraded);	2695	conf->raid_disks - mddev->degraded);
2697	r5c_update_on_rdev_error(mddev);	2696	r5c_update_on_rdev_error(mddev, rdev);
2698	}	2697	}
2699		2698
2700	/*	2699	/*
@@ -3055,6 +3054,11 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
3055	* When LOG_CRITICAL, stripes with injournal == 0 will be sent to	3054	* When LOG_CRITICAL, stripes with injournal == 0 will be sent to
3056	* no_space_stripes list.	3055	* no_space_stripes list.
3057	*	3056	*
		3057	* 3. during journal failure
		3058	* In journal failure, we try to flush all cached data to raid disks
		3059	* based on data in stripe cache. The array is read-only to upper
		3060	* layers, so we would skip all pending writes.
		3061	*
3058	*/	3062	*/
3059	static inline bool delay_towrite(struct r5conf *conf,	3063	static inline bool delay_towrite(struct r5conf *conf,
3060	struct r5dev *dev,	3064	struct r5dev *dev,
@@ -3068,6 +3072,9 @@ static inline bool delay_towrite(struct r5conf *conf,
3068	if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&	3072	if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
3069	s->injournal > 0)	3073	s->injournal > 0)
3070	return true;	3074	return true;
		3075	/* case 3 above */
		3076	if (s->log_failed && s->injournal)
		3077	return true;
3071	return false;	3078	return false;
3072	}	3079	}
3073		3080
@@ -4653,8 +4660,13 @@ static void handle_stripe(struct stripe_head *sh)
4653		4660
4654	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {	4661	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
4655	spin_lock(&sh->stripe_lock);	4662	spin_lock(&sh->stripe_lock);
4656	/* Cannot process 'sync' concurrently with 'discard' */	4663	/*
4657	if (!test_bit(STRIPE_DISCARD, &sh->state) &&	4664	* Cannot process 'sync' concurrently with 'discard'.
		4665	* Flush data in r5cache before 'sync'.
		4666	*/
		4667	if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
		4668	!test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) &&
		4669	!test_bit(STRIPE_DISCARD, &sh->state) &&
4658	test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {	4670	test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
4659	set_bit(STRIPE_SYNCING, &sh->state);	4671	set_bit(STRIPE_SYNCING, &sh->state);
4660	clear_bit(STRIPE_INSYNC, &sh->state);	4672	clear_bit(STRIPE_INSYNC, &sh->state);
@@ -4701,10 +4713,15 @@ static void handle_stripe(struct stripe_head *sh)
4701	" to_write=%d failed=%d failed_num=%d,%d\n",	4713	" to_write=%d failed=%d failed_num=%d,%d\n",
4702	s.locked, s.uptodate, s.to_read, s.to_write, s.failed,	4714	s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
4703	s.failed_num[0], s.failed_num[1]);	4715	s.failed_num[0], s.failed_num[1]);
4704	/* check if the array has lost more than max_degraded devices and,	4716	/*
		4717	* check if the array has lost more than max_degraded devices and,
4705	* if so, some requests might need to be failed.	4718	* if so, some requests might need to be failed.
		4719	*
		4720	* When journal device failed (log_failed), we will only process
		4721	* the stripe if there is data need write to raid disks
4706	*/	4722	*/
4707	if (s.failed > conf->max_degraded \|\| s.log_failed) {	4723	if (s.failed > conf->max_degraded \|\|
		4724	(s.log_failed && s.injournal == 0)) {
4708	sh->check_state = 0;	4725	sh->check_state = 0;
4709	sh->reconstruct_state = 0;	4726	sh->reconstruct_state = 0;
4710	break_stripe_batch_list(sh, 0);	4727	break_stripe_batch_list(sh, 0);
@@ -5277,8 +5294,10 @@ static struct stripe_head __get_priority_stripe(struct r5conf conf, int group)
5277	struct stripe_head sh, tmp;	5294	struct stripe_head sh, tmp;
5278	struct list_head *handle_list = NULL;	5295	struct list_head *handle_list = NULL;
5279	struct r5worker_group *wg;	5296	struct r5worker_group *wg;
5280	bool second_try = !r5c_is_writeback(conf->log);	5297	bool second_try = !r5c_is_writeback(conf->log) &&
5281	bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state);	5298	!r5l_log_disk_error(conf);
		5299	bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) \|\|
		5300	r5l_log_disk_error(conf);
5282		5301
5283	again:	5302	again:
5284	wg = NULL;	5303	wg = NULL;
@@ -6313,7 +6332,6 @@ int
6313	raid5_set_cache_size(struct mddev *mddev, int size)	6332	raid5_set_cache_size(struct mddev *mddev, int size)
6314	{	6333	{
6315	struct r5conf *conf = mddev->private;	6334	struct r5conf *conf = mddev->private;
6316	int err;
6317		6335
6318	if (size <= 16 \|\| size > 32768)	6336	if (size <= 16 \|\| size > 32768)
6319	return -EINVAL;	6337	return -EINVAL;
@@ -6325,10 +6343,7 @@ raid5_set_cache_size(struct mddev *mddev, int size)
6325	;	6343	;
6326	mutex_unlock(&conf->cache_size_mutex);	6344	mutex_unlock(&conf->cache_size_mutex);
6327		6345
6328		6346	md_allow_write(mddev);
6329	err = md_allow_write(mddev);
6330	if (err)
6331	return err;
6332		6347
6333	mutex_lock(&conf->cache_size_mutex);	6348	mutex_lock(&conf->cache_size_mutex);
6334	while (size > conf->max_nr_stripes)	6349	while (size > conf->max_nr_stripes)
@@ -7530,7 +7545,9 @@ static int raid5_remove_disk(struct mddev mddev, struct md_rdev rdev)
7530	* neilb: there is no locking about new writes here,	7545	* neilb: there is no locking about new writes here,
7531	* so this cannot be safe.	7546	* so this cannot be safe.
7532	*/	7547	*/
7533	if (atomic_read(&conf->active_stripes)) {	7548	if (atomic_read(&conf->active_stripes) \|\|
		7549	atomic_read(&conf->r5c_cached_full_stripes) \|\|
		7550	atomic_read(&conf->r5c_cached_partial_stripes)) {
7534	return -EBUSY;	7551	return -EBUSY;
7535	}	7552	}
7536	log_exit(conf);	7553	log_exit(conf);