md/raid6: asynchronous handle_parity_check6

[ Based on an original patch by Yuri Tikhonov ] Implement the state machine for handling the RAID-6 parities check and repair functionality. Note that the raid6 case does not need to check for new failures, like raid5, as it will always writeback the correct disks. The raid5 case can be updated to check zero_sum_result to avoid getting confused by new failures rather than retrying the entire check operation. Signed-off-by: Yuri Tikhonov <yur@emcraft.com> Signed-off-by: Ilya Yanok <yanok@emcraft.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
author: Dan Williams <dan.j.williams@intel.com> 2009-07-14 16:40:57 -0400
committer: Dan Williams <dan.j.williams@intel.com> 2009-08-29 22:13:13 -0400
commit: d82dfee0ad8f240fef1b28e2258891c07da57367 (patch)
tree: 44431399bef701c52f413c364f80751c18ff1179 /drivers/md/raid5.c
parent: a9b39a741a7e3b262b9f51fefb68e17b32756999 (diff)
1 files changed, 139 insertions, 67 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 08f806379b07..3c31f7f8aa65 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2901,91 +2901,163 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
                                  struct stripe_head_state *s,
                                  struct r6_state *r6s, int disks)
 {
-        int update_p = 0, update_q = 0;
-        struct r5dev *dev;
        int pd_idx = sh->pd_idx;
        int qd_idx = sh->qd_idx;
-        unsigned long cpu;
+        struct r5dev *dev;
-        struct page *tmp_page;
        set_bit(STRIPE_HANDLE, &sh->state);
        BUG_ON(s->failed > 2);
-        BUG_ON(s->uptodate < disks);
        /* Want to check and possibly repair P and Q.
         * However there could be one 'failed' device, in which
         * case we can only check one of them, possibly using the
         * other to generate missing data
         */
-        cpu = get_cpu();
-        tmp_page = per_cpu_ptr(conf->percpu, cpu)->spare_page;
+        switch (sh->check_state) {
-        if (s->failed == r6s->q_failed) {
+        case check_state_idle:
-                /* The only possible failed device holds 'Q', so it
+                /* start a new check operation if there are < 2 failures */
-                 * makes sense to check P (If anything else were failed,
+                if (s->failed == r6s->q_failed) {
-                 * we would have used P to recreate it).
+                        /* The only possible failed device holds Q, so it
-                 */
+                         * makes sense to check P (If anything else were failed,
-                compute_block_1(sh, pd_idx, 1);
+                         * we would have used P to recreate it).
-                if (!page_is_zero(sh->dev[pd_idx].page)) {
+                         */
-                        compute_block_1(sh, pd_idx, 0);
+                        sh->check_state = check_state_run;
-                        update_p = 1;
                }
-        }
+                if (!r6s->q_failed && s->failed < 2) {
-        if (!r6s->q_failed && s->failed < 2) {
+                        /* Q is not failed, and we didn't use it to generate
-                /* q is not failed, and we didn't use it to generate
+                         * anything, so it makes sense to check it
-                 * anything, so it makes sense to check it
+                         */
-                 */
+                        if (sh->check_state == check_state_run)
-                memcpy(page_address(tmp_page),
+                                sh->check_state = check_state_run_pq;
-                       page_address(sh->dev[qd_idx].page),
+                        else
-                       STRIPE_SIZE);
+                                sh->check_state = check_state_run_q;
-                compute_parity6(sh, UPDATE_PARITY);
-                if (memcmp(page_address(tmp_page),
-                           page_address(sh->dev[qd_idx].page),
-                           STRIPE_SIZE) != 0) {
-                        clear_bit(STRIPE_INSYNC, &sh->state);
-                        update_q = 1;
                }
-        }
-        put_cpu();
-        if (update_p || update_q) {
+                /* discard potentially stale zero_sum_result */
-                conf->mddev->resync_mismatches += STRIPE_SECTORS;
+                sh->ops.zero_sum_result = 0;
-                if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
-                        /* don't try to repair!! */
-                        update_p = update_q = 0;
-        }
-        /* now write out any block on a failed drive,
+                if (sh->check_state == check_state_run) {
-         * or P or Q if they need it
+                        /* async_xor_zero_sum destroys the contents of P */
-         */
+                        clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+                        s->uptodate--;
+                }
+                if (sh->check_state >= check_state_run &&
+                    sh->check_state <= check_state_run_pq) {
+                        /* async_syndrome_zero_sum preserves P and Q, so
+                         * no need to mark them !uptodate here
+                         */
+                        set_bit(STRIPE_OP_CHECK, &s->ops_request);
+                        break;
+                }
-        if (s->failed == 2) {
+                /* we have 2-disk failure */
-                dev = &sh->dev[r6s->failed_num[1]];
+                BUG_ON(s->failed != 2);
-                s->locked++;
+                /* fall through */
-                set_bit(R5_LOCKED, &dev->flags);
+        case check_state_compute_result:
-                set_bit(R5_Wantwrite, &dev->flags);
+                sh->check_state = check_state_idle;
-        }
-        if (s->failed >= 1) {
-                dev = &sh->dev[r6s->failed_num[0]];
-                s->locked++;
-                set_bit(R5_LOCKED, &dev->flags);
-                set_bit(R5_Wantwrite, &dev->flags);
-        }
-        if (update_p) {
+                /* check that a write has not made the stripe insync */
-                dev = &sh->dev[pd_idx];
+                if (test_bit(STRIPE_INSYNC, &sh->state))
-                s->locked++;
+                        break;
-                set_bit(R5_LOCKED, &dev->flags);
-                set_bit(R5_Wantwrite, &dev->flags);
-        }
-        if (update_q) {
-                dev = &sh->dev[qd_idx];
-                s->locked++;
-                set_bit(R5_LOCKED, &dev->flags);
-                set_bit(R5_Wantwrite, &dev->flags);
-        }
-        clear_bit(STRIPE_DEGRADED, &sh->state);
-        set_bit(STRIPE_INSYNC, &sh->state);
+                /* now write out any block on a failed drive,
+                 * or P or Q if they were recomputed
+                 */
+                BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
+                if (s->failed == 2) {
+                        dev = &sh->dev[r6s->failed_num[1]];
+                        s->locked++;
+                        set_bit(R5_LOCKED, &dev->flags);
+                        set_bit(R5_Wantwrite, &dev->flags);
+                }
+                if (s->failed >= 1) {
+                        dev = &sh->dev[r6s->failed_num[0]];
+                        s->locked++;
+                        set_bit(R5_LOCKED, &dev->flags);
+                        set_bit(R5_Wantwrite, &dev->flags);
+                }
+                if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
+                        dev = &sh->dev[pd_idx];
+                        s->locked++;
+                        set_bit(R5_LOCKED, &dev->flags);
+                        set_bit(R5_Wantwrite, &dev->flags);
+                }
+                if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
+                        dev = &sh->dev[qd_idx];
+                        s->locked++;
+                        set_bit(R5_LOCKED, &dev->flags);
+                        set_bit(R5_Wantwrite, &dev->flags);
+                }
+                clear_bit(STRIPE_DEGRADED, &sh->state);
+                set_bit(STRIPE_INSYNC, &sh->state);
+                break;
+        case check_state_run:
+        case check_state_run_q:
+        case check_state_run_pq:
+                break; /* we will be called again upon completion */
+        case check_state_check_result:
+                sh->check_state = check_state_idle;
+                /* handle a successful check operation, if parity is correct
+                 * we are done.  Otherwise update the mismatch count and repair
+                 * parity if !MD_RECOVERY_CHECK
+                 */
+                if (sh->ops.zero_sum_result == 0) {
+                        /* both parities are correct */
+                        if (!s->failed)
+                                set_bit(STRIPE_INSYNC, &sh->state);
+                        else {
+                                /* in contrast to the raid5 case we can validate
+                                 * parity, but still have a failure to write
+                                 * back
+                                 */
+                                sh->check_state = check_state_compute_result;
+                                /* Returning at this point means that we may go
+                                 * off and bring p and/or q uptodate again so
+                                 * we make sure to check zero_sum_result again
+                                 * to verify if p or q need writeback
+                                 */
+                        }
+                } else {
+                        conf->mddev->resync_mismatches += STRIPE_SECTORS;
+                        if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
+                                /* don't try to repair!! */
+                                set_bit(STRIPE_INSYNC, &sh->state);
+                        else {
+                                int *target = &sh->ops.target;
+                                sh->ops.target = -1;
+                                sh->ops.target2 = -1;
+                                sh->check_state = check_state_compute_run;
+                                set_bit(STRIPE_COMPUTE_RUN, &sh->state);
+                                set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
+                                if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
+                                        set_bit(R5_Wantcompute,
+                                                &sh->dev[pd_idx].flags);
+                                        *target = pd_idx;
+                                        target = &sh->ops.target2;
+                                        s->uptodate++;
+                                }
+                                if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
+                                        set_bit(R5_Wantcompute,
+                                                &sh->dev[qd_idx].flags);
+                                        *target = qd_idx;
+                                        s->uptodate++;
+                                }
+                        }
+                }
+                break;
+        case check_state_compute_run:
+                break;
+        default:
+                printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
+                       __func__, sh->check_state,
+                       (unsigned long long) sh->sector);
+                BUG();
+        }
 }
 static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
author	Dan Williams <dan.j.williams@intel.com>	2009-07-14 16:40:57 -0400
committer	Dan Williams <dan.j.williams@intel.com>	2009-08-29 22:13:13 -0400
commit	d82dfee0ad8f240fef1b28e2258891c07da57367 (patch)
tree	44431399bef701c52f413c364f80751c18ff1179 /drivers/md/raid5.c
parent	a9b39a741a7e3b262b9f51fefb68e17b32756999 (diff)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 08f806379b07..3c31f7f8aa65 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c
@@ -2901,91 +2901,163 @@ static void handle_parity_checks6(raid5_conf_t conf, struct stripe_head sh,
2901	struct stripe_head_state *s,	2901	struct stripe_head_state *s,
2902	struct r6_state *r6s, int disks)	2902	struct r6_state *r6s, int disks)
2903	{	2903	{
2904	int update_p = 0, update_q = 0;
2905	struct r5dev *dev;
2906	int pd_idx = sh->pd_idx;	2904	int pd_idx = sh->pd_idx;
2907	int qd_idx = sh->qd_idx;	2905	int qd_idx = sh->qd_idx;
2908	unsigned long cpu;	2906	struct r5dev *dev;
2909	struct page *tmp_page;
2910		2907
2911	set_bit(STRIPE_HANDLE, &sh->state);	2908	set_bit(STRIPE_HANDLE, &sh->state);
2912		2909
2913	BUG_ON(s->failed > 2);	2910	BUG_ON(s->failed > 2);
2914	BUG_ON(s->uptodate < disks);	2911
2915	/* Want to check and possibly repair P and Q.	2912	/* Want to check and possibly repair P and Q.
2916	* However there could be one 'failed' device, in which	2913	* However there could be one 'failed' device, in which
2917	* case we can only check one of them, possibly using the	2914	* case we can only check one of them, possibly using the
2918	* other to generate missing data	2915	* other to generate missing data
2919	*/	2916	*/
2920	cpu = get_cpu();	2917
2921	tmp_page = per_cpu_ptr(conf->percpu, cpu)->spare_page;	2918	switch (sh->check_state) {
2922	if (s->failed == r6s->q_failed) {	2919	case check_state_idle:
2923	/* The only possible failed device holds 'Q', so it	2920	/* start a new check operation if there are < 2 failures */
2924	* makes sense to check P (If anything else were failed,	2921	if (s->failed == r6s->q_failed) {
2925	* we would have used P to recreate it).	2922	/* The only possible failed device holds Q, so it
2926	*/	2923	* makes sense to check P (If anything else were failed,
2927	compute_block_1(sh, pd_idx, 1);	2924	* we would have used P to recreate it).
2928	if (!page_is_zero(sh->dev[pd_idx].page)) {	2925	*/
2929	compute_block_1(sh, pd_idx, 0);	2926	sh->check_state = check_state_run;
2930	update_p = 1;
2931	}	2927	}
2932	}	2928	if (!r6s->q_failed && s->failed < 2) {
2933	if (!r6s->q_failed && s->failed < 2) {	2929	/* Q is not failed, and we didn't use it to generate
2934	/* q is not failed, and we didn't use it to generate	2930	* anything, so it makes sense to check it
2935	* anything, so it makes sense to check it	2931	*/
2936	*/	2932	if (sh->check_state == check_state_run)
2937	memcpy(page_address(tmp_page),	2933	sh->check_state = check_state_run_pq;
2938	page_address(sh->dev[qd_idx].page),	2934	else
2939	STRIPE_SIZE);	2935	sh->check_state = check_state_run_q;
2940	compute_parity6(sh, UPDATE_PARITY);
2941	if (memcmp(page_address(tmp_page),
2942	page_address(sh->dev[qd_idx].page),
2943	STRIPE_SIZE) != 0) {
2944	clear_bit(STRIPE_INSYNC, &sh->state);
2945	update_q = 1;
2946	}	2936	}
2947	}
2948	put_cpu();
2949		2937
2950	if (update_p \|\| update_q) {	2938	/* discard potentially stale zero_sum_result */
2951	conf->mddev->resync_mismatches += STRIPE_SECTORS;	2939	sh->ops.zero_sum_result = 0;
2952	if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2953	/* don't try to repair!! */
2954	update_p = update_q = 0;
2955	}
2956		2940
2957	/* now write out any block on a failed drive,	2941	if (sh->check_state == check_state_run) {
2958	* or P or Q if they need it	2942	/* async_xor_zero_sum destroys the contents of P */
2959	*/	2943	clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
		2944	s->uptodate--;
		2945	}
		2946	if (sh->check_state >= check_state_run &&
		2947	sh->check_state <= check_state_run_pq) {
		2948	/* async_syndrome_zero_sum preserves P and Q, so
		2949	* no need to mark them !uptodate here
		2950	*/
		2951	set_bit(STRIPE_OP_CHECK, &s->ops_request);
		2952	break;
		2953	}
2960		2954
2961	if (s->failed == 2) {	2955	/* we have 2-disk failure */
2962	dev = &sh->dev[r6s->failed_num[1]];	2956	BUG_ON(s->failed != 2);
2963	s->locked++;	2957	/* fall through */
2964	set_bit(R5_LOCKED, &dev->flags);	2958	case check_state_compute_result:
2965	set_bit(R5_Wantwrite, &dev->flags);	2959	sh->check_state = check_state_idle;
2966	}
2967	if (s->failed >= 1) {
2968	dev = &sh->dev[r6s->failed_num[0]];
2969	s->locked++;
2970	set_bit(R5_LOCKED, &dev->flags);
2971	set_bit(R5_Wantwrite, &dev->flags);
2972	}
2973		2960
2974	if (update_p) {	2961	/* check that a write has not made the stripe insync */
2975	dev = &sh->dev[pd_idx];	2962	if (test_bit(STRIPE_INSYNC, &sh->state))
2976	s->locked++;	2963	break;
2977	set_bit(R5_LOCKED, &dev->flags);
2978	set_bit(R5_Wantwrite, &dev->flags);
2979	}
2980	if (update_q) {
2981	dev = &sh->dev[qd_idx];
2982	s->locked++;
2983	set_bit(R5_LOCKED, &dev->flags);
2984	set_bit(R5_Wantwrite, &dev->flags);
2985	}
2986	clear_bit(STRIPE_DEGRADED, &sh->state);
2987		2964
2988	set_bit(STRIPE_INSYNC, &sh->state);	2965	/* now write out any block on a failed drive,
		2966	* or P or Q if they were recomputed
		2967	*/
		2968	BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
		2969	if (s->failed == 2) {
		2970	dev = &sh->dev[r6s->failed_num[1]];
		2971	s->locked++;
		2972	set_bit(R5_LOCKED, &dev->flags);
		2973	set_bit(R5_Wantwrite, &dev->flags);
		2974	}
		2975	if (s->failed >= 1) {
		2976	dev = &sh->dev[r6s->failed_num[0]];
		2977	s->locked++;
		2978	set_bit(R5_LOCKED, &dev->flags);
		2979	set_bit(R5_Wantwrite, &dev->flags);
		2980	}
		2981	if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
		2982	dev = &sh->dev[pd_idx];
		2983	s->locked++;
		2984	set_bit(R5_LOCKED, &dev->flags);
		2985	set_bit(R5_Wantwrite, &dev->flags);
		2986	}
		2987	if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
		2988	dev = &sh->dev[qd_idx];
		2989	s->locked++;
		2990	set_bit(R5_LOCKED, &dev->flags);
		2991	set_bit(R5_Wantwrite, &dev->flags);
		2992	}
		2993	clear_bit(STRIPE_DEGRADED, &sh->state);
		2994
		2995	set_bit(STRIPE_INSYNC, &sh->state);
		2996	break;
		2997	case check_state_run:
		2998	case check_state_run_q:
		2999	case check_state_run_pq:
		3000	break; /* we will be called again upon completion */
		3001	case check_state_check_result:
		3002	sh->check_state = check_state_idle;
		3003
		3004	/* handle a successful check operation, if parity is correct
		3005	* we are done. Otherwise update the mismatch count and repair
		3006	* parity if !MD_RECOVERY_CHECK
		3007	*/
		3008	if (sh->ops.zero_sum_result == 0) {
		3009	/* both parities are correct */
		3010	if (!s->failed)
		3011	set_bit(STRIPE_INSYNC, &sh->state);
		3012	else {
		3013	/* in contrast to the raid5 case we can validate
		3014	* parity, but still have a failure to write
		3015	* back
		3016	*/
		3017	sh->check_state = check_state_compute_result;
		3018	/* Returning at this point means that we may go
		3019	* off and bring p and/or q uptodate again so
		3020	* we make sure to check zero_sum_result again
		3021	* to verify if p or q need writeback
		3022	*/
		3023	}
		3024	} else {
		3025	conf->mddev->resync_mismatches += STRIPE_SECTORS;
		3026	if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
		3027	/* don't try to repair!! */
		3028	set_bit(STRIPE_INSYNC, &sh->state);
		3029	else {
		3030	int *target = &sh->ops.target;
		3031
		3032	sh->ops.target = -1;
		3033	sh->ops.target2 = -1;
		3034	sh->check_state = check_state_compute_run;
		3035	set_bit(STRIPE_COMPUTE_RUN, &sh->state);
		3036	set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
		3037	if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
		3038	set_bit(R5_Wantcompute,
		3039	&sh->dev[pd_idx].flags);
		3040	*target = pd_idx;
		3041	target = &sh->ops.target2;
		3042	s->uptodate++;
		3043	}
		3044	if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
		3045	set_bit(R5_Wantcompute,
		3046	&sh->dev[qd_idx].flags);
		3047	*target = qd_idx;
		3048	s->uptodate++;
		3049	}
		3050	}
		3051	}
		3052	break;
		3053	case check_state_compute_run:
		3054	break;
		3055	default:
		3056	printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
		3057	__func__, sh->check_state,
		3058	(unsigned long long) sh->sector);
		3059	BUG();
		3060	}
2989	}	3061	}
2990		3062
2991	static void handle_stripe_expansion(raid5_conf_t conf, struct stripe_head sh,	3063	static void handle_stripe_expansion(raid5_conf_t conf, struct stripe_head sh,