aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDan Williams <dan.j.williams@intel.com>2008-06-27 18:31:57 -0400
committerNeil Brown <neilb@notabene.brown>2008-06-27 18:31:57 -0400
commitecc65c9b3f9b9d740a5deade3d85b39be56401b6 (patch)
treee6b4e827befc6849716689f573c89aa0a41e5d26
parentf0e43bcdebf709d747a3effb210aff1941e819ab (diff)
md: replace STRIPE_OP_CHECK with 'check_states'
From: Dan Williams <dan.j.williams@intel.com> The STRIPE_OP_* flags record the state of stripe operations which are performed outside the stripe lock. Their use in indicating which operations need to be run is straightforward; however, interpolating what the next state of the stripe should be based on a given combination of these flags is not straightforward, and has led to bugs. An easier to read implementation with minimal degrees of freedom is needed. Towards this goal, this patch introduces explicit states to replace what was previously interpolated from the STRIPE_OP_* flags. For now this only converts the handle_parity_checks5 path, removing a user of the ops.{pending,ack,complete,count} fields of struct stripe_operations. This conversion also found a remaining issue with the current code. There is a small window for a drive to fail between when we schedule a repair and when the parity calculation for that repair completes. When this happens we will writeback to 'failed_num' when we really want to write back to 'pd_idx'. Signed-off-by: Dan Williams <dan.j.williams@intel.com> Signed-off-by: Neil Brown <neilb@suse.de>
-rw-r--r--drivers/md/raid5.c172
-rw-r--r--include/linux/raid/raid5.h46
2 files changed, 123 insertions, 95 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6f3dd12dd3a4..544e1600f208 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -605,7 +605,11 @@ static void ops_complete_compute5(void *stripe_head_ref)
605 set_bit(R5_UPTODATE, &tgt->flags); 605 set_bit(R5_UPTODATE, &tgt->flags);
606 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 606 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
607 clear_bit(R5_Wantcompute, &tgt->flags); 607 clear_bit(R5_Wantcompute, &tgt->flags);
608 set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); 608 clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
609 if (sh->check_state == check_state_compute_run)
610 sh->check_state = check_state_compute_result;
611 else
612 set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
609 set_bit(STRIPE_HANDLE, &sh->state); 613 set_bit(STRIPE_HANDLE, &sh->state);
610 release_stripe(sh); 614 release_stripe(sh);
611} 615}
@@ -838,7 +842,7 @@ static void ops_complete_check(void *stripe_head_ref)
838 pr_debug("%s: stripe %llu\n", __func__, 842 pr_debug("%s: stripe %llu\n", __func__,
839 (unsigned long long)sh->sector); 843 (unsigned long long)sh->sector);
840 844
841 set_bit(STRIPE_OP_CHECK, &sh->ops.complete); 845 sh->check_state = check_state_check_result;
842 set_bit(STRIPE_HANDLE, &sh->state); 846 set_bit(STRIPE_HANDLE, &sh->state);
843 release_stripe(sh); 847 release_stripe(sh);
844} 848}
@@ -870,7 +874,8 @@ static void ops_run_check(struct stripe_head *sh)
870 ops_complete_check, sh); 874 ops_complete_check, sh);
871} 875}
872 876
873static void raid5_run_ops(struct stripe_head *sh, unsigned long pending) 877static void raid5_run_ops(struct stripe_head *sh, unsigned long pending,
878 unsigned long ops_request)
874{ 879{
875 int overlap_clear = 0, i, disks = sh->disks; 880 int overlap_clear = 0, i, disks = sh->disks;
876 struct dma_async_tx_descriptor *tx = NULL; 881 struct dma_async_tx_descriptor *tx = NULL;
@@ -880,7 +885,8 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending)
880 overlap_clear++; 885 overlap_clear++;
881 } 886 }
882 887
883 if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending)) 888 if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending) ||
889 test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request))
884 tx = ops_run_compute5(sh, pending); 890 tx = ops_run_compute5(sh, pending);
885 891
886 if (test_bit(STRIPE_OP_PREXOR, &pending)) 892 if (test_bit(STRIPE_OP_PREXOR, &pending))
@@ -894,7 +900,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending)
894 if (test_bit(STRIPE_OP_POSTXOR, &pending)) 900 if (test_bit(STRIPE_OP_POSTXOR, &pending))
895 ops_run_postxor(sh, tx, pending); 901 ops_run_postxor(sh, tx, pending);
896 902
897 if (test_bit(STRIPE_OP_CHECK, &pending)) 903 if (test_bit(STRIPE_OP_CHECK, &ops_request))
898 ops_run_check(sh); 904 ops_run_check(sh);
899 905
900 if (overlap_clear) 906 if (overlap_clear)
@@ -1961,8 +1967,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
1961 /* don't schedule compute operations or reads on the parity block while 1967 /* don't schedule compute operations or reads on the parity block while
1962 * a check is in flight 1968 * a check is in flight
1963 */ 1969 */
1964 if ((disk_idx == sh->pd_idx) && 1970 if (disk_idx == sh->pd_idx && sh->check_state)
1965 test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
1966 return ~0; 1971 return ~0;
1967 1972
1968 /* is the data in this block needed, and can we get it? */ 1973 /* is the data in this block needed, and can we get it? */
@@ -1983,9 +1988,8 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
1983 * 3/ We hold off parity block re-reads until check operations 1988 * 3/ We hold off parity block re-reads until check operations
1984 * have quiesced. 1989 * have quiesced.
1985 */ 1990 */
1986 if ((s->uptodate == disks - 1) && 1991 if ((s->uptodate == disks - 1) && !sh->check_state &&
1987 (s->failed && disk_idx == s->failed_num) && 1992 (s->failed && disk_idx == s->failed_num)) {
1988 !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
1989 set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); 1993 set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
1990 set_bit(R5_Wantcompute, &dev->flags); 1994 set_bit(R5_Wantcompute, &dev->flags);
1991 sh->ops.target = disk_idx; 1995 sh->ops.target = disk_idx;
@@ -2021,12 +2025,8 @@ static void handle_issuing_new_read_requests5(struct stripe_head *sh,
2021{ 2025{
2022 int i; 2026 int i;
2023 2027
2024 /* Clear completed compute operations. Parity recovery 2028 /* Clear completed compute operations */
2025 * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled 2029 if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete)) {
2026 * later on in this routine
2027 */
2028 if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
2029 !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
2030 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); 2030 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
2031 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); 2031 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
2032 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); 2032 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
@@ -2350,90 +2350,85 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
2350static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, 2350static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2351 struct stripe_head_state *s, int disks) 2351 struct stripe_head_state *s, int disks)
2352{ 2352{
2353 int canceled_check = 0; 2353 struct r5dev *dev = NULL;
2354 2354
2355 set_bit(STRIPE_HANDLE, &sh->state); 2355 set_bit(STRIPE_HANDLE, &sh->state);
2356 2356
2357 /* complete a check operation */ 2357 switch (sh->check_state) {
2358 if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) { 2358 case check_state_idle:
2359 clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); 2359 /* start a new check operation if there are no failures */
2360 clear_bit(STRIPE_OP_CHECK, &sh->ops.pending);
2361 if (s->failed == 0) { 2360 if (s->failed == 0) {
2362 if (sh->ops.zero_sum_result == 0)
2363 /* parity is correct (on disc,
2364 * not in buffer any more)
2365 */
2366 set_bit(STRIPE_INSYNC, &sh->state);
2367 else {
2368 conf->mddev->resync_mismatches +=
2369 STRIPE_SECTORS;
2370 if (test_bit(
2371 MD_RECOVERY_CHECK, &conf->mddev->recovery))
2372 /* don't try to repair!! */
2373 set_bit(STRIPE_INSYNC, &sh->state);
2374 else {
2375 set_bit(STRIPE_OP_COMPUTE_BLK,
2376 &sh->ops.pending);
2377 set_bit(STRIPE_OP_MOD_REPAIR_PD,
2378 &sh->ops.pending);
2379 set_bit(R5_Wantcompute,
2380 &sh->dev[sh->pd_idx].flags);
2381 sh->ops.target = sh->pd_idx;
2382 sh->ops.count++;
2383 s->uptodate++;
2384 }
2385 }
2386 } else
2387 canceled_check = 1; /* STRIPE_INSYNC is not set */
2388 }
2389
2390 /* start a new check operation if there are no failures, the stripe is
2391 * not insync, and a repair is not in flight
2392 */
2393 if (s->failed == 0 &&
2394 !test_bit(STRIPE_INSYNC, &sh->state) &&
2395 !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
2396 if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
2397 BUG_ON(s->uptodate != disks); 2361 BUG_ON(s->uptodate != disks);
2362 sh->check_state = check_state_run;
2363 set_bit(STRIPE_OP_CHECK, &s->ops_request);
2398 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 2364 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
2399 sh->ops.count++;
2400 s->uptodate--; 2365 s->uptodate--;
2366 break;
2401 } 2367 }
2402 } 2368 dev = &sh->dev[s->failed_num];
2403 2369 /* fall through */
2404 /* check if we can clear a parity disk reconstruct */ 2370 case check_state_compute_result:
2405 if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && 2371 sh->check_state = check_state_idle;
2406 test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { 2372 if (!dev)
2407 2373 dev = &sh->dev[sh->pd_idx];
2408 clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending); 2374
2409 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); 2375 /* check that a write has not made the stripe insync */
2410 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); 2376 if (test_bit(STRIPE_INSYNC, &sh->state))
2411 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); 2377 break;
2412 }
2413
2414 2378
2415 /* Wait for check parity and compute block operations to complete
2416 * before write-back. If a failure occurred while the check operation
2417 * was in flight we need to cycle this stripe through handle_stripe
2418 * since the parity block may not be uptodate
2419 */
2420 if (!canceled_check && !test_bit(STRIPE_INSYNC, &sh->state) &&
2421 !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) &&
2422 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) {
2423 struct r5dev *dev;
2424 /* either failed parity check, or recovery is happening */ 2379 /* either failed parity check, or recovery is happening */
2425 if (s->failed == 0)
2426 s->failed_num = sh->pd_idx;
2427 dev = &sh->dev[s->failed_num];
2428 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 2380 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
2429 BUG_ON(s->uptodate != disks); 2381 BUG_ON(s->uptodate != disks);
2430 2382
2431 set_bit(R5_LOCKED, &dev->flags); 2383 set_bit(R5_LOCKED, &dev->flags);
2384 s->locked++;
2432 set_bit(R5_Wantwrite, &dev->flags); 2385 set_bit(R5_Wantwrite, &dev->flags);
2433 2386
2434 clear_bit(STRIPE_DEGRADED, &sh->state); 2387 clear_bit(STRIPE_DEGRADED, &sh->state);
2435 s->locked++;
2436 set_bit(STRIPE_INSYNC, &sh->state); 2388 set_bit(STRIPE_INSYNC, &sh->state);
2389 break;
2390 case check_state_run:
2391 break; /* we will be called again upon completion */
2392 case check_state_check_result:
2393 sh->check_state = check_state_idle;
2394
2395 /* if a failure occurred during the check operation, leave
2396 * STRIPE_INSYNC not set and let the stripe be handled again
2397 */
2398 if (s->failed)
2399 break;
2400
2401 /* handle a successful check operation, if parity is correct
2402 * we are done. Otherwise update the mismatch count and repair
2403 * parity if !MD_RECOVERY_CHECK
2404 */
2405 if (sh->ops.zero_sum_result == 0)
2406 /* parity is correct (on disc,
2407 * not in buffer any more)
2408 */
2409 set_bit(STRIPE_INSYNC, &sh->state);
2410 else {
2411 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2412 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2413 /* don't try to repair!! */
2414 set_bit(STRIPE_INSYNC, &sh->state);
2415 else {
2416 sh->check_state = check_state_compute_run;
2417 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2418 set_bit(R5_Wantcompute,
2419 &sh->dev[sh->pd_idx].flags);
2420 sh->ops.target = sh->pd_idx;
2421 s->uptodate++;
2422 }
2423 }
2424 break;
2425 case check_state_compute_run:
2426 break;
2427 default:
2428 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
2429 __func__, sh->check_state,
2430 (unsigned long long) sh->sector);
2431 BUG();
2437 } 2432 }
2438} 2433}
2439 2434
@@ -2807,7 +2802,7 @@ static void handle_stripe5(struct stripe_head *sh)
2807 * block. 2802 * block.
2808 */ 2803 */
2809 if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) && 2804 if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) &&
2810 !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) 2805 !sh->check_state)
2811 handle_issuing_new_write_requests5(conf, sh, &s, disks); 2806 handle_issuing_new_write_requests5(conf, sh, &s, disks);
2812 2807
2813 /* maybe we need to check and possibly fix the parity for this stripe 2808 /* maybe we need to check and possibly fix the parity for this stripe
@@ -2815,11 +2810,10 @@ static void handle_stripe5(struct stripe_head *sh)
2815 * data is available. The parity check is held off while parity 2810 * data is available. The parity check is held off while parity
2816 * dependent operations are in flight. 2811 * dependent operations are in flight.
2817 */ 2812 */
2818 if ((s.syncing && s.locked == 0 && 2813 if (sh->check_state ||
2814 (s.syncing && s.locked == 0 &&
2819 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && 2815 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
2820 !test_bit(STRIPE_INSYNC, &sh->state)) || 2816 !test_bit(STRIPE_INSYNC, &sh->state)))
2821 test_bit(STRIPE_OP_CHECK, &sh->ops.pending) ||
2822 test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending))
2823 handle_parity_checks5(conf, sh, &s, disks); 2817 handle_parity_checks5(conf, sh, &s, disks);
2824 2818
2825 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 2819 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
@@ -2897,8 +2891,8 @@ static void handle_stripe5(struct stripe_head *sh)
2897 if (unlikely(blocked_rdev)) 2891 if (unlikely(blocked_rdev))
2898 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 2892 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
2899 2893
2900 if (pending) 2894 if (pending || s.ops_request)
2901 raid5_run_ops(sh, pending); 2895 raid5_run_ops(sh, pending, s.ops_request);
2902 2896
2903 ops_run_io(sh, &s); 2897 ops_run_io(sh, &s);
2904 2898
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 1301195abf4b..2c96d5fd54bf 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -158,6 +158,41 @@
158 * the compute block completes. 158 * the compute block completes.
159 */ 159 */
160 160
161/*
162 * Operations state - intermediate states that are visible outside of sh->lock
163 * In general _idle indicates nothing is running, _run indicates a data
164 * processing operation is active, and _result means the data processing result
165 * is stable and can be acted upon. For simple operations like biofill and
166 * compute that only have an _idle and _run state they are indicated with
167 * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN)
168 */
169/**
170 * enum check_states - handles syncing / repairing a stripe
171 * @check_state_idle - check operations are quiesced
172 * @check_state_run - check operation is running
173 * @check_state_result - set outside lock when check result is valid
174 * @check_state_compute_run - check failed and we are repairing
175 * @check_state_compute_result - set outside lock when compute result is valid
176 */
177enum check_states {
178 check_state_idle = 0,
179 check_state_run, /* parity check */
180 check_state_check_result,
181 check_state_compute_run, /* parity repair */
182 check_state_compute_result,
183};
184
185/**
186 * enum reconstruct_states - handles writing or expanding a stripe
187 */
188enum reconstruct_states {
189 reconstruct_state_idle = 0,
190 reconstruct_state_drain_run, /* write */
191 reconstruct_state_run, /* expand */
192 reconstruct_state_drain_result,
193 reconstruct_state_result,
194};
195
161struct stripe_head { 196struct stripe_head {
162 struct hlist_node hash; 197 struct hlist_node hash;
163 struct list_head lru; /* inactive_list or handle_list */ 198 struct list_head lru; /* inactive_list or handle_list */
@@ -169,6 +204,7 @@ struct stripe_head {
169 spinlock_t lock; 204 spinlock_t lock;
170 int bm_seq; /* sequence number for bitmap flushes */ 205 int bm_seq; /* sequence number for bitmap flushes */
171 int disks; /* disks in stripe */ 206 int disks; /* disks in stripe */
207 enum check_states check_state;
172 /* stripe_operations 208 /* stripe_operations
173 * @pending - pending ops flags (set for request->issue->complete) 209 * @pending - pending ops flags (set for request->issue->complete)
174 * @ack - submitted ops flags (set for issue->complete) 210 * @ack - submitted ops flags (set for issue->complete)
@@ -202,6 +238,7 @@ struct stripe_head_state {
202 int locked, uptodate, to_read, to_write, failed, written; 238 int locked, uptodate, to_read, to_write, failed, written;
203 int to_fill, compute, req_compute, non_overwrite; 239 int to_fill, compute, req_compute, non_overwrite;
204 int failed_num; 240 int failed_num;
241 unsigned long ops_request;
205}; 242};
206 243
207/* r6_state - extra state data only relevant to r6 */ 244/* r6_state - extra state data only relevant to r6 */
@@ -254,8 +291,10 @@ struct r6_state {
254#define STRIPE_EXPAND_READY 11 291#define STRIPE_EXPAND_READY 11
255#define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */ 292#define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */
256#define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ 293#define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */
294#define STRIPE_BIOFILL_RUN 14
295#define STRIPE_COMPUTE_RUN 15
257/* 296/*
258 * Operations flags (in issue order) 297 * Operation request flags
259 */ 298 */
260#define STRIPE_OP_BIOFILL 0 299#define STRIPE_OP_BIOFILL 0
261#define STRIPE_OP_COMPUTE_BLK 1 300#define STRIPE_OP_COMPUTE_BLK 1
@@ -264,11 +303,6 @@ struct r6_state {
264#define STRIPE_OP_POSTXOR 4 303#define STRIPE_OP_POSTXOR 4
265#define STRIPE_OP_CHECK 5 304#define STRIPE_OP_CHECK 5
266 305
267/* modifiers to the base operations
268 * STRIPE_OP_MOD_REPAIR_PD - compute the parity block and write it back
269 */
270#define STRIPE_OP_MOD_REPAIR_PD 7
271
272/* 306/*
273 * Plugging: 307 * Plugging:
274 * 308 *