diff options
author | Dan Williams <dan.j.williams@intel.com> | 2008-06-27 18:31:57 -0400 |
---|---|---|
committer | Neil Brown <neilb@notabene.brown> | 2008-06-27 18:31:57 -0400 |
commit | ecc65c9b3f9b9d740a5deade3d85b39be56401b6 (patch) | |
tree | e6b4e827befc6849716689f573c89aa0a41e5d26 /drivers | |
parent | f0e43bcdebf709d747a3effb210aff1941e819ab (diff) |
md: replace STRIPE_OP_CHECK with 'check_states'
From: Dan Williams <dan.j.williams@intel.com>
The STRIPE_OP_* flags record the state of stripe operations which are
performed outside the stripe lock. Their use in indicating which
operations need to be run is straightforward; however, interpolating what
the next state of the stripe should be based on a given combination of
these flags is not straightforward, and has led to bugs. An easier to read
implementation with minimal degrees of freedom is needed.
Towards this goal, this patch introduces explicit states to replace what was
previously interpolated from the STRIPE_OP_* flags. For now this only converts
the handle_parity_checks5 path, removing a user of the
ops.{pending,ack,complete,count} fields of struct stripe_operations.
This conversion also found a remaining issue with the current code. There is
a small window for a drive to fail between when we schedule a repair and when
the parity calculation for that repair completes. When this happens we will
writeback to 'failed_num' when we really want to write back to 'pd_idx'.
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/md/raid5.c | 172 |
1 files changed, 83 insertions, 89 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6f3dd12dd3a4..544e1600f208 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -605,7 +605,11 @@ static void ops_complete_compute5(void *stripe_head_ref) | |||
605 | set_bit(R5_UPTODATE, &tgt->flags); | 605 | set_bit(R5_UPTODATE, &tgt->flags); |
606 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); | 606 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); |
607 | clear_bit(R5_Wantcompute, &tgt->flags); | 607 | clear_bit(R5_Wantcompute, &tgt->flags); |
608 | set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); | 608 | clear_bit(STRIPE_COMPUTE_RUN, &sh->state); |
609 | if (sh->check_state == check_state_compute_run) | ||
610 | sh->check_state = check_state_compute_result; | ||
611 | else | ||
612 | set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); | ||
609 | set_bit(STRIPE_HANDLE, &sh->state); | 613 | set_bit(STRIPE_HANDLE, &sh->state); |
610 | release_stripe(sh); | 614 | release_stripe(sh); |
611 | } | 615 | } |
@@ -838,7 +842,7 @@ static void ops_complete_check(void *stripe_head_ref) | |||
838 | pr_debug("%s: stripe %llu\n", __func__, | 842 | pr_debug("%s: stripe %llu\n", __func__, |
839 | (unsigned long long)sh->sector); | 843 | (unsigned long long)sh->sector); |
840 | 844 | ||
841 | set_bit(STRIPE_OP_CHECK, &sh->ops.complete); | 845 | sh->check_state = check_state_check_result; |
842 | set_bit(STRIPE_HANDLE, &sh->state); | 846 | set_bit(STRIPE_HANDLE, &sh->state); |
843 | release_stripe(sh); | 847 | release_stripe(sh); |
844 | } | 848 | } |
@@ -870,7 +874,8 @@ static void ops_run_check(struct stripe_head *sh) | |||
870 | ops_complete_check, sh); | 874 | ops_complete_check, sh); |
871 | } | 875 | } |
872 | 876 | ||
873 | static void raid5_run_ops(struct stripe_head *sh, unsigned long pending) | 877 | static void raid5_run_ops(struct stripe_head *sh, unsigned long pending, |
878 | unsigned long ops_request) | ||
874 | { | 879 | { |
875 | int overlap_clear = 0, i, disks = sh->disks; | 880 | int overlap_clear = 0, i, disks = sh->disks; |
876 | struct dma_async_tx_descriptor *tx = NULL; | 881 | struct dma_async_tx_descriptor *tx = NULL; |
@@ -880,7 +885,8 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending) | |||
880 | overlap_clear++; | 885 | overlap_clear++; |
881 | } | 886 | } |
882 | 887 | ||
883 | if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending)) | 888 | if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending) || |
889 | test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) | ||
884 | tx = ops_run_compute5(sh, pending); | 890 | tx = ops_run_compute5(sh, pending); |
885 | 891 | ||
886 | if (test_bit(STRIPE_OP_PREXOR, &pending)) | 892 | if (test_bit(STRIPE_OP_PREXOR, &pending)) |
@@ -894,7 +900,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending) | |||
894 | if (test_bit(STRIPE_OP_POSTXOR, &pending)) | 900 | if (test_bit(STRIPE_OP_POSTXOR, &pending)) |
895 | ops_run_postxor(sh, tx, pending); | 901 | ops_run_postxor(sh, tx, pending); |
896 | 902 | ||
897 | if (test_bit(STRIPE_OP_CHECK, &pending)) | 903 | if (test_bit(STRIPE_OP_CHECK, &ops_request)) |
898 | ops_run_check(sh); | 904 | ops_run_check(sh); |
899 | 905 | ||
900 | if (overlap_clear) | 906 | if (overlap_clear) |
@@ -1961,8 +1967,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, | |||
1961 | /* don't schedule compute operations or reads on the parity block while | 1967 | /* don't schedule compute operations or reads on the parity block while |
1962 | * a check is in flight | 1968 | * a check is in flight |
1963 | */ | 1969 | */ |
1964 | if ((disk_idx == sh->pd_idx) && | 1970 | if (disk_idx == sh->pd_idx && sh->check_state) |
1965 | test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) | ||
1966 | return ~0; | 1971 | return ~0; |
1967 | 1972 | ||
1968 | /* is the data in this block needed, and can we get it? */ | 1973 | /* is the data in this block needed, and can we get it? */ |
@@ -1983,9 +1988,8 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, | |||
1983 | * 3/ We hold off parity block re-reads until check operations | 1988 | * 3/ We hold off parity block re-reads until check operations |
1984 | * have quiesced. | 1989 | * have quiesced. |
1985 | */ | 1990 | */ |
1986 | if ((s->uptodate == disks - 1) && | 1991 | if ((s->uptodate == disks - 1) && !sh->check_state && |
1987 | (s->failed && disk_idx == s->failed_num) && | 1992 | (s->failed && disk_idx == s->failed_num)) { |
1988 | !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { | ||
1989 | set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); | 1993 | set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); |
1990 | set_bit(R5_Wantcompute, &dev->flags); | 1994 | set_bit(R5_Wantcompute, &dev->flags); |
1991 | sh->ops.target = disk_idx; | 1995 | sh->ops.target = disk_idx; |
@@ -2021,12 +2025,8 @@ static void handle_issuing_new_read_requests5(struct stripe_head *sh, | |||
2021 | { | 2025 | { |
2022 | int i; | 2026 | int i; |
2023 | 2027 | ||
2024 | /* Clear completed compute operations. Parity recovery | 2028 | /* Clear completed compute operations */ |
2025 | * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled | 2029 | if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete)) { |
2026 | * later on in this routine | ||
2027 | */ | ||
2028 | if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && | ||
2029 | !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { | ||
2030 | clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); | 2030 | clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); |
2031 | clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); | 2031 | clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); |
2032 | clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); | 2032 | clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); |
@@ -2350,90 +2350,85 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf, | |||
2350 | static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | 2350 | static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, |
2351 | struct stripe_head_state *s, int disks) | 2351 | struct stripe_head_state *s, int disks) |
2352 | { | 2352 | { |
2353 | int canceled_check = 0; | 2353 | struct r5dev *dev = NULL; |
2354 | 2354 | ||
2355 | set_bit(STRIPE_HANDLE, &sh->state); | 2355 | set_bit(STRIPE_HANDLE, &sh->state); |
2356 | 2356 | ||
2357 | /* complete a check operation */ | 2357 | switch (sh->check_state) { |
2358 | if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) { | 2358 | case check_state_idle: |
2359 | clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); | 2359 | /* start a new check operation if there are no failures */ |
2360 | clear_bit(STRIPE_OP_CHECK, &sh->ops.pending); | ||
2361 | if (s->failed == 0) { | 2360 | if (s->failed == 0) { |
2362 | if (sh->ops.zero_sum_result == 0) | ||
2363 | /* parity is correct (on disc, | ||
2364 | * not in buffer any more) | ||
2365 | */ | ||
2366 | set_bit(STRIPE_INSYNC, &sh->state); | ||
2367 | else { | ||
2368 | conf->mddev->resync_mismatches += | ||
2369 | STRIPE_SECTORS; | ||
2370 | if (test_bit( | ||
2371 | MD_RECOVERY_CHECK, &conf->mddev->recovery)) | ||
2372 | /* don't try to repair!! */ | ||
2373 | set_bit(STRIPE_INSYNC, &sh->state); | ||
2374 | else { | ||
2375 | set_bit(STRIPE_OP_COMPUTE_BLK, | ||
2376 | &sh->ops.pending); | ||
2377 | set_bit(STRIPE_OP_MOD_REPAIR_PD, | ||
2378 | &sh->ops.pending); | ||
2379 | set_bit(R5_Wantcompute, | ||
2380 | &sh->dev[sh->pd_idx].flags); | ||
2381 | sh->ops.target = sh->pd_idx; | ||
2382 | sh->ops.count++; | ||
2383 | s->uptodate++; | ||
2384 | } | ||
2385 | } | ||
2386 | } else | ||
2387 | canceled_check = 1; /* STRIPE_INSYNC is not set */ | ||
2388 | } | ||
2389 | |||
2390 | /* start a new check operation if there are no failures, the stripe is | ||
2391 | * not insync, and a repair is not in flight | ||
2392 | */ | ||
2393 | if (s->failed == 0 && | ||
2394 | !test_bit(STRIPE_INSYNC, &sh->state) && | ||
2395 | !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { | ||
2396 | if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { | ||
2397 | BUG_ON(s->uptodate != disks); | 2361 | BUG_ON(s->uptodate != disks); |
2362 | sh->check_state = check_state_run; | ||
2363 | set_bit(STRIPE_OP_CHECK, &s->ops_request); | ||
2398 | clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); | 2364 | clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); |
2399 | sh->ops.count++; | ||
2400 | s->uptodate--; | 2365 | s->uptodate--; |
2366 | break; | ||
2401 | } | 2367 | } |
2402 | } | 2368 | dev = &sh->dev[s->failed_num]; |
2403 | 2369 | /* fall through */ | |
2404 | /* check if we can clear a parity disk reconstruct */ | 2370 | case check_state_compute_result: |
2405 | if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && | 2371 | sh->check_state = check_state_idle; |
2406 | test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { | 2372 | if (!dev) |
2407 | 2373 | dev = &sh->dev[sh->pd_idx]; | |
2408 | clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending); | 2374 | |
2409 | clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); | 2375 | /* check that a write has not made the stripe insync */ |
2410 | clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); | 2376 | if (test_bit(STRIPE_INSYNC, &sh->state)) |
2411 | clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); | 2377 | break; |
2412 | } | ||
2413 | |||
2414 | 2378 | ||
2415 | /* Wait for check parity and compute block operations to complete | ||
2416 | * before write-back. If a failure occurred while the check operation | ||
2417 | * was in flight we need to cycle this stripe through handle_stripe | ||
2418 | * since the parity block may not be uptodate | ||
2419 | */ | ||
2420 | if (!canceled_check && !test_bit(STRIPE_INSYNC, &sh->state) && | ||
2421 | !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) && | ||
2422 | !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) { | ||
2423 | struct r5dev *dev; | ||
2424 | /* either failed parity check, or recovery is happening */ | 2379 | /* either failed parity check, or recovery is happening */ |
2425 | if (s->failed == 0) | ||
2426 | s->failed_num = sh->pd_idx; | ||
2427 | dev = &sh->dev[s->failed_num]; | ||
2428 | BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); | 2380 | BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); |
2429 | BUG_ON(s->uptodate != disks); | 2381 | BUG_ON(s->uptodate != disks); |
2430 | 2382 | ||
2431 | set_bit(R5_LOCKED, &dev->flags); | 2383 | set_bit(R5_LOCKED, &dev->flags); |
2384 | s->locked++; | ||
2432 | set_bit(R5_Wantwrite, &dev->flags); | 2385 | set_bit(R5_Wantwrite, &dev->flags); |
2433 | 2386 | ||
2434 | clear_bit(STRIPE_DEGRADED, &sh->state); | 2387 | clear_bit(STRIPE_DEGRADED, &sh->state); |
2435 | s->locked++; | ||
2436 | set_bit(STRIPE_INSYNC, &sh->state); | 2388 | set_bit(STRIPE_INSYNC, &sh->state); |
2389 | break; | ||
2390 | case check_state_run: | ||
2391 | break; /* we will be called again upon completion */ | ||
2392 | case check_state_check_result: | ||
2393 | sh->check_state = check_state_idle; | ||
2394 | |||
2395 | /* if a failure occurred during the check operation, leave | ||
2396 | * STRIPE_INSYNC not set and let the stripe be handled again | ||
2397 | */ | ||
2398 | if (s->failed) | ||
2399 | break; | ||
2400 | |||
2401 | /* handle a successful check operation, if parity is correct | ||
2402 | * we are done. Otherwise update the mismatch count and repair | ||
2403 | * parity if !MD_RECOVERY_CHECK | ||
2404 | */ | ||
2405 | if (sh->ops.zero_sum_result == 0) | ||
2406 | /* parity is correct (on disc, | ||
2407 | * not in buffer any more) | ||
2408 | */ | ||
2409 | set_bit(STRIPE_INSYNC, &sh->state); | ||
2410 | else { | ||
2411 | conf->mddev->resync_mismatches += STRIPE_SECTORS; | ||
2412 | if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) | ||
2413 | /* don't try to repair!! */ | ||
2414 | set_bit(STRIPE_INSYNC, &sh->state); | ||
2415 | else { | ||
2416 | sh->check_state = check_state_compute_run; | ||
2417 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); | ||
2418 | set_bit(R5_Wantcompute, | ||
2419 | &sh->dev[sh->pd_idx].flags); | ||
2420 | sh->ops.target = sh->pd_idx; | ||
2421 | s->uptodate++; | ||
2422 | } | ||
2423 | } | ||
2424 | break; | ||
2425 | case check_state_compute_run: | ||
2426 | break; | ||
2427 | default: | ||
2428 | printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", | ||
2429 | __func__, sh->check_state, | ||
2430 | (unsigned long long) sh->sector); | ||
2431 | BUG(); | ||
2437 | } | 2432 | } |
2438 | } | 2433 | } |
2439 | 2434 | ||
@@ -2807,7 +2802,7 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2807 | * block. | 2802 | * block. |
2808 | */ | 2803 | */ |
2809 | if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) && | 2804 | if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) && |
2810 | !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) | 2805 | !sh->check_state) |
2811 | handle_issuing_new_write_requests5(conf, sh, &s, disks); | 2806 | handle_issuing_new_write_requests5(conf, sh, &s, disks); |
2812 | 2807 | ||
2813 | /* maybe we need to check and possibly fix the parity for this stripe | 2808 | /* maybe we need to check and possibly fix the parity for this stripe |
@@ -2815,11 +2810,10 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2815 | * data is available. The parity check is held off while parity | 2810 | * data is available. The parity check is held off while parity |
2816 | * dependent operations are in flight. | 2811 | * dependent operations are in flight. |
2817 | */ | 2812 | */ |
2818 | if ((s.syncing && s.locked == 0 && | 2813 | if (sh->check_state || |
2814 | (s.syncing && s.locked == 0 && | ||
2819 | !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && | 2815 | !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && |
2820 | !test_bit(STRIPE_INSYNC, &sh->state)) || | 2816 | !test_bit(STRIPE_INSYNC, &sh->state))) |
2821 | test_bit(STRIPE_OP_CHECK, &sh->ops.pending) || | ||
2822 | test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) | ||
2823 | handle_parity_checks5(conf, sh, &s, disks); | 2817 | handle_parity_checks5(conf, sh, &s, disks); |
2824 | 2818 | ||
2825 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { | 2819 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { |
@@ -2897,8 +2891,8 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2897 | if (unlikely(blocked_rdev)) | 2891 | if (unlikely(blocked_rdev)) |
2898 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); | 2892 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); |
2899 | 2893 | ||
2900 | if (pending) | 2894 | if (pending || s.ops_request) |
2901 | raid5_run_ops(sh, pending); | 2895 | raid5_run_ops(sh, pending, s.ops_request); |
2902 | 2896 | ||
2903 | ops_run_io(sh, &s); | 2897 | ops_run_io(sh, &s); |
2904 | 2898 | ||