aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDan Williams <dan.j.williams@intel.com>2007-01-02 15:52:30 -0500
committerDan Williams <dan.j.williams@intel.com>2007-07-13 11:06:17 -0400
commitf38e12199a94ca458e4f03c5a2c984fb80adadc5 (patch)
tree706f54c46d4a4c839dd43c1403854dde860c6be5
parente33129d84130459dbb764a1a52a4bfceab3da978 (diff)
md: handle_stripe5 - add request/completion logic for async compute ops
handle_stripe will compute a block when a backing disk has failed, or when it determines it can save a disk read by computing the block from all the other up-to-date blocks. Previously a block would be computed under the lock and subsequent logic in handle_stripe could use the newly up-to-date block. With the raid5_run_ops implementation the compute operation is carried out a later time outside the lock. To preserve the old functionality we take advantage of the dependency chain feature of async_tx to flag the block as R5_Wantcompute and then let other parts of handle_stripe operate on the block as if it were up-to-date. raid5_run_ops guarantees that the block will be ready before it is used in another operation. However, this only works in cases where the compute and the dependent operation are scheduled at the same time. If a previous call to handle_stripe sets the R5_Wantcompute flag there is no facility to pass the async_tx dependency chain across successive calls to raid5_run_ops. The req_compute variable protects against this case. Changelog: * remove the req_compute BUG_ON Signed-off-by: Dan Williams <dan.j.williams@intel.com> Acked-By: NeilBrown <neilb@suse.de>
-rw-r--r--drivers/md/raid5.c149
-rw-r--r--include/linux/raid/raid5.h2
2 files changed, 115 insertions, 36 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d9521aa69461..42439a4c1c51 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2077,36 +2077,101 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
2077 2077
2078} 2078}
2079 2079
2080/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks
2081 * to process
2082 */
2083static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
2084 struct stripe_head_state *s, int disk_idx, int disks)
2085{
2086 struct r5dev *dev = &sh->dev[disk_idx];
2087 struct r5dev *failed_dev = &sh->dev[s->failed_num];
2088
2089 /* don't schedule compute operations or reads on the parity block while
2090 * a check is in flight
2091 */
2092 if ((disk_idx == sh->pd_idx) &&
2093 test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
2094 return ~0;
2095
2096 /* is the data in this block needed, and can we get it? */
2097 if (!test_bit(R5_LOCKED, &dev->flags) &&
2098 !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread ||
2099 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2100 s->syncing || s->expanding || (s->failed &&
2101 (failed_dev->toread || (failed_dev->towrite &&
2102 !test_bit(R5_OVERWRITE, &failed_dev->flags)
2103 ))))) {
2104 /* 1/ We would like to get this block, possibly by computing it,
2105 * but we might not be able to.
2106 *
2107 * 2/ Since parity check operations potentially make the parity
2108 * block !uptodate it will need to be refreshed before any
2109 * compute operations on data disks are scheduled.
2110 *
2111 * 3/ We hold off parity block re-reads until check operations
2112 * have quiesced.
2113 */
2114 if ((s->uptodate == disks - 1) &&
2115 !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
2116 set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
2117 set_bit(R5_Wantcompute, &dev->flags);
2118 sh->ops.target = disk_idx;
2119 s->req_compute = 1;
2120 sh->ops.count++;
2121 /* Careful: from this point on 'uptodate' is in the eye
2122 * of raid5_run_ops which services 'compute' operations
2123 * before writes. R5_Wantcompute flags a block that will
2124 * be R5_UPTODATE by the time it is needed for a
2125 * subsequent operation.
2126 */
2127 s->uptodate++;
2128 return 0; /* uptodate + compute == disks */
2129 } else if ((s->uptodate < disks - 1) &&
2130 test_bit(R5_Insync, &dev->flags)) {
2131 /* Note: we hold off compute operations while checks are
2132 * in flight, but we still prefer 'compute' over 'read'
2133 * hence we only read if (uptodate < * disks-1)
2134 */
2135 set_bit(R5_LOCKED, &dev->flags);
2136 set_bit(R5_Wantread, &dev->flags);
2137 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2138 sh->ops.count++;
2139 s->locked++;
2140 pr_debug("Reading block %d (sync=%d)\n", disk_idx,
2141 s->syncing);
2142 }
2143 }
2144
2145 return ~0;
2146}
2147
2080static void handle_issuing_new_read_requests5(struct stripe_head *sh, 2148static void handle_issuing_new_read_requests5(struct stripe_head *sh,
2081 struct stripe_head_state *s, int disks) 2149 struct stripe_head_state *s, int disks)
2082{ 2150{
2083 int i; 2151 int i;
2084 for (i = disks; i--; ) { 2152
2085 struct r5dev *dev = &sh->dev[i]; 2153 /* Clear completed compute operations. Parity recovery
2086 if (!test_bit(R5_LOCKED, &dev->flags) && 2154 * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled
2087 !test_bit(R5_UPTODATE, &dev->flags) && 2155 * later on in this routine
2088 (dev->toread || 2156 */
2089 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2157 if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
2090 s->syncing || s->expanding || 2158 !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
2091 (s->failed && (sh->dev[s->failed_num].toread || 2159 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
2092 (sh->dev[s->failed_num].towrite && 2160 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
2093 !test_bit(R5_OVERWRITE, &sh->dev[s->failed_num].flags)) 2161 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
2094 )))) { 2162 }
2095 /* we would like to get this block, possibly 2163
2096 * by computing it, but we might not be able to 2164 /* look for blocks to read/compute, skip this if a compute
2097 */ 2165 * is already in flight, or if the stripe contents are in the
2098 if (s->uptodate == disks-1) { 2166 * midst of changing due to a write
2099 pr_debug("Computing block %d\n", i); 2167 */
2100 compute_block(sh, i); 2168 if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
2101 s->uptodate++; 2169 !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) &&
2102 } else if (test_bit(R5_Insync, &dev->flags)) { 2170 !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
2103 set_bit(R5_LOCKED, &dev->flags); 2171 for (i = disks; i--; )
2104 set_bit(R5_Wantread, &dev->flags); 2172 if (__handle_issuing_new_read_requests5(
2105 s->locked++; 2173 sh, s, i, disks) == 0)
2106 pr_debug("Reading block %d (sync=%d)\n", 2174 break;
2107 i, s->syncing);
2108 }
2109 }
2110 } 2175 }
2111 set_bit(STRIPE_HANDLE, &sh->state); 2176 set_bit(STRIPE_HANDLE, &sh->state);
2112} 2177}
@@ -2223,7 +2288,8 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
2223 struct r5dev *dev = &sh->dev[i]; 2288 struct r5dev *dev = &sh->dev[i];
2224 if ((dev->towrite || i == sh->pd_idx) && 2289 if ((dev->towrite || i == sh->pd_idx) &&
2225 !test_bit(R5_LOCKED, &dev->flags) && 2290 !test_bit(R5_LOCKED, &dev->flags) &&
2226 !test_bit(R5_UPTODATE, &dev->flags)) { 2291 !(test_bit(R5_UPTODATE, &dev->flags) ||
2292 test_bit(R5_Wantcompute, &dev->flags))) {
2227 if (test_bit(R5_Insync, &dev->flags)) 2293 if (test_bit(R5_Insync, &dev->flags))
2228 rmw++; 2294 rmw++;
2229 else 2295 else
@@ -2232,9 +2298,9 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
2232 /* Would I have to read this buffer for reconstruct_write */ 2298 /* Would I have to read this buffer for reconstruct_write */
2233 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && 2299 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
2234 !test_bit(R5_LOCKED, &dev->flags) && 2300 !test_bit(R5_LOCKED, &dev->flags) &&
2235 !test_bit(R5_UPTODATE, &dev->flags)) { 2301 !(test_bit(R5_UPTODATE, &dev->flags) ||
2236 if (test_bit(R5_Insync, &dev->flags)) 2302 test_bit(R5_Wantcompute, &dev->flags))) {
2237 rcw++; 2303 if (test_bit(R5_Insync, &dev->flags)) rcw++;
2238 else 2304 else
2239 rcw += 2*disks; 2305 rcw += 2*disks;
2240 } 2306 }
@@ -2248,7 +2314,8 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
2248 struct r5dev *dev = &sh->dev[i]; 2314 struct r5dev *dev = &sh->dev[i];
2249 if ((dev->towrite || i == sh->pd_idx) && 2315 if ((dev->towrite || i == sh->pd_idx) &&
2250 !test_bit(R5_LOCKED, &dev->flags) && 2316 !test_bit(R5_LOCKED, &dev->flags) &&
2251 !test_bit(R5_UPTODATE, &dev->flags) && 2317 !(test_bit(R5_UPTODATE, &dev->flags) ||
2318 test_bit(R5_Wantcompute, &dev->flags)) &&
2252 test_bit(R5_Insync, &dev->flags)) { 2319 test_bit(R5_Insync, &dev->flags)) {
2253 if ( 2320 if (
2254 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2321 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
@@ -2270,7 +2337,8 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
2270 if (!test_bit(R5_OVERWRITE, &dev->flags) && 2337 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2271 i != sh->pd_idx && 2338 i != sh->pd_idx &&
2272 !test_bit(R5_LOCKED, &dev->flags) && 2339 !test_bit(R5_LOCKED, &dev->flags) &&
2273 !test_bit(R5_UPTODATE, &dev->flags) && 2340 !(test_bit(R5_UPTODATE, &dev->flags) ||
2341 test_bit(R5_Wantcompute, &dev->flags)) &&
2274 test_bit(R5_Insync, &dev->flags)) { 2342 test_bit(R5_Insync, &dev->flags)) {
2275 if ( 2343 if (
2276 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2344 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
@@ -2288,8 +2356,17 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
2288 /* now if nothing is locked, and if we have enough data, 2356 /* now if nothing is locked, and if we have enough data,
2289 * we can start a write request 2357 * we can start a write request
2290 */ 2358 */
2291 if (s->locked == 0 && (rcw == 0 || rmw == 0) && 2359 /* since handle_stripe can be called at any time we need to handle the
2292 !test_bit(STRIPE_BIT_DELAY, &sh->state)) 2360 * case where a compute block operation has been submitted and then a
2361 * subsequent call wants to start a write request. raid5_run_ops only
2362 * handles the case where compute block and postxor are requested
2363 * simultaneously. If this is not the case then new writes need to be
2364 * held off until the compute completes.
2365 */
2366 if ((s->req_compute ||
2367 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) &&
2368 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
2369 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
2293 s->locked += handle_write_operations5(sh, rcw == 0, 0); 2370 s->locked += handle_write_operations5(sh, rcw == 0, 0);
2294} 2371}
2295 2372
@@ -2650,6 +2727,7 @@ static void handle_stripe5(struct stripe_head *sh)
2650 /* now count some things */ 2727 /* now count some things */
2651 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; 2728 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
2652 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; 2729 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
2730 if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++;
2653 2731
2654 if (dev->toread) 2732 if (dev->toread)
2655 s.to_read++; 2733 s.to_read++;
@@ -2706,7 +2784,8 @@ static void handle_stripe5(struct stripe_head *sh)
2706 * or to load a block that is being partially written. 2784 * or to load a block that is being partially written.
2707 */ 2785 */
2708 if (s.to_read || s.non_overwrite || 2786 if (s.to_read || s.non_overwrite ||
2709 (s.syncing && (s.uptodate < disks)) || s.expanding) 2787 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding ||
2788 test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
2710 handle_issuing_new_read_requests5(sh, &s, disks); 2789 handle_issuing_new_read_requests5(sh, &s, disks);
2711 2790
2712 /* Now we check to see if any write operations have recently 2791 /* Now we check to see if any write operations have recently
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 6fb9d94e6f2e..2293015de1d5 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -200,7 +200,7 @@ struct stripe_head {
200struct stripe_head_state { 200struct stripe_head_state {
201 int syncing, expanding, expanded; 201 int syncing, expanding, expanded;
202 int locked, uptodate, to_read, to_write, failed, written; 202 int locked, uptodate, to_read, to_write, failed, written;
203 int non_overwrite; 203 int compute, req_compute, non_overwrite;
204 int failed_num; 204 int failed_num;
205}; 205};
206 206