diff options
author | Dan Williams <dan.j.williams@intel.com> | 2007-01-02 15:52:30 -0500 |
---|---|---|
committer | Dan Williams <dan.j.williams@intel.com> | 2007-07-13 11:06:15 -0400 |
commit | 91c00924846a0034020451c280c76baa4299f9dc (patch) | |
tree | 7124ed6706937b793a10c37a861c5fc0f2e5b348 /include/linux/raid | |
parent | 45b4233caac05da0118b608a9fc2a40a9fc580cd (diff) |
md: raid5_run_ops - run stripe operations outside sh->lock
When the raid acceleration work was proposed, Neil laid out the following
attack plan:
1/ move the xor and copy operations outside spin_lock(&sh->lock)
2/ find/implement an asynchronous offload api
The raid5_run_ops routine uses the asynchronous offload api (async_tx) and
the stripe_operations member of a stripe_head to carry out xor+copy
operations asynchronously, outside the lock.
To perform operations outside the lock a new set of state flags is needed
to track new requests, in-flight requests, and completed requests. In this
new model handle_stripe is tasked with scanning the stripe_head for work,
updating the stripe_operations structure, and finally dropping the lock and
calling raid5_run_ops for processing. The following flags outline the
requests that handle_stripe can make of raid5_run_ops:
STRIPE_OP_BIOFILL
- copy data into request buffers to satisfy a read request
STRIPE_OP_COMPUTE_BLK
- generate a missing block in the cache from the other blocks
STRIPE_OP_PREXOR
- subtract existing data as part of the read-modify-write process
STRIPE_OP_BIODRAIN
- copy data out of request buffers to satisfy a write request
STRIPE_OP_POSTXOR
- recalculate parity for new data that has entered the cache
STRIPE_OP_CHECK
- verify that the parity is correct
STRIPE_OP_IO
- submit i/o to the member disks (note this was already performed outside
the stripe lock, but it made sense to add it as an operation type
The flow is:
1/ handle_stripe sets STRIPE_OP_* in sh->ops.pending
2/ raid5_run_ops reads sh->ops.pending, sets sh->ops.ack, and submits the
operation to the async_tx api
3/ async_tx triggers the completion callback routine to set
sh->ops.complete and release the stripe
4/ handle_stripe runs again to finish the operation and optionally submit
new operations that were previously blocked
Note this patch just defines raid5_run_ops, subsequent commits (one per
major operation type) modify handle_stripe to take advantage of this
routine.
Changelog:
* removed ops_complete_biodrain in favor of ops_complete_postxor and
ops_complete_write.
* removed the raid5_run_ops workqueue
* call bi_end_io for reads in ops_complete_biofill, saves a call to
handle_stripe
* explicitly handle the 2-disk raid5 case (xor becomes memcpy), Neil Brown
* fix race between async engines and bi_end_io call for reads, Neil Brown
* remove unnecessary spin_lock from ops_complete_biofill
* remove test_and_set/test_and_clear BUG_ONs, Neil Brown
* remove explicit interrupt handling for channel switching, this feature
was absorbed (i.e. it is now implicit) by the async_tx api
* use return_io in ops_complete_biofill
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-By: NeilBrown <neilb@suse.de>
Diffstat (limited to 'include/linux/raid')
-rw-r--r-- | include/linux/raid/raid5.h | 81 |
1 files changed, 78 insertions, 3 deletions
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index b99d354f6128..6fb9d94e6f2e 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h | |||
@@ -116,13 +116,46 @@ | |||
116 | * attach a request to an active stripe (add_stripe_bh()) | 116 | * attach a request to an active stripe (add_stripe_bh()) |
117 | * lockdev attach-buffer unlockdev | 117 | * lockdev attach-buffer unlockdev |
118 | * handle a stripe (handle_stripe()) | 118 | * handle a stripe (handle_stripe()) |
119 | * lockstripe clrSTRIPE_HANDLE ... (lockdev check-buffers unlockdev) .. change-state .. record io needed unlockstripe schedule io | 119 | * lockstripe clrSTRIPE_HANDLE ... |
120 | * (lockdev check-buffers unlockdev) .. | ||
121 | * change-state .. | ||
122 | * record io/ops needed unlockstripe schedule io/ops | ||
120 | * release an active stripe (release_stripe()) | 123 | * release an active stripe (release_stripe()) |
121 | * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev | 124 | * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev |
122 | * | 125 | * |
123 | * The refcount counts each thread that have activated the stripe, | 126 | * The refcount counts each thread that have activated the stripe, |
124 | * plus raid5d if it is handling it, plus one for each active request | 127 | * plus raid5d if it is handling it, plus one for each active request |
125 | * on a cached buffer. | 128 | * on a cached buffer, and plus one if the stripe is undergoing stripe |
129 | * operations. | ||
130 | * | ||
131 | * Stripe operations are performed outside the stripe lock, | ||
132 | * the stripe operations are: | ||
133 | * -copying data between the stripe cache and user application buffers | ||
134 | * -computing blocks to save a disk access, or to recover a missing block | ||
135 | * -updating the parity on a write operation (reconstruct write and | ||
136 | * read-modify-write) | ||
137 | * -checking parity correctness | ||
138 | * -running i/o to disk | ||
139 | * These operations are carried out by raid5_run_ops which uses the async_tx | ||
140 | * api to (optionally) offload operations to dedicated hardware engines. | ||
141 | * When requesting an operation handle_stripe sets the pending bit for the | ||
142 | * operation and increments the count. raid5_run_ops is then run whenever | ||
143 | * the count is non-zero. | ||
144 | * There are some critical dependencies between the operations that prevent some | ||
145 | * from being requested while another is in flight. | ||
146 | * 1/ Parity check operations destroy the in cache version of the parity block, | ||
147 | * so we prevent parity dependent operations like writes and compute_blocks | ||
148 | * from starting while a check is in progress. Some dma engines can perform | ||
149 | * the check without damaging the parity block, in these cases the parity | ||
150 | * block is re-marked up to date (assuming the check was successful) and is | ||
151 | * not re-read from disk. | ||
152 | * 2/ When a write operation is requested we immediately lock the affected | ||
153 | * blocks, and mark them as not up to date. This causes new read requests | ||
154 | * to be held off, as well as parity checks and compute block operations. | ||
155 | * 3/ Once a compute block operation has been requested handle_stripe treats | ||
156 | * that block as if it is up to date. raid5_run_ops guaruntees that any | ||
157 | * operation that is dependent on the compute block result is initiated after | ||
158 | * the compute block completes. | ||
126 | */ | 159 | */ |
127 | 160 | ||
128 | struct stripe_head { | 161 | struct stripe_head { |
@@ -136,11 +169,26 @@ struct stripe_head { | |||
136 | spinlock_t lock; | 169 | spinlock_t lock; |
137 | int bm_seq; /* sequence number for bitmap flushes */ | 170 | int bm_seq; /* sequence number for bitmap flushes */ |
138 | int disks; /* disks in stripe */ | 171 | int disks; /* disks in stripe */ |
172 | /* stripe_operations | ||
173 | * @pending - pending ops flags (set for request->issue->complete) | ||
174 | * @ack - submitted ops flags (set for issue->complete) | ||
175 | * @complete - completed ops flags (set for complete) | ||
176 | * @target - STRIPE_OP_COMPUTE_BLK target | ||
177 | * @count - raid5_runs_ops is set to run when this is non-zero | ||
178 | */ | ||
179 | struct stripe_operations { | ||
180 | unsigned long pending; | ||
181 | unsigned long ack; | ||
182 | unsigned long complete; | ||
183 | int target; | ||
184 | int count; | ||
185 | u32 zero_sum_result; | ||
186 | } ops; | ||
139 | struct r5dev { | 187 | struct r5dev { |
140 | struct bio req; | 188 | struct bio req; |
141 | struct bio_vec vec; | 189 | struct bio_vec vec; |
142 | struct page *page; | 190 | struct page *page; |
143 | struct bio *toread, *towrite, *written; | 191 | struct bio *toread, *read, *towrite, *written; |
144 | sector_t sector; /* sector of this page */ | 192 | sector_t sector; /* sector of this page */ |
145 | unsigned long flags; | 193 | unsigned long flags; |
146 | } dev[1]; /* allocated with extra space depending of RAID geometry */ | 194 | } dev[1]; /* allocated with extra space depending of RAID geometry */ |
@@ -174,6 +222,15 @@ struct r6_state { | |||
174 | #define R5_ReWrite 9 /* have tried to over-write the readerror */ | 222 | #define R5_ReWrite 9 /* have tried to over-write the readerror */ |
175 | 223 | ||
176 | #define R5_Expanded 10 /* This block now has post-expand data */ | 224 | #define R5_Expanded 10 /* This block now has post-expand data */ |
225 | #define R5_Wantcompute 11 /* compute_block in progress treat as | ||
226 | * uptodate | ||
227 | */ | ||
228 | #define R5_Wantfill 12 /* dev->toread contains a bio that needs | ||
229 | * filling | ||
230 | */ | ||
231 | #define R5_Wantprexor 13 /* distinguish blocks ready for rmw from | ||
232 | * other "towrites" | ||
233 | */ | ||
177 | /* | 234 | /* |
178 | * Write method | 235 | * Write method |
179 | */ | 236 | */ |
@@ -196,6 +253,24 @@ struct r6_state { | |||
196 | #define STRIPE_EXPAND_SOURCE 10 | 253 | #define STRIPE_EXPAND_SOURCE 10 |
197 | #define STRIPE_EXPAND_READY 11 | 254 | #define STRIPE_EXPAND_READY 11 |
198 | /* | 255 | /* |
256 | * Operations flags (in issue order) | ||
257 | */ | ||
258 | #define STRIPE_OP_BIOFILL 0 | ||
259 | #define STRIPE_OP_COMPUTE_BLK 1 | ||
260 | #define STRIPE_OP_PREXOR 2 | ||
261 | #define STRIPE_OP_BIODRAIN 3 | ||
262 | #define STRIPE_OP_POSTXOR 4 | ||
263 | #define STRIPE_OP_CHECK 5 | ||
264 | #define STRIPE_OP_IO 6 | ||
265 | |||
266 | /* modifiers to the base operations | ||
267 | * STRIPE_OP_MOD_REPAIR_PD - compute the parity block and write it back | ||
268 | * STRIPE_OP_MOD_DMA_CHECK - parity is not corrupted by the check | ||
269 | */ | ||
270 | #define STRIPE_OP_MOD_REPAIR_PD 7 | ||
271 | #define STRIPE_OP_MOD_DMA_CHECK 8 | ||
272 | |||
273 | /* | ||
199 | * Plugging: | 274 | * Plugging: |
200 | * | 275 | * |
201 | * To improve write throughput, we need to delay the handling of some | 276 | * To improve write throughput, we need to delay the handling of some |