md: raid5_run_ops - run stripe operations outside sh->lock

When the raid acceleration work was proposed, Neil laid out the following attack plan: 1/ move the xor and copy operations outside spin_lock(&sh->lock) 2/ find/implement an asynchronous offload api The raid5_run_ops routine uses the asynchronous offload api (async_tx) and the stripe_operations member of a stripe_head to carry out xor+copy operations asynchronously, outside the lock. To perform operations outside the lock a new set of state flags is needed to track new requests, in-flight requests, and completed requests. In this new model handle_stripe is tasked with scanning the stripe_head for work, updating the stripe_operations structure, and finally dropping the lock and calling raid5_run_ops for processing. The following flags outline the requests that handle_stripe can make of raid5_run_ops: STRIPE_OP_BIOFILL - copy data into request buffers to satisfy a read request STRIPE_OP_COMPUTE_BLK - generate a missing block in the cache from the other blocks STRIPE_OP_PREXOR - subtract existing data as part of the read-modify-write process STRIPE_OP_BIODRAIN - copy data out of request buffers to satisfy a write request STRIPE_OP_POSTXOR - recalculate parity for new data that has entered the cache STRIPE_OP_CHECK - verify that the parity is correct STRIPE_OP_IO - submit i/o to the member disks (note this was already performed outside the stripe lock, but it made sense to add it as an operation type The flow is: 1/ handle_stripe sets STRIPE_OP_* in sh->ops.pending 2/ raid5_run_ops reads sh->ops.pending, sets sh->ops.ack, and submits the operation to the async_tx api 3/ async_tx triggers the completion callback routine to set sh->ops.complete and release the stripe 4/ handle_stripe runs again to finish the operation and optionally submit new operations that were previously blocked Note this patch just defines raid5_run_ops, subsequent commits (one per major operation type) modify handle_stripe to take advantage of this routine. Changelog: * removed ops_complete_biodrain in favor of ops_complete_postxor and ops_complete_write. * removed the raid5_run_ops workqueue * call bi_end_io for reads in ops_complete_biofill, saves a call to handle_stripe * explicitly handle the 2-disk raid5 case (xor becomes memcpy), Neil Brown * fix race between async engines and bi_end_io call for reads, Neil Brown * remove unnecessary spin_lock from ops_complete_biofill * remove test_and_set/test_and_clear BUG_ONs, Neil Brown * remove explicit interrupt handling for channel switching, this feature was absorbed (i.e. it is now implicit) by the async_tx api * use return_io in ops_complete_biofill Signed-off-by: Dan Williams <dan.j.williams@intel.com> Acked-By: NeilBrown <neilb@suse.de>
author: Dan Williams <dan.j.williams@intel.com> 2007-01-02 15:52:30 -0500
committer: Dan Williams <dan.j.williams@intel.com> 2007-07-13 11:06:15 -0400
commit: 91c00924846a0034020451c280c76baa4299f9dc (patch)
tree: 7124ed6706937b793a10c37a861c5fc0f2e5b348 /include/linux/raid
parent: 45b4233caac05da0118b608a9fc2a40a9fc580cd (diff)
1 files changed, 78 insertions, 3 deletions
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index b99d354f6128..6fb9d94e6f2e 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -116,13 +116,46 @@
 *  attach a request to an active stripe (add_stripe_bh())
 *     lockdev attach-buffer unlockdev
 *  handle a stripe (handle_stripe())
- *     lockstripe clrSTRIPE_HANDLE ... (lockdev check-buffers unlockdev) .. change-state .. record io needed unlockstripe schedule io
+ *     lockstripe clrSTRIPE_HANDLE ...
+ *              (lockdev check-buffers unlockdev) ..
+ *              change-state ..
+ *              record io/ops needed unlockstripe schedule io/ops
 *  release an active stripe (release_stripe())
 *     lockdev if (!--cnt) { if  STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
 *
 * The refcount counts each thread that have activated the stripe,
 * plus raid5d if it is handling it, plus one for each active request
- * on a cached buffer.
+ * on a cached buffer, and plus one if the stripe is undergoing stripe
+ * operations.
+ *
+ * Stripe operations are performed outside the stripe lock,
+ * the stripe operations are:
+ * -copying data between the stripe cache and user application buffers
+ * -computing blocks to save a disk access, or to recover a missing block
+ * -updating the parity on a write operation (reconstruct write and
+ *  read-modify-write)
+ * -checking parity correctness
+ * -running i/o to disk
+ * These operations are carried out by raid5_run_ops which uses the async_tx
+ * api to (optionally) offload operations to dedicated hardware engines.
+ * When requesting an operation handle_stripe sets the pending bit for the
+ * operation and increments the count.  raid5_run_ops is then run whenever
+ * the count is non-zero.
+ * There are some critical dependencies between the operations that prevent some
+ * from being requested while another is in flight.
+ * 1/ Parity check operations destroy the in cache version of the parity block,
+ *    so we prevent parity dependent operations like writes and compute_blocks
+ *    from starting while a check is in progress.  Some dma engines can perform
+ *    the check without damaging the parity block, in these cases the parity
+ *    block is re-marked up to date (assuming the check was successful) and is
+ *    not re-read from disk.
+ * 2/ When a write operation is requested we immediately lock the affected
+ *    blocks, and mark them as not up to date.  This causes new read requests
+ *    to be held off, as well as parity checks and compute block operations.
+ * 3/ Once a compute block operation has been requested handle_stripe treats
+ *    that block as if it is up to date.  raid5_run_ops guaruntees that any
+ *    operation that is dependent on the compute block result is initiated after
+ *    the compute block completes.
 */
 struct stripe_head {
@@ -136,11 +169,26 @@ struct stripe_head {
        spinlock_t              lock;
        int                     bm_seq; /* sequence number for bitmap flushes */
        int                     disks;                  /* disks in stripe */
+        /* stripe_operations
+         * @pending - pending ops flags (set for request->issue->complete)
+         * @ack - submitted ops flags (set for issue->complete)
+         * @complete - completed ops flags (set for complete)
+         * @target - STRIPE_OP_COMPUTE_BLK target
+         * @count - raid5_runs_ops is set to run when this is non-zero
+         */
+        struct stripe_operations {
+                unsigned long      pending;
+                unsigned long      ack;
+                unsigned long      complete;
+                int                target;
+                int                count;
+                u32                zero_sum_result;
+        } ops;
        struct r5dev {
                struct bio      req;
                struct bio_vec  vec;
                struct page     *page;
-                struct bio      *toread, *towrite, *written;
+                struct bio      *toread, *read, *towrite, *written;
                sector_t        sector;                 /* sector of this page */
                unsigned long   flags;
        } dev[1]; /* allocated with extra space depending of RAID geometry */
@@ -174,6 +222,15 @@ struct r6_state {
 #define R5_ReWrite      9       /* have tried to over-write the readerror */
 #define R5_Expanded     10      /* This block now has post-expand data */
+#define R5_Wantcompute  11 /* compute_block in progress treat as
+                                    * uptodate
+                                    */
+#define R5_Wantfill     12 /* dev->toread contains a bio that needs
+                                    * filling
+                                    */
+#define R5_Wantprexor   13 /* distinguish blocks ready for rmw from
+                                    * other "towrites"
+                                    */
 /*
 * Write method
 */
@@ -196,6 +253,24 @@ struct r6_state {
 #define STRIPE_EXPAND_SOURCE    10
 #define STRIPE_EXPAND_READY     11
 /*
+ * Operations flags (in issue order)
+ */
+#define STRIPE_OP_BIOFILL       0
+#define STRIPE_OP_COMPUTE_BLK   1
+#define STRIPE_OP_PREXOR        2
+#define STRIPE_OP_BIODRAIN      3
+#define STRIPE_OP_POSTXOR       4
+#define STRIPE_OP_CHECK 5
+#define STRIPE_OP_IO            6
+/* modifiers to the base operations
+ * STRIPE_OP_MOD_REPAIR_PD - compute the parity block and write it back
+ * STRIPE_OP_MOD_DMA_CHECK - parity is not corrupted by the check
+ */
+#define STRIPE_OP_MOD_REPAIR_PD 7
+#define STRIPE_OP_MOD_DMA_CHECK 8
+/*
 * Plugging:
 *
 * To improve write throughput, we need to delay the handling of some
author	Dan Williams <dan.j.williams@intel.com>	2007-01-02 15:52:30 -0500
committer	Dan Williams <dan.j.williams@intel.com>	2007-07-13 11:06:15 -0400
commit	91c00924846a0034020451c280c76baa4299f9dc (patch)
tree	7124ed6706937b793a10c37a861c5fc0f2e5b348 /include/linux/raid
parent	45b4233caac05da0118b608a9fc2a40a9fc580cd (diff)

diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index b99d354f6128..6fb9d94e6f2e 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h
@@ -116,13 +116,46 @@
116	* attach a request to an active stripe (add_stripe_bh())	116	* attach a request to an active stripe (add_stripe_bh())
117	* lockdev attach-buffer unlockdev	117	* lockdev attach-buffer unlockdev
118	* handle a stripe (handle_stripe())	118	* handle a stripe (handle_stripe())
119	* lockstripe clrSTRIPE_HANDLE ... (lockdev check-buffers unlockdev) .. change-state .. record io needed unlockstripe schedule io	119	* lockstripe clrSTRIPE_HANDLE ...
		120	* (lockdev check-buffers unlockdev) ..
		121	* change-state ..
		122	* record io/ops needed unlockstripe schedule io/ops
120	* release an active stripe (release_stripe())	123	* release an active stripe (release_stripe())
121	* lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev	124	* lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
122	*	125	*
123	* The refcount counts each thread that have activated the stripe,	126	* The refcount counts each thread that have activated the stripe,
124	* plus raid5d if it is handling it, plus one for each active request	127	* plus raid5d if it is handling it, plus one for each active request
125	* on a cached buffer.	128	* on a cached buffer, and plus one if the stripe is undergoing stripe
		129	* operations.
		130	*
		131	* Stripe operations are performed outside the stripe lock,
		132	* the stripe operations are:
		133	* -copying data between the stripe cache and user application buffers
		134	* -computing blocks to save a disk access, or to recover a missing block
		135	* -updating the parity on a write operation (reconstruct write and
		136	* read-modify-write)
		137	* -checking parity correctness
		138	* -running i/o to disk
		139	* These operations are carried out by raid5_run_ops which uses the async_tx
		140	* api to (optionally) offload operations to dedicated hardware engines.
		141	* When requesting an operation handle_stripe sets the pending bit for the
		142	* operation and increments the count. raid5_run_ops is then run whenever
		143	* the count is non-zero.
		144	* There are some critical dependencies between the operations that prevent some
		145	* from being requested while another is in flight.
		146	* 1/ Parity check operations destroy the in cache version of the parity block,
		147	* so we prevent parity dependent operations like writes and compute_blocks
		148	* from starting while a check is in progress. Some dma engines can perform
		149	* the check without damaging the parity block, in these cases the parity
		150	* block is re-marked up to date (assuming the check was successful) and is
		151	* not re-read from disk.
		152	* 2/ When a write operation is requested we immediately lock the affected
		153	* blocks, and mark them as not up to date. This causes new read requests
		154	* to be held off, as well as parity checks and compute block operations.
		155	* 3/ Once a compute block operation has been requested handle_stripe treats
		156	* that block as if it is up to date. raid5_run_ops guaruntees that any
		157	* operation that is dependent on the compute block result is initiated after
		158	* the compute block completes.
126	*/	159	*/
127		160
128	struct stripe_head {	161	struct stripe_head {
@@ -136,11 +169,26 @@ struct stripe_head {
136	spinlock_t lock;	169	spinlock_t lock;
137	int bm_seq; /* sequence number for bitmap flushes */	170	int bm_seq; /* sequence number for bitmap flushes */
138	int disks; /* disks in stripe */	171	int disks; /* disks in stripe */
		172	/* stripe_operations
		173	* @pending - pending ops flags (set for request->issue->complete)
		174	* @ack - submitted ops flags (set for issue->complete)
		175	* @complete - completed ops flags (set for complete)
		176	* @target - STRIPE_OP_COMPUTE_BLK target
		177	* @count - raid5_runs_ops is set to run when this is non-zero
		178	*/
		179	struct stripe_operations {
		180	unsigned long pending;
		181	unsigned long ack;
		182	unsigned long complete;
		183	int target;
		184	int count;
		185	u32 zero_sum_result;
		186	} ops;
139	struct r5dev {	187	struct r5dev {
140	struct bio req;	188	struct bio req;
141	struct bio_vec vec;	189	struct bio_vec vec;
142	struct page *page;	190	struct page *page;
143	struct bio toread, towrite, *written;	191	struct bio toread, read, towrite, written;
144	sector_t sector; /* sector of this page */	192	sector_t sector; /* sector of this page */
145	unsigned long flags;	193	unsigned long flags;
146	} dev[1]; /* allocated with extra space depending of RAID geometry */	194	} dev[1]; /* allocated with extra space depending of RAID geometry */
@@ -174,6 +222,15 @@ struct r6_state {
174	#define R5_ReWrite 9 /* have tried to over-write the readerror */	222	#define R5_ReWrite 9 /* have tried to over-write the readerror */
175		223
176	#define R5_Expanded 10 /* This block now has post-expand data */	224	#define R5_Expanded 10 /* This block now has post-expand data */
		225	#define R5_Wantcompute 11 /* compute_block in progress treat as
		226	* uptodate
		227	*/
		228	#define R5_Wantfill 12 /* dev->toread contains a bio that needs
		229	* filling
		230	*/
		231	#define R5_Wantprexor 13 /* distinguish blocks ready for rmw from
		232	* other "towrites"
		233	*/
177	/*	234	/*
178	* Write method	235	* Write method
179	*/	236	*/
@@ -196,6 +253,24 @@ struct r6_state {
196	#define STRIPE_EXPAND_SOURCE 10	253	#define STRIPE_EXPAND_SOURCE 10
197	#define STRIPE_EXPAND_READY 11	254	#define STRIPE_EXPAND_READY 11
198	/*	255	/*
		256	* Operations flags (in issue order)
		257	*/
		258	#define STRIPE_OP_BIOFILL 0
		259	#define STRIPE_OP_COMPUTE_BLK 1
		260	#define STRIPE_OP_PREXOR 2
		261	#define STRIPE_OP_BIODRAIN 3
		262	#define STRIPE_OP_POSTXOR 4
		263	#define STRIPE_OP_CHECK 5
		264	#define STRIPE_OP_IO 6
		265
		266	/* modifiers to the base operations
		267	* STRIPE_OP_MOD_REPAIR_PD - compute the parity block and write it back
		268	* STRIPE_OP_MOD_DMA_CHECK - parity is not corrupted by the check
		269	*/
		270	#define STRIPE_OP_MOD_REPAIR_PD 7
		271	#define STRIPE_OP_MOD_DMA_CHECK 8
		272
		273	/*
199	* Plugging:	274	* Plugging:
200	*	275	*
201	* To improve write throughput, we need to delay the handling of some	276	* To improve write throughput, we need to delay the handling of some