aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@woody.linux-foundation.org>2007-07-13 13:52:27 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-07-13 13:52:27 -0400
commite030dbf91a87da7e8be3be3ca781558695bea683 (patch)
tree4ff2e01621a888be4098ca48c404775e56a55a0d /drivers
parent12a22960549979c10a95cc97f8ec63b461c55692 (diff)
parent3039f0735a280b54c7364fbfe6a9287f7f0b510a (diff)
Merge branch 'ioat-md-accel-for-linus' of git://lost.foo-projects.org/~dwillia2/git/iop
* 'ioat-md-accel-for-linus' of git://lost.foo-projects.org/~dwillia2/git/iop: (28 commits) ioatdma: add the unisys "i/oat" pci vendor/device id ARM: Add drivers/dma to arch/arm/Kconfig iop3xx: surface the iop3xx DMA and AAU units to the iop-adma driver iop13xx: surface the iop13xx adma units to the iop-adma driver dmaengine: driver for the iop32x, iop33x, and iop13xx raid engines md: remove raid5 compute_block and compute_parity5 md: handle_stripe5 - request io processing in raid5_run_ops md: handle_stripe5 - add request/completion logic for async expand ops md: handle_stripe5 - add request/completion logic for async read ops md: handle_stripe5 - add request/completion logic for async check ops md: handle_stripe5 - add request/completion logic for async compute ops md: handle_stripe5 - add request/completion logic for async write ops md: common infrastructure for running operations with raid5_run_ops md: raid5_run_ops - run stripe operations outside sh->lock raid5: replace custom debug PRINTKs with standard pr_debug raid5: refactor handle_stripe5 and handle_stripe6 (v3) async_tx: add the async_tx api xor: make 'xor_blocks' a library routine for use with async_tx dmaengine: make clients responsible for managing channels dmaengine: refactor dmaengine around dma_async_tx_descriptor ...
Diffstat (limited to 'drivers')
-rw-r--r--drivers/dma/Kconfig12
-rw-r--r--drivers/dma/Makefile1
-rw-r--r--drivers/dma/dmaengine.c419
-rw-r--r--drivers/dma/ioatdma.c369
-rw-r--r--drivers/dma/ioatdma.h16
-rw-r--r--drivers/dma/ioatdma_io.h118
-rw-r--r--drivers/dma/iop-adma.c1467
-rw-r--r--drivers/md/Kconfig2
-rw-r--r--drivers/md/Makefile4
-rw-r--r--drivers/md/md.c2
-rw-r--r--drivers/md/raid5.c2727
-rw-r--r--drivers/md/xor.c154
12 files changed, 3659 insertions, 1632 deletions
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 72be6c63edfc..b31756d59978 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -8,8 +8,8 @@ menu "DMA Engine support"
8config DMA_ENGINE 8config DMA_ENGINE
9 bool "Support for DMA engines" 9 bool "Support for DMA engines"
10 ---help--- 10 ---help---
11 DMA engines offload copy operations from the CPU to dedicated 11 DMA engines offload bulk memory operations from the CPU to dedicated
12 hardware, allowing the copies to happen asynchronously. 12 hardware, allowing the operations to happen asynchronously.
13 13
14comment "DMA Clients" 14comment "DMA Clients"
15 15
@@ -32,4 +32,12 @@ config INTEL_IOATDMA
32 ---help--- 32 ---help---
33 Enable support for the Intel(R) I/OAT DMA engine. 33 Enable support for the Intel(R) I/OAT DMA engine.
34 34
35config INTEL_IOP_ADMA
36 tristate "Intel IOP ADMA support"
37 depends on DMA_ENGINE && (ARCH_IOP32X || ARCH_IOP33X || ARCH_IOP13XX)
38 select ASYNC_CORE
39 default m
40 ---help---
41 Enable support for the Intel(R) IOP Series RAID engines.
42
35endmenu 43endmenu
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index bdcfdbdb1aec..b3839b687ae0 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -1,3 +1,4 @@
1obj-$(CONFIG_DMA_ENGINE) += dmaengine.o 1obj-$(CONFIG_DMA_ENGINE) += dmaengine.o
2obj-$(CONFIG_NET_DMA) += iovlock.o 2obj-$(CONFIG_NET_DMA) += iovlock.o
3obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o 3obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o
4obj-$(CONFIG_INTEL_IOP_ADMA) += iop-adma.o
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 322ee2984e3d..82489923af09 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -37,11 +37,11 @@
37 * Each device has a channels list, which runs unlocked but is never modified 37 * Each device has a channels list, which runs unlocked but is never modified
38 * once the device is registered, it's just setup by the driver. 38 * once the device is registered, it's just setup by the driver.
39 * 39 *
40 * Each client has a channels list, it's only modified under the client->lock 40 * Each client is responsible for keeping track of the channels it uses. See
41 * and in an RCU callback, so it's safe to read under rcu_read_lock(). 41 * the definition of dma_event_callback in dmaengine.h.
42 * 42 *
43 * Each device has a kref, which is initialized to 1 when the device is 43 * Each device has a kref, which is initialized to 1 when the device is
44 * registered. A kref_put is done for each class_device registered. When the 44 * registered. A kref_get is done for each class_device registered. When the
45 * class_device is released, the coresponding kref_put is done in the release 45 * class_device is released, the coresponding kref_put is done in the release
46 * method. Every time one of the device's channels is allocated to a client, 46 * method. Every time one of the device's channels is allocated to a client,
47 * a kref_get occurs. When the channel is freed, the coresponding kref_put 47 * a kref_get occurs. When the channel is freed, the coresponding kref_put
@@ -51,14 +51,17 @@
51 * references to finish. 51 * references to finish.
52 * 52 *
53 * Each channel has an open-coded implementation of Rusty Russell's "bigref," 53 * Each channel has an open-coded implementation of Rusty Russell's "bigref,"
54 * with a kref and a per_cpu local_t. A single reference is set when on an 54 * with a kref and a per_cpu local_t. A dma_chan_get is called when a client
55 * ADDED event, and removed with a REMOVE event. Net DMA client takes an 55 * signals that it wants to use a channel, and dma_chan_put is called when
56 * extra reference per outstanding transaction. The relase function does a 56 * a channel is removed or a client using it is unregesitered. A client can
57 * kref_put on the device. -ChrisL 57 * take extra references per outstanding transaction, as is the case with
58 * the NET DMA client. The release function does a kref_put on the device.
59 * -ChrisL, DanW
58 */ 60 */
59 61
60#include <linux/init.h> 62#include <linux/init.h>
61#include <linux/module.h> 63#include <linux/module.h>
64#include <linux/mm.h>
62#include <linux/device.h> 65#include <linux/device.h>
63#include <linux/dmaengine.h> 66#include <linux/dmaengine.h>
64#include <linux/hardirq.h> 67#include <linux/hardirq.h>
@@ -66,6 +69,7 @@
66#include <linux/percpu.h> 69#include <linux/percpu.h>
67#include <linux/rcupdate.h> 70#include <linux/rcupdate.h>
68#include <linux/mutex.h> 71#include <linux/mutex.h>
72#include <linux/jiffies.h>
69 73
70static DEFINE_MUTEX(dma_list_mutex); 74static DEFINE_MUTEX(dma_list_mutex);
71static LIST_HEAD(dma_device_list); 75static LIST_HEAD(dma_device_list);
@@ -100,8 +104,19 @@ static ssize_t show_bytes_transferred(struct class_device *cd, char *buf)
100static ssize_t show_in_use(struct class_device *cd, char *buf) 104static ssize_t show_in_use(struct class_device *cd, char *buf)
101{ 105{
102 struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev); 106 struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
107 int in_use = 0;
108
109 if (unlikely(chan->slow_ref) &&
110 atomic_read(&chan->refcount.refcount) > 1)
111 in_use = 1;
112 else {
113 if (local_read(&(per_cpu_ptr(chan->local,
114 get_cpu())->refcount)) > 0)
115 in_use = 1;
116 put_cpu();
117 }
103 118
104 return sprintf(buf, "%d\n", (chan->client ? 1 : 0)); 119 return sprintf(buf, "%d\n", in_use);
105} 120}
106 121
107static struct class_device_attribute dma_class_attrs[] = { 122static struct class_device_attribute dma_class_attrs[] = {
@@ -127,43 +142,72 @@ static struct class dma_devclass = {
127 142
128/* --- client and device registration --- */ 143/* --- client and device registration --- */
129 144
145#define dma_chan_satisfies_mask(chan, mask) \
146 __dma_chan_satisfies_mask((chan), &(mask))
147static int
148__dma_chan_satisfies_mask(struct dma_chan *chan, dma_cap_mask_t *want)
149{
150 dma_cap_mask_t has;
151
152 bitmap_and(has.bits, want->bits, chan->device->cap_mask.bits,
153 DMA_TX_TYPE_END);
154 return bitmap_equal(want->bits, has.bits, DMA_TX_TYPE_END);
155}
156
130/** 157/**
131 * dma_client_chan_alloc - try to allocate a channel to a client 158 * dma_client_chan_alloc - try to allocate channels to a client
132 * @client: &dma_client 159 * @client: &dma_client
133 * 160 *
134 * Called with dma_list_mutex held. 161 * Called with dma_list_mutex held.
135 */ 162 */
136static struct dma_chan *dma_client_chan_alloc(struct dma_client *client) 163static void dma_client_chan_alloc(struct dma_client *client)
137{ 164{
138 struct dma_device *device; 165 struct dma_device *device;
139 struct dma_chan *chan; 166 struct dma_chan *chan;
140 unsigned long flags;
141 int desc; /* allocated descriptor count */ 167 int desc; /* allocated descriptor count */
168 enum dma_state_client ack;
142 169
143 /* Find a channel, any DMA engine will do */ 170 /* Find a channel */
144 list_for_each_entry(device, &dma_device_list, global_node) { 171 list_for_each_entry(device, &dma_device_list, global_node)
145 list_for_each_entry(chan, &device->channels, device_node) { 172 list_for_each_entry(chan, &device->channels, device_node) {
146 if (chan->client) 173 if (!dma_chan_satisfies_mask(chan, client->cap_mask))
147 continue; 174 continue;
148 175
149 desc = chan->device->device_alloc_chan_resources(chan); 176 desc = chan->device->device_alloc_chan_resources(chan);
150 if (desc >= 0) { 177 if (desc >= 0) {
151 kref_get(&device->refcount); 178 ack = client->event_callback(client,
152 kref_init(&chan->refcount); 179 chan,
153 chan->slow_ref = 0; 180 DMA_RESOURCE_AVAILABLE);
154 INIT_RCU_HEAD(&chan->rcu); 181
155 chan->client = client; 182 /* we are done once this client rejects
156 spin_lock_irqsave(&client->lock, flags); 183 * an available resource
157 list_add_tail_rcu(&chan->client_node, 184 */
158 &client->channels); 185 if (ack == DMA_ACK) {
159 spin_unlock_irqrestore(&client->lock, flags); 186 dma_chan_get(chan);
160 return chan; 187 kref_get(&device->refcount);
188 } else if (ack == DMA_NAK)
189 return;
161 } 190 }
162 } 191 }
163 } 192}
193
194enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie)
195{
196 enum dma_status status;
197 unsigned long dma_sync_wait_timeout = jiffies + msecs_to_jiffies(5000);
198
199 dma_async_issue_pending(chan);
200 do {
201 status = dma_async_is_tx_complete(chan, cookie, NULL, NULL);
202 if (time_after_eq(jiffies, dma_sync_wait_timeout)) {
203 printk(KERN_ERR "dma_sync_wait_timeout!\n");
204 return DMA_ERROR;
205 }
206 } while (status == DMA_IN_PROGRESS);
164 207
165 return NULL; 208 return status;
166} 209}
210EXPORT_SYMBOL(dma_sync_wait);
167 211
168/** 212/**
169 * dma_chan_cleanup - release a DMA channel's resources 213 * dma_chan_cleanup - release a DMA channel's resources
@@ -173,7 +217,6 @@ void dma_chan_cleanup(struct kref *kref)
173{ 217{
174 struct dma_chan *chan = container_of(kref, struct dma_chan, refcount); 218 struct dma_chan *chan = container_of(kref, struct dma_chan, refcount);
175 chan->device->device_free_chan_resources(chan); 219 chan->device->device_free_chan_resources(chan);
176 chan->client = NULL;
177 kref_put(&chan->device->refcount, dma_async_device_cleanup); 220 kref_put(&chan->device->refcount, dma_async_device_cleanup);
178} 221}
179EXPORT_SYMBOL(dma_chan_cleanup); 222EXPORT_SYMBOL(dma_chan_cleanup);
@@ -189,7 +232,7 @@ static void dma_chan_free_rcu(struct rcu_head *rcu)
189 kref_put(&chan->refcount, dma_chan_cleanup); 232 kref_put(&chan->refcount, dma_chan_cleanup);
190} 233}
191 234
192static void dma_client_chan_free(struct dma_chan *chan) 235static void dma_chan_release(struct dma_chan *chan)
193{ 236{
194 atomic_add(0x7FFFFFFF, &chan->refcount.refcount); 237 atomic_add(0x7FFFFFFF, &chan->refcount.refcount);
195 chan->slow_ref = 1; 238 chan->slow_ref = 1;
@@ -197,70 +240,57 @@ static void dma_client_chan_free(struct dma_chan *chan)
197} 240}
198 241
199/** 242/**
200 * dma_chans_rebalance - reallocate channels to clients 243 * dma_chans_notify_available - broadcast available channels to the clients
201 *
202 * When the number of DMA channel in the system changes,
203 * channels need to be rebalanced among clients.
204 */ 244 */
205static void dma_chans_rebalance(void) 245static void dma_clients_notify_available(void)
206{ 246{
207 struct dma_client *client; 247 struct dma_client *client;
208 struct dma_chan *chan;
209 unsigned long flags;
210 248
211 mutex_lock(&dma_list_mutex); 249 mutex_lock(&dma_list_mutex);
212 250
213 list_for_each_entry(client, &dma_client_list, global_node) { 251 list_for_each_entry(client, &dma_client_list, global_node)
214 while (client->chans_desired > client->chan_count) { 252 dma_client_chan_alloc(client);
215 chan = dma_client_chan_alloc(client);
216 if (!chan)
217 break;
218 client->chan_count++;
219 client->event_callback(client,
220 chan,
221 DMA_RESOURCE_ADDED);
222 }
223 while (client->chans_desired < client->chan_count) {
224 spin_lock_irqsave(&client->lock, flags);
225 chan = list_entry(client->channels.next,
226 struct dma_chan,
227 client_node);
228 list_del_rcu(&chan->client_node);
229 spin_unlock_irqrestore(&client->lock, flags);
230 client->chan_count--;
231 client->event_callback(client,
232 chan,
233 DMA_RESOURCE_REMOVED);
234 dma_client_chan_free(chan);
235 }
236 }
237 253
238 mutex_unlock(&dma_list_mutex); 254 mutex_unlock(&dma_list_mutex);
239} 255}
240 256
241/** 257/**
242 * dma_async_client_register - allocate and register a &dma_client 258 * dma_chans_notify_available - tell the clients that a channel is going away
243 * @event_callback: callback for notification of channel addition/removal 259 * @chan: channel on its way out
244 */ 260 */
245struct dma_client *dma_async_client_register(dma_event_callback event_callback) 261static void dma_clients_notify_removed(struct dma_chan *chan)
246{ 262{
247 struct dma_client *client; 263 struct dma_client *client;
264 enum dma_state_client ack;
248 265
249 client = kzalloc(sizeof(*client), GFP_KERNEL); 266 mutex_lock(&dma_list_mutex);
250 if (!client)
251 return NULL;
252 267
253 INIT_LIST_HEAD(&client->channels); 268 list_for_each_entry(client, &dma_client_list, global_node) {
254 spin_lock_init(&client->lock); 269 ack = client->event_callback(client, chan,
255 client->chans_desired = 0; 270 DMA_RESOURCE_REMOVED);
256 client->chan_count = 0; 271
257 client->event_callback = event_callback; 272 /* client was holding resources for this channel so
273 * free it
274 */
275 if (ack == DMA_ACK) {
276 dma_chan_put(chan);
277 kref_put(&chan->device->refcount,
278 dma_async_device_cleanup);
279 }
280 }
258 281
282 mutex_unlock(&dma_list_mutex);
283}
284
285/**
286 * dma_async_client_register - register a &dma_client
287 * @client: ptr to a client structure with valid 'event_callback' and 'cap_mask'
288 */
289void dma_async_client_register(struct dma_client *client)
290{
259 mutex_lock(&dma_list_mutex); 291 mutex_lock(&dma_list_mutex);
260 list_add_tail(&client->global_node, &dma_client_list); 292 list_add_tail(&client->global_node, &dma_client_list);
261 mutex_unlock(&dma_list_mutex); 293 mutex_unlock(&dma_list_mutex);
262
263 return client;
264} 294}
265EXPORT_SYMBOL(dma_async_client_register); 295EXPORT_SYMBOL(dma_async_client_register);
266 296
@@ -272,40 +302,42 @@ EXPORT_SYMBOL(dma_async_client_register);
272 */ 302 */
273void dma_async_client_unregister(struct dma_client *client) 303void dma_async_client_unregister(struct dma_client *client)
274{ 304{
305 struct dma_device *device;
275 struct dma_chan *chan; 306 struct dma_chan *chan;
307 enum dma_state_client ack;
276 308
277 if (!client) 309 if (!client)
278 return; 310 return;
279 311
280 rcu_read_lock();
281 list_for_each_entry_rcu(chan, &client->channels, client_node)
282 dma_client_chan_free(chan);
283 rcu_read_unlock();
284
285 mutex_lock(&dma_list_mutex); 312 mutex_lock(&dma_list_mutex);
313 /* free all channels the client is holding */
314 list_for_each_entry(device, &dma_device_list, global_node)
315 list_for_each_entry(chan, &device->channels, device_node) {
316 ack = client->event_callback(client, chan,
317 DMA_RESOURCE_REMOVED);
318
319 if (ack == DMA_ACK) {
320 dma_chan_put(chan);
321 kref_put(&chan->device->refcount,
322 dma_async_device_cleanup);
323 }
324 }
325
286 list_del(&client->global_node); 326 list_del(&client->global_node);
287 mutex_unlock(&dma_list_mutex); 327 mutex_unlock(&dma_list_mutex);
288
289 kfree(client);
290 dma_chans_rebalance();
291} 328}
292EXPORT_SYMBOL(dma_async_client_unregister); 329EXPORT_SYMBOL(dma_async_client_unregister);
293 330
294/** 331/**
295 * dma_async_client_chan_request - request DMA channels 332 * dma_async_client_chan_request - send all available channels to the
296 * @client: &dma_client 333 * client that satisfy the capability mask
297 * @number: count of DMA channels requested 334 * @client - requester
298 *
299 * Clients call dma_async_client_chan_request() to specify how many
300 * DMA channels they need, 0 to free all currently allocated.
301 * The resulting allocations/frees are indicated to the client via the
302 * event callback.
303 */ 335 */
304void dma_async_client_chan_request(struct dma_client *client, 336void dma_async_client_chan_request(struct dma_client *client)
305 unsigned int number)
306{ 337{
307 client->chans_desired = number; 338 mutex_lock(&dma_list_mutex);
308 dma_chans_rebalance(); 339 dma_client_chan_alloc(client);
340 mutex_unlock(&dma_list_mutex);
309} 341}
310EXPORT_SYMBOL(dma_async_client_chan_request); 342EXPORT_SYMBOL(dma_async_client_chan_request);
311 343
@@ -316,12 +348,31 @@ EXPORT_SYMBOL(dma_async_client_chan_request);
316int dma_async_device_register(struct dma_device *device) 348int dma_async_device_register(struct dma_device *device)
317{ 349{
318 static int id; 350 static int id;
319 int chancnt = 0; 351 int chancnt = 0, rc;
320 struct dma_chan* chan; 352 struct dma_chan* chan;
321 353
322 if (!device) 354 if (!device)
323 return -ENODEV; 355 return -ENODEV;
324 356
357 /* validate device routines */
358 BUG_ON(dma_has_cap(DMA_MEMCPY, device->cap_mask) &&
359 !device->device_prep_dma_memcpy);
360 BUG_ON(dma_has_cap(DMA_XOR, device->cap_mask) &&
361 !device->device_prep_dma_xor);
362 BUG_ON(dma_has_cap(DMA_ZERO_SUM, device->cap_mask) &&
363 !device->device_prep_dma_zero_sum);
364 BUG_ON(dma_has_cap(DMA_MEMSET, device->cap_mask) &&
365 !device->device_prep_dma_memset);
366 BUG_ON(dma_has_cap(DMA_ZERO_SUM, device->cap_mask) &&
367 !device->device_prep_dma_interrupt);
368
369 BUG_ON(!device->device_alloc_chan_resources);
370 BUG_ON(!device->device_free_chan_resources);
371 BUG_ON(!device->device_dependency_added);
372 BUG_ON(!device->device_is_tx_complete);
373 BUG_ON(!device->device_issue_pending);
374 BUG_ON(!device->dev);
375
325 init_completion(&device->done); 376 init_completion(&device->done);
326 kref_init(&device->refcount); 377 kref_init(&device->refcount);
327 device->dev_id = id++; 378 device->dev_id = id++;
@@ -338,17 +389,38 @@ int dma_async_device_register(struct dma_device *device)
338 snprintf(chan->class_dev.class_id, BUS_ID_SIZE, "dma%dchan%d", 389 snprintf(chan->class_dev.class_id, BUS_ID_SIZE, "dma%dchan%d",
339 device->dev_id, chan->chan_id); 390 device->dev_id, chan->chan_id);
340 391
392 rc = class_device_register(&chan->class_dev);
393 if (rc) {
394 chancnt--;
395 free_percpu(chan->local);
396 chan->local = NULL;
397 goto err_out;
398 }
399
341 kref_get(&device->refcount); 400 kref_get(&device->refcount);
342 class_device_register(&chan->class_dev); 401 kref_init(&chan->refcount);
402 chan->slow_ref = 0;
403 INIT_RCU_HEAD(&chan->rcu);
343 } 404 }
344 405
345 mutex_lock(&dma_list_mutex); 406 mutex_lock(&dma_list_mutex);
346 list_add_tail(&device->global_node, &dma_device_list); 407 list_add_tail(&device->global_node, &dma_device_list);
347 mutex_unlock(&dma_list_mutex); 408 mutex_unlock(&dma_list_mutex);
348 409
349 dma_chans_rebalance(); 410 dma_clients_notify_available();
350 411
351 return 0; 412 return 0;
413
414err_out:
415 list_for_each_entry(chan, &device->channels, device_node) {
416 if (chan->local == NULL)
417 continue;
418 kref_put(&device->refcount, dma_async_device_cleanup);
419 class_device_unregister(&chan->class_dev);
420 chancnt--;
421 free_percpu(chan->local);
422 }
423 return rc;
352} 424}
353EXPORT_SYMBOL(dma_async_device_register); 425EXPORT_SYMBOL(dma_async_device_register);
354 426
@@ -371,32 +443,165 @@ static void dma_async_device_cleanup(struct kref *kref)
371void dma_async_device_unregister(struct dma_device *device) 443void dma_async_device_unregister(struct dma_device *device)
372{ 444{
373 struct dma_chan *chan; 445 struct dma_chan *chan;
374 unsigned long flags;
375 446
376 mutex_lock(&dma_list_mutex); 447 mutex_lock(&dma_list_mutex);
377 list_del(&device->global_node); 448 list_del(&device->global_node);
378 mutex_unlock(&dma_list_mutex); 449 mutex_unlock(&dma_list_mutex);
379 450
380 list_for_each_entry(chan, &device->channels, device_node) { 451 list_for_each_entry(chan, &device->channels, device_node) {
381 if (chan->client) { 452 dma_clients_notify_removed(chan);
382 spin_lock_irqsave(&chan->client->lock, flags);
383 list_del(&chan->client_node);
384 chan->client->chan_count--;
385 spin_unlock_irqrestore(&chan->client->lock, flags);
386 chan->client->event_callback(chan->client,
387 chan,
388 DMA_RESOURCE_REMOVED);
389 dma_client_chan_free(chan);
390 }
391 class_device_unregister(&chan->class_dev); 453 class_device_unregister(&chan->class_dev);
454 dma_chan_release(chan);
392 } 455 }
393 dma_chans_rebalance();
394 456
395 kref_put(&device->refcount, dma_async_device_cleanup); 457 kref_put(&device->refcount, dma_async_device_cleanup);
396 wait_for_completion(&device->done); 458 wait_for_completion(&device->done);
397} 459}
398EXPORT_SYMBOL(dma_async_device_unregister); 460EXPORT_SYMBOL(dma_async_device_unregister);
399 461
462/**
463 * dma_async_memcpy_buf_to_buf - offloaded copy between virtual addresses
464 * @chan: DMA channel to offload copy to
465 * @dest: destination address (virtual)
466 * @src: source address (virtual)
467 * @len: length
468 *
469 * Both @dest and @src must be mappable to a bus address according to the
470 * DMA mapping API rules for streaming mappings.
471 * Both @dest and @src must stay memory resident (kernel memory or locked
472 * user space pages).
473 */
474dma_cookie_t
475dma_async_memcpy_buf_to_buf(struct dma_chan *chan, void *dest,
476 void *src, size_t len)
477{
478 struct dma_device *dev = chan->device;
479 struct dma_async_tx_descriptor *tx;
480 dma_addr_t addr;
481 dma_cookie_t cookie;
482 int cpu;
483
484 tx = dev->device_prep_dma_memcpy(chan, len, 0);
485 if (!tx)
486 return -ENOMEM;
487
488 tx->ack = 1;
489 tx->callback = NULL;
490 addr = dma_map_single(dev->dev, src, len, DMA_TO_DEVICE);
491 tx->tx_set_src(addr, tx, 0);
492 addr = dma_map_single(dev->dev, dest, len, DMA_FROM_DEVICE);
493 tx->tx_set_dest(addr, tx, 0);
494 cookie = tx->tx_submit(tx);
495
496 cpu = get_cpu();
497 per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
498 per_cpu_ptr(chan->local, cpu)->memcpy_count++;
499 put_cpu();
500
501 return cookie;
502}
503EXPORT_SYMBOL(dma_async_memcpy_buf_to_buf);
504
505/**
506 * dma_async_memcpy_buf_to_pg - offloaded copy from address to page
507 * @chan: DMA channel to offload copy to
508 * @page: destination page
509 * @offset: offset in page to copy to
510 * @kdata: source address (virtual)
511 * @len: length
512 *
513 * Both @page/@offset and @kdata must be mappable to a bus address according
514 * to the DMA mapping API rules for streaming mappings.
515 * Both @page/@offset and @kdata must stay memory resident (kernel memory or
516 * locked user space pages)
517 */
518dma_cookie_t
519dma_async_memcpy_buf_to_pg(struct dma_chan *chan, struct page *page,
520 unsigned int offset, void *kdata, size_t len)
521{
522 struct dma_device *dev = chan->device;
523 struct dma_async_tx_descriptor *tx;
524 dma_addr_t addr;
525 dma_cookie_t cookie;
526 int cpu;
527
528 tx = dev->device_prep_dma_memcpy(chan, len, 0);
529 if (!tx)
530 return -ENOMEM;
531
532 tx->ack = 1;
533 tx->callback = NULL;
534 addr = dma_map_single(dev->dev, kdata, len, DMA_TO_DEVICE);
535 tx->tx_set_src(addr, tx, 0);
536 addr = dma_map_page(dev->dev, page, offset, len, DMA_FROM_DEVICE);
537 tx->tx_set_dest(addr, tx, 0);
538 cookie = tx->tx_submit(tx);
539
540 cpu = get_cpu();
541 per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
542 per_cpu_ptr(chan->local, cpu)->memcpy_count++;
543 put_cpu();
544
545 return cookie;
546}
547EXPORT_SYMBOL(dma_async_memcpy_buf_to_pg);
548
549/**
550 * dma_async_memcpy_pg_to_pg - offloaded copy from page to page
551 * @chan: DMA channel to offload copy to
552 * @dest_pg: destination page
553 * @dest_off: offset in page to copy to
554 * @src_pg: source page
555 * @src_off: offset in page to copy from
556 * @len: length
557 *
558 * Both @dest_page/@dest_off and @src_page/@src_off must be mappable to a bus
559 * address according to the DMA mapping API rules for streaming mappings.
560 * Both @dest_page/@dest_off and @src_page/@src_off must stay memory resident
561 * (kernel memory or locked user space pages).
562 */
563dma_cookie_t
564dma_async_memcpy_pg_to_pg(struct dma_chan *chan, struct page *dest_pg,
565 unsigned int dest_off, struct page *src_pg, unsigned int src_off,
566 size_t len)
567{
568 struct dma_device *dev = chan->device;
569 struct dma_async_tx_descriptor *tx;
570 dma_addr_t addr;
571 dma_cookie_t cookie;
572 int cpu;
573
574 tx = dev->device_prep_dma_memcpy(chan, len, 0);
575 if (!tx)
576 return -ENOMEM;
577
578 tx->ack = 1;
579 tx->callback = NULL;
580 addr = dma_map_page(dev->dev, src_pg, src_off, len, DMA_TO_DEVICE);
581 tx->tx_set_src(addr, tx, 0);
582 addr = dma_map_page(dev->dev, dest_pg, dest_off, len, DMA_FROM_DEVICE);
583 tx->tx_set_dest(addr, tx, 0);
584 cookie = tx->tx_submit(tx);
585
586 cpu = get_cpu();
587 per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
588 per_cpu_ptr(chan->local, cpu)->memcpy_count++;
589 put_cpu();
590
591 return cookie;
592}
593EXPORT_SYMBOL(dma_async_memcpy_pg_to_pg);
594
595void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx,
596 struct dma_chan *chan)
597{
598 tx->chan = chan;
599 spin_lock_init(&tx->lock);
600 INIT_LIST_HEAD(&tx->depend_node);
601 INIT_LIST_HEAD(&tx->depend_list);
602}
603EXPORT_SYMBOL(dma_async_tx_descriptor_init);
604
400static int __init dma_bus_init(void) 605static int __init dma_bus_init(void)
401{ 606{
402 mutex_init(&dma_list_mutex); 607 mutex_init(&dma_list_mutex);
diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c
index 850014139556..5fbe56b5cea0 100644
--- a/drivers/dma/ioatdma.c
+++ b/drivers/dma/ioatdma.c
@@ -32,16 +32,17 @@
32#include <linux/delay.h> 32#include <linux/delay.h>
33#include <linux/dma-mapping.h> 33#include <linux/dma-mapping.h>
34#include "ioatdma.h" 34#include "ioatdma.h"
35#include "ioatdma_io.h"
36#include "ioatdma_registers.h" 35#include "ioatdma_registers.h"
37#include "ioatdma_hw.h" 36#include "ioatdma_hw.h"
38 37
39#define to_ioat_chan(chan) container_of(chan, struct ioat_dma_chan, common) 38#define to_ioat_chan(chan) container_of(chan, struct ioat_dma_chan, common)
40#define to_ioat_device(dev) container_of(dev, struct ioat_device, common) 39#define to_ioat_device(dev) container_of(dev, struct ioat_device, common)
41#define to_ioat_desc(lh) container_of(lh, struct ioat_desc_sw, node) 40#define to_ioat_desc(lh) container_of(lh, struct ioat_desc_sw, node)
41#define tx_to_ioat_desc(tx) container_of(tx, struct ioat_desc_sw, async_tx)
42 42
43/* internal functions */ 43/* internal functions */
44static int __devinit ioat_probe(struct pci_dev *pdev, const struct pci_device_id *ent); 44static int __devinit ioat_probe(struct pci_dev *pdev, const struct pci_device_id *ent);
45static void ioat_shutdown(struct pci_dev *pdev);
45static void __devexit ioat_remove(struct pci_dev *pdev); 46static void __devexit ioat_remove(struct pci_dev *pdev);
46 47
47static int enumerate_dma_channels(struct ioat_device *device) 48static int enumerate_dma_channels(struct ioat_device *device)
@@ -51,8 +52,8 @@ static int enumerate_dma_channels(struct ioat_device *device)
51 int i; 52 int i;
52 struct ioat_dma_chan *ioat_chan; 53 struct ioat_dma_chan *ioat_chan;
53 54
54 device->common.chancnt = ioatdma_read8(device, IOAT_CHANCNT_OFFSET); 55 device->common.chancnt = readb(device->reg_base + IOAT_CHANCNT_OFFSET);
55 xfercap_scale = ioatdma_read8(device, IOAT_XFERCAP_OFFSET); 56 xfercap_scale = readb(device->reg_base + IOAT_XFERCAP_OFFSET);
56 xfercap = (xfercap_scale == 0 ? -1 : (1UL << xfercap_scale)); 57 xfercap = (xfercap_scale == 0 ? -1 : (1UL << xfercap_scale));
57 58
58 for (i = 0; i < device->common.chancnt; i++) { 59 for (i = 0; i < device->common.chancnt; i++) {
@@ -71,13 +72,79 @@ static int enumerate_dma_channels(struct ioat_device *device)
71 INIT_LIST_HEAD(&ioat_chan->used_desc); 72 INIT_LIST_HEAD(&ioat_chan->used_desc);
72 /* This should be made common somewhere in dmaengine.c */ 73 /* This should be made common somewhere in dmaengine.c */
73 ioat_chan->common.device = &device->common; 74 ioat_chan->common.device = &device->common;
74 ioat_chan->common.client = NULL;
75 list_add_tail(&ioat_chan->common.device_node, 75 list_add_tail(&ioat_chan->common.device_node,
76 &device->common.channels); 76 &device->common.channels);
77 } 77 }
78 return device->common.chancnt; 78 return device->common.chancnt;
79} 79}
80 80
81static void
82ioat_set_src(dma_addr_t addr, struct dma_async_tx_descriptor *tx, int index)
83{
84 struct ioat_desc_sw *iter, *desc = tx_to_ioat_desc(tx);
85 struct ioat_dma_chan *ioat_chan = to_ioat_chan(tx->chan);
86
87 pci_unmap_addr_set(desc, src, addr);
88
89 list_for_each_entry(iter, &desc->async_tx.tx_list, node) {
90 iter->hw->src_addr = addr;
91 addr += ioat_chan->xfercap;
92 }
93
94}
95
96static void
97ioat_set_dest(dma_addr_t addr, struct dma_async_tx_descriptor *tx, int index)
98{
99 struct ioat_desc_sw *iter, *desc = tx_to_ioat_desc(tx);
100 struct ioat_dma_chan *ioat_chan = to_ioat_chan(tx->chan);
101
102 pci_unmap_addr_set(desc, dst, addr);
103
104 list_for_each_entry(iter, &desc->async_tx.tx_list, node) {
105 iter->hw->dst_addr = addr;
106 addr += ioat_chan->xfercap;
107 }
108}
109
110static dma_cookie_t
111ioat_tx_submit(struct dma_async_tx_descriptor *tx)
112{
113 struct ioat_dma_chan *ioat_chan = to_ioat_chan(tx->chan);
114 struct ioat_desc_sw *desc = tx_to_ioat_desc(tx);
115 int append = 0;
116 dma_cookie_t cookie;
117 struct ioat_desc_sw *group_start;
118
119 group_start = list_entry(desc->async_tx.tx_list.next,
120 struct ioat_desc_sw, node);
121 spin_lock_bh(&ioat_chan->desc_lock);
122 /* cookie incr and addition to used_list must be atomic */
123 cookie = ioat_chan->common.cookie;
124 cookie++;
125 if (cookie < 0)
126 cookie = 1;
127 ioat_chan->common.cookie = desc->async_tx.cookie = cookie;
128
129 /* write address into NextDescriptor field of last desc in chain */
130 to_ioat_desc(ioat_chan->used_desc.prev)->hw->next =
131 group_start->async_tx.phys;
132 list_splice_init(&desc->async_tx.tx_list, ioat_chan->used_desc.prev);
133
134 ioat_chan->pending += desc->tx_cnt;
135 if (ioat_chan->pending >= 4) {
136 append = 1;
137 ioat_chan->pending = 0;
138 }
139 spin_unlock_bh(&ioat_chan->desc_lock);
140
141 if (append)
142 writeb(IOAT_CHANCMD_APPEND,
143 ioat_chan->reg_base + IOAT_CHANCMD_OFFSET);
144
145 return cookie;
146}
147
81static struct ioat_desc_sw *ioat_dma_alloc_descriptor( 148static struct ioat_desc_sw *ioat_dma_alloc_descriptor(
82 struct ioat_dma_chan *ioat_chan, 149 struct ioat_dma_chan *ioat_chan,
83 gfp_t flags) 150 gfp_t flags)
@@ -99,8 +166,13 @@ static struct ioat_desc_sw *ioat_dma_alloc_descriptor(
99 } 166 }
100 167
101 memset(desc, 0, sizeof(*desc)); 168 memset(desc, 0, sizeof(*desc));
169 dma_async_tx_descriptor_init(&desc_sw->async_tx, &ioat_chan->common);
170 desc_sw->async_tx.tx_set_src = ioat_set_src;
171 desc_sw->async_tx.tx_set_dest = ioat_set_dest;
172 desc_sw->async_tx.tx_submit = ioat_tx_submit;
173 INIT_LIST_HEAD(&desc_sw->async_tx.tx_list);
102 desc_sw->hw = desc; 174 desc_sw->hw = desc;
103 desc_sw->phys = phys; 175 desc_sw->async_tx.phys = phys;
104 176
105 return desc_sw; 177 return desc_sw;
106} 178}
@@ -123,7 +195,7 @@ static int ioat_dma_alloc_chan_resources(struct dma_chan *chan)
123 * In-use bit automatically set by reading chanctrl 195 * In-use bit automatically set by reading chanctrl
124 * If 0, we got it, if 1, someone else did 196 * If 0, we got it, if 1, someone else did
125 */ 197 */
126 chanctrl = ioatdma_chan_read16(ioat_chan, IOAT_CHANCTRL_OFFSET); 198 chanctrl = readw(ioat_chan->reg_base + IOAT_CHANCTRL_OFFSET);
127 if (chanctrl & IOAT_CHANCTRL_CHANNEL_IN_USE) 199 if (chanctrl & IOAT_CHANCTRL_CHANNEL_IN_USE)
128 return -EBUSY; 200 return -EBUSY;
129 201
@@ -132,12 +204,12 @@ static int ioat_dma_alloc_chan_resources(struct dma_chan *chan)
132 IOAT_CHANCTRL_ERR_INT_EN | 204 IOAT_CHANCTRL_ERR_INT_EN |
133 IOAT_CHANCTRL_ANY_ERR_ABORT_EN | 205 IOAT_CHANCTRL_ANY_ERR_ABORT_EN |
134 IOAT_CHANCTRL_ERR_COMPLETION_EN; 206 IOAT_CHANCTRL_ERR_COMPLETION_EN;
135 ioatdma_chan_write16(ioat_chan, IOAT_CHANCTRL_OFFSET, chanctrl); 207 writew(chanctrl, ioat_chan->reg_base + IOAT_CHANCTRL_OFFSET);
136 208
137 chanerr = ioatdma_chan_read32(ioat_chan, IOAT_CHANERR_OFFSET); 209 chanerr = readl(ioat_chan->reg_base + IOAT_CHANERR_OFFSET);
138 if (chanerr) { 210 if (chanerr) {
139 printk("IOAT: CHANERR = %x, clearing\n", chanerr); 211 printk("IOAT: CHANERR = %x, clearing\n", chanerr);
140 ioatdma_chan_write32(ioat_chan, IOAT_CHANERR_OFFSET, chanerr); 212 writel(chanerr, ioat_chan->reg_base + IOAT_CHANERR_OFFSET);
141 } 213 }
142 214
143 /* Allocate descriptors */ 215 /* Allocate descriptors */
@@ -161,10 +233,10 @@ static int ioat_dma_alloc_chan_resources(struct dma_chan *chan)
161 &ioat_chan->completion_addr); 233 &ioat_chan->completion_addr);
162 memset(ioat_chan->completion_virt, 0, 234 memset(ioat_chan->completion_virt, 0,
163 sizeof(*ioat_chan->completion_virt)); 235 sizeof(*ioat_chan->completion_virt));
164 ioatdma_chan_write32(ioat_chan, IOAT_CHANCMP_OFFSET_LOW, 236 writel(((u64) ioat_chan->completion_addr) & 0x00000000FFFFFFFF,
165 ((u64) ioat_chan->completion_addr) & 0x00000000FFFFFFFF); 237 ioat_chan->reg_base + IOAT_CHANCMP_OFFSET_LOW);
166 ioatdma_chan_write32(ioat_chan, IOAT_CHANCMP_OFFSET_HIGH, 238 writel(((u64) ioat_chan->completion_addr) >> 32,
167 ((u64) ioat_chan->completion_addr) >> 32); 239 ioat_chan->reg_base + IOAT_CHANCMP_OFFSET_HIGH);
168 240
169 ioat_start_null_desc(ioat_chan); 241 ioat_start_null_desc(ioat_chan);
170 return i; 242 return i;
@@ -182,18 +254,20 @@ static void ioat_dma_free_chan_resources(struct dma_chan *chan)
182 254
183 ioat_dma_memcpy_cleanup(ioat_chan); 255 ioat_dma_memcpy_cleanup(ioat_chan);
184 256
185 ioatdma_chan_write8(ioat_chan, IOAT_CHANCMD_OFFSET, IOAT_CHANCMD_RESET); 257 writeb(IOAT_CHANCMD_RESET, ioat_chan->reg_base + IOAT_CHANCMD_OFFSET);
186 258
187 spin_lock_bh(&ioat_chan->desc_lock); 259 spin_lock_bh(&ioat_chan->desc_lock);
188 list_for_each_entry_safe(desc, _desc, &ioat_chan->used_desc, node) { 260 list_for_each_entry_safe(desc, _desc, &ioat_chan->used_desc, node) {
189 in_use_descs++; 261 in_use_descs++;
190 list_del(&desc->node); 262 list_del(&desc->node);
191 pci_pool_free(ioat_device->dma_pool, desc->hw, desc->phys); 263 pci_pool_free(ioat_device->dma_pool, desc->hw,
264 desc->async_tx.phys);
192 kfree(desc); 265 kfree(desc);
193 } 266 }
194 list_for_each_entry_safe(desc, _desc, &ioat_chan->free_desc, node) { 267 list_for_each_entry_safe(desc, _desc, &ioat_chan->free_desc, node) {
195 list_del(&desc->node); 268 list_del(&desc->node);
196 pci_pool_free(ioat_device->dma_pool, desc->hw, desc->phys); 269 pci_pool_free(ioat_device->dma_pool, desc->hw,
270 desc->async_tx.phys);
197 kfree(desc); 271 kfree(desc);
198 } 272 }
199 spin_unlock_bh(&ioat_chan->desc_lock); 273 spin_unlock_bh(&ioat_chan->desc_lock);
@@ -210,50 +284,30 @@ static void ioat_dma_free_chan_resources(struct dma_chan *chan)
210 ioat_chan->last_completion = ioat_chan->completion_addr = 0; 284 ioat_chan->last_completion = ioat_chan->completion_addr = 0;
211 285
212 /* Tell hw the chan is free */ 286 /* Tell hw the chan is free */
213 chanctrl = ioatdma_chan_read16(ioat_chan, IOAT_CHANCTRL_OFFSET); 287 chanctrl = readw(ioat_chan->reg_base + IOAT_CHANCTRL_OFFSET);
214 chanctrl &= ~IOAT_CHANCTRL_CHANNEL_IN_USE; 288 chanctrl &= ~IOAT_CHANCTRL_CHANNEL_IN_USE;
215 ioatdma_chan_write16(ioat_chan, IOAT_CHANCTRL_OFFSET, chanctrl); 289 writew(chanctrl, ioat_chan->reg_base + IOAT_CHANCTRL_OFFSET);
216} 290}
217 291
218/** 292static struct dma_async_tx_descriptor *
219 * do_ioat_dma_memcpy - actual function that initiates a IOAT DMA transaction 293ioat_dma_prep_memcpy(struct dma_chan *chan, size_t len, int int_en)
220 * @ioat_chan: IOAT DMA channel handle
221 * @dest: DMA destination address
222 * @src: DMA source address
223 * @len: transaction length in bytes
224 */
225
226static dma_cookie_t do_ioat_dma_memcpy(struct ioat_dma_chan *ioat_chan,
227 dma_addr_t dest,
228 dma_addr_t src,
229 size_t len)
230{ 294{
231 struct ioat_desc_sw *first; 295 struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
232 struct ioat_desc_sw *prev; 296 struct ioat_desc_sw *first, *prev, *new;
233 struct ioat_desc_sw *new;
234 dma_cookie_t cookie;
235 LIST_HEAD(new_chain); 297 LIST_HEAD(new_chain);
236 u32 copy; 298 u32 copy;
237 size_t orig_len; 299 size_t orig_len;
238 dma_addr_t orig_src, orig_dst; 300 int desc_count = 0;
239 unsigned int desc_count = 0;
240 unsigned int append = 0;
241
242 if (!ioat_chan || !dest || !src)
243 return -EFAULT;
244 301
245 if (!len) 302 if (!len)
246 return ioat_chan->common.cookie; 303 return NULL;
247 304
248 orig_len = len; 305 orig_len = len;
249 orig_src = src;
250 orig_dst = dest;
251 306
252 first = NULL; 307 first = NULL;
253 prev = NULL; 308 prev = NULL;
254 309
255 spin_lock_bh(&ioat_chan->desc_lock); 310 spin_lock_bh(&ioat_chan->desc_lock);
256
257 while (len) { 311 while (len) {
258 if (!list_empty(&ioat_chan->free_desc)) { 312 if (!list_empty(&ioat_chan->free_desc)) {
259 new = to_ioat_desc(ioat_chan->free_desc.next); 313 new = to_ioat_desc(ioat_chan->free_desc.next);
@@ -270,141 +324,36 @@ static dma_cookie_t do_ioat_dma_memcpy(struct ioat_dma_chan *ioat_chan,
270 324
271 new->hw->size = copy; 325 new->hw->size = copy;
272 new->hw->ctl = 0; 326 new->hw->ctl = 0;
273 new->hw->src_addr = src; 327 new->async_tx.cookie = 0;
274 new->hw->dst_addr = dest; 328 new->async_tx.ack = 1;
275 new->cookie = 0;
276 329
277 /* chain together the physical address list for the HW */ 330 /* chain together the physical address list for the HW */
278 if (!first) 331 if (!first)
279 first = new; 332 first = new;
280 else 333 else
281 prev->hw->next = (u64) new->phys; 334 prev->hw->next = (u64) new->async_tx.phys;
282 335
283 prev = new; 336 prev = new;
284
285 len -= copy; 337 len -= copy;
286 dest += copy;
287 src += copy;
288
289 list_add_tail(&new->node, &new_chain); 338 list_add_tail(&new->node, &new_chain);
290 desc_count++; 339 desc_count++;
291 } 340 }
292 new->hw->ctl = IOAT_DMA_DESCRIPTOR_CTL_CP_STS;
293 new->hw->next = 0;
294 341
295 /* cookie incr and addition to used_list must be atomic */ 342 list_splice(&new_chain, &new->async_tx.tx_list);
296 343
297 cookie = ioat_chan->common.cookie; 344 new->hw->ctl = IOAT_DMA_DESCRIPTOR_CTL_CP_STS;
298 cookie++; 345 new->hw->next = 0;
299 if (cookie < 0) 346 new->tx_cnt = desc_count;
300 cookie = 1; 347 new->async_tx.ack = 0; /* client is in control of this ack */
301 ioat_chan->common.cookie = new->cookie = cookie; 348 new->async_tx.cookie = -EBUSY;
302 349
303 pci_unmap_addr_set(new, src, orig_src);
304 pci_unmap_addr_set(new, dst, orig_dst);
305 pci_unmap_len_set(new, src_len, orig_len); 350 pci_unmap_len_set(new, src_len, orig_len);
306 pci_unmap_len_set(new, dst_len, orig_len); 351 pci_unmap_len_set(new, dst_len, orig_len);
307
308 /* write address into NextDescriptor field of last desc in chain */
309 to_ioat_desc(ioat_chan->used_desc.prev)->hw->next = first->phys;
310 list_splice_init(&new_chain, ioat_chan->used_desc.prev);
311
312 ioat_chan->pending += desc_count;
313 if (ioat_chan->pending >= 20) {
314 append = 1;
315 ioat_chan->pending = 0;
316 }
317
318 spin_unlock_bh(&ioat_chan->desc_lock); 352 spin_unlock_bh(&ioat_chan->desc_lock);
319 353
320 if (append) 354 return new ? &new->async_tx : NULL;
321 ioatdma_chan_write8(ioat_chan,
322 IOAT_CHANCMD_OFFSET,
323 IOAT_CHANCMD_APPEND);
324 return cookie;
325}
326
327/**
328 * ioat_dma_memcpy_buf_to_buf - wrapper that takes src & dest bufs
329 * @chan: IOAT DMA channel handle
330 * @dest: DMA destination address
331 * @src: DMA source address
332 * @len: transaction length in bytes
333 */
334
335static dma_cookie_t ioat_dma_memcpy_buf_to_buf(struct dma_chan *chan,
336 void *dest,
337 void *src,
338 size_t len)
339{
340 dma_addr_t dest_addr;
341 dma_addr_t src_addr;
342 struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
343
344 dest_addr = pci_map_single(ioat_chan->device->pdev,
345 dest, len, PCI_DMA_FROMDEVICE);
346 src_addr = pci_map_single(ioat_chan->device->pdev,
347 src, len, PCI_DMA_TODEVICE);
348
349 return do_ioat_dma_memcpy(ioat_chan, dest_addr, src_addr, len);
350} 355}
351 356
352/**
353 * ioat_dma_memcpy_buf_to_pg - wrapper, copying from a buf to a page
354 * @chan: IOAT DMA channel handle
355 * @page: pointer to the page to copy to
356 * @offset: offset into that page
357 * @src: DMA source address
358 * @len: transaction length in bytes
359 */
360
361static dma_cookie_t ioat_dma_memcpy_buf_to_pg(struct dma_chan *chan,
362 struct page *page,
363 unsigned int offset,
364 void *src,
365 size_t len)
366{
367 dma_addr_t dest_addr;
368 dma_addr_t src_addr;
369 struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
370
371 dest_addr = pci_map_page(ioat_chan->device->pdev,
372 page, offset, len, PCI_DMA_FROMDEVICE);
373 src_addr = pci_map_single(ioat_chan->device->pdev,
374 src, len, PCI_DMA_TODEVICE);
375
376 return do_ioat_dma_memcpy(ioat_chan, dest_addr, src_addr, len);
377}
378
379/**
380 * ioat_dma_memcpy_pg_to_pg - wrapper, copying between two pages
381 * @chan: IOAT DMA channel handle
382 * @dest_pg: pointer to the page to copy to
383 * @dest_off: offset into that page
384 * @src_pg: pointer to the page to copy from
385 * @src_off: offset into that page
386 * @len: transaction length in bytes. This is guaranteed not to make a copy
387 * across a page boundary.
388 */
389
390static dma_cookie_t ioat_dma_memcpy_pg_to_pg(struct dma_chan *chan,
391 struct page *dest_pg,
392 unsigned int dest_off,
393 struct page *src_pg,
394 unsigned int src_off,
395 size_t len)
396{
397 dma_addr_t dest_addr;
398 dma_addr_t src_addr;
399 struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
400
401 dest_addr = pci_map_page(ioat_chan->device->pdev,
402 dest_pg, dest_off, len, PCI_DMA_FROMDEVICE);
403 src_addr = pci_map_page(ioat_chan->device->pdev,
404 src_pg, src_off, len, PCI_DMA_TODEVICE);
405
406 return do_ioat_dma_memcpy(ioat_chan, dest_addr, src_addr, len);
407}
408 357
409/** 358/**
410 * ioat_dma_memcpy_issue_pending - push potentially unrecognized appended descriptors to hw 359 * ioat_dma_memcpy_issue_pending - push potentially unrecognized appended descriptors to hw
@@ -417,9 +366,8 @@ static void ioat_dma_memcpy_issue_pending(struct dma_chan *chan)
417 366
418 if (ioat_chan->pending != 0) { 367 if (ioat_chan->pending != 0) {
419 ioat_chan->pending = 0; 368 ioat_chan->pending = 0;
420 ioatdma_chan_write8(ioat_chan, 369 writeb(IOAT_CHANCMD_APPEND,
421 IOAT_CHANCMD_OFFSET, 370 ioat_chan->reg_base + IOAT_CHANCMD_OFFSET);
422 IOAT_CHANCMD_APPEND);
423 } 371 }
424} 372}
425 373
@@ -449,7 +397,7 @@ static void ioat_dma_memcpy_cleanup(struct ioat_dma_chan *chan)
449 if ((chan->completion_virt->full & IOAT_CHANSTS_DMA_TRANSFER_STATUS) == 397 if ((chan->completion_virt->full & IOAT_CHANSTS_DMA_TRANSFER_STATUS) ==
450 IOAT_CHANSTS_DMA_TRANSFER_STATUS_HALTED) { 398 IOAT_CHANSTS_DMA_TRANSFER_STATUS_HALTED) {
451 printk("IOAT: Channel halted, chanerr = %x\n", 399 printk("IOAT: Channel halted, chanerr = %x\n",
452 ioatdma_chan_read32(chan, IOAT_CHANERR_OFFSET)); 400 readl(chan->reg_base + IOAT_CHANERR_OFFSET));
453 401
454 /* TODO do something to salvage the situation */ 402 /* TODO do something to salvage the situation */
455 } 403 }
@@ -467,8 +415,8 @@ static void ioat_dma_memcpy_cleanup(struct ioat_dma_chan *chan)
467 * exceeding xfercap, perhaps. If so, only the last one will 415 * exceeding xfercap, perhaps. If so, only the last one will
468 * have a cookie, and require unmapping. 416 * have a cookie, and require unmapping.
469 */ 417 */
470 if (desc->cookie) { 418 if (desc->async_tx.cookie) {
471 cookie = desc->cookie; 419 cookie = desc->async_tx.cookie;
472 420
473 /* yes we are unmapping both _page and _single alloc'd 421 /* yes we are unmapping both _page and _single alloc'd
474 regions with unmap_page. Is this *really* that bad? 422 regions with unmap_page. Is this *really* that bad?
@@ -483,14 +431,19 @@ static void ioat_dma_memcpy_cleanup(struct ioat_dma_chan *chan)
483 PCI_DMA_TODEVICE); 431 PCI_DMA_TODEVICE);
484 } 432 }
485 433
486 if (desc->phys != phys_complete) { 434 if (desc->async_tx.phys != phys_complete) {
487 /* a completed entry, but not the last, so cleanup */ 435 /* a completed entry, but not the last, so cleanup
488 list_del(&desc->node); 436 * if the client is done with the descriptor
489 list_add_tail(&desc->node, &chan->free_desc); 437 */
438 if (desc->async_tx.ack) {
439 list_del(&desc->node);
440 list_add_tail(&desc->node, &chan->free_desc);
441 } else
442 desc->async_tx.cookie = 0;
490 } else { 443 } else {
491 /* last used desc. Do not remove, so we can append from 444 /* last used desc. Do not remove, so we can append from
492 it, but don't look at it next time, either */ 445 it, but don't look at it next time, either */
493 desc->cookie = 0; 446 desc->async_tx.cookie = 0;
494 447
495 /* TODO check status bits? */ 448 /* TODO check status bits? */
496 break; 449 break;
@@ -506,6 +459,17 @@ static void ioat_dma_memcpy_cleanup(struct ioat_dma_chan *chan)
506 spin_unlock(&chan->cleanup_lock); 459 spin_unlock(&chan->cleanup_lock);
507} 460}
508 461
462static void ioat_dma_dependency_added(struct dma_chan *chan)
463{
464 struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
465 spin_lock_bh(&ioat_chan->desc_lock);
466 if (ioat_chan->pending == 0) {
467 spin_unlock_bh(&ioat_chan->desc_lock);
468 ioat_dma_memcpy_cleanup(ioat_chan);
469 } else
470 spin_unlock_bh(&ioat_chan->desc_lock);
471}
472
509/** 473/**
510 * ioat_dma_is_complete - poll the status of a IOAT DMA transaction 474 * ioat_dma_is_complete - poll the status of a IOAT DMA transaction
511 * @chan: IOAT DMA channel handle 475 * @chan: IOAT DMA channel handle
@@ -553,6 +517,8 @@ static enum dma_status ioat_dma_is_complete(struct dma_chan *chan,
553 517
554static struct pci_device_id ioat_pci_tbl[] = { 518static struct pci_device_id ioat_pci_tbl[] = {
555 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT) }, 519 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT) },
520 { PCI_DEVICE(PCI_VENDOR_ID_UNISYS,
521 PCI_DEVICE_ID_UNISYS_DMA_DIRECTOR) },
556 { 0, } 522 { 0, }
557}; 523};
558 524
@@ -560,6 +526,7 @@ static struct pci_driver ioat_pci_driver = {
560 .name = "ioatdma", 526 .name = "ioatdma",
561 .id_table = ioat_pci_tbl, 527 .id_table = ioat_pci_tbl,
562 .probe = ioat_probe, 528 .probe = ioat_probe,
529 .shutdown = ioat_shutdown,
563 .remove = __devexit_p(ioat_remove), 530 .remove = __devexit_p(ioat_remove),
564}; 531};
565 532
@@ -569,21 +536,21 @@ static irqreturn_t ioat_do_interrupt(int irq, void *data)
569 unsigned long attnstatus; 536 unsigned long attnstatus;
570 u8 intrctrl; 537 u8 intrctrl;
571 538
572 intrctrl = ioatdma_read8(instance, IOAT_INTRCTRL_OFFSET); 539 intrctrl = readb(instance->reg_base + IOAT_INTRCTRL_OFFSET);
573 540
574 if (!(intrctrl & IOAT_INTRCTRL_MASTER_INT_EN)) 541 if (!(intrctrl & IOAT_INTRCTRL_MASTER_INT_EN))
575 return IRQ_NONE; 542 return IRQ_NONE;
576 543
577 if (!(intrctrl & IOAT_INTRCTRL_INT_STATUS)) { 544 if (!(intrctrl & IOAT_INTRCTRL_INT_STATUS)) {
578 ioatdma_write8(instance, IOAT_INTRCTRL_OFFSET, intrctrl); 545 writeb(intrctrl, instance->reg_base + IOAT_INTRCTRL_OFFSET);
579 return IRQ_NONE; 546 return IRQ_NONE;
580 } 547 }
581 548
582 attnstatus = ioatdma_read32(instance, IOAT_ATTNSTATUS_OFFSET); 549 attnstatus = readl(instance->reg_base + IOAT_ATTNSTATUS_OFFSET);
583 550
584 printk(KERN_ERR "ioatdma error: interrupt! status %lx\n", attnstatus); 551 printk(KERN_ERR "ioatdma error: interrupt! status %lx\n", attnstatus);
585 552
586 ioatdma_write8(instance, IOAT_INTRCTRL_OFFSET, intrctrl); 553 writeb(intrctrl, instance->reg_base + IOAT_INTRCTRL_OFFSET);
587 return IRQ_HANDLED; 554 return IRQ_HANDLED;
588} 555}
589 556
@@ -607,19 +574,17 @@ static void ioat_start_null_desc(struct ioat_dma_chan *ioat_chan)
607 574
608 desc->hw->ctl = IOAT_DMA_DESCRIPTOR_NUL; 575 desc->hw->ctl = IOAT_DMA_DESCRIPTOR_NUL;
609 desc->hw->next = 0; 576 desc->hw->next = 0;
577 desc->async_tx.ack = 1;
610 578
611 list_add_tail(&desc->node, &ioat_chan->used_desc); 579 list_add_tail(&desc->node, &ioat_chan->used_desc);
612 spin_unlock_bh(&ioat_chan->desc_lock); 580 spin_unlock_bh(&ioat_chan->desc_lock);
613 581
614#if (BITS_PER_LONG == 64) 582 writel(((u64) desc->async_tx.phys) & 0x00000000FFFFFFFF,
615 ioatdma_chan_write64(ioat_chan, IOAT_CHAINADDR_OFFSET, desc->phys); 583 ioat_chan->reg_base + IOAT_CHAINADDR_OFFSET_LOW);
616#else 584 writel(((u64) desc->async_tx.phys) >> 32,
617 ioatdma_chan_write32(ioat_chan, 585 ioat_chan->reg_base + IOAT_CHAINADDR_OFFSET_HIGH);
618 IOAT_CHAINADDR_OFFSET_LOW, 586
619 (u32) desc->phys); 587 writeb(IOAT_CHANCMD_START, ioat_chan->reg_base + IOAT_CHANCMD_OFFSET);
620 ioatdma_chan_write32(ioat_chan, IOAT_CHAINADDR_OFFSET_HIGH, 0);
621#endif
622 ioatdma_chan_write8(ioat_chan, IOAT_CHANCMD_OFFSET, IOAT_CHANCMD_START);
623} 588}
624 589
625/* 590/*
@@ -633,6 +598,8 @@ static int ioat_self_test(struct ioat_device *device)
633 u8 *src; 598 u8 *src;
634 u8 *dest; 599 u8 *dest;
635 struct dma_chan *dma_chan; 600 struct dma_chan *dma_chan;
601 struct dma_async_tx_descriptor *tx;
602 dma_addr_t addr;
636 dma_cookie_t cookie; 603 dma_cookie_t cookie;
637 int err = 0; 604 int err = 0;
638 605
@@ -658,7 +625,15 @@ static int ioat_self_test(struct ioat_device *device)
658 goto out; 625 goto out;
659 } 626 }
660 627
661 cookie = ioat_dma_memcpy_buf_to_buf(dma_chan, dest, src, IOAT_TEST_SIZE); 628 tx = ioat_dma_prep_memcpy(dma_chan, IOAT_TEST_SIZE, 0);
629 async_tx_ack(tx);
630 addr = dma_map_single(dma_chan->device->dev, src, IOAT_TEST_SIZE,
631 DMA_TO_DEVICE);
632 ioat_set_src(addr, tx, 0);
633 addr = dma_map_single(dma_chan->device->dev, dest, IOAT_TEST_SIZE,
634 DMA_FROM_DEVICE);
635 ioat_set_dest(addr, tx, 0);
636 cookie = ioat_tx_submit(tx);
662 ioat_dma_memcpy_issue_pending(dma_chan); 637 ioat_dma_memcpy_issue_pending(dma_chan);
663 msleep(1); 638 msleep(1);
664 639
@@ -748,19 +723,20 @@ static int __devinit ioat_probe(struct pci_dev *pdev,
748 723
749 device->reg_base = reg_base; 724 device->reg_base = reg_base;
750 725
751 ioatdma_write8(device, IOAT_INTRCTRL_OFFSET, IOAT_INTRCTRL_MASTER_INT_EN); 726 writeb(IOAT_INTRCTRL_MASTER_INT_EN, device->reg_base + IOAT_INTRCTRL_OFFSET);
752 pci_set_master(pdev); 727 pci_set_master(pdev);
753 728
754 INIT_LIST_HEAD(&device->common.channels); 729 INIT_LIST_HEAD(&device->common.channels);
755 enumerate_dma_channels(device); 730 enumerate_dma_channels(device);
756 731
732 dma_cap_set(DMA_MEMCPY, device->common.cap_mask);
757 device->common.device_alloc_chan_resources = ioat_dma_alloc_chan_resources; 733 device->common.device_alloc_chan_resources = ioat_dma_alloc_chan_resources;
758 device->common.device_free_chan_resources = ioat_dma_free_chan_resources; 734 device->common.device_free_chan_resources = ioat_dma_free_chan_resources;
759 device->common.device_memcpy_buf_to_buf = ioat_dma_memcpy_buf_to_buf; 735 device->common.device_prep_dma_memcpy = ioat_dma_prep_memcpy;
760 device->common.device_memcpy_buf_to_pg = ioat_dma_memcpy_buf_to_pg; 736 device->common.device_is_tx_complete = ioat_dma_is_complete;
761 device->common.device_memcpy_pg_to_pg = ioat_dma_memcpy_pg_to_pg; 737 device->common.device_issue_pending = ioat_dma_memcpy_issue_pending;
762 device->common.device_memcpy_complete = ioat_dma_is_complete; 738 device->common.device_dependency_added = ioat_dma_dependency_added;
763 device->common.device_memcpy_issue_pending = ioat_dma_memcpy_issue_pending; 739 device->common.dev = &pdev->dev;
764 printk(KERN_INFO "Intel(R) I/OAT DMA Engine found, %d channels\n", 740 printk(KERN_INFO "Intel(R) I/OAT DMA Engine found, %d channels\n",
765 device->common.chancnt); 741 device->common.chancnt);
766 742
@@ -787,9 +763,20 @@ err_request_regions:
787err_set_dma_mask: 763err_set_dma_mask:
788 pci_disable_device(pdev); 764 pci_disable_device(pdev);
789err_enable_device: 765err_enable_device:
766
767 printk(KERN_ERR "Intel(R) I/OAT DMA Engine initialization failed\n");
768
790 return err; 769 return err;
791} 770}
792 771
772static void ioat_shutdown(struct pci_dev *pdev)
773{
774 struct ioat_device *device;
775 device = pci_get_drvdata(pdev);
776
777 dma_async_device_unregister(&device->common);
778}
779
793static void __devexit ioat_remove(struct pci_dev *pdev) 780static void __devexit ioat_remove(struct pci_dev *pdev)
794{ 781{
795 struct ioat_device *device; 782 struct ioat_device *device;
@@ -818,7 +805,7 @@ static void __devexit ioat_remove(struct pci_dev *pdev)
818} 805}
819 806
820/* MODULE API */ 807/* MODULE API */
821MODULE_VERSION("1.7"); 808MODULE_VERSION("1.9");
822MODULE_LICENSE("GPL"); 809MODULE_LICENSE("GPL");
823MODULE_AUTHOR("Intel Corporation"); 810MODULE_AUTHOR("Intel Corporation");
824 811
diff --git a/drivers/dma/ioatdma.h b/drivers/dma/ioatdma.h
index 62b26a9be4c9..d3726478031a 100644
--- a/drivers/dma/ioatdma.h
+++ b/drivers/dma/ioatdma.h
@@ -30,9 +30,6 @@
30 30
31#define IOAT_LOW_COMPLETION_MASK 0xffffffc0 31#define IOAT_LOW_COMPLETION_MASK 0xffffffc0
32 32
33extern struct list_head dma_device_list;
34extern struct list_head dma_client_list;
35
36/** 33/**
37 * struct ioat_device - internal representation of a IOAT device 34 * struct ioat_device - internal representation of a IOAT device
38 * @pdev: PCI-Express device 35 * @pdev: PCI-Express device
@@ -105,21 +102,20 @@ struct ioat_dma_chan {
105/** 102/**
106 * struct ioat_desc_sw - wrapper around hardware descriptor 103 * struct ioat_desc_sw - wrapper around hardware descriptor
107 * @hw: hardware DMA descriptor 104 * @hw: hardware DMA descriptor
108 * @node: 105 * @node: this descriptor will either be on the free list,
109 * @cookie: 106 * or attached to a transaction list (async_tx.tx_list)
110 * @phys: 107 * @tx_cnt: number of descriptors required to complete the transaction
108 * @async_tx: the generic software descriptor for all engines
111 */ 109 */
112
113struct ioat_desc_sw { 110struct ioat_desc_sw {
114 struct ioat_dma_descriptor *hw; 111 struct ioat_dma_descriptor *hw;
115 struct list_head node; 112 struct list_head node;
116 dma_cookie_t cookie; 113 int tx_cnt;
117 dma_addr_t phys;
118 DECLARE_PCI_UNMAP_ADDR(src) 114 DECLARE_PCI_UNMAP_ADDR(src)
119 DECLARE_PCI_UNMAP_LEN(src_len) 115 DECLARE_PCI_UNMAP_LEN(src_len)
120 DECLARE_PCI_UNMAP_ADDR(dst) 116 DECLARE_PCI_UNMAP_ADDR(dst)
121 DECLARE_PCI_UNMAP_LEN(dst_len) 117 DECLARE_PCI_UNMAP_LEN(dst_len)
118 struct dma_async_tx_descriptor async_tx;
122}; 119};
123 120
124#endif /* IOATDMA_H */ 121#endif /* IOATDMA_H */
125
diff --git a/drivers/dma/ioatdma_io.h b/drivers/dma/ioatdma_io.h
deleted file mode 100644
index c0b4bf66c920..000000000000
--- a/drivers/dma/ioatdma_io.h
+++ /dev/null
@@ -1,118 +0,0 @@
1/*
2 * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of the GNU General Public License as published by the Free
6 * Software Foundation; either version 2 of the License, or (at your option)
7 * any later version.
8 *
9 * This program is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59
16 * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * The full GNU General Public License is included in this distribution in the
19 * file called COPYING.
20 */
21#ifndef IOATDMA_IO_H
22#define IOATDMA_IO_H
23
24#include <asm/io.h>
25
26/*
27 * device and per-channel MMIO register read and write functions
28 * this is a lot of anoying inline functions, but it's typesafe
29 */
30
31static inline u8 ioatdma_read8(struct ioat_device *device,
32 unsigned int offset)
33{
34 return readb(device->reg_base + offset);
35}
36
37static inline u16 ioatdma_read16(struct ioat_device *device,
38 unsigned int offset)
39{
40 return readw(device->reg_base + offset);
41}
42
43static inline u32 ioatdma_read32(struct ioat_device *device,
44 unsigned int offset)
45{
46 return readl(device->reg_base + offset);
47}
48
49static inline void ioatdma_write8(struct ioat_device *device,
50 unsigned int offset, u8 value)
51{
52 writeb(value, device->reg_base + offset);
53}
54
55static inline void ioatdma_write16(struct ioat_device *device,
56 unsigned int offset, u16 value)
57{
58 writew(value, device->reg_base + offset);
59}
60
61static inline void ioatdma_write32(struct ioat_device *device,
62 unsigned int offset, u32 value)
63{
64 writel(value, device->reg_base + offset);
65}
66
67static inline u8 ioatdma_chan_read8(struct ioat_dma_chan *chan,
68 unsigned int offset)
69{
70 return readb(chan->reg_base + offset);
71}
72
73static inline u16 ioatdma_chan_read16(struct ioat_dma_chan *chan,
74 unsigned int offset)
75{
76 return readw(chan->reg_base + offset);
77}
78
79static inline u32 ioatdma_chan_read32(struct ioat_dma_chan *chan,
80 unsigned int offset)
81{
82 return readl(chan->reg_base + offset);
83}
84
85static inline void ioatdma_chan_write8(struct ioat_dma_chan *chan,
86 unsigned int offset, u8 value)
87{
88 writeb(value, chan->reg_base + offset);
89}
90
91static inline void ioatdma_chan_write16(struct ioat_dma_chan *chan,
92 unsigned int offset, u16 value)
93{
94 writew(value, chan->reg_base + offset);
95}
96
97static inline void ioatdma_chan_write32(struct ioat_dma_chan *chan,
98 unsigned int offset, u32 value)
99{
100 writel(value, chan->reg_base + offset);
101}
102
103#if (BITS_PER_LONG == 64)
104static inline u64 ioatdma_chan_read64(struct ioat_dma_chan *chan,
105 unsigned int offset)
106{
107 return readq(chan->reg_base + offset);
108}
109
110static inline void ioatdma_chan_write64(struct ioat_dma_chan *chan,
111 unsigned int offset, u64 value)
112{
113 writeq(value, chan->reg_base + offset);
114}
115#endif
116
117#endif /* IOATDMA_IO_H */
118
diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c
new file mode 100644
index 000000000000..5a1d426744d6
--- /dev/null
+++ b/drivers/dma/iop-adma.c
@@ -0,0 +1,1467 @@
1/*
2 * offload engine driver for the Intel Xscale series of i/o processors
3 * Copyright © 2006, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc.,
16 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
17 *
18 */
19
20/*
21 * This driver supports the asynchrounous DMA copy and RAID engines available
22 * on the Intel Xscale(R) family of I/O Processors (IOP 32x, 33x, 134x)
23 */
24
25#include <linux/init.h>
26#include <linux/module.h>
27#include <linux/async_tx.h>
28#include <linux/delay.h>
29#include <linux/dma-mapping.h>
30#include <linux/spinlock.h>
31#include <linux/interrupt.h>
32#include <linux/platform_device.h>
33#include <linux/memory.h>
34#include <linux/ioport.h>
35
36#include <asm/arch/adma.h>
37
38#define to_iop_adma_chan(chan) container_of(chan, struct iop_adma_chan, common)
39#define to_iop_adma_device(dev) \
40 container_of(dev, struct iop_adma_device, common)
41#define tx_to_iop_adma_slot(tx) \
42 container_of(tx, struct iop_adma_desc_slot, async_tx)
43
44/**
45 * iop_adma_free_slots - flags descriptor slots for reuse
46 * @slot: Slot to free
47 * Caller must hold &iop_chan->lock while calling this function
48 */
49static void iop_adma_free_slots(struct iop_adma_desc_slot *slot)
50{
51 int stride = slot->slots_per_op;
52
53 while (stride--) {
54 slot->slots_per_op = 0;
55 slot = list_entry(slot->slot_node.next,
56 struct iop_adma_desc_slot,
57 slot_node);
58 }
59}
60
61static dma_cookie_t
62iop_adma_run_tx_complete_actions(struct iop_adma_desc_slot *desc,
63 struct iop_adma_chan *iop_chan, dma_cookie_t cookie)
64{
65 BUG_ON(desc->async_tx.cookie < 0);
66 spin_lock_bh(&desc->async_tx.lock);
67 if (desc->async_tx.cookie > 0) {
68 cookie = desc->async_tx.cookie;
69 desc->async_tx.cookie = 0;
70
71 /* call the callback (must not sleep or submit new
72 * operations to this channel)
73 */
74 if (desc->async_tx.callback)
75 desc->async_tx.callback(
76 desc->async_tx.callback_param);
77
78 /* unmap dma addresses
79 * (unmap_single vs unmap_page?)
80 */
81 if (desc->group_head && desc->unmap_len) {
82 struct iop_adma_desc_slot *unmap = desc->group_head;
83 struct device *dev =
84 &iop_chan->device->pdev->dev;
85 u32 len = unmap->unmap_len;
86 u32 src_cnt = unmap->unmap_src_cnt;
87 dma_addr_t addr = iop_desc_get_dest_addr(unmap,
88 iop_chan);
89
90 dma_unmap_page(dev, addr, len, DMA_FROM_DEVICE);
91 while (src_cnt--) {
92 addr = iop_desc_get_src_addr(unmap,
93 iop_chan,
94 src_cnt);
95 dma_unmap_page(dev, addr, len,
96 DMA_TO_DEVICE);
97 }
98 desc->group_head = NULL;
99 }
100 }
101
102 /* run dependent operations */
103 async_tx_run_dependencies(&desc->async_tx);
104 spin_unlock_bh(&desc->async_tx.lock);
105
106 return cookie;
107}
108
109static int
110iop_adma_clean_slot(struct iop_adma_desc_slot *desc,
111 struct iop_adma_chan *iop_chan)
112{
113 /* the client is allowed to attach dependent operations
114 * until 'ack' is set
115 */
116 if (!desc->async_tx.ack)
117 return 0;
118
119 /* leave the last descriptor in the chain
120 * so we can append to it
121 */
122 if (desc->chain_node.next == &iop_chan->chain)
123 return 1;
124
125 dev_dbg(iop_chan->device->common.dev,
126 "\tfree slot: %d slots_per_op: %d\n",
127 desc->idx, desc->slots_per_op);
128
129 list_del(&desc->chain_node);
130 iop_adma_free_slots(desc);
131
132 return 0;
133}
134
135static void __iop_adma_slot_cleanup(struct iop_adma_chan *iop_chan)
136{
137 struct iop_adma_desc_slot *iter, *_iter, *grp_start = NULL;
138 dma_cookie_t cookie = 0;
139 u32 current_desc = iop_chan_get_current_descriptor(iop_chan);
140 int busy = iop_chan_is_busy(iop_chan);
141 int seen_current = 0, slot_cnt = 0, slots_per_op = 0;
142
143 dev_dbg(iop_chan->device->common.dev, "%s\n", __FUNCTION__);
144 /* free completed slots from the chain starting with
145 * the oldest descriptor
146 */
147 list_for_each_entry_safe(iter, _iter, &iop_chan->chain,
148 chain_node) {
149 pr_debug("\tcookie: %d slot: %d busy: %d "
150 "this_desc: %#x next_desc: %#x ack: %d\n",
151 iter->async_tx.cookie, iter->idx, busy,
152 iter->async_tx.phys, iop_desc_get_next_desc(iter),
153 iter->async_tx.ack);
154 prefetch(_iter);
155 prefetch(&_iter->async_tx);
156
157 /* do not advance past the current descriptor loaded into the
158 * hardware channel, subsequent descriptors are either in
159 * process or have not been submitted
160 */
161 if (seen_current)
162 break;
163
164 /* stop the search if we reach the current descriptor and the
165 * channel is busy, or if it appears that the current descriptor
166 * needs to be re-read (i.e. has been appended to)
167 */
168 if (iter->async_tx.phys == current_desc) {
169 BUG_ON(seen_current++);
170 if (busy || iop_desc_get_next_desc(iter))
171 break;
172 }
173
174 /* detect the start of a group transaction */
175 if (!slot_cnt && !slots_per_op) {
176 slot_cnt = iter->slot_cnt;
177 slots_per_op = iter->slots_per_op;
178 if (slot_cnt <= slots_per_op) {
179 slot_cnt = 0;
180 slots_per_op = 0;
181 }
182 }
183
184 if (slot_cnt) {
185 pr_debug("\tgroup++\n");
186 if (!grp_start)
187 grp_start = iter;
188 slot_cnt -= slots_per_op;
189 }
190
191 /* all the members of a group are complete */
192 if (slots_per_op != 0 && slot_cnt == 0) {
193 struct iop_adma_desc_slot *grp_iter, *_grp_iter;
194 int end_of_chain = 0;
195 pr_debug("\tgroup end\n");
196
197 /* collect the total results */
198 if (grp_start->xor_check_result) {
199 u32 zero_sum_result = 0;
200 slot_cnt = grp_start->slot_cnt;
201 grp_iter = grp_start;
202
203 list_for_each_entry_from(grp_iter,
204 &iop_chan->chain, chain_node) {
205 zero_sum_result |=
206 iop_desc_get_zero_result(grp_iter);
207 pr_debug("\titer%d result: %d\n",
208 grp_iter->idx, zero_sum_result);
209 slot_cnt -= slots_per_op;
210 if (slot_cnt == 0)
211 break;
212 }
213 pr_debug("\tgrp_start->xor_check_result: %p\n",
214 grp_start->xor_check_result);
215 *grp_start->xor_check_result = zero_sum_result;
216 }
217
218 /* clean up the group */
219 slot_cnt = grp_start->slot_cnt;
220 grp_iter = grp_start;
221 list_for_each_entry_safe_from(grp_iter, _grp_iter,
222 &iop_chan->chain, chain_node) {
223 cookie = iop_adma_run_tx_complete_actions(
224 grp_iter, iop_chan, cookie);
225
226 slot_cnt -= slots_per_op;
227 end_of_chain = iop_adma_clean_slot(grp_iter,
228 iop_chan);
229
230 if (slot_cnt == 0 || end_of_chain)
231 break;
232 }
233
234 /* the group should be complete at this point */
235 BUG_ON(slot_cnt);
236
237 slots_per_op = 0;
238 grp_start = NULL;
239 if (end_of_chain)
240 break;
241 else
242 continue;
243 } else if (slots_per_op) /* wait for group completion */
244 continue;
245
246 /* write back zero sum results (single descriptor case) */
247 if (iter->xor_check_result && iter->async_tx.cookie)
248 *iter->xor_check_result =
249 iop_desc_get_zero_result(iter);
250
251 cookie = iop_adma_run_tx_complete_actions(
252 iter, iop_chan, cookie);
253
254 if (iop_adma_clean_slot(iter, iop_chan))
255 break;
256 }
257
258 BUG_ON(!seen_current);
259
260 iop_chan_idle(busy, iop_chan);
261
262 if (cookie > 0) {
263 iop_chan->completed_cookie = cookie;
264 pr_debug("\tcompleted cookie %d\n", cookie);
265 }
266}
267
268static void
269iop_adma_slot_cleanup(struct iop_adma_chan *iop_chan)
270{
271 spin_lock_bh(&iop_chan->lock);
272 __iop_adma_slot_cleanup(iop_chan);
273 spin_unlock_bh(&iop_chan->lock);
274}
275
276static void iop_adma_tasklet(unsigned long data)
277{
278 struct iop_adma_chan *chan = (struct iop_adma_chan *) data;
279 __iop_adma_slot_cleanup(chan);
280}
281
282static struct iop_adma_desc_slot *
283iop_adma_alloc_slots(struct iop_adma_chan *iop_chan, int num_slots,
284 int slots_per_op)
285{
286 struct iop_adma_desc_slot *iter, *_iter, *alloc_start = NULL;
287 struct list_head chain = LIST_HEAD_INIT(chain);
288 int slots_found, retry = 0;
289
290 /* start search from the last allocated descrtiptor
291 * if a contiguous allocation can not be found start searching
292 * from the beginning of the list
293 */
294retry:
295 slots_found = 0;
296 if (retry == 0)
297 iter = iop_chan->last_used;
298 else
299 iter = list_entry(&iop_chan->all_slots,
300 struct iop_adma_desc_slot,
301 slot_node);
302
303 list_for_each_entry_safe_continue(
304 iter, _iter, &iop_chan->all_slots, slot_node) {
305 prefetch(_iter);
306 prefetch(&_iter->async_tx);
307 if (iter->slots_per_op) {
308 /* give up after finding the first busy slot
309 * on the second pass through the list
310 */
311 if (retry)
312 break;
313
314 slots_found = 0;
315 continue;
316 }
317
318 /* start the allocation if the slot is correctly aligned */
319 if (!slots_found++) {
320 if (iop_desc_is_aligned(iter, slots_per_op))
321 alloc_start = iter;
322 else {
323 slots_found = 0;
324 continue;
325 }
326 }
327
328 if (slots_found == num_slots) {
329 struct iop_adma_desc_slot *alloc_tail = NULL;
330 struct iop_adma_desc_slot *last_used = NULL;
331 iter = alloc_start;
332 while (num_slots) {
333 int i;
334 dev_dbg(iop_chan->device->common.dev,
335 "allocated slot: %d "
336 "(desc %p phys: %#x) slots_per_op %d\n",
337 iter->idx, iter->hw_desc,
338 iter->async_tx.phys, slots_per_op);
339
340 /* pre-ack all but the last descriptor */
341 if (num_slots != slots_per_op)
342 iter->async_tx.ack = 1;
343 else
344 iter->async_tx.ack = 0;
345
346 list_add_tail(&iter->chain_node, &chain);
347 alloc_tail = iter;
348 iter->async_tx.cookie = 0;
349 iter->slot_cnt = num_slots;
350 iter->xor_check_result = NULL;
351 for (i = 0; i < slots_per_op; i++) {
352 iter->slots_per_op = slots_per_op - i;
353 last_used = iter;
354 iter = list_entry(iter->slot_node.next,
355 struct iop_adma_desc_slot,
356 slot_node);
357 }
358 num_slots -= slots_per_op;
359 }
360 alloc_tail->group_head = alloc_start;
361 alloc_tail->async_tx.cookie = -EBUSY;
362 list_splice(&chain, &alloc_tail->async_tx.tx_list);
363 iop_chan->last_used = last_used;
364 iop_desc_clear_next_desc(alloc_start);
365 iop_desc_clear_next_desc(alloc_tail);
366 return alloc_tail;
367 }
368 }
369 if (!retry++)
370 goto retry;
371
372 /* try to free some slots if the allocation fails */
373 tasklet_schedule(&iop_chan->irq_tasklet);
374
375 return NULL;
376}
377
378static dma_cookie_t
379iop_desc_assign_cookie(struct iop_adma_chan *iop_chan,
380 struct iop_adma_desc_slot *desc)
381{
382 dma_cookie_t cookie = iop_chan->common.cookie;
383 cookie++;
384 if (cookie < 0)
385 cookie = 1;
386 iop_chan->common.cookie = desc->async_tx.cookie = cookie;
387 return cookie;
388}
389
390static void iop_adma_check_threshold(struct iop_adma_chan *iop_chan)
391{
392 dev_dbg(iop_chan->device->common.dev, "pending: %d\n",
393 iop_chan->pending);
394
395 if (iop_chan->pending >= IOP_ADMA_THRESHOLD) {
396 iop_chan->pending = 0;
397 iop_chan_append(iop_chan);
398 }
399}
400
401static dma_cookie_t
402iop_adma_tx_submit(struct dma_async_tx_descriptor *tx)
403{
404 struct iop_adma_desc_slot *sw_desc = tx_to_iop_adma_slot(tx);
405 struct iop_adma_chan *iop_chan = to_iop_adma_chan(tx->chan);
406 struct iop_adma_desc_slot *grp_start, *old_chain_tail;
407 int slot_cnt;
408 int slots_per_op;
409 dma_cookie_t cookie;
410
411 grp_start = sw_desc->group_head;
412 slot_cnt = grp_start->slot_cnt;
413 slots_per_op = grp_start->slots_per_op;
414
415 spin_lock_bh(&iop_chan->lock);
416 cookie = iop_desc_assign_cookie(iop_chan, sw_desc);
417
418 old_chain_tail = list_entry(iop_chan->chain.prev,
419 struct iop_adma_desc_slot, chain_node);
420 list_splice_init(&sw_desc->async_tx.tx_list,
421 &old_chain_tail->chain_node);
422
423 /* fix up the hardware chain */
424 iop_desc_set_next_desc(old_chain_tail, grp_start->async_tx.phys);
425
426 /* 1/ don't add pre-chained descriptors
427 * 2/ dummy read to flush next_desc write
428 */
429 BUG_ON(iop_desc_get_next_desc(sw_desc));
430
431 /* increment the pending count by the number of slots
432 * memcpy operations have a 1:1 (slot:operation) relation
433 * other operations are heavier and will pop the threshold
434 * more often.
435 */
436 iop_chan->pending += slot_cnt;
437 iop_adma_check_threshold(iop_chan);
438 spin_unlock_bh(&iop_chan->lock);
439
440 dev_dbg(iop_chan->device->common.dev, "%s cookie: %d slot: %d\n",
441 __FUNCTION__, sw_desc->async_tx.cookie, sw_desc->idx);
442
443 return cookie;
444}
445
446static void
447iop_adma_set_dest(dma_addr_t addr, struct dma_async_tx_descriptor *tx,
448 int index)
449{
450 struct iop_adma_desc_slot *sw_desc = tx_to_iop_adma_slot(tx);
451 struct iop_adma_chan *iop_chan = to_iop_adma_chan(tx->chan);
452
453 /* to do: support transfers lengths > IOP_ADMA_MAX_BYTE_COUNT */
454 iop_desc_set_dest_addr(sw_desc->group_head, iop_chan, addr);
455}
456
457static void iop_chan_start_null_memcpy(struct iop_adma_chan *iop_chan);
458static void iop_chan_start_null_xor(struct iop_adma_chan *iop_chan);
459
460/* returns the number of allocated descriptors */
461static int iop_adma_alloc_chan_resources(struct dma_chan *chan)
462{
463 char *hw_desc;
464 int idx;
465 struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan);
466 struct iop_adma_desc_slot *slot = NULL;
467 int init = iop_chan->slots_allocated ? 0 : 1;
468 struct iop_adma_platform_data *plat_data =
469 iop_chan->device->pdev->dev.platform_data;
470 int num_descs_in_pool = plat_data->pool_size/IOP_ADMA_SLOT_SIZE;
471
472 /* Allocate descriptor slots */
473 do {
474 idx = iop_chan->slots_allocated;
475 if (idx == num_descs_in_pool)
476 break;
477
478 slot = kzalloc(sizeof(*slot), GFP_KERNEL);
479 if (!slot) {
480 printk(KERN_INFO "IOP ADMA Channel only initialized"
481 " %d descriptor slots", idx);
482 break;
483 }
484 hw_desc = (char *) iop_chan->device->dma_desc_pool_virt;
485 slot->hw_desc = (void *) &hw_desc[idx * IOP_ADMA_SLOT_SIZE];
486
487 dma_async_tx_descriptor_init(&slot->async_tx, chan);
488 slot->async_tx.tx_submit = iop_adma_tx_submit;
489 slot->async_tx.tx_set_dest = iop_adma_set_dest;
490 INIT_LIST_HEAD(&slot->chain_node);
491 INIT_LIST_HEAD(&slot->slot_node);
492 INIT_LIST_HEAD(&slot->async_tx.tx_list);
493 hw_desc = (char *) iop_chan->device->dma_desc_pool;
494 slot->async_tx.phys =
495 (dma_addr_t) &hw_desc[idx * IOP_ADMA_SLOT_SIZE];
496 slot->idx = idx;
497
498 spin_lock_bh(&iop_chan->lock);
499 iop_chan->slots_allocated++;
500 list_add_tail(&slot->slot_node, &iop_chan->all_slots);
501 spin_unlock_bh(&iop_chan->lock);
502 } while (iop_chan->slots_allocated < num_descs_in_pool);
503
504 if (idx && !iop_chan->last_used)
505 iop_chan->last_used = list_entry(iop_chan->all_slots.next,
506 struct iop_adma_desc_slot,
507 slot_node);
508
509 dev_dbg(iop_chan->device->common.dev,
510 "allocated %d descriptor slots last_used: %p\n",
511 iop_chan->slots_allocated, iop_chan->last_used);
512
513 /* initialize the channel and the chain with a null operation */
514 if (init) {
515 if (dma_has_cap(DMA_MEMCPY,
516 iop_chan->device->common.cap_mask))
517 iop_chan_start_null_memcpy(iop_chan);
518 else if (dma_has_cap(DMA_XOR,
519 iop_chan->device->common.cap_mask))
520 iop_chan_start_null_xor(iop_chan);
521 else
522 BUG();
523 }
524
525 return (idx > 0) ? idx : -ENOMEM;
526}
527
528static struct dma_async_tx_descriptor *
529iop_adma_prep_dma_interrupt(struct dma_chan *chan)
530{
531 struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan);
532 struct iop_adma_desc_slot *sw_desc, *grp_start;
533 int slot_cnt, slots_per_op;
534
535 dev_dbg(iop_chan->device->common.dev, "%s\n", __FUNCTION__);
536
537 spin_lock_bh(&iop_chan->lock);
538 slot_cnt = iop_chan_interrupt_slot_count(&slots_per_op, iop_chan);
539 sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op);
540 if (sw_desc) {
541 grp_start = sw_desc->group_head;
542 iop_desc_init_interrupt(grp_start, iop_chan);
543 grp_start->unmap_len = 0;
544 }
545 spin_unlock_bh(&iop_chan->lock);
546
547 return sw_desc ? &sw_desc->async_tx : NULL;
548}
549
550static void
551iop_adma_memcpy_set_src(dma_addr_t addr, struct dma_async_tx_descriptor *tx,
552 int index)
553{
554 struct iop_adma_desc_slot *sw_desc = tx_to_iop_adma_slot(tx);
555 struct iop_adma_desc_slot *grp_start = sw_desc->group_head;
556
557 iop_desc_set_memcpy_src_addr(grp_start, addr);
558}
559
560static struct dma_async_tx_descriptor *
561iop_adma_prep_dma_memcpy(struct dma_chan *chan, size_t len, int int_en)
562{
563 struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan);
564 struct iop_adma_desc_slot *sw_desc, *grp_start;
565 int slot_cnt, slots_per_op;
566
567 if (unlikely(!len))
568 return NULL;
569 BUG_ON(unlikely(len > IOP_ADMA_MAX_BYTE_COUNT));
570
571 dev_dbg(iop_chan->device->common.dev, "%s len: %u\n",
572 __FUNCTION__, len);
573
574 spin_lock_bh(&iop_chan->lock);
575 slot_cnt = iop_chan_memcpy_slot_count(len, &slots_per_op);
576 sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op);
577 if (sw_desc) {
578 grp_start = sw_desc->group_head;
579 iop_desc_init_memcpy(grp_start, int_en);
580 iop_desc_set_byte_count(grp_start, iop_chan, len);
581 sw_desc->unmap_src_cnt = 1;
582 sw_desc->unmap_len = len;
583 sw_desc->async_tx.tx_set_src = iop_adma_memcpy_set_src;
584 }
585 spin_unlock_bh(&iop_chan->lock);
586
587 return sw_desc ? &sw_desc->async_tx : NULL;
588}
589
590static struct dma_async_tx_descriptor *
591iop_adma_prep_dma_memset(struct dma_chan *chan, int value, size_t len,
592 int int_en)
593{
594 struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan);
595 struct iop_adma_desc_slot *sw_desc, *grp_start;
596 int slot_cnt, slots_per_op;
597
598 if (unlikely(!len))
599 return NULL;
600 BUG_ON(unlikely(len > IOP_ADMA_MAX_BYTE_COUNT));
601
602 dev_dbg(iop_chan->device->common.dev, "%s len: %u\n",
603 __FUNCTION__, len);
604
605 spin_lock_bh(&iop_chan->lock);
606 slot_cnt = iop_chan_memset_slot_count(len, &slots_per_op);
607 sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op);
608 if (sw_desc) {
609 grp_start = sw_desc->group_head;
610 iop_desc_init_memset(grp_start, int_en);
611 iop_desc_set_byte_count(grp_start, iop_chan, len);
612 iop_desc_set_block_fill_val(grp_start, value);
613 sw_desc->unmap_src_cnt = 1;
614 sw_desc->unmap_len = len;
615 }
616 spin_unlock_bh(&iop_chan->lock);
617
618 return sw_desc ? &sw_desc->async_tx : NULL;
619}
620
621static void
622iop_adma_xor_set_src(dma_addr_t addr, struct dma_async_tx_descriptor *tx,
623 int index)
624{
625 struct iop_adma_desc_slot *sw_desc = tx_to_iop_adma_slot(tx);
626 struct iop_adma_desc_slot *grp_start = sw_desc->group_head;
627
628 iop_desc_set_xor_src_addr(grp_start, index, addr);
629}
630
631static struct dma_async_tx_descriptor *
632iop_adma_prep_dma_xor(struct dma_chan *chan, unsigned int src_cnt, size_t len,
633 int int_en)
634{
635 struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan);
636 struct iop_adma_desc_slot *sw_desc, *grp_start;
637 int slot_cnt, slots_per_op;
638
639 if (unlikely(!len))
640 return NULL;
641 BUG_ON(unlikely(len > IOP_ADMA_XOR_MAX_BYTE_COUNT));
642
643 dev_dbg(iop_chan->device->common.dev,
644 "%s src_cnt: %d len: %u int_en: %d\n",
645 __FUNCTION__, src_cnt, len, int_en);
646
647 spin_lock_bh(&iop_chan->lock);
648 slot_cnt = iop_chan_xor_slot_count(len, src_cnt, &slots_per_op);
649 sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op);
650 if (sw_desc) {
651 grp_start = sw_desc->group_head;
652 iop_desc_init_xor(grp_start, src_cnt, int_en);
653 iop_desc_set_byte_count(grp_start, iop_chan, len);
654 sw_desc->unmap_src_cnt = src_cnt;
655 sw_desc->unmap_len = len;
656 sw_desc->async_tx.tx_set_src = iop_adma_xor_set_src;
657 }
658 spin_unlock_bh(&iop_chan->lock);
659
660 return sw_desc ? &sw_desc->async_tx : NULL;
661}
662
663static void
664iop_adma_xor_zero_sum_set_src(dma_addr_t addr,
665 struct dma_async_tx_descriptor *tx,
666 int index)
667{
668 struct iop_adma_desc_slot *sw_desc = tx_to_iop_adma_slot(tx);
669 struct iop_adma_desc_slot *grp_start = sw_desc->group_head;
670
671 iop_desc_set_zero_sum_src_addr(grp_start, index, addr);
672}
673
674static struct dma_async_tx_descriptor *
675iop_adma_prep_dma_zero_sum(struct dma_chan *chan, unsigned int src_cnt,
676 size_t len, u32 *result, int int_en)
677{
678 struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan);
679 struct iop_adma_desc_slot *sw_desc, *grp_start;
680 int slot_cnt, slots_per_op;
681
682 if (unlikely(!len))
683 return NULL;
684
685 dev_dbg(iop_chan->device->common.dev, "%s src_cnt: %d len: %u\n",
686 __FUNCTION__, src_cnt, len);
687
688 spin_lock_bh(&iop_chan->lock);
689 slot_cnt = iop_chan_zero_sum_slot_count(len, src_cnt, &slots_per_op);
690 sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op);
691 if (sw_desc) {
692 grp_start = sw_desc->group_head;
693 iop_desc_init_zero_sum(grp_start, src_cnt, int_en);
694 iop_desc_set_zero_sum_byte_count(grp_start, len);
695 grp_start->xor_check_result = result;
696 pr_debug("\t%s: grp_start->xor_check_result: %p\n",
697 __FUNCTION__, grp_start->xor_check_result);
698 sw_desc->unmap_src_cnt = src_cnt;
699 sw_desc->unmap_len = len;
700 sw_desc->async_tx.tx_set_src = iop_adma_xor_zero_sum_set_src;
701 }
702 spin_unlock_bh(&iop_chan->lock);
703
704 return sw_desc ? &sw_desc->async_tx : NULL;
705}
706
707static void iop_adma_dependency_added(struct dma_chan *chan)
708{
709 struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan);
710 tasklet_schedule(&iop_chan->irq_tasklet);
711}
712
713static void iop_adma_free_chan_resources(struct dma_chan *chan)
714{
715 struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan);
716 struct iop_adma_desc_slot *iter, *_iter;
717 int in_use_descs = 0;
718
719 iop_adma_slot_cleanup(iop_chan);
720
721 spin_lock_bh(&iop_chan->lock);
722 list_for_each_entry_safe(iter, _iter, &iop_chan->chain,
723 chain_node) {
724 in_use_descs++;
725 list_del(&iter->chain_node);
726 }
727 list_for_each_entry_safe_reverse(
728 iter, _iter, &iop_chan->all_slots, slot_node) {
729 list_del(&iter->slot_node);
730 kfree(iter);
731 iop_chan->slots_allocated--;
732 }
733 iop_chan->last_used = NULL;
734
735 dev_dbg(iop_chan->device->common.dev, "%s slots_allocated %d\n",
736 __FUNCTION__, iop_chan->slots_allocated);
737 spin_unlock_bh(&iop_chan->lock);
738
739 /* one is ok since we left it on there on purpose */
740 if (in_use_descs > 1)
741 printk(KERN_ERR "IOP: Freeing %d in use descriptors!\n",
742 in_use_descs - 1);
743}
744
745/**
746 * iop_adma_is_complete - poll the status of an ADMA transaction
747 * @chan: ADMA channel handle
748 * @cookie: ADMA transaction identifier
749 */
750static enum dma_status iop_adma_is_complete(struct dma_chan *chan,
751 dma_cookie_t cookie,
752 dma_cookie_t *done,
753 dma_cookie_t *used)
754{
755 struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan);
756 dma_cookie_t last_used;
757 dma_cookie_t last_complete;
758 enum dma_status ret;
759
760 last_used = chan->cookie;
761 last_complete = iop_chan->completed_cookie;
762
763 if (done)
764 *done = last_complete;
765 if (used)
766 *used = last_used;
767
768 ret = dma_async_is_complete(cookie, last_complete, last_used);
769 if (ret == DMA_SUCCESS)
770 return ret;
771
772 iop_adma_slot_cleanup(iop_chan);
773
774 last_used = chan->cookie;
775 last_complete = iop_chan->completed_cookie;
776
777 if (done)
778 *done = last_complete;
779 if (used)
780 *used = last_used;
781
782 return dma_async_is_complete(cookie, last_complete, last_used);
783}
784
785static irqreturn_t iop_adma_eot_handler(int irq, void *data)
786{
787 struct iop_adma_chan *chan = data;
788
789 dev_dbg(chan->device->common.dev, "%s\n", __FUNCTION__);
790
791 tasklet_schedule(&chan->irq_tasklet);
792
793 iop_adma_device_clear_eot_status(chan);
794
795 return IRQ_HANDLED;
796}
797
798static irqreturn_t iop_adma_eoc_handler(int irq, void *data)
799{
800 struct iop_adma_chan *chan = data;
801
802 dev_dbg(chan->device->common.dev, "%s\n", __FUNCTION__);
803
804 tasklet_schedule(&chan->irq_tasklet);
805
806 iop_adma_device_clear_eoc_status(chan);
807
808 return IRQ_HANDLED;
809}
810
811static irqreturn_t iop_adma_err_handler(int irq, void *data)
812{
813 struct iop_adma_chan *chan = data;
814 unsigned long status = iop_chan_get_status(chan);
815
816 dev_printk(KERN_ERR, chan->device->common.dev,
817 "error ( %s%s%s%s%s%s%s)\n",
818 iop_is_err_int_parity(status, chan) ? "int_parity " : "",
819 iop_is_err_mcu_abort(status, chan) ? "mcu_abort " : "",
820 iop_is_err_int_tabort(status, chan) ? "int_tabort " : "",
821 iop_is_err_int_mabort(status, chan) ? "int_mabort " : "",
822 iop_is_err_pci_tabort(status, chan) ? "pci_tabort " : "",
823 iop_is_err_pci_mabort(status, chan) ? "pci_mabort " : "",
824 iop_is_err_split_tx(status, chan) ? "split_tx " : "");
825
826 iop_adma_device_clear_err_status(chan);
827
828 BUG();
829
830 return IRQ_HANDLED;
831}
832
833static void iop_adma_issue_pending(struct dma_chan *chan)
834{
835 struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan);
836
837 if (iop_chan->pending) {
838 iop_chan->pending = 0;
839 iop_chan_append(iop_chan);
840 }
841}
842
843/*
844 * Perform a transaction to verify the HW works.
845 */
846#define IOP_ADMA_TEST_SIZE 2000
847
848static int __devinit iop_adma_memcpy_self_test(struct iop_adma_device *device)
849{
850 int i;
851 void *src, *dest;
852 dma_addr_t src_dma, dest_dma;
853 struct dma_chan *dma_chan;
854 dma_cookie_t cookie;
855 struct dma_async_tx_descriptor *tx;
856 int err = 0;
857 struct iop_adma_chan *iop_chan;
858
859 dev_dbg(device->common.dev, "%s\n", __FUNCTION__);
860
861 src = kzalloc(sizeof(u8) * IOP_ADMA_TEST_SIZE, GFP_KERNEL);
862 if (!src)
863 return -ENOMEM;
864 dest = kzalloc(sizeof(u8) * IOP_ADMA_TEST_SIZE, GFP_KERNEL);
865 if (!dest) {
866 kfree(src);
867 return -ENOMEM;
868 }
869
870 /* Fill in src buffer */
871 for (i = 0; i < IOP_ADMA_TEST_SIZE; i++)
872 ((u8 *) src)[i] = (u8)i;
873
874 memset(dest, 0, IOP_ADMA_TEST_SIZE);
875
876 /* Start copy, using first DMA channel */
877 dma_chan = container_of(device->common.channels.next,
878 struct dma_chan,
879 device_node);
880 if (iop_adma_alloc_chan_resources(dma_chan) < 1) {
881 err = -ENODEV;
882 goto out;
883 }
884
885 tx = iop_adma_prep_dma_memcpy(dma_chan, IOP_ADMA_TEST_SIZE, 1);
886 dest_dma = dma_map_single(dma_chan->device->dev, dest,
887 IOP_ADMA_TEST_SIZE, DMA_FROM_DEVICE);
888 iop_adma_set_dest(dest_dma, tx, 0);
889 src_dma = dma_map_single(dma_chan->device->dev, src,
890 IOP_ADMA_TEST_SIZE, DMA_TO_DEVICE);
891 iop_adma_memcpy_set_src(src_dma, tx, 0);
892
893 cookie = iop_adma_tx_submit(tx);
894 iop_adma_issue_pending(dma_chan);
895 async_tx_ack(tx);
896 msleep(1);
897
898 if (iop_adma_is_complete(dma_chan, cookie, NULL, NULL) !=
899 DMA_SUCCESS) {
900 dev_printk(KERN_ERR, dma_chan->device->dev,
901 "Self-test copy timed out, disabling\n");
902 err = -ENODEV;
903 goto free_resources;
904 }
905
906 iop_chan = to_iop_adma_chan(dma_chan);
907 dma_sync_single_for_cpu(&iop_chan->device->pdev->dev, dest_dma,
908 IOP_ADMA_TEST_SIZE, DMA_FROM_DEVICE);
909 if (memcmp(src, dest, IOP_ADMA_TEST_SIZE)) {
910 dev_printk(KERN_ERR, dma_chan->device->dev,
911 "Self-test copy failed compare, disabling\n");
912 err = -ENODEV;
913 goto free_resources;
914 }
915
916free_resources:
917 iop_adma_free_chan_resources(dma_chan);
918out:
919 kfree(src);
920 kfree(dest);
921 return err;
922}
923
924#define IOP_ADMA_NUM_SRC_TEST 4 /* must be <= 15 */
925static int __devinit
926iop_adma_xor_zero_sum_self_test(struct iop_adma_device *device)
927{
928 int i, src_idx;
929 struct page *dest;
930 struct page *xor_srcs[IOP_ADMA_NUM_SRC_TEST];
931 struct page *zero_sum_srcs[IOP_ADMA_NUM_SRC_TEST + 1];
932 dma_addr_t dma_addr, dest_dma;
933 struct dma_async_tx_descriptor *tx;
934 struct dma_chan *dma_chan;
935 dma_cookie_t cookie;
936 u8 cmp_byte = 0;
937 u32 cmp_word;
938 u32 zero_sum_result;
939 int err = 0;
940 struct iop_adma_chan *iop_chan;
941
942 dev_dbg(device->common.dev, "%s\n", __FUNCTION__);
943
944 for (src_idx = 0; src_idx < IOP_ADMA_NUM_SRC_TEST; src_idx++) {
945 xor_srcs[src_idx] = alloc_page(GFP_KERNEL);
946 if (!xor_srcs[src_idx])
947 while (src_idx--) {
948 __free_page(xor_srcs[src_idx]);
949 return -ENOMEM;
950 }
951 }
952
953 dest = alloc_page(GFP_KERNEL);
954 if (!dest)
955 while (src_idx--) {
956 __free_page(xor_srcs[src_idx]);
957 return -ENOMEM;
958 }
959
960 /* Fill in src buffers */
961 for (src_idx = 0; src_idx < IOP_ADMA_NUM_SRC_TEST; src_idx++) {
962 u8 *ptr = page_address(xor_srcs[src_idx]);
963 for (i = 0; i < PAGE_SIZE; i++)
964 ptr[i] = (1 << src_idx);
965 }
966
967 for (src_idx = 0; src_idx < IOP_ADMA_NUM_SRC_TEST; src_idx++)
968 cmp_byte ^= (u8) (1 << src_idx);
969
970 cmp_word = (cmp_byte << 24) | (cmp_byte << 16) |
971 (cmp_byte << 8) | cmp_byte;
972
973 memset(page_address(dest), 0, PAGE_SIZE);
974
975 dma_chan = container_of(device->common.channels.next,
976 struct dma_chan,
977 device_node);
978 if (iop_adma_alloc_chan_resources(dma_chan) < 1) {
979 err = -ENODEV;
980 goto out;
981 }
982
983 /* test xor */
984 tx = iop_adma_prep_dma_xor(dma_chan, IOP_ADMA_NUM_SRC_TEST,
985 PAGE_SIZE, 1);
986 dest_dma = dma_map_page(dma_chan->device->dev, dest, 0,
987 PAGE_SIZE, DMA_FROM_DEVICE);
988 iop_adma_set_dest(dest_dma, tx, 0);
989
990 for (i = 0; i < IOP_ADMA_NUM_SRC_TEST; i++) {
991 dma_addr = dma_map_page(dma_chan->device->dev, xor_srcs[i], 0,
992 PAGE_SIZE, DMA_TO_DEVICE);
993 iop_adma_xor_set_src(dma_addr, tx, i);
994 }
995
996 cookie = iop_adma_tx_submit(tx);
997 iop_adma_issue_pending(dma_chan);
998 async_tx_ack(tx);
999 msleep(8);
1000
1001 if (iop_adma_is_complete(dma_chan, cookie, NULL, NULL) !=
1002 DMA_SUCCESS) {
1003 dev_printk(KERN_ERR, dma_chan->device->dev,
1004 "Self-test xor timed out, disabling\n");
1005 err = -ENODEV;
1006 goto free_resources;
1007 }
1008
1009 iop_chan = to_iop_adma_chan(dma_chan);
1010 dma_sync_single_for_cpu(&iop_chan->device->pdev->dev, dest_dma,
1011 PAGE_SIZE, DMA_FROM_DEVICE);
1012 for (i = 0; i < (PAGE_SIZE / sizeof(u32)); i++) {
1013 u32 *ptr = page_address(dest);
1014 if (ptr[i] != cmp_word) {
1015 dev_printk(KERN_ERR, dma_chan->device->dev,
1016 "Self-test xor failed compare, disabling\n");
1017 err = -ENODEV;
1018 goto free_resources;
1019 }
1020 }
1021 dma_sync_single_for_device(&iop_chan->device->pdev->dev, dest_dma,
1022 PAGE_SIZE, DMA_TO_DEVICE);
1023
1024 /* skip zero sum if the capability is not present */
1025 if (!dma_has_cap(DMA_ZERO_SUM, dma_chan->device->cap_mask))
1026 goto free_resources;
1027
1028 /* zero sum the sources with the destintation page */
1029 for (i = 0; i < IOP_ADMA_NUM_SRC_TEST; i++)
1030 zero_sum_srcs[i] = xor_srcs[i];
1031 zero_sum_srcs[i] = dest;
1032
1033 zero_sum_result = 1;
1034
1035 tx = iop_adma_prep_dma_zero_sum(dma_chan, IOP_ADMA_NUM_SRC_TEST + 1,
1036 PAGE_SIZE, &zero_sum_result, 1);
1037 for (i = 0; i < IOP_ADMA_NUM_SRC_TEST + 1; i++) {
1038 dma_addr = dma_map_page(dma_chan->device->dev, zero_sum_srcs[i],
1039 0, PAGE_SIZE, DMA_TO_DEVICE);
1040 iop_adma_xor_zero_sum_set_src(dma_addr, tx, i);
1041 }
1042
1043 cookie = iop_adma_tx_submit(tx);
1044 iop_adma_issue_pending(dma_chan);
1045 async_tx_ack(tx);
1046 msleep(8);
1047
1048 if (iop_adma_is_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) {
1049 dev_printk(KERN_ERR, dma_chan->device->dev,
1050 "Self-test zero sum timed out, disabling\n");
1051 err = -ENODEV;
1052 goto free_resources;
1053 }
1054
1055 if (zero_sum_result != 0) {
1056 dev_printk(KERN_ERR, dma_chan->device->dev,
1057 "Self-test zero sum failed compare, disabling\n");
1058 err = -ENODEV;
1059 goto free_resources;
1060 }
1061
1062 /* test memset */
1063 tx = iop_adma_prep_dma_memset(dma_chan, 0, PAGE_SIZE, 1);
1064 dma_addr = dma_map_page(dma_chan->device->dev, dest, 0,
1065 PAGE_SIZE, DMA_FROM_DEVICE);
1066 iop_adma_set_dest(dma_addr, tx, 0);
1067
1068 cookie = iop_adma_tx_submit(tx);
1069 iop_adma_issue_pending(dma_chan);
1070 async_tx_ack(tx);
1071 msleep(8);
1072
1073 if (iop_adma_is_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) {
1074 dev_printk(KERN_ERR, dma_chan->device->dev,
1075 "Self-test memset timed out, disabling\n");
1076 err = -ENODEV;
1077 goto free_resources;
1078 }
1079
1080 for (i = 0; i < PAGE_SIZE/sizeof(u32); i++) {
1081 u32 *ptr = page_address(dest);
1082 if (ptr[i]) {
1083 dev_printk(KERN_ERR, dma_chan->device->dev,
1084 "Self-test memset failed compare, disabling\n");
1085 err = -ENODEV;
1086 goto free_resources;
1087 }
1088 }
1089
1090 /* test for non-zero parity sum */
1091 zero_sum_result = 0;
1092 tx = iop_adma_prep_dma_zero_sum(dma_chan, IOP_ADMA_NUM_SRC_TEST + 1,
1093 PAGE_SIZE, &zero_sum_result, 1);
1094 for (i = 0; i < IOP_ADMA_NUM_SRC_TEST + 1; i++) {
1095 dma_addr = dma_map_page(dma_chan->device->dev, zero_sum_srcs[i],
1096 0, PAGE_SIZE, DMA_TO_DEVICE);
1097 iop_adma_xor_zero_sum_set_src(dma_addr, tx, i);
1098 }
1099
1100 cookie = iop_adma_tx_submit(tx);
1101 iop_adma_issue_pending(dma_chan);
1102 async_tx_ack(tx);
1103 msleep(8);
1104
1105 if (iop_adma_is_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) {
1106 dev_printk(KERN_ERR, dma_chan->device->dev,
1107 "Self-test non-zero sum timed out, disabling\n");
1108 err = -ENODEV;
1109 goto free_resources;
1110 }
1111
1112 if (zero_sum_result != 1) {
1113 dev_printk(KERN_ERR, dma_chan->device->dev,
1114 "Self-test non-zero sum failed compare, disabling\n");
1115 err = -ENODEV;
1116 goto free_resources;
1117 }
1118
1119free_resources:
1120 iop_adma_free_chan_resources(dma_chan);
1121out:
1122 src_idx = IOP_ADMA_NUM_SRC_TEST;
1123 while (src_idx--)
1124 __free_page(xor_srcs[src_idx]);
1125 __free_page(dest);
1126 return err;
1127}
1128
1129static int __devexit iop_adma_remove(struct platform_device *dev)
1130{
1131 struct iop_adma_device *device = platform_get_drvdata(dev);
1132 struct dma_chan *chan, *_chan;
1133 struct iop_adma_chan *iop_chan;
1134 int i;
1135 struct iop_adma_platform_data *plat_data = dev->dev.platform_data;
1136
1137 dma_async_device_unregister(&device->common);
1138
1139 for (i = 0; i < 3; i++) {
1140 unsigned int irq;
1141 irq = platform_get_irq(dev, i);
1142 free_irq(irq, device);
1143 }
1144
1145 dma_free_coherent(&dev->dev, plat_data->pool_size,
1146 device->dma_desc_pool_virt, device->dma_desc_pool);
1147
1148 do {
1149 struct resource *res;
1150 res = platform_get_resource(dev, IORESOURCE_MEM, 0);
1151 release_mem_region(res->start, res->end - res->start);
1152 } while (0);
1153
1154 list_for_each_entry_safe(chan, _chan, &device->common.channels,
1155 device_node) {
1156 iop_chan = to_iop_adma_chan(chan);
1157 list_del(&chan->device_node);
1158 kfree(iop_chan);
1159 }
1160 kfree(device);
1161
1162 return 0;
1163}
1164
1165static int __devinit iop_adma_probe(struct platform_device *pdev)
1166{
1167 struct resource *res;
1168 int ret = 0, i;
1169 struct iop_adma_device *adev;
1170 struct iop_adma_chan *iop_chan;
1171 struct dma_device *dma_dev;
1172 struct iop_adma_platform_data *plat_data = pdev->dev.platform_data;
1173
1174 res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
1175 if (!res)
1176 return -ENODEV;
1177
1178 if (!devm_request_mem_region(&pdev->dev, res->start,
1179 res->end - res->start, pdev->name))
1180 return -EBUSY;
1181
1182 adev = kzalloc(sizeof(*adev), GFP_KERNEL);
1183 if (!adev)
1184 return -ENOMEM;
1185 dma_dev = &adev->common;
1186
1187 /* allocate coherent memory for hardware descriptors
1188 * note: writecombine gives slightly better performance, but
1189 * requires that we explicitly flush the writes
1190 */
1191 if ((adev->dma_desc_pool_virt = dma_alloc_writecombine(&pdev->dev,
1192 plat_data->pool_size,
1193 &adev->dma_desc_pool,
1194 GFP_KERNEL)) == NULL) {
1195 ret = -ENOMEM;
1196 goto err_free_adev;
1197 }
1198
1199 dev_dbg(&pdev->dev, "%s: allocted descriptor pool virt %p phys %p\n",
1200 __FUNCTION__, adev->dma_desc_pool_virt,
1201 (void *) adev->dma_desc_pool);
1202
1203 adev->id = plat_data->hw_id;
1204
1205 /* discover transaction capabilites from the platform data */
1206 dma_dev->cap_mask = plat_data->cap_mask;
1207
1208 adev->pdev = pdev;
1209 platform_set_drvdata(pdev, adev);
1210
1211 INIT_LIST_HEAD(&dma_dev->channels);
1212
1213 /* set base routines */
1214 dma_dev->device_alloc_chan_resources = iop_adma_alloc_chan_resources;
1215 dma_dev->device_free_chan_resources = iop_adma_free_chan_resources;
1216 dma_dev->device_is_tx_complete = iop_adma_is_complete;
1217 dma_dev->device_issue_pending = iop_adma_issue_pending;
1218 dma_dev->device_dependency_added = iop_adma_dependency_added;
1219 dma_dev->dev = &pdev->dev;
1220
1221 /* set prep routines based on capability */
1222 if (dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask))
1223 dma_dev->device_prep_dma_memcpy = iop_adma_prep_dma_memcpy;
1224 if (dma_has_cap(DMA_MEMSET, dma_dev->cap_mask))
1225 dma_dev->device_prep_dma_memset = iop_adma_prep_dma_memset;
1226 if (dma_has_cap(DMA_XOR, dma_dev->cap_mask)) {
1227 dma_dev->max_xor = iop_adma_get_max_xor();
1228 dma_dev->device_prep_dma_xor = iop_adma_prep_dma_xor;
1229 }
1230 if (dma_has_cap(DMA_ZERO_SUM, dma_dev->cap_mask))
1231 dma_dev->device_prep_dma_zero_sum =
1232 iop_adma_prep_dma_zero_sum;
1233 if (dma_has_cap(DMA_INTERRUPT, dma_dev->cap_mask))
1234 dma_dev->device_prep_dma_interrupt =
1235 iop_adma_prep_dma_interrupt;
1236
1237 iop_chan = kzalloc(sizeof(*iop_chan), GFP_KERNEL);
1238 if (!iop_chan) {
1239 ret = -ENOMEM;
1240 goto err_free_dma;
1241 }
1242 iop_chan->device = adev;
1243
1244 iop_chan->mmr_base = devm_ioremap(&pdev->dev, res->start,
1245 res->end - res->start);
1246 if (!iop_chan->mmr_base) {
1247 ret = -ENOMEM;
1248 goto err_free_iop_chan;
1249 }
1250 tasklet_init(&iop_chan->irq_tasklet, iop_adma_tasklet, (unsigned long)
1251 iop_chan);
1252
1253 /* clear errors before enabling interrupts */
1254 iop_adma_device_clear_err_status(iop_chan);
1255
1256 for (i = 0; i < 3; i++) {
1257 irq_handler_t handler[] = { iop_adma_eot_handler,
1258 iop_adma_eoc_handler,
1259 iop_adma_err_handler };
1260 int irq = platform_get_irq(pdev, i);
1261 if (irq < 0) {
1262 ret = -ENXIO;
1263 goto err_free_iop_chan;
1264 } else {
1265 ret = devm_request_irq(&pdev->dev, irq,
1266 handler[i], 0, pdev->name, iop_chan);
1267 if (ret)
1268 goto err_free_iop_chan;
1269 }
1270 }
1271
1272 spin_lock_init(&iop_chan->lock);
1273 init_timer(&iop_chan->cleanup_watchdog);
1274 iop_chan->cleanup_watchdog.data = (unsigned long) iop_chan;
1275 iop_chan->cleanup_watchdog.function = iop_adma_tasklet;
1276 INIT_LIST_HEAD(&iop_chan->chain);
1277 INIT_LIST_HEAD(&iop_chan->all_slots);
1278 INIT_RCU_HEAD(&iop_chan->common.rcu);
1279 iop_chan->common.device = dma_dev;
1280 list_add_tail(&iop_chan->common.device_node, &dma_dev->channels);
1281
1282 if (dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask)) {
1283 ret = iop_adma_memcpy_self_test(adev);
1284 dev_dbg(&pdev->dev, "memcpy self test returned %d\n", ret);
1285 if (ret)
1286 goto err_free_iop_chan;
1287 }
1288
1289 if (dma_has_cap(DMA_XOR, dma_dev->cap_mask) ||
1290 dma_has_cap(DMA_MEMSET, dma_dev->cap_mask)) {
1291 ret = iop_adma_xor_zero_sum_self_test(adev);
1292 dev_dbg(&pdev->dev, "xor self test returned %d\n", ret);
1293 if (ret)
1294 goto err_free_iop_chan;
1295 }
1296
1297 dev_printk(KERN_INFO, &pdev->dev, "Intel(R) IOP: "
1298 "( %s%s%s%s%s%s%s%s%s%s)\n",
1299 dma_has_cap(DMA_PQ_XOR, dma_dev->cap_mask) ? "pq_xor " : "",
1300 dma_has_cap(DMA_PQ_UPDATE, dma_dev->cap_mask) ? "pq_update " : "",
1301 dma_has_cap(DMA_PQ_ZERO_SUM, dma_dev->cap_mask) ? "pq_zero_sum " : "",
1302 dma_has_cap(DMA_XOR, dma_dev->cap_mask) ? "xor " : "",
1303 dma_has_cap(DMA_DUAL_XOR, dma_dev->cap_mask) ? "dual_xor " : "",
1304 dma_has_cap(DMA_ZERO_SUM, dma_dev->cap_mask) ? "xor_zero_sum " : "",
1305 dma_has_cap(DMA_MEMSET, dma_dev->cap_mask) ? "fill " : "",
1306 dma_has_cap(DMA_MEMCPY_CRC32C, dma_dev->cap_mask) ? "cpy+crc " : "",
1307 dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask) ? "cpy " : "",
1308 dma_has_cap(DMA_INTERRUPT, dma_dev->cap_mask) ? "intr " : "");
1309
1310 dma_async_device_register(dma_dev);
1311 goto out;
1312
1313 err_free_iop_chan:
1314 kfree(iop_chan);
1315 err_free_dma:
1316 dma_free_coherent(&adev->pdev->dev, plat_data->pool_size,
1317 adev->dma_desc_pool_virt, adev->dma_desc_pool);
1318 err_free_adev:
1319 kfree(adev);
1320 out:
1321 return ret;
1322}
1323
1324static void iop_chan_start_null_memcpy(struct iop_adma_chan *iop_chan)
1325{
1326 struct iop_adma_desc_slot *sw_desc, *grp_start;
1327 dma_cookie_t cookie;
1328 int slot_cnt, slots_per_op;
1329
1330 dev_dbg(iop_chan->device->common.dev, "%s\n", __FUNCTION__);
1331
1332 spin_lock_bh(&iop_chan->lock);
1333 slot_cnt = iop_chan_memcpy_slot_count(0, &slots_per_op);
1334 sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op);
1335 if (sw_desc) {
1336 grp_start = sw_desc->group_head;
1337
1338 list_splice_init(&sw_desc->async_tx.tx_list, &iop_chan->chain);
1339 sw_desc->async_tx.ack = 1;
1340 iop_desc_init_memcpy(grp_start, 0);
1341 iop_desc_set_byte_count(grp_start, iop_chan, 0);
1342 iop_desc_set_dest_addr(grp_start, iop_chan, 0);
1343 iop_desc_set_memcpy_src_addr(grp_start, 0);
1344
1345 cookie = iop_chan->common.cookie;
1346 cookie++;
1347 if (cookie <= 1)
1348 cookie = 2;
1349
1350 /* initialize the completed cookie to be less than
1351 * the most recently used cookie
1352 */
1353 iop_chan->completed_cookie = cookie - 1;
1354 iop_chan->common.cookie = sw_desc->async_tx.cookie = cookie;
1355
1356 /* channel should not be busy */
1357 BUG_ON(iop_chan_is_busy(iop_chan));
1358
1359 /* clear any prior error-status bits */
1360 iop_adma_device_clear_err_status(iop_chan);
1361
1362 /* disable operation */
1363 iop_chan_disable(iop_chan);
1364
1365 /* set the descriptor address */
1366 iop_chan_set_next_descriptor(iop_chan, sw_desc->async_tx.phys);
1367
1368 /* 1/ don't add pre-chained descriptors
1369 * 2/ dummy read to flush next_desc write
1370 */
1371 BUG_ON(iop_desc_get_next_desc(sw_desc));
1372
1373 /* run the descriptor */
1374 iop_chan_enable(iop_chan);
1375 } else
1376 dev_printk(KERN_ERR, iop_chan->device->common.dev,
1377 "failed to allocate null descriptor\n");
1378 spin_unlock_bh(&iop_chan->lock);
1379}
1380
1381static void iop_chan_start_null_xor(struct iop_adma_chan *iop_chan)
1382{
1383 struct iop_adma_desc_slot *sw_desc, *grp_start;
1384 dma_cookie_t cookie;
1385 int slot_cnt, slots_per_op;
1386
1387 dev_dbg(iop_chan->device->common.dev, "%s\n", __FUNCTION__);
1388
1389 spin_lock_bh(&iop_chan->lock);
1390 slot_cnt = iop_chan_xor_slot_count(0, 2, &slots_per_op);
1391 sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op);
1392 if (sw_desc) {
1393 grp_start = sw_desc->group_head;
1394 list_splice_init(&sw_desc->async_tx.tx_list, &iop_chan->chain);
1395 sw_desc->async_tx.ack = 1;
1396 iop_desc_init_null_xor(grp_start, 2, 0);
1397 iop_desc_set_byte_count(grp_start, iop_chan, 0);
1398 iop_desc_set_dest_addr(grp_start, iop_chan, 0);
1399 iop_desc_set_xor_src_addr(grp_start, 0, 0);
1400 iop_desc_set_xor_src_addr(grp_start, 1, 0);
1401
1402 cookie = iop_chan->common.cookie;
1403 cookie++;
1404 if (cookie <= 1)
1405 cookie = 2;
1406
1407 /* initialize the completed cookie to be less than
1408 * the most recently used cookie
1409 */
1410 iop_chan->completed_cookie = cookie - 1;
1411 iop_chan->common.cookie = sw_desc->async_tx.cookie = cookie;
1412
1413 /* channel should not be busy */
1414 BUG_ON(iop_chan_is_busy(iop_chan));
1415
1416 /* clear any prior error-status bits */
1417 iop_adma_device_clear_err_status(iop_chan);
1418
1419 /* disable operation */
1420 iop_chan_disable(iop_chan);
1421
1422 /* set the descriptor address */
1423 iop_chan_set_next_descriptor(iop_chan, sw_desc->async_tx.phys);
1424
1425 /* 1/ don't add pre-chained descriptors
1426 * 2/ dummy read to flush next_desc write
1427 */
1428 BUG_ON(iop_desc_get_next_desc(sw_desc));
1429
1430 /* run the descriptor */
1431 iop_chan_enable(iop_chan);
1432 } else
1433 dev_printk(KERN_ERR, iop_chan->device->common.dev,
1434 "failed to allocate null descriptor\n");
1435 spin_unlock_bh(&iop_chan->lock);
1436}
1437
1438static struct platform_driver iop_adma_driver = {
1439 .probe = iop_adma_probe,
1440 .remove = iop_adma_remove,
1441 .driver = {
1442 .owner = THIS_MODULE,
1443 .name = "iop-adma",
1444 },
1445};
1446
1447static int __init iop_adma_init (void)
1448{
1449 /* it's currently unsafe to unload this module */
1450 /* if forced, worst case is that rmmod hangs */
1451 __unsafe(THIS_MODULE);
1452
1453 return platform_driver_register(&iop_adma_driver);
1454}
1455
1456static void __exit iop_adma_exit (void)
1457{
1458 platform_driver_unregister(&iop_adma_driver);
1459 return;
1460}
1461
1462module_init(iop_adma_init);
1463module_exit(iop_adma_exit);
1464
1465MODULE_AUTHOR("Intel Corporation");
1466MODULE_DESCRIPTION("IOP ADMA Engine Driver");
1467MODULE_LICENSE("GPL");
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 466909f38d98..64bf3a81db93 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -109,6 +109,8 @@ config MD_RAID10
109config MD_RAID456 109config MD_RAID456
110 tristate "RAID-4/RAID-5/RAID-6 mode" 110 tristate "RAID-4/RAID-5/RAID-6 mode"
111 depends on BLK_DEV_MD 111 depends on BLK_DEV_MD
112 select ASYNC_MEMCPY
113 select ASYNC_XOR
112 ---help--- 114 ---help---
113 A RAID-5 set of N drives with a capacity of C MB per drive provides 115 A RAID-5 set of N drives with a capacity of C MB per drive provides
114 the capacity of C * (N - 1) MB, and protects against a failure 116 the capacity of C * (N - 1) MB, and protects against a failure
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 2c45d7683ae9..c49366cdc05d 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -18,7 +18,7 @@ raid456-objs := raid5.o raid6algos.o raid6recov.o raid6tables.o \
18hostprogs-y := mktables 18hostprogs-y := mktables
19 19
20# Note: link order is important. All raid personalities 20# Note: link order is important. All raid personalities
21# and xor.o must come before md.o, as they each initialise 21# and must come before md.o, as they each initialise
22# themselves, and md.o may use the personalities when it 22# themselves, and md.o may use the personalities when it
23# auto-initialised. 23# auto-initialised.
24 24
@@ -26,7 +26,7 @@ obj-$(CONFIG_MD_LINEAR) += linear.o
26obj-$(CONFIG_MD_RAID0) += raid0.o 26obj-$(CONFIG_MD_RAID0) += raid0.o
27obj-$(CONFIG_MD_RAID1) += raid1.o 27obj-$(CONFIG_MD_RAID1) += raid1.o
28obj-$(CONFIG_MD_RAID10) += raid10.o 28obj-$(CONFIG_MD_RAID10) += raid10.o
29obj-$(CONFIG_MD_RAID456) += raid456.o xor.o 29obj-$(CONFIG_MD_RAID456) += raid456.o
30obj-$(CONFIG_MD_MULTIPATH) += multipath.o 30obj-$(CONFIG_MD_MULTIPATH) += multipath.o
31obj-$(CONFIG_MD_FAULTY) += faulty.o 31obj-$(CONFIG_MD_FAULTY) += faulty.o
32obj-$(CONFIG_BLK_DEV_MD) += md-mod.o 32obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1c54f3c1cca7..33beaa7da085 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5814,7 +5814,7 @@ static __exit void md_exit(void)
5814 } 5814 }
5815} 5815}
5816 5816
5817module_init(md_init) 5817subsys_initcall(md_init);
5818module_exit(md_exit) 5818module_exit(md_exit)
5819 5819
5820static int get_ro(char *buffer, struct kernel_param *kp) 5820static int get_ro(char *buffer, struct kernel_param *kp)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 061375ee6592..0b66afef2d82 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -52,6 +52,7 @@
52#include "raid6.h" 52#include "raid6.h"
53 53
54#include <linux/raid/bitmap.h> 54#include <linux/raid/bitmap.h>
55#include <linux/async_tx.h>
55 56
56/* 57/*
57 * Stripe cache 58 * Stripe cache
@@ -80,7 +81,6 @@
80/* 81/*
81 * The following can be used to debug the driver 82 * The following can be used to debug the driver
82 */ 83 */
83#define RAID5_DEBUG 0
84#define RAID5_PARANOIA 1 84#define RAID5_PARANOIA 1
85#if RAID5_PARANOIA && defined(CONFIG_SMP) 85#if RAID5_PARANOIA && defined(CONFIG_SMP)
86# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock) 86# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
@@ -88,8 +88,7 @@
88# define CHECK_DEVLOCK() 88# define CHECK_DEVLOCK()
89#endif 89#endif
90 90
91#define PRINTK(x...) ((void)(RAID5_DEBUG && printk(x))) 91#ifdef DEBUG
92#if RAID5_DEBUG
93#define inline 92#define inline
94#define __inline__ 93#define __inline__
95#endif 94#endif
@@ -104,6 +103,23 @@ static inline int raid6_next_disk(int disk, int raid_disks)
104 disk++; 103 disk++;
105 return (disk < raid_disks) ? disk : 0; 104 return (disk < raid_disks) ? disk : 0;
106} 105}
106
107static void return_io(struct bio *return_bi)
108{
109 struct bio *bi = return_bi;
110 while (bi) {
111 int bytes = bi->bi_size;
112
113 return_bi = bi->bi_next;
114 bi->bi_next = NULL;
115 bi->bi_size = 0;
116 bi->bi_end_io(bi, bytes,
117 test_bit(BIO_UPTODATE, &bi->bi_flags)
118 ? 0 : -EIO);
119 bi = return_bi;
120 }
121}
122
107static void print_raid5_conf (raid5_conf_t *conf); 123static void print_raid5_conf (raid5_conf_t *conf);
108 124
109static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) 125static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
@@ -125,6 +141,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
125 } 141 }
126 md_wakeup_thread(conf->mddev->thread); 142 md_wakeup_thread(conf->mddev->thread);
127 } else { 143 } else {
144 BUG_ON(sh->ops.pending);
128 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 145 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
129 atomic_dec(&conf->preread_active_stripes); 146 atomic_dec(&conf->preread_active_stripes);
130 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) 147 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
@@ -152,7 +169,8 @@ static void release_stripe(struct stripe_head *sh)
152 169
153static inline void remove_hash(struct stripe_head *sh) 170static inline void remove_hash(struct stripe_head *sh)
154{ 171{
155 PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector); 172 pr_debug("remove_hash(), stripe %llu\n",
173 (unsigned long long)sh->sector);
156 174
157 hlist_del_init(&sh->hash); 175 hlist_del_init(&sh->hash);
158} 176}
@@ -161,7 +179,8 @@ static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
161{ 179{
162 struct hlist_head *hp = stripe_hash(conf, sh->sector); 180 struct hlist_head *hp = stripe_hash(conf, sh->sector);
163 181
164 PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector); 182 pr_debug("insert_hash(), stripe %llu\n",
183 (unsigned long long)sh->sector);
165 184
166 CHECK_DEVLOCK(); 185 CHECK_DEVLOCK();
167 hlist_add_head(&sh->hash, hp); 186 hlist_add_head(&sh->hash, hp);
@@ -224,9 +243,10 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
224 243
225 BUG_ON(atomic_read(&sh->count) != 0); 244 BUG_ON(atomic_read(&sh->count) != 0);
226 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 245 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
227 246 BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete);
247
228 CHECK_DEVLOCK(); 248 CHECK_DEVLOCK();
229 PRINTK("init_stripe called, stripe %llu\n", 249 pr_debug("init_stripe called, stripe %llu\n",
230 (unsigned long long)sh->sector); 250 (unsigned long long)sh->sector);
231 251
232 remove_hash(sh); 252 remove_hash(sh);
@@ -240,11 +260,11 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
240 for (i = sh->disks; i--; ) { 260 for (i = sh->disks; i--; ) {
241 struct r5dev *dev = &sh->dev[i]; 261 struct r5dev *dev = &sh->dev[i];
242 262
243 if (dev->toread || dev->towrite || dev->written || 263 if (dev->toread || dev->read || dev->towrite || dev->written ||
244 test_bit(R5_LOCKED, &dev->flags)) { 264 test_bit(R5_LOCKED, &dev->flags)) {
245 printk("sector=%llx i=%d %p %p %p %d\n", 265 printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",
246 (unsigned long long)sh->sector, i, dev->toread, 266 (unsigned long long)sh->sector, i, dev->toread,
247 dev->towrite, dev->written, 267 dev->read, dev->towrite, dev->written,
248 test_bit(R5_LOCKED, &dev->flags)); 268 test_bit(R5_LOCKED, &dev->flags));
249 BUG(); 269 BUG();
250 } 270 }
@@ -260,11 +280,11 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in
260 struct hlist_node *hn; 280 struct hlist_node *hn;
261 281
262 CHECK_DEVLOCK(); 282 CHECK_DEVLOCK();
263 PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector); 283 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
264 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) 284 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
265 if (sh->sector == sector && sh->disks == disks) 285 if (sh->sector == sector && sh->disks == disks)
266 return sh; 286 return sh;
267 PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector); 287 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
268 return NULL; 288 return NULL;
269} 289}
270 290
@@ -276,7 +296,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
276{ 296{
277 struct stripe_head *sh; 297 struct stripe_head *sh;
278 298
279 PRINTK("get_stripe, sector %llu\n", (unsigned long long)sector); 299 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
280 300
281 spin_lock_irq(&conf->device_lock); 301 spin_lock_irq(&conf->device_lock);
282 302
@@ -324,6 +344,579 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
324 return sh; 344 return sh;
325} 345}
326 346
347/* test_and_ack_op() ensures that we only dequeue an operation once */
348#define test_and_ack_op(op, pend) \
349do { \
350 if (test_bit(op, &sh->ops.pending) && \
351 !test_bit(op, &sh->ops.complete)) { \
352 if (test_and_set_bit(op, &sh->ops.ack)) \
353 clear_bit(op, &pend); \
354 else \
355 ack++; \
356 } else \
357 clear_bit(op, &pend); \
358} while (0)
359
360/* find new work to run, do not resubmit work that is already
361 * in flight
362 */
363static unsigned long get_stripe_work(struct stripe_head *sh)
364{
365 unsigned long pending;
366 int ack = 0;
367
368 pending = sh->ops.pending;
369
370 test_and_ack_op(STRIPE_OP_BIOFILL, pending);
371 test_and_ack_op(STRIPE_OP_COMPUTE_BLK, pending);
372 test_and_ack_op(STRIPE_OP_PREXOR, pending);
373 test_and_ack_op(STRIPE_OP_BIODRAIN, pending);
374 test_and_ack_op(STRIPE_OP_POSTXOR, pending);
375 test_and_ack_op(STRIPE_OP_CHECK, pending);
376 if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending))
377 ack++;
378
379 sh->ops.count -= ack;
380 BUG_ON(sh->ops.count < 0);
381
382 return pending;
383}
384
385static int
386raid5_end_read_request(struct bio *bi, unsigned int bytes_done, int error);
387static int
388raid5_end_write_request (struct bio *bi, unsigned int bytes_done, int error);
389
390static void ops_run_io(struct stripe_head *sh)
391{
392 raid5_conf_t *conf = sh->raid_conf;
393 int i, disks = sh->disks;
394
395 might_sleep();
396
397 for (i = disks; i--; ) {
398 int rw;
399 struct bio *bi;
400 mdk_rdev_t *rdev;
401 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
402 rw = WRITE;
403 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
404 rw = READ;
405 else
406 continue;
407
408 bi = &sh->dev[i].req;
409
410 bi->bi_rw = rw;
411 if (rw == WRITE)
412 bi->bi_end_io = raid5_end_write_request;
413 else
414 bi->bi_end_io = raid5_end_read_request;
415
416 rcu_read_lock();
417 rdev = rcu_dereference(conf->disks[i].rdev);
418 if (rdev && test_bit(Faulty, &rdev->flags))
419 rdev = NULL;
420 if (rdev)
421 atomic_inc(&rdev->nr_pending);
422 rcu_read_unlock();
423
424 if (rdev) {
425 if (test_bit(STRIPE_SYNCING, &sh->state) ||
426 test_bit(STRIPE_EXPAND_SOURCE, &sh->state) ||
427 test_bit(STRIPE_EXPAND_READY, &sh->state))
428 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
429
430 bi->bi_bdev = rdev->bdev;
431 pr_debug("%s: for %llu schedule op %ld on disc %d\n",
432 __FUNCTION__, (unsigned long long)sh->sector,
433 bi->bi_rw, i);
434 atomic_inc(&sh->count);
435 bi->bi_sector = sh->sector + rdev->data_offset;
436 bi->bi_flags = 1 << BIO_UPTODATE;
437 bi->bi_vcnt = 1;
438 bi->bi_max_vecs = 1;
439 bi->bi_idx = 0;
440 bi->bi_io_vec = &sh->dev[i].vec;
441 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
442 bi->bi_io_vec[0].bv_offset = 0;
443 bi->bi_size = STRIPE_SIZE;
444 bi->bi_next = NULL;
445 if (rw == WRITE &&
446 test_bit(R5_ReWrite, &sh->dev[i].flags))
447 atomic_add(STRIPE_SECTORS,
448 &rdev->corrected_errors);
449 generic_make_request(bi);
450 } else {
451 if (rw == WRITE)
452 set_bit(STRIPE_DEGRADED, &sh->state);
453 pr_debug("skip op %ld on disc %d for sector %llu\n",
454 bi->bi_rw, i, (unsigned long long)sh->sector);
455 clear_bit(R5_LOCKED, &sh->dev[i].flags);
456 set_bit(STRIPE_HANDLE, &sh->state);
457 }
458 }
459}
460
461static struct dma_async_tx_descriptor *
462async_copy_data(int frombio, struct bio *bio, struct page *page,
463 sector_t sector, struct dma_async_tx_descriptor *tx)
464{
465 struct bio_vec *bvl;
466 struct page *bio_page;
467 int i;
468 int page_offset;
469
470 if (bio->bi_sector >= sector)
471 page_offset = (signed)(bio->bi_sector - sector) * 512;
472 else
473 page_offset = (signed)(sector - bio->bi_sector) * -512;
474 bio_for_each_segment(bvl, bio, i) {
475 int len = bio_iovec_idx(bio, i)->bv_len;
476 int clen;
477 int b_offset = 0;
478
479 if (page_offset < 0) {
480 b_offset = -page_offset;
481 page_offset += b_offset;
482 len -= b_offset;
483 }
484
485 if (len > 0 && page_offset + len > STRIPE_SIZE)
486 clen = STRIPE_SIZE - page_offset;
487 else
488 clen = len;
489
490 if (clen > 0) {
491 b_offset += bio_iovec_idx(bio, i)->bv_offset;
492 bio_page = bio_iovec_idx(bio, i)->bv_page;
493 if (frombio)
494 tx = async_memcpy(page, bio_page, page_offset,
495 b_offset, clen,
496 ASYNC_TX_DEP_ACK | ASYNC_TX_KMAP_SRC,
497 tx, NULL, NULL);
498 else
499 tx = async_memcpy(bio_page, page, b_offset,
500 page_offset, clen,
501 ASYNC_TX_DEP_ACK | ASYNC_TX_KMAP_DST,
502 tx, NULL, NULL);
503 }
504 if (clen < len) /* hit end of page */
505 break;
506 page_offset += len;
507 }
508
509 return tx;
510}
511
512static void ops_complete_biofill(void *stripe_head_ref)
513{
514 struct stripe_head *sh = stripe_head_ref;
515 struct bio *return_bi = NULL;
516 raid5_conf_t *conf = sh->raid_conf;
517 int i, more_to_read = 0;
518
519 pr_debug("%s: stripe %llu\n", __FUNCTION__,
520 (unsigned long long)sh->sector);
521
522 /* clear completed biofills */
523 for (i = sh->disks; i--; ) {
524 struct r5dev *dev = &sh->dev[i];
525 /* check if this stripe has new incoming reads */
526 if (dev->toread)
527 more_to_read++;
528
529 /* acknowledge completion of a biofill operation */
530 /* and check if we need to reply to a read request
531 */
532 if (test_bit(R5_Wantfill, &dev->flags) && !dev->toread) {
533 struct bio *rbi, *rbi2;
534 clear_bit(R5_Wantfill, &dev->flags);
535
536 /* The access to dev->read is outside of the
537 * spin_lock_irq(&conf->device_lock), but is protected
538 * by the STRIPE_OP_BIOFILL pending bit
539 */
540 BUG_ON(!dev->read);
541 rbi = dev->read;
542 dev->read = NULL;
543 while (rbi && rbi->bi_sector <
544 dev->sector + STRIPE_SECTORS) {
545 rbi2 = r5_next_bio(rbi, dev->sector);
546 spin_lock_irq(&conf->device_lock);
547 if (--rbi->bi_phys_segments == 0) {
548 rbi->bi_next = return_bi;
549 return_bi = rbi;
550 }
551 spin_unlock_irq(&conf->device_lock);
552 rbi = rbi2;
553 }
554 }
555 }
556 clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack);
557 clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending);
558
559 return_io(return_bi);
560
561 if (more_to_read)
562 set_bit(STRIPE_HANDLE, &sh->state);
563 release_stripe(sh);
564}
565
566static void ops_run_biofill(struct stripe_head *sh)
567{
568 struct dma_async_tx_descriptor *tx = NULL;
569 raid5_conf_t *conf = sh->raid_conf;
570 int i;
571
572 pr_debug("%s: stripe %llu\n", __FUNCTION__,
573 (unsigned long long)sh->sector);
574
575 for (i = sh->disks; i--; ) {
576 struct r5dev *dev = &sh->dev[i];
577 if (test_bit(R5_Wantfill, &dev->flags)) {
578 struct bio *rbi;
579 spin_lock_irq(&conf->device_lock);
580 dev->read = rbi = dev->toread;
581 dev->toread = NULL;
582 spin_unlock_irq(&conf->device_lock);
583 while (rbi && rbi->bi_sector <
584 dev->sector + STRIPE_SECTORS) {
585 tx = async_copy_data(0, rbi, dev->page,
586 dev->sector, tx);
587 rbi = r5_next_bio(rbi, dev->sector);
588 }
589 }
590 }
591
592 atomic_inc(&sh->count);
593 async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
594 ops_complete_biofill, sh);
595}
596
597static void ops_complete_compute5(void *stripe_head_ref)
598{
599 struct stripe_head *sh = stripe_head_ref;
600 int target = sh->ops.target;
601 struct r5dev *tgt = &sh->dev[target];
602
603 pr_debug("%s: stripe %llu\n", __FUNCTION__,
604 (unsigned long long)sh->sector);
605
606 set_bit(R5_UPTODATE, &tgt->flags);
607 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
608 clear_bit(R5_Wantcompute, &tgt->flags);
609 set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
610 set_bit(STRIPE_HANDLE, &sh->state);
611 release_stripe(sh);
612}
613
614static struct dma_async_tx_descriptor *
615ops_run_compute5(struct stripe_head *sh, unsigned long pending)
616{
617 /* kernel stack size limits the total number of disks */
618 int disks = sh->disks;
619 struct page *xor_srcs[disks];
620 int target = sh->ops.target;
621 struct r5dev *tgt = &sh->dev[target];
622 struct page *xor_dest = tgt->page;
623 int count = 0;
624 struct dma_async_tx_descriptor *tx;
625 int i;
626
627 pr_debug("%s: stripe %llu block: %d\n",
628 __FUNCTION__, (unsigned long long)sh->sector, target);
629 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
630
631 for (i = disks; i--; )
632 if (i != target)
633 xor_srcs[count++] = sh->dev[i].page;
634
635 atomic_inc(&sh->count);
636
637 if (unlikely(count == 1))
638 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
639 0, NULL, ops_complete_compute5, sh);
640 else
641 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
642 ASYNC_TX_XOR_ZERO_DST, NULL,
643 ops_complete_compute5, sh);
644
645 /* ack now if postxor is not set to be run */
646 if (tx && !test_bit(STRIPE_OP_POSTXOR, &pending))
647 async_tx_ack(tx);
648
649 return tx;
650}
651
652static void ops_complete_prexor(void *stripe_head_ref)
653{
654 struct stripe_head *sh = stripe_head_ref;
655
656 pr_debug("%s: stripe %llu\n", __FUNCTION__,
657 (unsigned long long)sh->sector);
658
659 set_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
660}
661
662static struct dma_async_tx_descriptor *
663ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
664{
665 /* kernel stack size limits the total number of disks */
666 int disks = sh->disks;
667 struct page *xor_srcs[disks];
668 int count = 0, pd_idx = sh->pd_idx, i;
669
670 /* existing parity data subtracted */
671 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
672
673 pr_debug("%s: stripe %llu\n", __FUNCTION__,
674 (unsigned long long)sh->sector);
675
676 for (i = disks; i--; ) {
677 struct r5dev *dev = &sh->dev[i];
678 /* Only process blocks that are known to be uptodate */
679 if (dev->towrite && test_bit(R5_Wantprexor, &dev->flags))
680 xor_srcs[count++] = dev->page;
681 }
682
683 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
684 ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx,
685 ops_complete_prexor, sh);
686
687 return tx;
688}
689
690static struct dma_async_tx_descriptor *
691ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
692{
693 int disks = sh->disks;
694 int pd_idx = sh->pd_idx, i;
695
696 /* check if prexor is active which means only process blocks
697 * that are part of a read-modify-write (Wantprexor)
698 */
699 int prexor = test_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
700
701 pr_debug("%s: stripe %llu\n", __FUNCTION__,
702 (unsigned long long)sh->sector);
703
704 for (i = disks; i--; ) {
705 struct r5dev *dev = &sh->dev[i];
706 struct bio *chosen;
707 int towrite;
708
709 towrite = 0;
710 if (prexor) { /* rmw */
711 if (dev->towrite &&
712 test_bit(R5_Wantprexor, &dev->flags))
713 towrite = 1;
714 } else { /* rcw */
715 if (i != pd_idx && dev->towrite &&
716 test_bit(R5_LOCKED, &dev->flags))
717 towrite = 1;
718 }
719
720 if (towrite) {
721 struct bio *wbi;
722
723 spin_lock(&sh->lock);
724 chosen = dev->towrite;
725 dev->towrite = NULL;
726 BUG_ON(dev->written);
727 wbi = dev->written = chosen;
728 spin_unlock(&sh->lock);
729
730 while (wbi && wbi->bi_sector <
731 dev->sector + STRIPE_SECTORS) {
732 tx = async_copy_data(1, wbi, dev->page,
733 dev->sector, tx);
734 wbi = r5_next_bio(wbi, dev->sector);
735 }
736 }
737 }
738
739 return tx;
740}
741
742static void ops_complete_postxor(void *stripe_head_ref)
743{
744 struct stripe_head *sh = stripe_head_ref;
745
746 pr_debug("%s: stripe %llu\n", __FUNCTION__,
747 (unsigned long long)sh->sector);
748
749 set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
750 set_bit(STRIPE_HANDLE, &sh->state);
751 release_stripe(sh);
752}
753
754static void ops_complete_write(void *stripe_head_ref)
755{
756 struct stripe_head *sh = stripe_head_ref;
757 int disks = sh->disks, i, pd_idx = sh->pd_idx;
758
759 pr_debug("%s: stripe %llu\n", __FUNCTION__,
760 (unsigned long long)sh->sector);
761
762 for (i = disks; i--; ) {
763 struct r5dev *dev = &sh->dev[i];
764 if (dev->written || i == pd_idx)
765 set_bit(R5_UPTODATE, &dev->flags);
766 }
767
768 set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
769 set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
770
771 set_bit(STRIPE_HANDLE, &sh->state);
772 release_stripe(sh);
773}
774
775static void
776ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
777{
778 /* kernel stack size limits the total number of disks */
779 int disks = sh->disks;
780 struct page *xor_srcs[disks];
781
782 int count = 0, pd_idx = sh->pd_idx, i;
783 struct page *xor_dest;
784 int prexor = test_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
785 unsigned long flags;
786 dma_async_tx_callback callback;
787
788 pr_debug("%s: stripe %llu\n", __FUNCTION__,
789 (unsigned long long)sh->sector);
790
791 /* check if prexor is active which means only process blocks
792 * that are part of a read-modify-write (written)
793 */
794 if (prexor) {
795 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
796 for (i = disks; i--; ) {
797 struct r5dev *dev = &sh->dev[i];
798 if (dev->written)
799 xor_srcs[count++] = dev->page;
800 }
801 } else {
802 xor_dest = sh->dev[pd_idx].page;
803 for (i = disks; i--; ) {
804 struct r5dev *dev = &sh->dev[i];
805 if (i != pd_idx)
806 xor_srcs[count++] = dev->page;
807 }
808 }
809
810 /* check whether this postxor is part of a write */
811 callback = test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending) ?
812 ops_complete_write : ops_complete_postxor;
813
814 /* 1/ if we prexor'd then the dest is reused as a source
815 * 2/ if we did not prexor then we are redoing the parity
816 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
817 * for the synchronous xor case
818 */
819 flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK |
820 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
821
822 atomic_inc(&sh->count);
823
824 if (unlikely(count == 1)) {
825 flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST);
826 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
827 flags, tx, callback, sh);
828 } else
829 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
830 flags, tx, callback, sh);
831}
832
833static void ops_complete_check(void *stripe_head_ref)
834{
835 struct stripe_head *sh = stripe_head_ref;
836 int pd_idx = sh->pd_idx;
837
838 pr_debug("%s: stripe %llu\n", __FUNCTION__,
839 (unsigned long long)sh->sector);
840
841 if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) &&
842 sh->ops.zero_sum_result == 0)
843 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
844
845 set_bit(STRIPE_OP_CHECK, &sh->ops.complete);
846 set_bit(STRIPE_HANDLE, &sh->state);
847 release_stripe(sh);
848}
849
850static void ops_run_check(struct stripe_head *sh)
851{
852 /* kernel stack size limits the total number of disks */
853 int disks = sh->disks;
854 struct page *xor_srcs[disks];
855 struct dma_async_tx_descriptor *tx;
856
857 int count = 0, pd_idx = sh->pd_idx, i;
858 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
859
860 pr_debug("%s: stripe %llu\n", __FUNCTION__,
861 (unsigned long long)sh->sector);
862
863 for (i = disks; i--; ) {
864 struct r5dev *dev = &sh->dev[i];
865 if (i != pd_idx)
866 xor_srcs[count++] = dev->page;
867 }
868
869 tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
870 &sh->ops.zero_sum_result, 0, NULL, NULL, NULL);
871
872 if (tx)
873 set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
874 else
875 clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
876
877 atomic_inc(&sh->count);
878 tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
879 ops_complete_check, sh);
880}
881
882static void raid5_run_ops(struct stripe_head *sh, unsigned long pending)
883{
884 int overlap_clear = 0, i, disks = sh->disks;
885 struct dma_async_tx_descriptor *tx = NULL;
886
887 if (test_bit(STRIPE_OP_BIOFILL, &pending)) {
888 ops_run_biofill(sh);
889 overlap_clear++;
890 }
891
892 if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending))
893 tx = ops_run_compute5(sh, pending);
894
895 if (test_bit(STRIPE_OP_PREXOR, &pending))
896 tx = ops_run_prexor(sh, tx);
897
898 if (test_bit(STRIPE_OP_BIODRAIN, &pending)) {
899 tx = ops_run_biodrain(sh, tx);
900 overlap_clear++;
901 }
902
903 if (test_bit(STRIPE_OP_POSTXOR, &pending))
904 ops_run_postxor(sh, tx);
905
906 if (test_bit(STRIPE_OP_CHECK, &pending))
907 ops_run_check(sh);
908
909 if (test_bit(STRIPE_OP_IO, &pending))
910 ops_run_io(sh);
911
912 if (overlap_clear)
913 for (i = disks; i--; ) {
914 struct r5dev *dev = &sh->dev[i];
915 if (test_and_clear_bit(R5_Overlap, &dev->flags))
916 wake_up(&sh->raid_conf->wait_for_overlap);
917 }
918}
919
327static int grow_one_stripe(raid5_conf_t *conf) 920static int grow_one_stripe(raid5_conf_t *conf)
328{ 921{
329 struct stripe_head *sh; 922 struct stripe_head *sh;
@@ -537,8 +1130,8 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
537 if (bi == &sh->dev[i].req) 1130 if (bi == &sh->dev[i].req)
538 break; 1131 break;
539 1132
540 PRINTK("end_read_request %llu/%d, count: %d, uptodate %d.\n", 1133 pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n",
541 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1134 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
542 uptodate); 1135 uptodate);
543 if (i == disks) { 1136 if (i == disks) {
544 BUG(); 1137 BUG();
@@ -613,7 +1206,7 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
613 if (bi == &sh->dev[i].req) 1206 if (bi == &sh->dev[i].req)
614 break; 1207 break;
615 1208
616 PRINTK("end_write_request %llu/%d, count %d, uptodate: %d.\n", 1209 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
617 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1210 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
618 uptodate); 1211 uptodate);
619 if (i == disks) { 1212 if (i == disks) {
@@ -658,7 +1251,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
658{ 1251{
659 char b[BDEVNAME_SIZE]; 1252 char b[BDEVNAME_SIZE];
660 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 1253 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
661 PRINTK("raid5: error called\n"); 1254 pr_debug("raid5: error called\n");
662 1255
663 if (!test_bit(Faulty, &rdev->flags)) { 1256 if (!test_bit(Faulty, &rdev->flags)) {
664 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1257 set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -916,137 +1509,13 @@ static void copy_data(int frombio, struct bio *bio,
916 } 1509 }
917} 1510}
918 1511
919#define check_xor() do { \ 1512#define check_xor() do { \
920 if (count == MAX_XOR_BLOCKS) { \ 1513 if (count == MAX_XOR_BLOCKS) { \
921 xor_block(count, STRIPE_SIZE, ptr); \ 1514 xor_blocks(count, STRIPE_SIZE, dest, ptr);\
922 count = 1; \ 1515 count = 0; \
923 } \ 1516 } \
924 } while(0) 1517 } while(0)
925 1518
926
927static void compute_block(struct stripe_head *sh, int dd_idx)
928{
929 int i, count, disks = sh->disks;
930 void *ptr[MAX_XOR_BLOCKS], *p;
931
932 PRINTK("compute_block, stripe %llu, idx %d\n",
933 (unsigned long long)sh->sector, dd_idx);
934
935 ptr[0] = page_address(sh->dev[dd_idx].page);
936 memset(ptr[0], 0, STRIPE_SIZE);
937 count = 1;
938 for (i = disks ; i--; ) {
939 if (i == dd_idx)
940 continue;
941 p = page_address(sh->dev[i].page);
942 if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
943 ptr[count++] = p;
944 else
945 printk(KERN_ERR "compute_block() %d, stripe %llu, %d"
946 " not present\n", dd_idx,
947 (unsigned long long)sh->sector, i);
948
949 check_xor();
950 }
951 if (count != 1)
952 xor_block(count, STRIPE_SIZE, ptr);
953 set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
954}
955
956static void compute_parity5(struct stripe_head *sh, int method)
957{
958 raid5_conf_t *conf = sh->raid_conf;
959 int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
960 void *ptr[MAX_XOR_BLOCKS];
961 struct bio *chosen;
962
963 PRINTK("compute_parity5, stripe %llu, method %d\n",
964 (unsigned long long)sh->sector, method);
965
966 count = 1;
967 ptr[0] = page_address(sh->dev[pd_idx].page);
968 switch(method) {
969 case READ_MODIFY_WRITE:
970 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags));
971 for (i=disks ; i-- ;) {
972 if (i==pd_idx)
973 continue;
974 if (sh->dev[i].towrite &&
975 test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
976 ptr[count++] = page_address(sh->dev[i].page);
977 chosen = sh->dev[i].towrite;
978 sh->dev[i].towrite = NULL;
979
980 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
981 wake_up(&conf->wait_for_overlap);
982
983 BUG_ON(sh->dev[i].written);
984 sh->dev[i].written = chosen;
985 check_xor();
986 }
987 }
988 break;
989 case RECONSTRUCT_WRITE:
990 memset(ptr[0], 0, STRIPE_SIZE);
991 for (i= disks; i-- ;)
992 if (i!=pd_idx && sh->dev[i].towrite) {
993 chosen = sh->dev[i].towrite;
994 sh->dev[i].towrite = NULL;
995
996 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
997 wake_up(&conf->wait_for_overlap);
998
999 BUG_ON(sh->dev[i].written);
1000 sh->dev[i].written = chosen;
1001 }
1002 break;
1003 case CHECK_PARITY:
1004 break;
1005 }
1006 if (count>1) {
1007 xor_block(count, STRIPE_SIZE, ptr);
1008 count = 1;
1009 }
1010
1011 for (i = disks; i--;)
1012 if (sh->dev[i].written) {
1013 sector_t sector = sh->dev[i].sector;
1014 struct bio *wbi = sh->dev[i].written;
1015 while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
1016 copy_data(1, wbi, sh->dev[i].page, sector);
1017 wbi = r5_next_bio(wbi, sector);
1018 }
1019
1020 set_bit(R5_LOCKED, &sh->dev[i].flags);
1021 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1022 }
1023
1024 switch(method) {
1025 case RECONSTRUCT_WRITE:
1026 case CHECK_PARITY:
1027 for (i=disks; i--;)
1028 if (i != pd_idx) {
1029 ptr[count++] = page_address(sh->dev[i].page);
1030 check_xor();
1031 }
1032 break;
1033 case READ_MODIFY_WRITE:
1034 for (i = disks; i--;)
1035 if (sh->dev[i].written) {
1036 ptr[count++] = page_address(sh->dev[i].page);
1037 check_xor();
1038 }
1039 }
1040 if (count != 1)
1041 xor_block(count, STRIPE_SIZE, ptr);
1042
1043 if (method != CHECK_PARITY) {
1044 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1045 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1046 } else
1047 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1048}
1049
1050static void compute_parity6(struct stripe_head *sh, int method) 1519static void compute_parity6(struct stripe_head *sh, int method)
1051{ 1520{
1052 raid6_conf_t *conf = sh->raid_conf; 1521 raid6_conf_t *conf = sh->raid_conf;
@@ -1058,7 +1527,7 @@ static void compute_parity6(struct stripe_head *sh, int method)
1058 qd_idx = raid6_next_disk(pd_idx, disks); 1527 qd_idx = raid6_next_disk(pd_idx, disks);
1059 d0_idx = raid6_next_disk(qd_idx, disks); 1528 d0_idx = raid6_next_disk(qd_idx, disks);
1060 1529
1061 PRINTK("compute_parity, stripe %llu, method %d\n", 1530 pr_debug("compute_parity, stripe %llu, method %d\n",
1062 (unsigned long long)sh->sector, method); 1531 (unsigned long long)sh->sector, method);
1063 1532
1064 switch(method) { 1533 switch(method) {
@@ -1132,20 +1601,20 @@ static void compute_parity6(struct stripe_head *sh, int method)
1132static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) 1601static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
1133{ 1602{
1134 int i, count, disks = sh->disks; 1603 int i, count, disks = sh->disks;
1135 void *ptr[MAX_XOR_BLOCKS], *p; 1604 void *ptr[MAX_XOR_BLOCKS], *dest, *p;
1136 int pd_idx = sh->pd_idx; 1605 int pd_idx = sh->pd_idx;
1137 int qd_idx = raid6_next_disk(pd_idx, disks); 1606 int qd_idx = raid6_next_disk(pd_idx, disks);
1138 1607
1139 PRINTK("compute_block_1, stripe %llu, idx %d\n", 1608 pr_debug("compute_block_1, stripe %llu, idx %d\n",
1140 (unsigned long long)sh->sector, dd_idx); 1609 (unsigned long long)sh->sector, dd_idx);
1141 1610
1142 if ( dd_idx == qd_idx ) { 1611 if ( dd_idx == qd_idx ) {
1143 /* We're actually computing the Q drive */ 1612 /* We're actually computing the Q drive */
1144 compute_parity6(sh, UPDATE_PARITY); 1613 compute_parity6(sh, UPDATE_PARITY);
1145 } else { 1614 } else {
1146 ptr[0] = page_address(sh->dev[dd_idx].page); 1615 dest = page_address(sh->dev[dd_idx].page);
1147 if (!nozero) memset(ptr[0], 0, STRIPE_SIZE); 1616 if (!nozero) memset(dest, 0, STRIPE_SIZE);
1148 count = 1; 1617 count = 0;
1149 for (i = disks ; i--; ) { 1618 for (i = disks ; i--; ) {
1150 if (i == dd_idx || i == qd_idx) 1619 if (i == dd_idx || i == qd_idx)
1151 continue; 1620 continue;
@@ -1159,8 +1628,8 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
1159 1628
1160 check_xor(); 1629 check_xor();
1161 } 1630 }
1162 if (count != 1) 1631 if (count)
1163 xor_block(count, STRIPE_SIZE, ptr); 1632 xor_blocks(count, STRIPE_SIZE, dest, ptr);
1164 if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); 1633 if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1165 else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); 1634 else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1166 } 1635 }
@@ -1183,7 +1652,7 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
1183 BUG_ON(faila == failb); 1652 BUG_ON(faila == failb);
1184 if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } 1653 if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
1185 1654
1186 PRINTK("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n", 1655 pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
1187 (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb); 1656 (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb);
1188 1657
1189 if ( failb == disks-1 ) { 1658 if ( failb == disks-1 ) {
@@ -1229,7 +1698,79 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
1229 } 1698 }
1230} 1699}
1231 1700
1701static int
1702handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
1703{
1704 int i, pd_idx = sh->pd_idx, disks = sh->disks;
1705 int locked = 0;
1232 1706
1707 if (rcw) {
1708 /* if we are not expanding this is a proper write request, and
1709 * there will be bios with new data to be drained into the
1710 * stripe cache
1711 */
1712 if (!expand) {
1713 set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
1714 sh->ops.count++;
1715 }
1716
1717 set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
1718 sh->ops.count++;
1719
1720 for (i = disks; i--; ) {
1721 struct r5dev *dev = &sh->dev[i];
1722
1723 if (dev->towrite) {
1724 set_bit(R5_LOCKED, &dev->flags);
1725 if (!expand)
1726 clear_bit(R5_UPTODATE, &dev->flags);
1727 locked++;
1728 }
1729 }
1730 } else {
1731 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
1732 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
1733
1734 set_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
1735 set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
1736 set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
1737
1738 sh->ops.count += 3;
1739
1740 for (i = disks; i--; ) {
1741 struct r5dev *dev = &sh->dev[i];
1742 if (i == pd_idx)
1743 continue;
1744
1745 /* For a read-modify write there may be blocks that are
1746 * locked for reading while others are ready to be
1747 * written so we distinguish these blocks by the
1748 * R5_Wantprexor bit
1749 */
1750 if (dev->towrite &&
1751 (test_bit(R5_UPTODATE, &dev->flags) ||
1752 test_bit(R5_Wantcompute, &dev->flags))) {
1753 set_bit(R5_Wantprexor, &dev->flags);
1754 set_bit(R5_LOCKED, &dev->flags);
1755 clear_bit(R5_UPTODATE, &dev->flags);
1756 locked++;
1757 }
1758 }
1759 }
1760
1761 /* keep the parity disk locked while asynchronous operations
1762 * are in flight
1763 */
1764 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1765 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1766 locked++;
1767
1768 pr_debug("%s: stripe %llu locked: %d pending: %lx\n",
1769 __FUNCTION__, (unsigned long long)sh->sector,
1770 locked, sh->ops.pending);
1771
1772 return locked;
1773}
1233 1774
1234/* 1775/*
1235 * Each stripe/dev can have one or more bion attached. 1776 * Each stripe/dev can have one or more bion attached.
@@ -1242,7 +1783,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
1242 raid5_conf_t *conf = sh->raid_conf; 1783 raid5_conf_t *conf = sh->raid_conf;
1243 int firstwrite=0; 1784 int firstwrite=0;
1244 1785
1245 PRINTK("adding bh b#%llu to stripe s#%llu\n", 1786 pr_debug("adding bh b#%llu to stripe s#%llu\n",
1246 (unsigned long long)bi->bi_sector, 1787 (unsigned long long)bi->bi_sector,
1247 (unsigned long long)sh->sector); 1788 (unsigned long long)sh->sector);
1248 1789
@@ -1271,7 +1812,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
1271 spin_unlock_irq(&conf->device_lock); 1812 spin_unlock_irq(&conf->device_lock);
1272 spin_unlock(&sh->lock); 1813 spin_unlock(&sh->lock);
1273 1814
1274 PRINTK("added bi b#%llu to stripe s#%llu, disk %d.\n", 1815 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
1275 (unsigned long long)bi->bi_sector, 1816 (unsigned long long)bi->bi_sector,
1276 (unsigned long long)sh->sector, dd_idx); 1817 (unsigned long long)sh->sector, dd_idx);
1277 1818
@@ -1326,6 +1867,729 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
1326 return pd_idx; 1867 return pd_idx;
1327} 1868}
1328 1869
1870static void
1871handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
1872 struct stripe_head_state *s, int disks,
1873 struct bio **return_bi)
1874{
1875 int i;
1876 for (i = disks; i--; ) {
1877 struct bio *bi;
1878 int bitmap_end = 0;
1879
1880 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1881 mdk_rdev_t *rdev;
1882 rcu_read_lock();
1883 rdev = rcu_dereference(conf->disks[i].rdev);
1884 if (rdev && test_bit(In_sync, &rdev->flags))
1885 /* multiple read failures in one stripe */
1886 md_error(conf->mddev, rdev);
1887 rcu_read_unlock();
1888 }
1889 spin_lock_irq(&conf->device_lock);
1890 /* fail all writes first */
1891 bi = sh->dev[i].towrite;
1892 sh->dev[i].towrite = NULL;
1893 if (bi) {
1894 s->to_write--;
1895 bitmap_end = 1;
1896 }
1897
1898 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1899 wake_up(&conf->wait_for_overlap);
1900
1901 while (bi && bi->bi_sector <
1902 sh->dev[i].sector + STRIPE_SECTORS) {
1903 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
1904 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1905 if (--bi->bi_phys_segments == 0) {
1906 md_write_end(conf->mddev);
1907 bi->bi_next = *return_bi;
1908 *return_bi = bi;
1909 }
1910 bi = nextbi;
1911 }
1912 /* and fail all 'written' */
1913 bi = sh->dev[i].written;
1914 sh->dev[i].written = NULL;
1915 if (bi) bitmap_end = 1;
1916 while (bi && bi->bi_sector <
1917 sh->dev[i].sector + STRIPE_SECTORS) {
1918 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
1919 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1920 if (--bi->bi_phys_segments == 0) {
1921 md_write_end(conf->mddev);
1922 bi->bi_next = *return_bi;
1923 *return_bi = bi;
1924 }
1925 bi = bi2;
1926 }
1927
1928 /* fail any reads if this device is non-operational and
1929 * the data has not reached the cache yet.
1930 */
1931 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
1932 (!test_bit(R5_Insync, &sh->dev[i].flags) ||
1933 test_bit(R5_ReadError, &sh->dev[i].flags))) {
1934 bi = sh->dev[i].toread;
1935 sh->dev[i].toread = NULL;
1936 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1937 wake_up(&conf->wait_for_overlap);
1938 if (bi) s->to_read--;
1939 while (bi && bi->bi_sector <
1940 sh->dev[i].sector + STRIPE_SECTORS) {
1941 struct bio *nextbi =
1942 r5_next_bio(bi, sh->dev[i].sector);
1943 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1944 if (--bi->bi_phys_segments == 0) {
1945 bi->bi_next = *return_bi;
1946 *return_bi = bi;
1947 }
1948 bi = nextbi;
1949 }
1950 }
1951 spin_unlock_irq(&conf->device_lock);
1952 if (bitmap_end)
1953 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
1954 STRIPE_SECTORS, 0, 0);
1955 }
1956
1957}
1958
1959/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks
1960 * to process
1961 */
1962static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
1963 struct stripe_head_state *s, int disk_idx, int disks)
1964{
1965 struct r5dev *dev = &sh->dev[disk_idx];
1966 struct r5dev *failed_dev = &sh->dev[s->failed_num];
1967
1968 /* don't schedule compute operations or reads on the parity block while
1969 * a check is in flight
1970 */
1971 if ((disk_idx == sh->pd_idx) &&
1972 test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
1973 return ~0;
1974
1975 /* is the data in this block needed, and can we get it? */
1976 if (!test_bit(R5_LOCKED, &dev->flags) &&
1977 !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread ||
1978 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
1979 s->syncing || s->expanding || (s->failed &&
1980 (failed_dev->toread || (failed_dev->towrite &&
1981 !test_bit(R5_OVERWRITE, &failed_dev->flags)
1982 ))))) {
1983 /* 1/ We would like to get this block, possibly by computing it,
1984 * but we might not be able to.
1985 *
1986 * 2/ Since parity check operations potentially make the parity
1987 * block !uptodate it will need to be refreshed before any
1988 * compute operations on data disks are scheduled.
1989 *
1990 * 3/ We hold off parity block re-reads until check operations
1991 * have quiesced.
1992 */
1993 if ((s->uptodate == disks - 1) &&
1994 !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
1995 set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
1996 set_bit(R5_Wantcompute, &dev->flags);
1997 sh->ops.target = disk_idx;
1998 s->req_compute = 1;
1999 sh->ops.count++;
2000 /* Careful: from this point on 'uptodate' is in the eye
2001 * of raid5_run_ops which services 'compute' operations
2002 * before writes. R5_Wantcompute flags a block that will
2003 * be R5_UPTODATE by the time it is needed for a
2004 * subsequent operation.
2005 */
2006 s->uptodate++;
2007 return 0; /* uptodate + compute == disks */
2008 } else if ((s->uptodate < disks - 1) &&
2009 test_bit(R5_Insync, &dev->flags)) {
2010 /* Note: we hold off compute operations while checks are
2011 * in flight, but we still prefer 'compute' over 'read'
2012 * hence we only read if (uptodate < * disks-1)
2013 */
2014 set_bit(R5_LOCKED, &dev->flags);
2015 set_bit(R5_Wantread, &dev->flags);
2016 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2017 sh->ops.count++;
2018 s->locked++;
2019 pr_debug("Reading block %d (sync=%d)\n", disk_idx,
2020 s->syncing);
2021 }
2022 }
2023
2024 return ~0;
2025}
2026
2027static void handle_issuing_new_read_requests5(struct stripe_head *sh,
2028 struct stripe_head_state *s, int disks)
2029{
2030 int i;
2031
2032 /* Clear completed compute operations. Parity recovery
2033 * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled
2034 * later on in this routine
2035 */
2036 if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
2037 !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
2038 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
2039 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
2040 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
2041 }
2042
2043 /* look for blocks to read/compute, skip this if a compute
2044 * is already in flight, or if the stripe contents are in the
2045 * midst of changing due to a write
2046 */
2047 if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
2048 !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) &&
2049 !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
2050 for (i = disks; i--; )
2051 if (__handle_issuing_new_read_requests5(
2052 sh, s, i, disks) == 0)
2053 break;
2054 }
2055 set_bit(STRIPE_HANDLE, &sh->state);
2056}
2057
2058static void handle_issuing_new_read_requests6(struct stripe_head *sh,
2059 struct stripe_head_state *s, struct r6_state *r6s,
2060 int disks)
2061{
2062 int i;
2063 for (i = disks; i--; ) {
2064 struct r5dev *dev = &sh->dev[i];
2065 if (!test_bit(R5_LOCKED, &dev->flags) &&
2066 !test_bit(R5_UPTODATE, &dev->flags) &&
2067 (dev->toread || (dev->towrite &&
2068 !test_bit(R5_OVERWRITE, &dev->flags)) ||
2069 s->syncing || s->expanding ||
2070 (s->failed >= 1 &&
2071 (sh->dev[r6s->failed_num[0]].toread ||
2072 s->to_write)) ||
2073 (s->failed >= 2 &&
2074 (sh->dev[r6s->failed_num[1]].toread ||
2075 s->to_write)))) {
2076 /* we would like to get this block, possibly
2077 * by computing it, but we might not be able to
2078 */
2079 if (s->uptodate == disks-1) {
2080 pr_debug("Computing stripe %llu block %d\n",
2081 (unsigned long long)sh->sector, i);
2082 compute_block_1(sh, i, 0);
2083 s->uptodate++;
2084 } else if ( s->uptodate == disks-2 && s->failed >= 2 ) {
2085 /* Computing 2-failure is *very* expensive; only
2086 * do it if failed >= 2
2087 */
2088 int other;
2089 for (other = disks; other--; ) {
2090 if (other == i)
2091 continue;
2092 if (!test_bit(R5_UPTODATE,
2093 &sh->dev[other].flags))
2094 break;
2095 }
2096 BUG_ON(other < 0);
2097 pr_debug("Computing stripe %llu blocks %d,%d\n",
2098 (unsigned long long)sh->sector,
2099 i, other);
2100 compute_block_2(sh, i, other);
2101 s->uptodate += 2;
2102 } else if (test_bit(R5_Insync, &dev->flags)) {
2103 set_bit(R5_LOCKED, &dev->flags);
2104 set_bit(R5_Wantread, &dev->flags);
2105 s->locked++;
2106 pr_debug("Reading block %d (sync=%d)\n",
2107 i, s->syncing);
2108 }
2109 }
2110 }
2111 set_bit(STRIPE_HANDLE, &sh->state);
2112}
2113
2114
2115/* handle_completed_write_requests
2116 * any written block on an uptodate or failed drive can be returned.
2117 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
2118 * never LOCKED, so we don't need to test 'failed' directly.
2119 */
2120static void handle_completed_write_requests(raid5_conf_t *conf,
2121 struct stripe_head *sh, int disks, struct bio **return_bi)
2122{
2123 int i;
2124 struct r5dev *dev;
2125
2126 for (i = disks; i--; )
2127 if (sh->dev[i].written) {
2128 dev = &sh->dev[i];
2129 if (!test_bit(R5_LOCKED, &dev->flags) &&
2130 test_bit(R5_UPTODATE, &dev->flags)) {
2131 /* We can return any write requests */
2132 struct bio *wbi, *wbi2;
2133 int bitmap_end = 0;
2134 pr_debug("Return write for disc %d\n", i);
2135 spin_lock_irq(&conf->device_lock);
2136 wbi = dev->written;
2137 dev->written = NULL;
2138 while (wbi && wbi->bi_sector <
2139 dev->sector + STRIPE_SECTORS) {
2140 wbi2 = r5_next_bio(wbi, dev->sector);
2141 if (--wbi->bi_phys_segments == 0) {
2142 md_write_end(conf->mddev);
2143 wbi->bi_next = *return_bi;
2144 *return_bi = wbi;
2145 }
2146 wbi = wbi2;
2147 }
2148 if (dev->towrite == NULL)
2149 bitmap_end = 1;
2150 spin_unlock_irq(&conf->device_lock);
2151 if (bitmap_end)
2152 bitmap_endwrite(conf->mddev->bitmap,
2153 sh->sector,
2154 STRIPE_SECTORS,
2155 !test_bit(STRIPE_DEGRADED, &sh->state),
2156 0);
2157 }
2158 }
2159}
2160
2161static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
2162 struct stripe_head *sh, struct stripe_head_state *s, int disks)
2163{
2164 int rmw = 0, rcw = 0, i;
2165 for (i = disks; i--; ) {
2166 /* would I have to read this buffer for read_modify_write */
2167 struct r5dev *dev = &sh->dev[i];
2168 if ((dev->towrite || i == sh->pd_idx) &&
2169 !test_bit(R5_LOCKED, &dev->flags) &&
2170 !(test_bit(R5_UPTODATE, &dev->flags) ||
2171 test_bit(R5_Wantcompute, &dev->flags))) {
2172 if (test_bit(R5_Insync, &dev->flags))
2173 rmw++;
2174 else
2175 rmw += 2*disks; /* cannot read it */
2176 }
2177 /* Would I have to read this buffer for reconstruct_write */
2178 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
2179 !test_bit(R5_LOCKED, &dev->flags) &&
2180 !(test_bit(R5_UPTODATE, &dev->flags) ||
2181 test_bit(R5_Wantcompute, &dev->flags))) {
2182 if (test_bit(R5_Insync, &dev->flags)) rcw++;
2183 else
2184 rcw += 2*disks;
2185 }
2186 }
2187 pr_debug("for sector %llu, rmw=%d rcw=%d\n",
2188 (unsigned long long)sh->sector, rmw, rcw);
2189 set_bit(STRIPE_HANDLE, &sh->state);
2190 if (rmw < rcw && rmw > 0)
2191 /* prefer read-modify-write, but need to get some data */
2192 for (i = disks; i--; ) {
2193 struct r5dev *dev = &sh->dev[i];
2194 if ((dev->towrite || i == sh->pd_idx) &&
2195 !test_bit(R5_LOCKED, &dev->flags) &&
2196 !(test_bit(R5_UPTODATE, &dev->flags) ||
2197 test_bit(R5_Wantcompute, &dev->flags)) &&
2198 test_bit(R5_Insync, &dev->flags)) {
2199 if (
2200 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2201 pr_debug("Read_old block "
2202 "%d for r-m-w\n", i);
2203 set_bit(R5_LOCKED, &dev->flags);
2204 set_bit(R5_Wantread, &dev->flags);
2205 if (!test_and_set_bit(
2206 STRIPE_OP_IO, &sh->ops.pending))
2207 sh->ops.count++;
2208 s->locked++;
2209 } else {
2210 set_bit(STRIPE_DELAYED, &sh->state);
2211 set_bit(STRIPE_HANDLE, &sh->state);
2212 }
2213 }
2214 }
2215 if (rcw <= rmw && rcw > 0)
2216 /* want reconstruct write, but need to get some data */
2217 for (i = disks; i--; ) {
2218 struct r5dev *dev = &sh->dev[i];
2219 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2220 i != sh->pd_idx &&
2221 !test_bit(R5_LOCKED, &dev->flags) &&
2222 !(test_bit(R5_UPTODATE, &dev->flags) ||
2223 test_bit(R5_Wantcompute, &dev->flags)) &&
2224 test_bit(R5_Insync, &dev->flags)) {
2225 if (
2226 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2227 pr_debug("Read_old block "
2228 "%d for Reconstruct\n", i);
2229 set_bit(R5_LOCKED, &dev->flags);
2230 set_bit(R5_Wantread, &dev->flags);
2231 if (!test_and_set_bit(
2232 STRIPE_OP_IO, &sh->ops.pending))
2233 sh->ops.count++;
2234 s->locked++;
2235 } else {
2236 set_bit(STRIPE_DELAYED, &sh->state);
2237 set_bit(STRIPE_HANDLE, &sh->state);
2238 }
2239 }
2240 }
2241 /* now if nothing is locked, and if we have enough data,
2242 * we can start a write request
2243 */
2244 /* since handle_stripe can be called at any time we need to handle the
2245 * case where a compute block operation has been submitted and then a
2246 * subsequent call wants to start a write request. raid5_run_ops only
2247 * handles the case where compute block and postxor are requested
2248 * simultaneously. If this is not the case then new writes need to be
2249 * held off until the compute completes.
2250 */
2251 if ((s->req_compute ||
2252 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) &&
2253 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
2254 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
2255 s->locked += handle_write_operations5(sh, rcw == 0, 0);
2256}
2257
2258static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
2259 struct stripe_head *sh, struct stripe_head_state *s,
2260 struct r6_state *r6s, int disks)
2261{
2262 int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i;
2263 int qd_idx = r6s->qd_idx;
2264 for (i = disks; i--; ) {
2265 struct r5dev *dev = &sh->dev[i];
2266 /* Would I have to read this buffer for reconstruct_write */
2267 if (!test_bit(R5_OVERWRITE, &dev->flags)
2268 && i != pd_idx && i != qd_idx
2269 && (!test_bit(R5_LOCKED, &dev->flags)
2270 ) &&
2271 !test_bit(R5_UPTODATE, &dev->flags)) {
2272 if (test_bit(R5_Insync, &dev->flags)) rcw++;
2273 else {
2274 pr_debug("raid6: must_compute: "
2275 "disk %d flags=%#lx\n", i, dev->flags);
2276 must_compute++;
2277 }
2278 }
2279 }
2280 pr_debug("for sector %llu, rcw=%d, must_compute=%d\n",
2281 (unsigned long long)sh->sector, rcw, must_compute);
2282 set_bit(STRIPE_HANDLE, &sh->state);
2283
2284 if (rcw > 0)
2285 /* want reconstruct write, but need to get some data */
2286 for (i = disks; i--; ) {
2287 struct r5dev *dev = &sh->dev[i];
2288 if (!test_bit(R5_OVERWRITE, &dev->flags)
2289 && !(s->failed == 0 && (i == pd_idx || i == qd_idx))
2290 && !test_bit(R5_LOCKED, &dev->flags) &&
2291 !test_bit(R5_UPTODATE, &dev->flags) &&
2292 test_bit(R5_Insync, &dev->flags)) {
2293 if (
2294 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2295 pr_debug("Read_old stripe %llu "
2296 "block %d for Reconstruct\n",
2297 (unsigned long long)sh->sector, i);
2298 set_bit(R5_LOCKED, &dev->flags);
2299 set_bit(R5_Wantread, &dev->flags);
2300 s->locked++;
2301 } else {
2302 pr_debug("Request delayed stripe %llu "
2303 "block %d for Reconstruct\n",
2304 (unsigned long long)sh->sector, i);
2305 set_bit(STRIPE_DELAYED, &sh->state);
2306 set_bit(STRIPE_HANDLE, &sh->state);
2307 }
2308 }
2309 }
2310 /* now if nothing is locked, and if we have enough data, we can start a
2311 * write request
2312 */
2313 if (s->locked == 0 && rcw == 0 &&
2314 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
2315 if (must_compute > 0) {
2316 /* We have failed blocks and need to compute them */
2317 switch (s->failed) {
2318 case 0:
2319 BUG();
2320 case 1:
2321 compute_block_1(sh, r6s->failed_num[0], 0);
2322 break;
2323 case 2:
2324 compute_block_2(sh, r6s->failed_num[0],
2325 r6s->failed_num[1]);
2326 break;
2327 default: /* This request should have been failed? */
2328 BUG();
2329 }
2330 }
2331
2332 pr_debug("Computing parity for stripe %llu\n",
2333 (unsigned long long)sh->sector);
2334 compute_parity6(sh, RECONSTRUCT_WRITE);
2335 /* now every locked buffer is ready to be written */
2336 for (i = disks; i--; )
2337 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
2338 pr_debug("Writing stripe %llu block %d\n",
2339 (unsigned long long)sh->sector, i);
2340 s->locked++;
2341 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2342 }
2343 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
2344 set_bit(STRIPE_INSYNC, &sh->state);
2345
2346 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2347 atomic_dec(&conf->preread_active_stripes);
2348 if (atomic_read(&conf->preread_active_stripes) <
2349 IO_THRESHOLD)
2350 md_wakeup_thread(conf->mddev->thread);
2351 }
2352 }
2353}
2354
2355static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2356 struct stripe_head_state *s, int disks)
2357{
2358 set_bit(STRIPE_HANDLE, &sh->state);
2359 /* Take one of the following actions:
2360 * 1/ start a check parity operation if (uptodate == disks)
2361 * 2/ finish a check parity operation and act on the result
2362 * 3/ skip to the writeback section if we previously
2363 * initiated a recovery operation
2364 */
2365 if (s->failed == 0 &&
2366 !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
2367 if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
2368 BUG_ON(s->uptodate != disks);
2369 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
2370 sh->ops.count++;
2371 s->uptodate--;
2372 } else if (
2373 test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) {
2374 clear_bit(STRIPE_OP_CHECK, &sh->ops.ack);
2375 clear_bit(STRIPE_OP_CHECK, &sh->ops.pending);
2376
2377 if (sh->ops.zero_sum_result == 0)
2378 /* parity is correct (on disc,
2379 * not in buffer any more)
2380 */
2381 set_bit(STRIPE_INSYNC, &sh->state);
2382 else {
2383 conf->mddev->resync_mismatches +=
2384 STRIPE_SECTORS;
2385 if (test_bit(
2386 MD_RECOVERY_CHECK, &conf->mddev->recovery))
2387 /* don't try to repair!! */
2388 set_bit(STRIPE_INSYNC, &sh->state);
2389 else {
2390 set_bit(STRIPE_OP_COMPUTE_BLK,
2391 &sh->ops.pending);
2392 set_bit(STRIPE_OP_MOD_REPAIR_PD,
2393 &sh->ops.pending);
2394 set_bit(R5_Wantcompute,
2395 &sh->dev[sh->pd_idx].flags);
2396 sh->ops.target = sh->pd_idx;
2397 sh->ops.count++;
2398 s->uptodate++;
2399 }
2400 }
2401 }
2402 }
2403
2404 /* check if we can clear a parity disk reconstruct */
2405 if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
2406 test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
2407
2408 clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending);
2409 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
2410 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
2411 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
2412 }
2413
2414 /* Wait for check parity and compute block operations to complete
2415 * before write-back
2416 */
2417 if (!test_bit(STRIPE_INSYNC, &sh->state) &&
2418 !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) &&
2419 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) {
2420 struct r5dev *dev;
2421 /* either failed parity check, or recovery is happening */
2422 if (s->failed == 0)
2423 s->failed_num = sh->pd_idx;
2424 dev = &sh->dev[s->failed_num];
2425 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
2426 BUG_ON(s->uptodate != disks);
2427
2428 set_bit(R5_LOCKED, &dev->flags);
2429 set_bit(R5_Wantwrite, &dev->flags);
2430 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2431 sh->ops.count++;
2432
2433 clear_bit(STRIPE_DEGRADED, &sh->state);
2434 s->locked++;
2435 set_bit(STRIPE_INSYNC, &sh->state);
2436 }
2437}
2438
2439
2440static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2441 struct stripe_head_state *s,
2442 struct r6_state *r6s, struct page *tmp_page,
2443 int disks)
2444{
2445 int update_p = 0, update_q = 0;
2446 struct r5dev *dev;
2447 int pd_idx = sh->pd_idx;
2448 int qd_idx = r6s->qd_idx;
2449
2450 set_bit(STRIPE_HANDLE, &sh->state);
2451
2452 BUG_ON(s->failed > 2);
2453 BUG_ON(s->uptodate < disks);
2454 /* Want to check and possibly repair P and Q.
2455 * However there could be one 'failed' device, in which
2456 * case we can only check one of them, possibly using the
2457 * other to generate missing data
2458 */
2459
2460 /* If !tmp_page, we cannot do the calculations,
2461 * but as we have set STRIPE_HANDLE, we will soon be called
2462 * by stripe_handle with a tmp_page - just wait until then.
2463 */
2464 if (tmp_page) {
2465 if (s->failed == r6s->q_failed) {
2466 /* The only possible failed device holds 'Q', so it
2467 * makes sense to check P (If anything else were failed,
2468 * we would have used P to recreate it).
2469 */
2470 compute_block_1(sh, pd_idx, 1);
2471 if (!page_is_zero(sh->dev[pd_idx].page)) {
2472 compute_block_1(sh, pd_idx, 0);
2473 update_p = 1;
2474 }
2475 }
2476 if (!r6s->q_failed && s->failed < 2) {
2477 /* q is not failed, and we didn't use it to generate
2478 * anything, so it makes sense to check it
2479 */
2480 memcpy(page_address(tmp_page),
2481 page_address(sh->dev[qd_idx].page),
2482 STRIPE_SIZE);
2483 compute_parity6(sh, UPDATE_PARITY);
2484 if (memcmp(page_address(tmp_page),
2485 page_address(sh->dev[qd_idx].page),
2486 STRIPE_SIZE) != 0) {
2487 clear_bit(STRIPE_INSYNC, &sh->state);
2488 update_q = 1;
2489 }
2490 }
2491 if (update_p || update_q) {
2492 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2493 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2494 /* don't try to repair!! */
2495 update_p = update_q = 0;
2496 }
2497
2498 /* now write out any block on a failed drive,
2499 * or P or Q if they need it
2500 */
2501
2502 if (s->failed == 2) {
2503 dev = &sh->dev[r6s->failed_num[1]];
2504 s->locked++;
2505 set_bit(R5_LOCKED, &dev->flags);
2506 set_bit(R5_Wantwrite, &dev->flags);
2507 }
2508 if (s->failed >= 1) {
2509 dev = &sh->dev[r6s->failed_num[0]];
2510 s->locked++;
2511 set_bit(R5_LOCKED, &dev->flags);
2512 set_bit(R5_Wantwrite, &dev->flags);
2513 }
2514
2515 if (update_p) {
2516 dev = &sh->dev[pd_idx];
2517 s->locked++;
2518 set_bit(R5_LOCKED, &dev->flags);
2519 set_bit(R5_Wantwrite, &dev->flags);
2520 }
2521 if (update_q) {
2522 dev = &sh->dev[qd_idx];
2523 s->locked++;
2524 set_bit(R5_LOCKED, &dev->flags);
2525 set_bit(R5_Wantwrite, &dev->flags);
2526 }
2527 clear_bit(STRIPE_DEGRADED, &sh->state);
2528
2529 set_bit(STRIPE_INSYNC, &sh->state);
2530 }
2531}
2532
2533static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2534 struct r6_state *r6s)
2535{
2536 int i;
2537
2538 /* We have read all the blocks in this stripe and now we need to
2539 * copy some of them into a target stripe for expand.
2540 */
2541 struct dma_async_tx_descriptor *tx = NULL;
2542 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2543 for (i = 0; i < sh->disks; i++)
2544 if (i != sh->pd_idx && (r6s && i != r6s->qd_idx)) {
2545 int dd_idx, pd_idx, j;
2546 struct stripe_head *sh2;
2547
2548 sector_t bn = compute_blocknr(sh, i);
2549 sector_t s = raid5_compute_sector(bn, conf->raid_disks,
2550 conf->raid_disks -
2551 conf->max_degraded, &dd_idx,
2552 &pd_idx, conf);
2553 sh2 = get_active_stripe(conf, s, conf->raid_disks,
2554 pd_idx, 1);
2555 if (sh2 == NULL)
2556 /* so far only the early blocks of this stripe
2557 * have been requested. When later blocks
2558 * get requested, we will try again
2559 */
2560 continue;
2561 if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
2562 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
2563 /* must have already done this block */
2564 release_stripe(sh2);
2565 continue;
2566 }
2567
2568 /* place all the copies on one channel */
2569 tx = async_memcpy(sh2->dev[dd_idx].page,
2570 sh->dev[i].page, 0, 0, STRIPE_SIZE,
2571 ASYNC_TX_DEP_ACK, tx, NULL, NULL);
2572
2573 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
2574 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
2575 for (j = 0; j < conf->raid_disks; j++)
2576 if (j != sh2->pd_idx &&
2577 (r6s && j != r6s->qd_idx) &&
2578 !test_bit(R5_Expanded, &sh2->dev[j].flags))
2579 break;
2580 if (j == conf->raid_disks) {
2581 set_bit(STRIPE_EXPAND_READY, &sh2->state);
2582 set_bit(STRIPE_HANDLE, &sh2->state);
2583 }
2584 release_stripe(sh2);
2585
2586 /* done submitting copies, wait for them to complete */
2587 if (i + 1 >= sh->disks) {
2588 async_tx_ack(tx);
2589 dma_wait_for_async_tx(tx);
2590 }
2591 }
2592}
1329 2593
1330/* 2594/*
1331 * handle_stripe - do things to a stripe. 2595 * handle_stripe - do things to a stripe.
@@ -1339,81 +2603,70 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
1339 * schedule a write of some buffers 2603 * schedule a write of some buffers
1340 * return confirmation of parity correctness 2604 * return confirmation of parity correctness
1341 * 2605 *
1342 * Parity calculations are done inside the stripe lock
1343 * buffers are taken off read_list or write_list, and bh_cache buffers 2606 * buffers are taken off read_list or write_list, and bh_cache buffers
1344 * get BH_Lock set before the stripe lock is released. 2607 * get BH_Lock set before the stripe lock is released.
1345 * 2608 *
1346 */ 2609 */
1347 2610
1348static void handle_stripe5(struct stripe_head *sh) 2611static void handle_stripe5(struct stripe_head *sh)
1349{ 2612{
1350 raid5_conf_t *conf = sh->raid_conf; 2613 raid5_conf_t *conf = sh->raid_conf;
1351 int disks = sh->disks; 2614 int disks = sh->disks, i;
1352 struct bio *return_bi= NULL; 2615 struct bio *return_bi = NULL;
1353 struct bio *bi; 2616 struct stripe_head_state s;
1354 int i;
1355 int syncing, expanding, expanded;
1356 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
1357 int non_overwrite = 0;
1358 int failed_num=0;
1359 struct r5dev *dev; 2617 struct r5dev *dev;
2618 unsigned long pending = 0;
1360 2619
1361 PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n", 2620 memset(&s, 0, sizeof(s));
1362 (unsigned long long)sh->sector, atomic_read(&sh->count), 2621 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d "
1363 sh->pd_idx); 2622 "ops=%lx:%lx:%lx\n", (unsigned long long)sh->sector, sh->state,
2623 atomic_read(&sh->count), sh->pd_idx,
2624 sh->ops.pending, sh->ops.ack, sh->ops.complete);
1364 2625
1365 spin_lock(&sh->lock); 2626 spin_lock(&sh->lock);
1366 clear_bit(STRIPE_HANDLE, &sh->state); 2627 clear_bit(STRIPE_HANDLE, &sh->state);
1367 clear_bit(STRIPE_DELAYED, &sh->state); 2628 clear_bit(STRIPE_DELAYED, &sh->state);
1368 2629
1369 syncing = test_bit(STRIPE_SYNCING, &sh->state); 2630 s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
1370 expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 2631 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
1371 expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 2632 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
1372 /* Now to look around and see what can be done */ 2633 /* Now to look around and see what can be done */
1373 2634
1374 rcu_read_lock(); 2635 rcu_read_lock();
1375 for (i=disks; i--; ) { 2636 for (i=disks; i--; ) {
1376 mdk_rdev_t *rdev; 2637 mdk_rdev_t *rdev;
1377 dev = &sh->dev[i]; 2638 struct r5dev *dev = &sh->dev[i];
1378 clear_bit(R5_Insync, &dev->flags); 2639 clear_bit(R5_Insync, &dev->flags);
1379 2640
1380 PRINTK("check %d: state 0x%lx read %p write %p written %p\n", 2641 pr_debug("check %d: state 0x%lx toread %p read %p write %p "
1381 i, dev->flags, dev->toread, dev->towrite, dev->written); 2642 "written %p\n", i, dev->flags, dev->toread, dev->read,
1382 /* maybe we can reply to a read */ 2643 dev->towrite, dev->written);
1383 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
1384 struct bio *rbi, *rbi2;
1385 PRINTK("Return read for disc %d\n", i);
1386 spin_lock_irq(&conf->device_lock);
1387 rbi = dev->toread;
1388 dev->toread = NULL;
1389 if (test_and_clear_bit(R5_Overlap, &dev->flags))
1390 wake_up(&conf->wait_for_overlap);
1391 spin_unlock_irq(&conf->device_lock);
1392 while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
1393 copy_data(0, rbi, dev->page, dev->sector);
1394 rbi2 = r5_next_bio(rbi, dev->sector);
1395 spin_lock_irq(&conf->device_lock);
1396 if (--rbi->bi_phys_segments == 0) {
1397 rbi->bi_next = return_bi;
1398 return_bi = rbi;
1399 }
1400 spin_unlock_irq(&conf->device_lock);
1401 rbi = rbi2;
1402 }
1403 }
1404 2644
1405 /* now count some things */ 2645 /* maybe we can request a biofill operation
1406 if (test_bit(R5_LOCKED, &dev->flags)) locked++; 2646 *
1407 if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++; 2647 * new wantfill requests are only permitted while
2648 * STRIPE_OP_BIOFILL is clear
2649 */
2650 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
2651 !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
2652 set_bit(R5_Wantfill, &dev->flags);
1408 2653
1409 2654 /* now count some things */
1410 if (dev->toread) to_read++; 2655 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
2656 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
2657 if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++;
2658
2659 if (test_bit(R5_Wantfill, &dev->flags))
2660 s.to_fill++;
2661 else if (dev->toread)
2662 s.to_read++;
1411 if (dev->towrite) { 2663 if (dev->towrite) {
1412 to_write++; 2664 s.to_write++;
1413 if (!test_bit(R5_OVERWRITE, &dev->flags)) 2665 if (!test_bit(R5_OVERWRITE, &dev->flags))
1414 non_overwrite++; 2666 s.non_overwrite++;
1415 } 2667 }
1416 if (dev->written) written++; 2668 if (dev->written)
2669 s.written++;
1417 rdev = rcu_dereference(conf->disks[i].rdev); 2670 rdev = rcu_dereference(conf->disks[i].rdev);
1418 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 2671 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
1419 /* The ReadError flag will just be confusing now */ 2672 /* The ReadError flag will just be confusing now */
@@ -1422,306 +2675,131 @@ static void handle_stripe5(struct stripe_head *sh)
1422 } 2675 }
1423 if (!rdev || !test_bit(In_sync, &rdev->flags) 2676 if (!rdev || !test_bit(In_sync, &rdev->flags)
1424 || test_bit(R5_ReadError, &dev->flags)) { 2677 || test_bit(R5_ReadError, &dev->flags)) {
1425 failed++; 2678 s.failed++;
1426 failed_num = i; 2679 s.failed_num = i;
1427 } else 2680 } else
1428 set_bit(R5_Insync, &dev->flags); 2681 set_bit(R5_Insync, &dev->flags);
1429 } 2682 }
1430 rcu_read_unlock(); 2683 rcu_read_unlock();
1431 PRINTK("locked=%d uptodate=%d to_read=%d" 2684
2685 if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
2686 sh->ops.count++;
2687
2688 pr_debug("locked=%d uptodate=%d to_read=%d"
1432 " to_write=%d failed=%d failed_num=%d\n", 2689 " to_write=%d failed=%d failed_num=%d\n",
1433 locked, uptodate, to_read, to_write, failed, failed_num); 2690 s.locked, s.uptodate, s.to_read, s.to_write,
2691 s.failed, s.failed_num);
1434 /* check if the array has lost two devices and, if so, some requests might 2692 /* check if the array has lost two devices and, if so, some requests might
1435 * need to be failed 2693 * need to be failed
1436 */ 2694 */
1437 if (failed > 1 && to_read+to_write+written) { 2695 if (s.failed > 1 && s.to_read+s.to_write+s.written)
1438 for (i=disks; i--; ) { 2696 handle_requests_to_failed_array(conf, sh, &s, disks,
1439 int bitmap_end = 0; 2697 &return_bi);
1440 2698 if (s.failed > 1 && s.syncing) {
1441 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1442 mdk_rdev_t *rdev;
1443 rcu_read_lock();
1444 rdev = rcu_dereference(conf->disks[i].rdev);
1445 if (rdev && test_bit(In_sync, &rdev->flags))
1446 /* multiple read failures in one stripe */
1447 md_error(conf->mddev, rdev);
1448 rcu_read_unlock();
1449 }
1450
1451 spin_lock_irq(&conf->device_lock);
1452 /* fail all writes first */
1453 bi = sh->dev[i].towrite;
1454 sh->dev[i].towrite = NULL;
1455 if (bi) { to_write--; bitmap_end = 1; }
1456
1457 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1458 wake_up(&conf->wait_for_overlap);
1459
1460 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
1461 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
1462 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1463 if (--bi->bi_phys_segments == 0) {
1464 md_write_end(conf->mddev);
1465 bi->bi_next = return_bi;
1466 return_bi = bi;
1467 }
1468 bi = nextbi;
1469 }
1470 /* and fail all 'written' */
1471 bi = sh->dev[i].written;
1472 sh->dev[i].written = NULL;
1473 if (bi) bitmap_end = 1;
1474 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
1475 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
1476 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1477 if (--bi->bi_phys_segments == 0) {
1478 md_write_end(conf->mddev);
1479 bi->bi_next = return_bi;
1480 return_bi = bi;
1481 }
1482 bi = bi2;
1483 }
1484
1485 /* fail any reads if this device is non-operational */
1486 if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
1487 test_bit(R5_ReadError, &sh->dev[i].flags)) {
1488 bi = sh->dev[i].toread;
1489 sh->dev[i].toread = NULL;
1490 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1491 wake_up(&conf->wait_for_overlap);
1492 if (bi) to_read--;
1493 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
1494 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
1495 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1496 if (--bi->bi_phys_segments == 0) {
1497 bi->bi_next = return_bi;
1498 return_bi = bi;
1499 }
1500 bi = nextbi;
1501 }
1502 }
1503 spin_unlock_irq(&conf->device_lock);
1504 if (bitmap_end)
1505 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
1506 STRIPE_SECTORS, 0, 0);
1507 }
1508 }
1509 if (failed > 1 && syncing) {
1510 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 2699 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
1511 clear_bit(STRIPE_SYNCING, &sh->state); 2700 clear_bit(STRIPE_SYNCING, &sh->state);
1512 syncing = 0; 2701 s.syncing = 0;
1513 } 2702 }
1514 2703
1515 /* might be able to return some write requests if the parity block 2704 /* might be able to return some write requests if the parity block
1516 * is safe, or on a failed drive 2705 * is safe, or on a failed drive
1517 */ 2706 */
1518 dev = &sh->dev[sh->pd_idx]; 2707 dev = &sh->dev[sh->pd_idx];
1519 if ( written && 2708 if ( s.written &&
1520 ( (test_bit(R5_Insync, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) && 2709 ((test_bit(R5_Insync, &dev->flags) &&
1521 test_bit(R5_UPTODATE, &dev->flags)) 2710 !test_bit(R5_LOCKED, &dev->flags) &&
1522 || (failed == 1 && failed_num == sh->pd_idx)) 2711 test_bit(R5_UPTODATE, &dev->flags)) ||
1523 ) { 2712 (s.failed == 1 && s.failed_num == sh->pd_idx)))
1524 /* any written block on an uptodate or failed drive can be returned. 2713 handle_completed_write_requests(conf, sh, disks, &return_bi);
1525 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
1526 * never LOCKED, so we don't need to test 'failed' directly.
1527 */
1528 for (i=disks; i--; )
1529 if (sh->dev[i].written) {
1530 dev = &sh->dev[i];
1531 if (!test_bit(R5_LOCKED, &dev->flags) &&
1532 test_bit(R5_UPTODATE, &dev->flags) ) {
1533 /* We can return any write requests */
1534 struct bio *wbi, *wbi2;
1535 int bitmap_end = 0;
1536 PRINTK("Return write for disc %d\n", i);
1537 spin_lock_irq(&conf->device_lock);
1538 wbi = dev->written;
1539 dev->written = NULL;
1540 while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
1541 wbi2 = r5_next_bio(wbi, dev->sector);
1542 if (--wbi->bi_phys_segments == 0) {
1543 md_write_end(conf->mddev);
1544 wbi->bi_next = return_bi;
1545 return_bi = wbi;
1546 }
1547 wbi = wbi2;
1548 }
1549 if (dev->towrite == NULL)
1550 bitmap_end = 1;
1551 spin_unlock_irq(&conf->device_lock);
1552 if (bitmap_end)
1553 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
1554 STRIPE_SECTORS,
1555 !test_bit(STRIPE_DEGRADED, &sh->state), 0);
1556 }
1557 }
1558 }
1559 2714
1560 /* Now we might consider reading some blocks, either to check/generate 2715 /* Now we might consider reading some blocks, either to check/generate
1561 * parity, or to satisfy requests 2716 * parity, or to satisfy requests
1562 * or to load a block that is being partially written. 2717 * or to load a block that is being partially written.
1563 */ 2718 */
1564 if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding) { 2719 if (s.to_read || s.non_overwrite ||
1565 for (i=disks; i--;) { 2720 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding ||
1566 dev = &sh->dev[i]; 2721 test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
1567 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && 2722 handle_issuing_new_read_requests5(sh, &s, disks);
1568 (dev->toread || 2723
1569 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2724 /* Now we check to see if any write operations have recently
1570 syncing || 2725 * completed
1571 expanding || 2726 */
1572 (failed && (sh->dev[failed_num].toread || 2727
1573 (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags)))) 2728 /* leave prexor set until postxor is done, allows us to distinguish
1574 ) 2729 * a rmw from a rcw during biodrain
1575 ) { 2730 */
1576 /* we would like to get this block, possibly 2731 if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) &&
1577 * by computing it, but we might not be able to 2732 test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
1578 */ 2733
1579 if (uptodate == disks-1) { 2734 clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
1580 PRINTK("Computing block %d\n", i); 2735 clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack);
1581 compute_block(sh, i); 2736 clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
1582 uptodate++; 2737
1583 } else if (test_bit(R5_Insync, &dev->flags)) { 2738 for (i = disks; i--; )
1584 set_bit(R5_LOCKED, &dev->flags); 2739 clear_bit(R5_Wantprexor, &sh->dev[i].flags);
1585 set_bit(R5_Wantread, &dev->flags);
1586 locked++;
1587 PRINTK("Reading block %d (sync=%d)\n",
1588 i, syncing);
1589 }
1590 }
1591 }
1592 set_bit(STRIPE_HANDLE, &sh->state);
1593 } 2740 }
1594 2741
1595 /* now to consider writing and what else, if anything should be read */ 2742 /* if only POSTXOR is set then this is an 'expand' postxor */
1596 if (to_write) { 2743 if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) &&
1597 int rmw=0, rcw=0; 2744 test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
1598 for (i=disks ; i--;) { 2745
1599 /* would I have to read this buffer for read_modify_write */ 2746 clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
2747 clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack);
2748 clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
2749
2750 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
2751 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
2752 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
2753
2754 /* All the 'written' buffers and the parity block are ready to
2755 * be written back to disk
2756 */
2757 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
2758 for (i = disks; i--; ) {
1600 dev = &sh->dev[i]; 2759 dev = &sh->dev[i];
1601 if ((dev->towrite || i == sh->pd_idx) && 2760 if (test_bit(R5_LOCKED, &dev->flags) &&
1602 (!test_bit(R5_LOCKED, &dev->flags) 2761 (i == sh->pd_idx || dev->written)) {
1603 ) && 2762 pr_debug("Writing block %d\n", i);
1604 !test_bit(R5_UPTODATE, &dev->flags)) { 2763 set_bit(R5_Wantwrite, &dev->flags);
1605 if (test_bit(R5_Insync, &dev->flags) 2764 if (!test_and_set_bit(
1606/* && !(!mddev->insync && i == sh->pd_idx) */ 2765 STRIPE_OP_IO, &sh->ops.pending))
1607 ) 2766 sh->ops.count++;
1608 rmw++; 2767 if (!test_bit(R5_Insync, &dev->flags) ||
1609 else rmw += 2*disks; /* cannot read it */ 2768 (i == sh->pd_idx && s.failed == 0))
1610 } 2769 set_bit(STRIPE_INSYNC, &sh->state);
1611 /* Would I have to read this buffer for reconstruct_write */
1612 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
1613 (!test_bit(R5_LOCKED, &dev->flags)
1614 ) &&
1615 !test_bit(R5_UPTODATE, &dev->flags)) {
1616 if (test_bit(R5_Insync, &dev->flags)) rcw++;
1617 else rcw += 2*disks;
1618 } 2770 }
1619 } 2771 }
1620 PRINTK("for sector %llu, rmw=%d rcw=%d\n", 2772 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
1621 (unsigned long long)sh->sector, rmw, rcw); 2773 atomic_dec(&conf->preread_active_stripes);
1622 set_bit(STRIPE_HANDLE, &sh->state); 2774 if (atomic_read(&conf->preread_active_stripes) <
1623 if (rmw < rcw && rmw > 0) 2775 IO_THRESHOLD)
1624 /* prefer read-modify-write, but need to get some data */ 2776 md_wakeup_thread(conf->mddev->thread);
1625 for (i=disks; i--;) {
1626 dev = &sh->dev[i];
1627 if ((dev->towrite || i == sh->pd_idx) &&
1628 !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1629 test_bit(R5_Insync, &dev->flags)) {
1630 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1631 {
1632 PRINTK("Read_old block %d for r-m-w\n", i);
1633 set_bit(R5_LOCKED, &dev->flags);
1634 set_bit(R5_Wantread, &dev->flags);
1635 locked++;
1636 } else {
1637 set_bit(STRIPE_DELAYED, &sh->state);
1638 set_bit(STRIPE_HANDLE, &sh->state);
1639 }
1640 }
1641 }
1642 if (rcw <= rmw && rcw > 0)
1643 /* want reconstruct write, but need to get some data */
1644 for (i=disks; i--;) {
1645 dev = &sh->dev[i];
1646 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
1647 !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1648 test_bit(R5_Insync, &dev->flags)) {
1649 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1650 {
1651 PRINTK("Read_old block %d for Reconstruct\n", i);
1652 set_bit(R5_LOCKED, &dev->flags);
1653 set_bit(R5_Wantread, &dev->flags);
1654 locked++;
1655 } else {
1656 set_bit(STRIPE_DELAYED, &sh->state);
1657 set_bit(STRIPE_HANDLE, &sh->state);
1658 }
1659 }
1660 }
1661 /* now if nothing is locked, and if we have enough data, we can start a write request */
1662 if (locked == 0 && (rcw == 0 ||rmw == 0) &&
1663 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
1664 PRINTK("Computing parity...\n");
1665 compute_parity5(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
1666 /* now every locked buffer is ready to be written */
1667 for (i=disks; i--;)
1668 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
1669 PRINTK("Writing block %d\n", i);
1670 locked++;
1671 set_bit(R5_Wantwrite, &sh->dev[i].flags);
1672 if (!test_bit(R5_Insync, &sh->dev[i].flags)
1673 || (i==sh->pd_idx && failed == 0))
1674 set_bit(STRIPE_INSYNC, &sh->state);
1675 }
1676 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
1677 atomic_dec(&conf->preread_active_stripes);
1678 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
1679 md_wakeup_thread(conf->mddev->thread);
1680 }
1681 } 2777 }
1682 } 2778 }
1683 2779
1684 /* maybe we need to check and possibly fix the parity for this stripe 2780 /* Now to consider new write requests and what else, if anything
1685 * Any reads will already have been scheduled, so we just see if enough data 2781 * should be read. We do not handle new writes when:
1686 * is available 2782 * 1/ A 'write' operation (copy+xor) is already in flight.
2783 * 2/ A 'check' operation is in flight, as it may clobber the parity
2784 * block.
1687 */ 2785 */
1688 if (syncing && locked == 0 && 2786 if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) &&
1689 !test_bit(STRIPE_INSYNC, &sh->state)) { 2787 !test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
1690 set_bit(STRIPE_HANDLE, &sh->state); 2788 handle_issuing_new_write_requests5(conf, sh, &s, disks);
1691 if (failed == 0) {
1692 BUG_ON(uptodate != disks);
1693 compute_parity5(sh, CHECK_PARITY);
1694 uptodate--;
1695 if (page_is_zero(sh->dev[sh->pd_idx].page)) {
1696 /* parity is correct (on disc, not in buffer any more) */
1697 set_bit(STRIPE_INSYNC, &sh->state);
1698 } else {
1699 conf->mddev->resync_mismatches += STRIPE_SECTORS;
1700 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
1701 /* don't try to repair!! */
1702 set_bit(STRIPE_INSYNC, &sh->state);
1703 else {
1704 compute_block(sh, sh->pd_idx);
1705 uptodate++;
1706 }
1707 }
1708 }
1709 if (!test_bit(STRIPE_INSYNC, &sh->state)) {
1710 /* either failed parity check, or recovery is happening */
1711 if (failed==0)
1712 failed_num = sh->pd_idx;
1713 dev = &sh->dev[failed_num];
1714 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
1715 BUG_ON(uptodate != disks);
1716 2789
1717 set_bit(R5_LOCKED, &dev->flags); 2790 /* maybe we need to check and possibly fix the parity for this stripe
1718 set_bit(R5_Wantwrite, &dev->flags); 2791 * Any reads will already have been scheduled, so we just see if enough
1719 clear_bit(STRIPE_DEGRADED, &sh->state); 2792 * data is available. The parity check is held off while parity
1720 locked++; 2793 * dependent operations are in flight.
1721 set_bit(STRIPE_INSYNC, &sh->state); 2794 */
1722 } 2795 if ((s.syncing && s.locked == 0 &&
1723 } 2796 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
1724 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 2797 !test_bit(STRIPE_INSYNC, &sh->state)) ||
2798 test_bit(STRIPE_OP_CHECK, &sh->ops.pending) ||
2799 test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending))
2800 handle_parity_checks5(conf, sh, &s, disks);
2801
2802 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
1725 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 2803 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
1726 clear_bit(STRIPE_SYNCING, &sh->state); 2804 clear_bit(STRIPE_SYNCING, &sh->state);
1727 } 2805 }
@@ -1729,186 +2807,102 @@ static void handle_stripe5(struct stripe_head *sh)
1729 /* If the failed drive is just a ReadError, then we might need to progress 2807 /* If the failed drive is just a ReadError, then we might need to progress
1730 * the repair/check process 2808 * the repair/check process
1731 */ 2809 */
1732 if (failed == 1 && ! conf->mddev->ro && 2810 if (s.failed == 1 && !conf->mddev->ro &&
1733 test_bit(R5_ReadError, &sh->dev[failed_num].flags) 2811 test_bit(R5_ReadError, &sh->dev[s.failed_num].flags)
1734 && !test_bit(R5_LOCKED, &sh->dev[failed_num].flags) 2812 && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags)
1735 && test_bit(R5_UPTODATE, &sh->dev[failed_num].flags) 2813 && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags)
1736 ) { 2814 ) {
1737 dev = &sh->dev[failed_num]; 2815 dev = &sh->dev[s.failed_num];
1738 if (!test_bit(R5_ReWrite, &dev->flags)) { 2816 if (!test_bit(R5_ReWrite, &dev->flags)) {
1739 set_bit(R5_Wantwrite, &dev->flags); 2817 set_bit(R5_Wantwrite, &dev->flags);
2818 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2819 sh->ops.count++;
1740 set_bit(R5_ReWrite, &dev->flags); 2820 set_bit(R5_ReWrite, &dev->flags);
1741 set_bit(R5_LOCKED, &dev->flags); 2821 set_bit(R5_LOCKED, &dev->flags);
1742 locked++; 2822 s.locked++;
1743 } else { 2823 } else {
1744 /* let's read it back */ 2824 /* let's read it back */
1745 set_bit(R5_Wantread, &dev->flags); 2825 set_bit(R5_Wantread, &dev->flags);
2826 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2827 sh->ops.count++;
1746 set_bit(R5_LOCKED, &dev->flags); 2828 set_bit(R5_LOCKED, &dev->flags);
1747 locked++; 2829 s.locked++;
1748 } 2830 }
1749 } 2831 }
1750 2832
1751 if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { 2833 /* Finish postxor operations initiated by the expansion
1752 /* Need to write out all blocks after computing parity */ 2834 * process
1753 sh->disks = conf->raid_disks; 2835 */
1754 sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks); 2836 if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) &&
1755 compute_parity5(sh, RECONSTRUCT_WRITE); 2837 !test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) {
1756 for (i= conf->raid_disks; i--;) { 2838
1757 set_bit(R5_LOCKED, &sh->dev[i].flags); 2839 clear_bit(STRIPE_EXPANDING, &sh->state);
1758 locked++; 2840
2841 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
2842 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
2843 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
2844
2845 for (i = conf->raid_disks; i--; ) {
1759 set_bit(R5_Wantwrite, &sh->dev[i].flags); 2846 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2847 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2848 sh->ops.count++;
1760 } 2849 }
1761 clear_bit(STRIPE_EXPANDING, &sh->state); 2850 }
1762 } else if (expanded) { 2851
2852 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
2853 !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
2854 /* Need to write out all blocks after computing parity */
2855 sh->disks = conf->raid_disks;
2856 sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
2857 conf->raid_disks);
2858 s.locked += handle_write_operations5(sh, 0, 1);
2859 } else if (s.expanded &&
2860 !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
1763 clear_bit(STRIPE_EXPAND_READY, &sh->state); 2861 clear_bit(STRIPE_EXPAND_READY, &sh->state);
1764 atomic_dec(&conf->reshape_stripes); 2862 atomic_dec(&conf->reshape_stripes);
1765 wake_up(&conf->wait_for_overlap); 2863 wake_up(&conf->wait_for_overlap);
1766 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 2864 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
1767 } 2865 }
1768 2866
1769 if (expanding && locked == 0) { 2867 if (s.expanding && s.locked == 0)
1770 /* We have read all the blocks in this stripe and now we need to 2868 handle_stripe_expansion(conf, sh, NULL);
1771 * copy some of them into a target stripe for expand. 2869
1772 */ 2870 if (sh->ops.count)
1773 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 2871 pending = get_stripe_work(sh);
1774 for (i=0; i< sh->disks; i++)
1775 if (i != sh->pd_idx) {
1776 int dd_idx, pd_idx, j;
1777 struct stripe_head *sh2;
1778
1779 sector_t bn = compute_blocknr(sh, i);
1780 sector_t s = raid5_compute_sector(bn, conf->raid_disks,
1781 conf->raid_disks-1,
1782 &dd_idx, &pd_idx, conf);
1783 sh2 = get_active_stripe(conf, s, conf->raid_disks, pd_idx, 1);
1784 if (sh2 == NULL)
1785 /* so far only the early blocks of this stripe
1786 * have been requested. When later blocks
1787 * get requested, we will try again
1788 */
1789 continue;
1790 if(!test_bit(STRIPE_EXPANDING, &sh2->state) ||
1791 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
1792 /* must have already done this block */
1793 release_stripe(sh2);
1794 continue;
1795 }
1796 memcpy(page_address(sh2->dev[dd_idx].page),
1797 page_address(sh->dev[i].page),
1798 STRIPE_SIZE);
1799 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
1800 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
1801 for (j=0; j<conf->raid_disks; j++)
1802 if (j != sh2->pd_idx &&
1803 !test_bit(R5_Expanded, &sh2->dev[j].flags))
1804 break;
1805 if (j == conf->raid_disks) {
1806 set_bit(STRIPE_EXPAND_READY, &sh2->state);
1807 set_bit(STRIPE_HANDLE, &sh2->state);
1808 }
1809 release_stripe(sh2);
1810 }
1811 }
1812 2872
1813 spin_unlock(&sh->lock); 2873 spin_unlock(&sh->lock);
1814 2874
1815 while ((bi=return_bi)) { 2875 if (pending)
1816 int bytes = bi->bi_size; 2876 raid5_run_ops(sh, pending);
1817 2877
1818 return_bi = bi->bi_next; 2878 return_io(return_bi);
1819 bi->bi_next = NULL;
1820 bi->bi_size = 0;
1821 bi->bi_end_io(bi, bytes,
1822 test_bit(BIO_UPTODATE, &bi->bi_flags)
1823 ? 0 : -EIO);
1824 }
1825 for (i=disks; i-- ;) {
1826 int rw;
1827 struct bio *bi;
1828 mdk_rdev_t *rdev;
1829 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
1830 rw = WRITE;
1831 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
1832 rw = READ;
1833 else
1834 continue;
1835
1836 bi = &sh->dev[i].req;
1837
1838 bi->bi_rw = rw;
1839 if (rw == WRITE)
1840 bi->bi_end_io = raid5_end_write_request;
1841 else
1842 bi->bi_end_io = raid5_end_read_request;
1843
1844 rcu_read_lock();
1845 rdev = rcu_dereference(conf->disks[i].rdev);
1846 if (rdev && test_bit(Faulty, &rdev->flags))
1847 rdev = NULL;
1848 if (rdev)
1849 atomic_inc(&rdev->nr_pending);
1850 rcu_read_unlock();
1851
1852 if (rdev) {
1853 if (syncing || expanding || expanded)
1854 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
1855 2879
1856 bi->bi_bdev = rdev->bdev;
1857 PRINTK("for %llu schedule op %ld on disc %d\n",
1858 (unsigned long long)sh->sector, bi->bi_rw, i);
1859 atomic_inc(&sh->count);
1860 bi->bi_sector = sh->sector + rdev->data_offset;
1861 bi->bi_flags = 1 << BIO_UPTODATE;
1862 bi->bi_vcnt = 1;
1863 bi->bi_max_vecs = 1;
1864 bi->bi_idx = 0;
1865 bi->bi_io_vec = &sh->dev[i].vec;
1866 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1867 bi->bi_io_vec[0].bv_offset = 0;
1868 bi->bi_size = STRIPE_SIZE;
1869 bi->bi_next = NULL;
1870 if (rw == WRITE &&
1871 test_bit(R5_ReWrite, &sh->dev[i].flags))
1872 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
1873 generic_make_request(bi);
1874 } else {
1875 if (rw == WRITE)
1876 set_bit(STRIPE_DEGRADED, &sh->state);
1877 PRINTK("skip op %ld on disc %d for sector %llu\n",
1878 bi->bi_rw, i, (unsigned long long)sh->sector);
1879 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1880 set_bit(STRIPE_HANDLE, &sh->state);
1881 }
1882 }
1883} 2880}
1884 2881
1885static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) 2882static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
1886{ 2883{
1887 raid6_conf_t *conf = sh->raid_conf; 2884 raid6_conf_t *conf = sh->raid_conf;
1888 int disks = sh->disks; 2885 int disks = sh->disks;
1889 struct bio *return_bi= NULL; 2886 struct bio *return_bi = NULL;
1890 struct bio *bi; 2887 int i, pd_idx = sh->pd_idx;
1891 int i; 2888 struct stripe_head_state s;
1892 int syncing, expanding, expanded; 2889 struct r6_state r6s;
1893 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
1894 int non_overwrite = 0;
1895 int failed_num[2] = {0, 0};
1896 struct r5dev *dev, *pdev, *qdev; 2890 struct r5dev *dev, *pdev, *qdev;
1897 int pd_idx = sh->pd_idx;
1898 int qd_idx = raid6_next_disk(pd_idx, disks);
1899 int p_failed, q_failed;
1900 2891
1901 PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d, qd_idx=%d\n", 2892 r6s.qd_idx = raid6_next_disk(pd_idx, disks);
1902 (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count), 2893 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
1903 pd_idx, qd_idx); 2894 "pd_idx=%d, qd_idx=%d\n",
2895 (unsigned long long)sh->sector, sh->state,
2896 atomic_read(&sh->count), pd_idx, r6s.qd_idx);
2897 memset(&s, 0, sizeof(s));
1904 2898
1905 spin_lock(&sh->lock); 2899 spin_lock(&sh->lock);
1906 clear_bit(STRIPE_HANDLE, &sh->state); 2900 clear_bit(STRIPE_HANDLE, &sh->state);
1907 clear_bit(STRIPE_DELAYED, &sh->state); 2901 clear_bit(STRIPE_DELAYED, &sh->state);
1908 2902
1909 syncing = test_bit(STRIPE_SYNCING, &sh->state); 2903 s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
1910 expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 2904 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
1911 expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 2905 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
1912 /* Now to look around and see what can be done */ 2906 /* Now to look around and see what can be done */
1913 2907
1914 rcu_read_lock(); 2908 rcu_read_lock();
@@ -1917,12 +2911,12 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
1917 dev = &sh->dev[i]; 2911 dev = &sh->dev[i];
1918 clear_bit(R5_Insync, &dev->flags); 2912 clear_bit(R5_Insync, &dev->flags);
1919 2913
1920 PRINTK("check %d: state 0x%lx read %p write %p written %p\n", 2914 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
1921 i, dev->flags, dev->toread, dev->towrite, dev->written); 2915 i, dev->flags, dev->toread, dev->towrite, dev->written);
1922 /* maybe we can reply to a read */ 2916 /* maybe we can reply to a read */
1923 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { 2917 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
1924 struct bio *rbi, *rbi2; 2918 struct bio *rbi, *rbi2;
1925 PRINTK("Return read for disc %d\n", i); 2919 pr_debug("Return read for disc %d\n", i);
1926 spin_lock_irq(&conf->device_lock); 2920 spin_lock_irq(&conf->device_lock);
1927 rbi = dev->toread; 2921 rbi = dev->toread;
1928 dev->toread = NULL; 2922 dev->toread = NULL;
@@ -1943,17 +2937,19 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
1943 } 2937 }
1944 2938
1945 /* now count some things */ 2939 /* now count some things */
1946 if (test_bit(R5_LOCKED, &dev->flags)) locked++; 2940 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
1947 if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++; 2941 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
1948 2942
1949 2943
1950 if (dev->toread) to_read++; 2944 if (dev->toread)
2945 s.to_read++;
1951 if (dev->towrite) { 2946 if (dev->towrite) {
1952 to_write++; 2947 s.to_write++;
1953 if (!test_bit(R5_OVERWRITE, &dev->flags)) 2948 if (!test_bit(R5_OVERWRITE, &dev->flags))
1954 non_overwrite++; 2949 s.non_overwrite++;
1955 } 2950 }
1956 if (dev->written) written++; 2951 if (dev->written)
2952 s.written++;
1957 rdev = rcu_dereference(conf->disks[i].rdev); 2953 rdev = rcu_dereference(conf->disks[i].rdev);
1958 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 2954 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
1959 /* The ReadError flag will just be confusing now */ 2955 /* The ReadError flag will just be confusing now */
@@ -1962,96 +2958,27 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
1962 } 2958 }
1963 if (!rdev || !test_bit(In_sync, &rdev->flags) 2959 if (!rdev || !test_bit(In_sync, &rdev->flags)
1964 || test_bit(R5_ReadError, &dev->flags)) { 2960 || test_bit(R5_ReadError, &dev->flags)) {
1965 if ( failed < 2 ) 2961 if (s.failed < 2)
1966 failed_num[failed] = i; 2962 r6s.failed_num[s.failed] = i;
1967 failed++; 2963 s.failed++;
1968 } else 2964 } else
1969 set_bit(R5_Insync, &dev->flags); 2965 set_bit(R5_Insync, &dev->flags);
1970 } 2966 }
1971 rcu_read_unlock(); 2967 rcu_read_unlock();
1972 PRINTK("locked=%d uptodate=%d to_read=%d" 2968 pr_debug("locked=%d uptodate=%d to_read=%d"
1973 " to_write=%d failed=%d failed_num=%d,%d\n", 2969 " to_write=%d failed=%d failed_num=%d,%d\n",
1974 locked, uptodate, to_read, to_write, failed, 2970 s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
1975 failed_num[0], failed_num[1]); 2971 r6s.failed_num[0], r6s.failed_num[1]);
1976 /* check if the array has lost >2 devices and, if so, some requests might 2972 /* check if the array has lost >2 devices and, if so, some requests
1977 * need to be failed 2973 * might need to be failed
1978 */ 2974 */
1979 if (failed > 2 && to_read+to_write+written) { 2975 if (s.failed > 2 && s.to_read+s.to_write+s.written)
1980 for (i=disks; i--; ) { 2976 handle_requests_to_failed_array(conf, sh, &s, disks,
1981 int bitmap_end = 0; 2977 &return_bi);
1982 2978 if (s.failed > 2 && s.syncing) {
1983 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1984 mdk_rdev_t *rdev;
1985 rcu_read_lock();
1986 rdev = rcu_dereference(conf->disks[i].rdev);
1987 if (rdev && test_bit(In_sync, &rdev->flags))
1988 /* multiple read failures in one stripe */
1989 md_error(conf->mddev, rdev);
1990 rcu_read_unlock();
1991 }
1992
1993 spin_lock_irq(&conf->device_lock);
1994 /* fail all writes first */
1995 bi = sh->dev[i].towrite;
1996 sh->dev[i].towrite = NULL;
1997 if (bi) { to_write--; bitmap_end = 1; }
1998
1999 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2000 wake_up(&conf->wait_for_overlap);
2001
2002 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
2003 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2004 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2005 if (--bi->bi_phys_segments == 0) {
2006 md_write_end(conf->mddev);
2007 bi->bi_next = return_bi;
2008 return_bi = bi;
2009 }
2010 bi = nextbi;
2011 }
2012 /* and fail all 'written' */
2013 bi = sh->dev[i].written;
2014 sh->dev[i].written = NULL;
2015 if (bi) bitmap_end = 1;
2016 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
2017 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
2018 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2019 if (--bi->bi_phys_segments == 0) {
2020 md_write_end(conf->mddev);
2021 bi->bi_next = return_bi;
2022 return_bi = bi;
2023 }
2024 bi = bi2;
2025 }
2026
2027 /* fail any reads if this device is non-operational */
2028 if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
2029 test_bit(R5_ReadError, &sh->dev[i].flags)) {
2030 bi = sh->dev[i].toread;
2031 sh->dev[i].toread = NULL;
2032 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2033 wake_up(&conf->wait_for_overlap);
2034 if (bi) to_read--;
2035 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
2036 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2037 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2038 if (--bi->bi_phys_segments == 0) {
2039 bi->bi_next = return_bi;
2040 return_bi = bi;
2041 }
2042 bi = nextbi;
2043 }
2044 }
2045 spin_unlock_irq(&conf->device_lock);
2046 if (bitmap_end)
2047 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2048 STRIPE_SECTORS, 0, 0);
2049 }
2050 }
2051 if (failed > 2 && syncing) {
2052 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 2979 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
2053 clear_bit(STRIPE_SYNCING, &sh->state); 2980 clear_bit(STRIPE_SYNCING, &sh->state);
2054 syncing = 0; 2981 s.syncing = 0;
2055 } 2982 }
2056 2983
2057 /* 2984 /*
@@ -2059,279 +2986,41 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2059 * are safe, or on a failed drive 2986 * are safe, or on a failed drive
2060 */ 2987 */
2061 pdev = &sh->dev[pd_idx]; 2988 pdev = &sh->dev[pd_idx];
2062 p_failed = (failed >= 1 && failed_num[0] == pd_idx) 2989 r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx)
2063 || (failed >= 2 && failed_num[1] == pd_idx); 2990 || (s.failed >= 2 && r6s.failed_num[1] == pd_idx);
2064 qdev = &sh->dev[qd_idx]; 2991 qdev = &sh->dev[r6s.qd_idx];
2065 q_failed = (failed >= 1 && failed_num[0] == qd_idx) 2992 r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == r6s.qd_idx)
2066 || (failed >= 2 && failed_num[1] == qd_idx); 2993 || (s.failed >= 2 && r6s.failed_num[1] == r6s.qd_idx);
2067 2994
2068 if ( written && 2995 if ( s.written &&
2069 ( p_failed || ((test_bit(R5_Insync, &pdev->flags) 2996 ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
2070 && !test_bit(R5_LOCKED, &pdev->flags) 2997 && !test_bit(R5_LOCKED, &pdev->flags)
2071 && test_bit(R5_UPTODATE, &pdev->flags))) ) && 2998 && test_bit(R5_UPTODATE, &pdev->flags)))) &&
2072 ( q_failed || ((test_bit(R5_Insync, &qdev->flags) 2999 ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
2073 && !test_bit(R5_LOCKED, &qdev->flags) 3000 && !test_bit(R5_LOCKED, &qdev->flags)
2074 && test_bit(R5_UPTODATE, &qdev->flags))) ) ) { 3001 && test_bit(R5_UPTODATE, &qdev->flags)))))
2075 /* any written block on an uptodate or failed drive can be 3002 handle_completed_write_requests(conf, sh, disks, &return_bi);
2076 * returned. Note that if we 'wrote' to a failed drive,
2077 * it will be UPTODATE, but never LOCKED, so we don't need
2078 * to test 'failed' directly.
2079 */
2080 for (i=disks; i--; )
2081 if (sh->dev[i].written) {
2082 dev = &sh->dev[i];
2083 if (!test_bit(R5_LOCKED, &dev->flags) &&
2084 test_bit(R5_UPTODATE, &dev->flags) ) {
2085 /* We can return any write requests */
2086 int bitmap_end = 0;
2087 struct bio *wbi, *wbi2;
2088 PRINTK("Return write for stripe %llu disc %d\n",
2089 (unsigned long long)sh->sector, i);
2090 spin_lock_irq(&conf->device_lock);
2091 wbi = dev->written;
2092 dev->written = NULL;
2093 while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
2094 wbi2 = r5_next_bio(wbi, dev->sector);
2095 if (--wbi->bi_phys_segments == 0) {
2096 md_write_end(conf->mddev);
2097 wbi->bi_next = return_bi;
2098 return_bi = wbi;
2099 }
2100 wbi = wbi2;
2101 }
2102 if (dev->towrite == NULL)
2103 bitmap_end = 1;
2104 spin_unlock_irq(&conf->device_lock);
2105 if (bitmap_end)
2106 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2107 STRIPE_SECTORS,
2108 !test_bit(STRIPE_DEGRADED, &sh->state), 0);
2109 }
2110 }
2111 }
2112 3003
2113 /* Now we might consider reading some blocks, either to check/generate 3004 /* Now we might consider reading some blocks, either to check/generate
2114 * parity, or to satisfy requests 3005 * parity, or to satisfy requests
2115 * or to load a block that is being partially written. 3006 * or to load a block that is being partially written.
2116 */ 3007 */
2117 if (to_read || non_overwrite || (to_write && failed) || 3008 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
2118 (syncing && (uptodate < disks)) || expanding) { 3009 (s.syncing && (s.uptodate < disks)) || s.expanding)
2119 for (i=disks; i--;) { 3010 handle_issuing_new_read_requests6(sh, &s, &r6s, disks);
2120 dev = &sh->dev[i];
2121 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
2122 (dev->toread ||
2123 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2124 syncing ||
2125 expanding ||
2126 (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) ||
2127 (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write))
2128 )
2129 ) {
2130 /* we would like to get this block, possibly
2131 * by computing it, but we might not be able to
2132 */
2133 if (uptodate == disks-1) {
2134 PRINTK("Computing stripe %llu block %d\n",
2135 (unsigned long long)sh->sector, i);
2136 compute_block_1(sh, i, 0);
2137 uptodate++;
2138 } else if ( uptodate == disks-2 && failed >= 2 ) {
2139 /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */
2140 int other;
2141 for (other=disks; other--;) {
2142 if ( other == i )
2143 continue;
2144 if ( !test_bit(R5_UPTODATE, &sh->dev[other].flags) )
2145 break;
2146 }
2147 BUG_ON(other < 0);
2148 PRINTK("Computing stripe %llu blocks %d,%d\n",
2149 (unsigned long long)sh->sector, i, other);
2150 compute_block_2(sh, i, other);
2151 uptodate += 2;
2152 } else if (test_bit(R5_Insync, &dev->flags)) {
2153 set_bit(R5_LOCKED, &dev->flags);
2154 set_bit(R5_Wantread, &dev->flags);
2155 locked++;
2156 PRINTK("Reading block %d (sync=%d)\n",
2157 i, syncing);
2158 }
2159 }
2160 }
2161 set_bit(STRIPE_HANDLE, &sh->state);
2162 }
2163 3011
2164 /* now to consider writing and what else, if anything should be read */ 3012 /* now to consider writing and what else, if anything should be read */
2165 if (to_write) { 3013 if (s.to_write)
2166 int rcw=0, must_compute=0; 3014 handle_issuing_new_write_requests6(conf, sh, &s, &r6s, disks);
2167 for (i=disks ; i--;) {
2168 dev = &sh->dev[i];
2169 /* Would I have to read this buffer for reconstruct_write */
2170 if (!test_bit(R5_OVERWRITE, &dev->flags)
2171 && i != pd_idx && i != qd_idx
2172 && (!test_bit(R5_LOCKED, &dev->flags)
2173 ) &&
2174 !test_bit(R5_UPTODATE, &dev->flags)) {
2175 if (test_bit(R5_Insync, &dev->flags)) rcw++;
2176 else {
2177 PRINTK("raid6: must_compute: disk %d flags=%#lx\n", i, dev->flags);
2178 must_compute++;
2179 }
2180 }
2181 }
2182 PRINTK("for sector %llu, rcw=%d, must_compute=%d\n",
2183 (unsigned long long)sh->sector, rcw, must_compute);
2184 set_bit(STRIPE_HANDLE, &sh->state);
2185
2186 if (rcw > 0)
2187 /* want reconstruct write, but need to get some data */
2188 for (i=disks; i--;) {
2189 dev = &sh->dev[i];
2190 if (!test_bit(R5_OVERWRITE, &dev->flags)
2191 && !(failed == 0 && (i == pd_idx || i == qd_idx))
2192 && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
2193 test_bit(R5_Insync, &dev->flags)) {
2194 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
2195 {
2196 PRINTK("Read_old stripe %llu block %d for Reconstruct\n",
2197 (unsigned long long)sh->sector, i);
2198 set_bit(R5_LOCKED, &dev->flags);
2199 set_bit(R5_Wantread, &dev->flags);
2200 locked++;
2201 } else {
2202 PRINTK("Request delayed stripe %llu block %d for Reconstruct\n",
2203 (unsigned long long)sh->sector, i);
2204 set_bit(STRIPE_DELAYED, &sh->state);
2205 set_bit(STRIPE_HANDLE, &sh->state);
2206 }
2207 }
2208 }
2209 /* now if nothing is locked, and if we have enough data, we can start a write request */
2210 if (locked == 0 && rcw == 0 &&
2211 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
2212 if ( must_compute > 0 ) {
2213 /* We have failed blocks and need to compute them */
2214 switch ( failed ) {
2215 case 0: BUG();
2216 case 1: compute_block_1(sh, failed_num[0], 0); break;
2217 case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break;
2218 default: BUG(); /* This request should have been failed? */
2219 }
2220 }
2221
2222 PRINTK("Computing parity for stripe %llu\n", (unsigned long long)sh->sector);
2223 compute_parity6(sh, RECONSTRUCT_WRITE);
2224 /* now every locked buffer is ready to be written */
2225 for (i=disks; i--;)
2226 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
2227 PRINTK("Writing stripe %llu block %d\n",
2228 (unsigned long long)sh->sector, i);
2229 locked++;
2230 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2231 }
2232 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
2233 set_bit(STRIPE_INSYNC, &sh->state);
2234
2235 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2236 atomic_dec(&conf->preread_active_stripes);
2237 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
2238 md_wakeup_thread(conf->mddev->thread);
2239 }
2240 }
2241 }
2242 3015
2243 /* maybe we need to check and possibly fix the parity for this stripe 3016 /* maybe we need to check and possibly fix the parity for this stripe
2244 * Any reads will already have been scheduled, so we just see if enough data 3017 * Any reads will already have been scheduled, so we just see if enough
2245 * is available 3018 * data is available
2246 */ 3019 */
2247 if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) { 3020 if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state))
2248 int update_p = 0, update_q = 0; 3021 handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks);
2249 struct r5dev *dev;
2250
2251 set_bit(STRIPE_HANDLE, &sh->state);
2252
2253 BUG_ON(failed>2);
2254 BUG_ON(uptodate < disks);
2255 /* Want to check and possibly repair P and Q.
2256 * However there could be one 'failed' device, in which
2257 * case we can only check one of them, possibly using the
2258 * other to generate missing data
2259 */
2260
2261 /* If !tmp_page, we cannot do the calculations,
2262 * but as we have set STRIPE_HANDLE, we will soon be called
2263 * by stripe_handle with a tmp_page - just wait until then.
2264 */
2265 if (tmp_page) {
2266 if (failed == q_failed) {
2267 /* The only possible failed device holds 'Q', so it makes
2268 * sense to check P (If anything else were failed, we would
2269 * have used P to recreate it).
2270 */
2271 compute_block_1(sh, pd_idx, 1);
2272 if (!page_is_zero(sh->dev[pd_idx].page)) {
2273 compute_block_1(sh,pd_idx,0);
2274 update_p = 1;
2275 }
2276 }
2277 if (!q_failed && failed < 2) {
2278 /* q is not failed, and we didn't use it to generate
2279 * anything, so it makes sense to check it
2280 */
2281 memcpy(page_address(tmp_page),
2282 page_address(sh->dev[qd_idx].page),
2283 STRIPE_SIZE);
2284 compute_parity6(sh, UPDATE_PARITY);
2285 if (memcmp(page_address(tmp_page),
2286 page_address(sh->dev[qd_idx].page),
2287 STRIPE_SIZE)!= 0) {
2288 clear_bit(STRIPE_INSYNC, &sh->state);
2289 update_q = 1;
2290 }
2291 }
2292 if (update_p || update_q) {
2293 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2294 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2295 /* don't try to repair!! */
2296 update_p = update_q = 0;
2297 }
2298
2299 /* now write out any block on a failed drive,
2300 * or P or Q if they need it
2301 */
2302 3022
2303 if (failed == 2) { 3023 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
2304 dev = &sh->dev[failed_num[1]];
2305 locked++;
2306 set_bit(R5_LOCKED, &dev->flags);
2307 set_bit(R5_Wantwrite, &dev->flags);
2308 }
2309 if (failed >= 1) {
2310 dev = &sh->dev[failed_num[0]];
2311 locked++;
2312 set_bit(R5_LOCKED, &dev->flags);
2313 set_bit(R5_Wantwrite, &dev->flags);
2314 }
2315
2316 if (update_p) {
2317 dev = &sh->dev[pd_idx];
2318 locked ++;
2319 set_bit(R5_LOCKED, &dev->flags);
2320 set_bit(R5_Wantwrite, &dev->flags);
2321 }
2322 if (update_q) {
2323 dev = &sh->dev[qd_idx];
2324 locked++;
2325 set_bit(R5_LOCKED, &dev->flags);
2326 set_bit(R5_Wantwrite, &dev->flags);
2327 }
2328 clear_bit(STRIPE_DEGRADED, &sh->state);
2329
2330 set_bit(STRIPE_INSYNC, &sh->state);
2331 }
2332 }
2333
2334 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
2335 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 3024 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
2336 clear_bit(STRIPE_SYNCING, &sh->state); 3025 clear_bit(STRIPE_SYNCING, &sh->state);
2337 } 3026 }
@@ -2339,9 +3028,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2339 /* If the failed drives are just a ReadError, then we might need 3028 /* If the failed drives are just a ReadError, then we might need
2340 * to progress the repair/check process 3029 * to progress the repair/check process
2341 */ 3030 */
2342 if (failed <= 2 && ! conf->mddev->ro) 3031 if (s.failed <= 2 && !conf->mddev->ro)
2343 for (i=0; i<failed;i++) { 3032 for (i = 0; i < s.failed; i++) {
2344 dev = &sh->dev[failed_num[i]]; 3033 dev = &sh->dev[r6s.failed_num[i]];
2345 if (test_bit(R5_ReadError, &dev->flags) 3034 if (test_bit(R5_ReadError, &dev->flags)
2346 && !test_bit(R5_LOCKED, &dev->flags) 3035 && !test_bit(R5_LOCKED, &dev->flags)
2347 && test_bit(R5_UPTODATE, &dev->flags) 3036 && test_bit(R5_UPTODATE, &dev->flags)
@@ -2358,7 +3047,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2358 } 3047 }
2359 } 3048 }
2360 3049
2361 if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { 3050 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
2362 /* Need to write out all blocks after computing P&Q */ 3051 /* Need to write out all blocks after computing P&Q */
2363 sh->disks = conf->raid_disks; 3052 sh->disks = conf->raid_disks;
2364 sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 3053 sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
@@ -2366,82 +3055,24 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2366 compute_parity6(sh, RECONSTRUCT_WRITE); 3055 compute_parity6(sh, RECONSTRUCT_WRITE);
2367 for (i = conf->raid_disks ; i-- ; ) { 3056 for (i = conf->raid_disks ; i-- ; ) {
2368 set_bit(R5_LOCKED, &sh->dev[i].flags); 3057 set_bit(R5_LOCKED, &sh->dev[i].flags);
2369 locked++; 3058 s.locked++;
2370 set_bit(R5_Wantwrite, &sh->dev[i].flags); 3059 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2371 } 3060 }
2372 clear_bit(STRIPE_EXPANDING, &sh->state); 3061 clear_bit(STRIPE_EXPANDING, &sh->state);
2373 } else if (expanded) { 3062 } else if (s.expanded) {
2374 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3063 clear_bit(STRIPE_EXPAND_READY, &sh->state);
2375 atomic_dec(&conf->reshape_stripes); 3064 atomic_dec(&conf->reshape_stripes);
2376 wake_up(&conf->wait_for_overlap); 3065 wake_up(&conf->wait_for_overlap);
2377 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3066 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
2378 } 3067 }
2379 3068
2380 if (expanding && locked == 0) { 3069 if (s.expanding && s.locked == 0)
2381 /* We have read all the blocks in this stripe and now we need to 3070 handle_stripe_expansion(conf, sh, &r6s);
2382 * copy some of them into a target stripe for expand.
2383 */
2384 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2385 for (i = 0; i < sh->disks ; i++)
2386 if (i != pd_idx && i != qd_idx) {
2387 int dd_idx2, pd_idx2, j;
2388 struct stripe_head *sh2;
2389
2390 sector_t bn = compute_blocknr(sh, i);
2391 sector_t s = raid5_compute_sector(
2392 bn, conf->raid_disks,
2393 conf->raid_disks - conf->max_degraded,
2394 &dd_idx2, &pd_idx2, conf);
2395 sh2 = get_active_stripe(conf, s,
2396 conf->raid_disks,
2397 pd_idx2, 1);
2398 if (sh2 == NULL)
2399 /* so for only the early blocks of
2400 * this stripe have been requests.
2401 * When later blocks get requests, we
2402 * will try again
2403 */
2404 continue;
2405 if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
2406 test_bit(R5_Expanded,
2407 &sh2->dev[dd_idx2].flags)) {
2408 /* must have already done this block */
2409 release_stripe(sh2);
2410 continue;
2411 }
2412 memcpy(page_address(sh2->dev[dd_idx2].page),
2413 page_address(sh->dev[i].page),
2414 STRIPE_SIZE);
2415 set_bit(R5_Expanded, &sh2->dev[dd_idx2].flags);
2416 set_bit(R5_UPTODATE, &sh2->dev[dd_idx2].flags);
2417 for (j = 0 ; j < conf->raid_disks ; j++)
2418 if (j != sh2->pd_idx &&
2419 j != raid6_next_disk(sh2->pd_idx,
2420 sh2->disks) &&
2421 !test_bit(R5_Expanded,
2422 &sh2->dev[j].flags))
2423 break;
2424 if (j == conf->raid_disks) {
2425 set_bit(STRIPE_EXPAND_READY,
2426 &sh2->state);
2427 set_bit(STRIPE_HANDLE, &sh2->state);
2428 }
2429 release_stripe(sh2);
2430 }
2431 }
2432 3071
2433 spin_unlock(&sh->lock); 3072 spin_unlock(&sh->lock);
2434 3073
2435 while ((bi=return_bi)) { 3074 return_io(return_bi);
2436 int bytes = bi->bi_size;
2437 3075
2438 return_bi = bi->bi_next;
2439 bi->bi_next = NULL;
2440 bi->bi_size = 0;
2441 bi->bi_end_io(bi, bytes,
2442 test_bit(BIO_UPTODATE, &bi->bi_flags)
2443 ? 0 : -EIO);
2444 }
2445 for (i=disks; i-- ;) { 3076 for (i=disks; i-- ;) {
2446 int rw; 3077 int rw;
2447 struct bio *bi; 3078 struct bio *bi;
@@ -2470,11 +3101,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2470 rcu_read_unlock(); 3101 rcu_read_unlock();
2471 3102
2472 if (rdev) { 3103 if (rdev) {
2473 if (syncing || expanding || expanded) 3104 if (s.syncing || s.expanding || s.expanded)
2474 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 3105 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
2475 3106
2476 bi->bi_bdev = rdev->bdev; 3107 bi->bi_bdev = rdev->bdev;
2477 PRINTK("for %llu schedule op %ld on disc %d\n", 3108 pr_debug("for %llu schedule op %ld on disc %d\n",
2478 (unsigned long long)sh->sector, bi->bi_rw, i); 3109 (unsigned long long)sh->sector, bi->bi_rw, i);
2479 atomic_inc(&sh->count); 3110 atomic_inc(&sh->count);
2480 bi->bi_sector = sh->sector + rdev->data_offset; 3111 bi->bi_sector = sh->sector + rdev->data_offset;
@@ -2494,7 +3125,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2494 } else { 3125 } else {
2495 if (rw == WRITE) 3126 if (rw == WRITE)
2496 set_bit(STRIPE_DEGRADED, &sh->state); 3127 set_bit(STRIPE_DEGRADED, &sh->state);
2497 PRINTK("skip op %ld on disc %d for sector %llu\n", 3128 pr_debug("skip op %ld on disc %d for sector %llu\n",
2498 bi->bi_rw, i, (unsigned long long)sh->sector); 3129 bi->bi_rw, i, (unsigned long long)sh->sector);
2499 clear_bit(R5_LOCKED, &sh->dev[i].flags); 3130 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2500 set_bit(STRIPE_HANDLE, &sh->state); 3131 set_bit(STRIPE_HANDLE, &sh->state);
@@ -2738,7 +3369,7 @@ static int raid5_align_endio(struct bio *bi, unsigned int bytes, int error)
2738 } 3369 }
2739 3370
2740 3371
2741 PRINTK("raid5_align_endio : io error...handing IO for a retry\n"); 3372 pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
2742 3373
2743 add_bio_to_retry(raid_bi, conf); 3374 add_bio_to_retry(raid_bi, conf);
2744 return 0; 3375 return 0;
@@ -2776,7 +3407,7 @@ static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio)
2776 mdk_rdev_t *rdev; 3407 mdk_rdev_t *rdev;
2777 3408
2778 if (!in_chunk_boundary(mddev, raid_bio)) { 3409 if (!in_chunk_boundary(mddev, raid_bio)) {
2779 PRINTK("chunk_aligned_read : non aligned\n"); 3410 pr_debug("chunk_aligned_read : non aligned\n");
2780 return 0; 3411 return 0;
2781 } 3412 }
2782 /* 3413 /*
@@ -2900,7 +3531,7 @@ static int make_request(request_queue_t *q, struct bio * bi)
2900 3531
2901 new_sector = raid5_compute_sector(logical_sector, disks, data_disks, 3532 new_sector = raid5_compute_sector(logical_sector, disks, data_disks,
2902 &dd_idx, &pd_idx, conf); 3533 &dd_idx, &pd_idx, conf);
2903 PRINTK("raid5: make_request, sector %llu logical %llu\n", 3534 pr_debug("raid5: make_request, sector %llu logical %llu\n",
2904 (unsigned long long)new_sector, 3535 (unsigned long long)new_sector,
2905 (unsigned long long)logical_sector); 3536 (unsigned long long)logical_sector);
2906 3537
@@ -3273,7 +3904,7 @@ static void raid5d (mddev_t *mddev)
3273 raid5_conf_t *conf = mddev_to_conf(mddev); 3904 raid5_conf_t *conf = mddev_to_conf(mddev);
3274 int handled; 3905 int handled;
3275 3906
3276 PRINTK("+++ raid5d active\n"); 3907 pr_debug("+++ raid5d active\n");
3277 3908
3278 md_check_recovery(mddev); 3909 md_check_recovery(mddev);
3279 3910
@@ -3308,8 +3939,10 @@ static void raid5d (mddev_t *mddev)
3308 handled++; 3939 handled++;
3309 } 3940 }
3310 3941
3311 if (list_empty(&conf->handle_list)) 3942 if (list_empty(&conf->handle_list)) {
3943 async_tx_issue_pending_all();
3312 break; 3944 break;
3945 }
3313 3946
3314 first = conf->handle_list.next; 3947 first = conf->handle_list.next;
3315 sh = list_entry(first, struct stripe_head, lru); 3948 sh = list_entry(first, struct stripe_head, lru);
@@ -3325,13 +3958,13 @@ static void raid5d (mddev_t *mddev)
3325 3958
3326 spin_lock_irq(&conf->device_lock); 3959 spin_lock_irq(&conf->device_lock);
3327 } 3960 }
3328 PRINTK("%d stripes handled\n", handled); 3961 pr_debug("%d stripes handled\n", handled);
3329 3962
3330 spin_unlock_irq(&conf->device_lock); 3963 spin_unlock_irq(&conf->device_lock);
3331 3964
3332 unplug_slaves(mddev); 3965 unplug_slaves(mddev);
3333 3966
3334 PRINTK("--- raid5d inactive\n"); 3967 pr_debug("--- raid5d inactive\n");
3335} 3968}
3336 3969
3337static ssize_t 3970static ssize_t
@@ -3507,7 +4140,7 @@ static int run(mddev_t *mddev)
3507 atomic_set(&conf->preread_active_stripes, 0); 4140 atomic_set(&conf->preread_active_stripes, 0);
3508 atomic_set(&conf->active_aligned_reads, 0); 4141 atomic_set(&conf->active_aligned_reads, 0);
3509 4142
3510 PRINTK("raid5: run(%s) called.\n", mdname(mddev)); 4143 pr_debug("raid5: run(%s) called.\n", mdname(mddev));
3511 4144
3512 ITERATE_RDEV(mddev,rdev,tmp) { 4145 ITERATE_RDEV(mddev,rdev,tmp) {
3513 raid_disk = rdev->raid_disk; 4146 raid_disk = rdev->raid_disk;
@@ -3690,7 +4323,7 @@ static int stop(mddev_t *mddev)
3690 return 0; 4323 return 0;
3691} 4324}
3692 4325
3693#if RAID5_DEBUG 4326#ifdef DEBUG
3694static void print_sh (struct seq_file *seq, struct stripe_head *sh) 4327static void print_sh (struct seq_file *seq, struct stripe_head *sh)
3695{ 4328{
3696 int i; 4329 int i;
@@ -3737,7 +4370,7 @@ static void status (struct seq_file *seq, mddev_t *mddev)
3737 conf->disks[i].rdev && 4370 conf->disks[i].rdev &&
3738 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); 4371 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
3739 seq_printf (seq, "]"); 4372 seq_printf (seq, "]");
3740#if RAID5_DEBUG 4373#ifdef DEBUG
3741 seq_printf (seq, "\n"); 4374 seq_printf (seq, "\n");
3742 printall(seq, conf); 4375 printall(seq, conf);
3743#endif 4376#endif
diff --git a/drivers/md/xor.c b/drivers/md/xor.c
deleted file mode 100644
index 324897c4be4e..000000000000
--- a/drivers/md/xor.c
+++ /dev/null
@@ -1,154 +0,0 @@
1/*
2 * xor.c : Multiple Devices driver for Linux
3 *
4 * Copyright (C) 1996, 1997, 1998, 1999, 2000,
5 * Ingo Molnar, Matti Aarnio, Jakub Jelinek, Richard Henderson.
6 *
7 * Dispatch optimized RAID-5 checksumming functions.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2, or (at your option)
12 * any later version.
13 *
14 * You should have received a copy of the GNU General Public License
15 * (for example /usr/src/linux/COPYING); if not, write to the Free
16 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17 */
18
19#define BH_TRACE 0
20#include <linux/module.h>
21#include <linux/raid/md.h>
22#include <linux/raid/xor.h>
23#include <asm/xor.h>
24
25/* The xor routines to use. */
26static struct xor_block_template *active_template;
27
28void
29xor_block(unsigned int count, unsigned int bytes, void **ptr)
30{
31 unsigned long *p0, *p1, *p2, *p3, *p4;
32
33 p0 = (unsigned long *) ptr[0];
34 p1 = (unsigned long *) ptr[1];
35 if (count == 2) {
36 active_template->do_2(bytes, p0, p1);
37 return;
38 }
39
40 p2 = (unsigned long *) ptr[2];
41 if (count == 3) {
42 active_template->do_3(bytes, p0, p1, p2);
43 return;
44 }
45
46 p3 = (unsigned long *) ptr[3];
47 if (count == 4) {
48 active_template->do_4(bytes, p0, p1, p2, p3);
49 return;
50 }
51
52 p4 = (unsigned long *) ptr[4];
53 active_template->do_5(bytes, p0, p1, p2, p3, p4);
54}
55
56/* Set of all registered templates. */
57static struct xor_block_template *template_list;
58
59#define BENCH_SIZE (PAGE_SIZE)
60
61static void
62do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
63{
64 int speed;
65 unsigned long now;
66 int i, count, max;
67
68 tmpl->next = template_list;
69 template_list = tmpl;
70
71 /*
72 * Count the number of XORs done during a whole jiffy, and use
73 * this to calculate the speed of checksumming. We use a 2-page
74 * allocation to have guaranteed color L1-cache layout.
75 */
76 max = 0;
77 for (i = 0; i < 5; i++) {
78 now = jiffies;
79 count = 0;
80 while (jiffies == now) {
81 mb();
82 tmpl->do_2(BENCH_SIZE, b1, b2);
83 mb();
84 count++;
85 mb();
86 }
87 if (count > max)
88 max = count;
89 }
90
91 speed = max * (HZ * BENCH_SIZE / 1024);
92 tmpl->speed = speed;
93
94 printk(" %-10s: %5d.%03d MB/sec\n", tmpl->name,
95 speed / 1000, speed % 1000);
96}
97
98static int
99calibrate_xor_block(void)
100{
101 void *b1, *b2;
102 struct xor_block_template *f, *fastest;
103
104 b1 = (void *) __get_free_pages(GFP_KERNEL, 2);
105 if (! b1) {
106 printk("raid5: Yikes! No memory available.\n");
107 return -ENOMEM;
108 }
109 b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE;
110
111 /*
112 * If this arch/cpu has a short-circuited selection, don't loop through all
113 * the possible functions, just test the best one
114 */
115
116 fastest = NULL;
117
118#ifdef XOR_SELECT_TEMPLATE
119 fastest = XOR_SELECT_TEMPLATE(fastest);
120#endif
121
122#define xor_speed(templ) do_xor_speed((templ), b1, b2)
123
124 if (fastest) {
125 printk(KERN_INFO "raid5: automatically using best checksumming function: %s\n",
126 fastest->name);
127 xor_speed(fastest);
128 } else {
129 printk(KERN_INFO "raid5: measuring checksumming speed\n");
130 XOR_TRY_TEMPLATES;
131 fastest = template_list;
132 for (f = fastest; f; f = f->next)
133 if (f->speed > fastest->speed)
134 fastest = f;
135 }
136
137 printk("raid5: using function: %s (%d.%03d MB/sec)\n",
138 fastest->name, fastest->speed / 1000, fastest->speed % 1000);
139
140#undef xor_speed
141
142 free_pages((unsigned long)b1, 2);
143
144 active_template = fastest;
145 return 0;
146}
147
148static __exit void xor_exit(void) { }
149
150EXPORT_SYMBOL(xor_block);
151MODULE_LICENSE("GPL");
152
153module_init(calibrate_xor_block);
154module_exit(xor_exit);