aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig2
-rw-r--r--drivers/md/Makefile4
-rw-r--r--drivers/md/md.c2
-rw-r--r--drivers/md/raid5.c2727
-rw-r--r--drivers/md/xor.c154
5 files changed, 1685 insertions, 1204 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 466909f38d98..64bf3a81db93 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -109,6 +109,8 @@ config MD_RAID10
109config MD_RAID456 109config MD_RAID456
110 tristate "RAID-4/RAID-5/RAID-6 mode" 110 tristate "RAID-4/RAID-5/RAID-6 mode"
111 depends on BLK_DEV_MD 111 depends on BLK_DEV_MD
112 select ASYNC_MEMCPY
113 select ASYNC_XOR
112 ---help--- 114 ---help---
113 A RAID-5 set of N drives with a capacity of C MB per drive provides 115 A RAID-5 set of N drives with a capacity of C MB per drive provides
114 the capacity of C * (N - 1) MB, and protects against a failure 116 the capacity of C * (N - 1) MB, and protects against a failure
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 2c45d7683ae9..c49366cdc05d 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -18,7 +18,7 @@ raid456-objs := raid5.o raid6algos.o raid6recov.o raid6tables.o \
18hostprogs-y := mktables 18hostprogs-y := mktables
19 19
20# Note: link order is important. All raid personalities 20# Note: link order is important. All raid personalities
21# and xor.o must come before md.o, as they each initialise 21# and must come before md.o, as they each initialise
22# themselves, and md.o may use the personalities when it 22# themselves, and md.o may use the personalities when it
23# auto-initialised. 23# auto-initialised.
24 24
@@ -26,7 +26,7 @@ obj-$(CONFIG_MD_LINEAR) += linear.o
26obj-$(CONFIG_MD_RAID0) += raid0.o 26obj-$(CONFIG_MD_RAID0) += raid0.o
27obj-$(CONFIG_MD_RAID1) += raid1.o 27obj-$(CONFIG_MD_RAID1) += raid1.o
28obj-$(CONFIG_MD_RAID10) += raid10.o 28obj-$(CONFIG_MD_RAID10) += raid10.o
29obj-$(CONFIG_MD_RAID456) += raid456.o xor.o 29obj-$(CONFIG_MD_RAID456) += raid456.o
30obj-$(CONFIG_MD_MULTIPATH) += multipath.o 30obj-$(CONFIG_MD_MULTIPATH) += multipath.o
31obj-$(CONFIG_MD_FAULTY) += faulty.o 31obj-$(CONFIG_MD_FAULTY) += faulty.o
32obj-$(CONFIG_BLK_DEV_MD) += md-mod.o 32obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1c54f3c1cca7..33beaa7da085 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5814,7 +5814,7 @@ static __exit void md_exit(void)
5814 } 5814 }
5815} 5815}
5816 5816
5817module_init(md_init) 5817subsys_initcall(md_init);
5818module_exit(md_exit) 5818module_exit(md_exit)
5819 5819
5820static int get_ro(char *buffer, struct kernel_param *kp) 5820static int get_ro(char *buffer, struct kernel_param *kp)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 061375ee6592..0b66afef2d82 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -52,6 +52,7 @@
52#include "raid6.h" 52#include "raid6.h"
53 53
54#include <linux/raid/bitmap.h> 54#include <linux/raid/bitmap.h>
55#include <linux/async_tx.h>
55 56
56/* 57/*
57 * Stripe cache 58 * Stripe cache
@@ -80,7 +81,6 @@
80/* 81/*
81 * The following can be used to debug the driver 82 * The following can be used to debug the driver
82 */ 83 */
83#define RAID5_DEBUG 0
84#define RAID5_PARANOIA 1 84#define RAID5_PARANOIA 1
85#if RAID5_PARANOIA && defined(CONFIG_SMP) 85#if RAID5_PARANOIA && defined(CONFIG_SMP)
86# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock) 86# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
@@ -88,8 +88,7 @@
88# define CHECK_DEVLOCK() 88# define CHECK_DEVLOCK()
89#endif 89#endif
90 90
91#define PRINTK(x...) ((void)(RAID5_DEBUG && printk(x))) 91#ifdef DEBUG
92#if RAID5_DEBUG
93#define inline 92#define inline
94#define __inline__ 93#define __inline__
95#endif 94#endif
@@ -104,6 +103,23 @@ static inline int raid6_next_disk(int disk, int raid_disks)
104 disk++; 103 disk++;
105 return (disk < raid_disks) ? disk : 0; 104 return (disk < raid_disks) ? disk : 0;
106} 105}
106
107static void return_io(struct bio *return_bi)
108{
109 struct bio *bi = return_bi;
110 while (bi) {
111 int bytes = bi->bi_size;
112
113 return_bi = bi->bi_next;
114 bi->bi_next = NULL;
115 bi->bi_size = 0;
116 bi->bi_end_io(bi, bytes,
117 test_bit(BIO_UPTODATE, &bi->bi_flags)
118 ? 0 : -EIO);
119 bi = return_bi;
120 }
121}
122
107static void print_raid5_conf (raid5_conf_t *conf); 123static void print_raid5_conf (raid5_conf_t *conf);
108 124
109static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) 125static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
@@ -125,6 +141,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
125 } 141 }
126 md_wakeup_thread(conf->mddev->thread); 142 md_wakeup_thread(conf->mddev->thread);
127 } else { 143 } else {
144 BUG_ON(sh->ops.pending);
128 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 145 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
129 atomic_dec(&conf->preread_active_stripes); 146 atomic_dec(&conf->preread_active_stripes);
130 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) 147 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
@@ -152,7 +169,8 @@ static void release_stripe(struct stripe_head *sh)
152 169
153static inline void remove_hash(struct stripe_head *sh) 170static inline void remove_hash(struct stripe_head *sh)
154{ 171{
155 PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector); 172 pr_debug("remove_hash(), stripe %llu\n",
173 (unsigned long long)sh->sector);
156 174
157 hlist_del_init(&sh->hash); 175 hlist_del_init(&sh->hash);
158} 176}
@@ -161,7 +179,8 @@ static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
161{ 179{
162 struct hlist_head *hp = stripe_hash(conf, sh->sector); 180 struct hlist_head *hp = stripe_hash(conf, sh->sector);
163 181
164 PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector); 182 pr_debug("insert_hash(), stripe %llu\n",
183 (unsigned long long)sh->sector);
165 184
166 CHECK_DEVLOCK(); 185 CHECK_DEVLOCK();
167 hlist_add_head(&sh->hash, hp); 186 hlist_add_head(&sh->hash, hp);
@@ -224,9 +243,10 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
224 243
225 BUG_ON(atomic_read(&sh->count) != 0); 244 BUG_ON(atomic_read(&sh->count) != 0);
226 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 245 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
227 246 BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete);
247
228 CHECK_DEVLOCK(); 248 CHECK_DEVLOCK();
229 PRINTK("init_stripe called, stripe %llu\n", 249 pr_debug("init_stripe called, stripe %llu\n",
230 (unsigned long long)sh->sector); 250 (unsigned long long)sh->sector);
231 251
232 remove_hash(sh); 252 remove_hash(sh);
@@ -240,11 +260,11 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
240 for (i = sh->disks; i--; ) { 260 for (i = sh->disks; i--; ) {
241 struct r5dev *dev = &sh->dev[i]; 261 struct r5dev *dev = &sh->dev[i];
242 262
243 if (dev->toread || dev->towrite || dev->written || 263 if (dev->toread || dev->read || dev->towrite || dev->written ||
244 test_bit(R5_LOCKED, &dev->flags)) { 264 test_bit(R5_LOCKED, &dev->flags)) {
245 printk("sector=%llx i=%d %p %p %p %d\n", 265 printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",
246 (unsigned long long)sh->sector, i, dev->toread, 266 (unsigned long long)sh->sector, i, dev->toread,
247 dev->towrite, dev->written, 267 dev->read, dev->towrite, dev->written,
248 test_bit(R5_LOCKED, &dev->flags)); 268 test_bit(R5_LOCKED, &dev->flags));
249 BUG(); 269 BUG();
250 } 270 }
@@ -260,11 +280,11 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in
260 struct hlist_node *hn; 280 struct hlist_node *hn;
261 281
262 CHECK_DEVLOCK(); 282 CHECK_DEVLOCK();
263 PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector); 283 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
264 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) 284 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
265 if (sh->sector == sector && sh->disks == disks) 285 if (sh->sector == sector && sh->disks == disks)
266 return sh; 286 return sh;
267 PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector); 287 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
268 return NULL; 288 return NULL;
269} 289}
270 290
@@ -276,7 +296,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
276{ 296{
277 struct stripe_head *sh; 297 struct stripe_head *sh;
278 298
279 PRINTK("get_stripe, sector %llu\n", (unsigned long long)sector); 299 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
280 300
281 spin_lock_irq(&conf->device_lock); 301 spin_lock_irq(&conf->device_lock);
282 302
@@ -324,6 +344,579 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
324 return sh; 344 return sh;
325} 345}
326 346
347/* test_and_ack_op() ensures that we only dequeue an operation once */
348#define test_and_ack_op(op, pend) \
349do { \
350 if (test_bit(op, &sh->ops.pending) && \
351 !test_bit(op, &sh->ops.complete)) { \
352 if (test_and_set_bit(op, &sh->ops.ack)) \
353 clear_bit(op, &pend); \
354 else \
355 ack++; \
356 } else \
357 clear_bit(op, &pend); \
358} while (0)
359
360/* find new work to run, do not resubmit work that is already
361 * in flight
362 */
363static unsigned long get_stripe_work(struct stripe_head *sh)
364{
365 unsigned long pending;
366 int ack = 0;
367
368 pending = sh->ops.pending;
369
370 test_and_ack_op(STRIPE_OP_BIOFILL, pending);
371 test_and_ack_op(STRIPE_OP_COMPUTE_BLK, pending);
372 test_and_ack_op(STRIPE_OP_PREXOR, pending);
373 test_and_ack_op(STRIPE_OP_BIODRAIN, pending);
374 test_and_ack_op(STRIPE_OP_POSTXOR, pending);
375 test_and_ack_op(STRIPE_OP_CHECK, pending);
376 if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending))
377 ack++;
378
379 sh->ops.count -= ack;
380 BUG_ON(sh->ops.count < 0);
381
382 return pending;
383}
384
385static int
386raid5_end_read_request(struct bio *bi, unsigned int bytes_done, int error);
387static int
388raid5_end_write_request (struct bio *bi, unsigned int bytes_done, int error);
389
390static void ops_run_io(struct stripe_head *sh)
391{
392 raid5_conf_t *conf = sh->raid_conf;
393 int i, disks = sh->disks;
394
395 might_sleep();
396
397 for (i = disks; i--; ) {
398 int rw;
399 struct bio *bi;
400 mdk_rdev_t *rdev;
401 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
402 rw = WRITE;
403 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
404 rw = READ;
405 else
406 continue;
407
408 bi = &sh->dev[i].req;
409
410 bi->bi_rw = rw;
411 if (rw == WRITE)
412 bi->bi_end_io = raid5_end_write_request;
413 else
414 bi->bi_end_io = raid5_end_read_request;
415
416 rcu_read_lock();
417 rdev = rcu_dereference(conf->disks[i].rdev);
418 if (rdev && test_bit(Faulty, &rdev->flags))
419 rdev = NULL;
420 if (rdev)
421 atomic_inc(&rdev->nr_pending);
422 rcu_read_unlock();
423
424 if (rdev) {
425 if (test_bit(STRIPE_SYNCING, &sh->state) ||
426 test_bit(STRIPE_EXPAND_SOURCE, &sh->state) ||
427 test_bit(STRIPE_EXPAND_READY, &sh->state))
428 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
429
430 bi->bi_bdev = rdev->bdev;
431 pr_debug("%s: for %llu schedule op %ld on disc %d\n",
432 __FUNCTION__, (unsigned long long)sh->sector,
433 bi->bi_rw, i);
434 atomic_inc(&sh->count);
435 bi->bi_sector = sh->sector + rdev->data_offset;
436 bi->bi_flags = 1 << BIO_UPTODATE;
437 bi->bi_vcnt = 1;
438 bi->bi_max_vecs = 1;
439 bi->bi_idx = 0;
440 bi->bi_io_vec = &sh->dev[i].vec;
441 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
442 bi->bi_io_vec[0].bv_offset = 0;
443 bi->bi_size = STRIPE_SIZE;
444 bi->bi_next = NULL;
445 if (rw == WRITE &&
446 test_bit(R5_ReWrite, &sh->dev[i].flags))
447 atomic_add(STRIPE_SECTORS,
448 &rdev->corrected_errors);
449 generic_make_request(bi);
450 } else {
451 if (rw == WRITE)
452 set_bit(STRIPE_DEGRADED, &sh->state);
453 pr_debug("skip op %ld on disc %d for sector %llu\n",
454 bi->bi_rw, i, (unsigned long long)sh->sector);
455 clear_bit(R5_LOCKED, &sh->dev[i].flags);
456 set_bit(STRIPE_HANDLE, &sh->state);
457 }
458 }
459}
460
461static struct dma_async_tx_descriptor *
462async_copy_data(int frombio, struct bio *bio, struct page *page,
463 sector_t sector, struct dma_async_tx_descriptor *tx)
464{
465 struct bio_vec *bvl;
466 struct page *bio_page;
467 int i;
468 int page_offset;
469
470 if (bio->bi_sector >= sector)
471 page_offset = (signed)(bio->bi_sector - sector) * 512;
472 else
473 page_offset = (signed)(sector - bio->bi_sector) * -512;
474 bio_for_each_segment(bvl, bio, i) {
475 int len = bio_iovec_idx(bio, i)->bv_len;
476 int clen;
477 int b_offset = 0;
478
479 if (page_offset < 0) {
480 b_offset = -page_offset;
481 page_offset += b_offset;
482 len -= b_offset;
483 }
484
485 if (len > 0 && page_offset + len > STRIPE_SIZE)
486 clen = STRIPE_SIZE - page_offset;
487 else
488 clen = len;
489
490 if (clen > 0) {
491 b_offset += bio_iovec_idx(bio, i)->bv_offset;
492 bio_page = bio_iovec_idx(bio, i)->bv_page;
493 if (frombio)
494 tx = async_memcpy(page, bio_page, page_offset,
495 b_offset, clen,
496 ASYNC_TX_DEP_ACK | ASYNC_TX_KMAP_SRC,
497 tx, NULL, NULL);
498 else
499 tx = async_memcpy(bio_page, page, b_offset,
500 page_offset, clen,
501 ASYNC_TX_DEP_ACK | ASYNC_TX_KMAP_DST,
502 tx, NULL, NULL);
503 }
504 if (clen < len) /* hit end of page */
505 break;
506 page_offset += len;
507 }
508
509 return tx;
510}
511
512static void ops_complete_biofill(void *stripe_head_ref)
513{
514 struct stripe_head *sh = stripe_head_ref;
515 struct bio *return_bi = NULL;
516 raid5_conf_t *conf = sh->raid_conf;
517 int i, more_to_read = 0;
518
519 pr_debug("%s: stripe %llu\n", __FUNCTION__,
520 (unsigned long long)sh->sector);
521
522 /* clear completed biofills */
523 for (i = sh->disks; i--; ) {
524 struct r5dev *dev = &sh->dev[i];
525 /* check if this stripe has new incoming reads */
526 if (dev->toread)
527 more_to_read++;
528
529 /* acknowledge completion of a biofill operation */
530 /* and check if we need to reply to a read request
531 */
532 if (test_bit(R5_Wantfill, &dev->flags) && !dev->toread) {
533 struct bio *rbi, *rbi2;
534 clear_bit(R5_Wantfill, &dev->flags);
535
536 /* The access to dev->read is outside of the
537 * spin_lock_irq(&conf->device_lock), but is protected
538 * by the STRIPE_OP_BIOFILL pending bit
539 */
540 BUG_ON(!dev->read);
541 rbi = dev->read;
542 dev->read = NULL;
543 while (rbi && rbi->bi_sector <
544 dev->sector + STRIPE_SECTORS) {
545 rbi2 = r5_next_bio(rbi, dev->sector);
546 spin_lock_irq(&conf->device_lock);
547 if (--rbi->bi_phys_segments == 0) {
548 rbi->bi_next = return_bi;
549 return_bi = rbi;
550 }
551 spin_unlock_irq(&conf->device_lock);
552 rbi = rbi2;
553 }
554 }
555 }
556 clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack);
557 clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending);
558
559 return_io(return_bi);
560
561 if (more_to_read)
562 set_bit(STRIPE_HANDLE, &sh->state);
563 release_stripe(sh);
564}
565
566static void ops_run_biofill(struct stripe_head *sh)
567{
568 struct dma_async_tx_descriptor *tx = NULL;
569 raid5_conf_t *conf = sh->raid_conf;
570 int i;
571
572 pr_debug("%s: stripe %llu\n", __FUNCTION__,
573 (unsigned long long)sh->sector);
574
575 for (i = sh->disks; i--; ) {
576 struct r5dev *dev = &sh->dev[i];
577 if (test_bit(R5_Wantfill, &dev->flags)) {
578 struct bio *rbi;
579 spin_lock_irq(&conf->device_lock);
580 dev->read = rbi = dev->toread;
581 dev->toread = NULL;
582 spin_unlock_irq(&conf->device_lock);
583 while (rbi && rbi->bi_sector <
584 dev->sector + STRIPE_SECTORS) {
585 tx = async_copy_data(0, rbi, dev->page,
586 dev->sector, tx);
587 rbi = r5_next_bio(rbi, dev->sector);
588 }
589 }
590 }
591
592 atomic_inc(&sh->count);
593 async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
594 ops_complete_biofill, sh);
595}
596
597static void ops_complete_compute5(void *stripe_head_ref)
598{
599 struct stripe_head *sh = stripe_head_ref;
600 int target = sh->ops.target;
601 struct r5dev *tgt = &sh->dev[target];
602
603 pr_debug("%s: stripe %llu\n", __FUNCTION__,
604 (unsigned long long)sh->sector);
605
606 set_bit(R5_UPTODATE, &tgt->flags);
607 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
608 clear_bit(R5_Wantcompute, &tgt->flags);
609 set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
610 set_bit(STRIPE_HANDLE, &sh->state);
611 release_stripe(sh);
612}
613
614static struct dma_async_tx_descriptor *
615ops_run_compute5(struct stripe_head *sh, unsigned long pending)
616{
617 /* kernel stack size limits the total number of disks */
618 int disks = sh->disks;
619 struct page *xor_srcs[disks];
620 int target = sh->ops.target;
621 struct r5dev *tgt = &sh->dev[target];
622 struct page *xor_dest = tgt->page;
623 int count = 0;
624 struct dma_async_tx_descriptor *tx;
625 int i;
626
627 pr_debug("%s: stripe %llu block: %d\n",
628 __FUNCTION__, (unsigned long long)sh->sector, target);
629 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
630
631 for (i = disks; i--; )
632 if (i != target)
633 xor_srcs[count++] = sh->dev[i].page;
634
635 atomic_inc(&sh->count);
636
637 if (unlikely(count == 1))
638 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
639 0, NULL, ops_complete_compute5, sh);
640 else
641 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
642 ASYNC_TX_XOR_ZERO_DST, NULL,
643 ops_complete_compute5, sh);
644
645 /* ack now if postxor is not set to be run */
646 if (tx && !test_bit(STRIPE_OP_POSTXOR, &pending))
647 async_tx_ack(tx);
648
649 return tx;
650}
651
652static void ops_complete_prexor(void *stripe_head_ref)
653{
654 struct stripe_head *sh = stripe_head_ref;
655
656 pr_debug("%s: stripe %llu\n", __FUNCTION__,
657 (unsigned long long)sh->sector);
658
659 set_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
660}
661
662static struct dma_async_tx_descriptor *
663ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
664{
665 /* kernel stack size limits the total number of disks */
666 int disks = sh->disks;
667 struct page *xor_srcs[disks];
668 int count = 0, pd_idx = sh->pd_idx, i;
669
670 /* existing parity data subtracted */
671 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
672
673 pr_debug("%s: stripe %llu\n", __FUNCTION__,
674 (unsigned long long)sh->sector);
675
676 for (i = disks; i--; ) {
677 struct r5dev *dev = &sh->dev[i];
678 /* Only process blocks that are known to be uptodate */
679 if (dev->towrite && test_bit(R5_Wantprexor, &dev->flags))
680 xor_srcs[count++] = dev->page;
681 }
682
683 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
684 ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx,
685 ops_complete_prexor, sh);
686
687 return tx;
688}
689
690static struct dma_async_tx_descriptor *
691ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
692{
693 int disks = sh->disks;
694 int pd_idx = sh->pd_idx, i;
695
696 /* check if prexor is active which means only process blocks
697 * that are part of a read-modify-write (Wantprexor)
698 */
699 int prexor = test_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
700
701 pr_debug("%s: stripe %llu\n", __FUNCTION__,
702 (unsigned long long)sh->sector);
703
704 for (i = disks; i--; ) {
705 struct r5dev *dev = &sh->dev[i];
706 struct bio *chosen;
707 int towrite;
708
709 towrite = 0;
710 if (prexor) { /* rmw */
711 if (dev->towrite &&
712 test_bit(R5_Wantprexor, &dev->flags))
713 towrite = 1;
714 } else { /* rcw */
715 if (i != pd_idx && dev->towrite &&
716 test_bit(R5_LOCKED, &dev->flags))
717 towrite = 1;
718 }
719
720 if (towrite) {
721 struct bio *wbi;
722
723 spin_lock(&sh->lock);
724 chosen = dev->towrite;
725 dev->towrite = NULL;
726 BUG_ON(dev->written);
727 wbi = dev->written = chosen;
728 spin_unlock(&sh->lock);
729
730 while (wbi && wbi->bi_sector <
731 dev->sector + STRIPE_SECTORS) {
732 tx = async_copy_data(1, wbi, dev->page,
733 dev->sector, tx);
734 wbi = r5_next_bio(wbi, dev->sector);
735 }
736 }
737 }
738
739 return tx;
740}
741
742static void ops_complete_postxor(void *stripe_head_ref)
743{
744 struct stripe_head *sh = stripe_head_ref;
745
746 pr_debug("%s: stripe %llu\n", __FUNCTION__,
747 (unsigned long long)sh->sector);
748
749 set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
750 set_bit(STRIPE_HANDLE, &sh->state);
751 release_stripe(sh);
752}
753
754static void ops_complete_write(void *stripe_head_ref)
755{
756 struct stripe_head *sh = stripe_head_ref;
757 int disks = sh->disks, i, pd_idx = sh->pd_idx;
758
759 pr_debug("%s: stripe %llu\n", __FUNCTION__,
760 (unsigned long long)sh->sector);
761
762 for (i = disks; i--; ) {
763 struct r5dev *dev = &sh->dev[i];
764 if (dev->written || i == pd_idx)
765 set_bit(R5_UPTODATE, &dev->flags);
766 }
767
768 set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
769 set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
770
771 set_bit(STRIPE_HANDLE, &sh->state);
772 release_stripe(sh);
773}
774
775static void
776ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
777{
778 /* kernel stack size limits the total number of disks */
779 int disks = sh->disks;
780 struct page *xor_srcs[disks];
781
782 int count = 0, pd_idx = sh->pd_idx, i;
783 struct page *xor_dest;
784 int prexor = test_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
785 unsigned long flags;
786 dma_async_tx_callback callback;
787
788 pr_debug("%s: stripe %llu\n", __FUNCTION__,
789 (unsigned long long)sh->sector);
790
791 /* check if prexor is active which means only process blocks
792 * that are part of a read-modify-write (written)
793 */
794 if (prexor) {
795 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
796 for (i = disks; i--; ) {
797 struct r5dev *dev = &sh->dev[i];
798 if (dev->written)
799 xor_srcs[count++] = dev->page;
800 }
801 } else {
802 xor_dest = sh->dev[pd_idx].page;
803 for (i = disks; i--; ) {
804 struct r5dev *dev = &sh->dev[i];
805 if (i != pd_idx)
806 xor_srcs[count++] = dev->page;
807 }
808 }
809
810 /* check whether this postxor is part of a write */
811 callback = test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending) ?
812 ops_complete_write : ops_complete_postxor;
813
814 /* 1/ if we prexor'd then the dest is reused as a source
815 * 2/ if we did not prexor then we are redoing the parity
816 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
817 * for the synchronous xor case
818 */
819 flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK |
820 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
821
822 atomic_inc(&sh->count);
823
824 if (unlikely(count == 1)) {
825 flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST);
826 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
827 flags, tx, callback, sh);
828 } else
829 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
830 flags, tx, callback, sh);
831}
832
833static void ops_complete_check(void *stripe_head_ref)
834{
835 struct stripe_head *sh = stripe_head_ref;
836 int pd_idx = sh->pd_idx;
837
838 pr_debug("%s: stripe %llu\n", __FUNCTION__,
839 (unsigned long long)sh->sector);
840
841 if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) &&
842 sh->ops.zero_sum_result == 0)
843 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
844
845 set_bit(STRIPE_OP_CHECK, &sh->ops.complete);
846 set_bit(STRIPE_HANDLE, &sh->state);
847 release_stripe(sh);
848}
849
850static void ops_run_check(struct stripe_head *sh)
851{
852 /* kernel stack size limits the total number of disks */
853 int disks = sh->disks;
854 struct page *xor_srcs[disks];
855 struct dma_async_tx_descriptor *tx;
856
857 int count = 0, pd_idx = sh->pd_idx, i;
858 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
859
860 pr_debug("%s: stripe %llu\n", __FUNCTION__,
861 (unsigned long long)sh->sector);
862
863 for (i = disks; i--; ) {
864 struct r5dev *dev = &sh->dev[i];
865 if (i != pd_idx)
866 xor_srcs[count++] = dev->page;
867 }
868
869 tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
870 &sh->ops.zero_sum_result, 0, NULL, NULL, NULL);
871
872 if (tx)
873 set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
874 else
875 clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
876
877 atomic_inc(&sh->count);
878 tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
879 ops_complete_check, sh);
880}
881
882static void raid5_run_ops(struct stripe_head *sh, unsigned long pending)
883{
884 int overlap_clear = 0, i, disks = sh->disks;
885 struct dma_async_tx_descriptor *tx = NULL;
886
887 if (test_bit(STRIPE_OP_BIOFILL, &pending)) {
888 ops_run_biofill(sh);
889 overlap_clear++;
890 }
891
892 if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending))
893 tx = ops_run_compute5(sh, pending);
894
895 if (test_bit(STRIPE_OP_PREXOR, &pending))
896 tx = ops_run_prexor(sh, tx);
897
898 if (test_bit(STRIPE_OP_BIODRAIN, &pending)) {
899 tx = ops_run_biodrain(sh, tx);
900 overlap_clear++;
901 }
902
903 if (test_bit(STRIPE_OP_POSTXOR, &pending))
904 ops_run_postxor(sh, tx);
905
906 if (test_bit(STRIPE_OP_CHECK, &pending))
907 ops_run_check(sh);
908
909 if (test_bit(STRIPE_OP_IO, &pending))
910 ops_run_io(sh);
911
912 if (overlap_clear)
913 for (i = disks; i--; ) {
914 struct r5dev *dev = &sh->dev[i];
915 if (test_and_clear_bit(R5_Overlap, &dev->flags))
916 wake_up(&sh->raid_conf->wait_for_overlap);
917 }
918}
919
327static int grow_one_stripe(raid5_conf_t *conf) 920static int grow_one_stripe(raid5_conf_t *conf)
328{ 921{
329 struct stripe_head *sh; 922 struct stripe_head *sh;
@@ -537,8 +1130,8 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
537 if (bi == &sh->dev[i].req) 1130 if (bi == &sh->dev[i].req)
538 break; 1131 break;
539 1132
540 PRINTK("end_read_request %llu/%d, count: %d, uptodate %d.\n", 1133 pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n",
541 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1134 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
542 uptodate); 1135 uptodate);
543 if (i == disks) { 1136 if (i == disks) {
544 BUG(); 1137 BUG();
@@ -613,7 +1206,7 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
613 if (bi == &sh->dev[i].req) 1206 if (bi == &sh->dev[i].req)
614 break; 1207 break;
615 1208
616 PRINTK("end_write_request %llu/%d, count %d, uptodate: %d.\n", 1209 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
617 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1210 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
618 uptodate); 1211 uptodate);
619 if (i == disks) { 1212 if (i == disks) {
@@ -658,7 +1251,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
658{ 1251{
659 char b[BDEVNAME_SIZE]; 1252 char b[BDEVNAME_SIZE];
660 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 1253 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
661 PRINTK("raid5: error called\n"); 1254 pr_debug("raid5: error called\n");
662 1255
663 if (!test_bit(Faulty, &rdev->flags)) { 1256 if (!test_bit(Faulty, &rdev->flags)) {
664 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1257 set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -916,137 +1509,13 @@ static void copy_data(int frombio, struct bio *bio,
916 } 1509 }
917} 1510}
918 1511
919#define check_xor() do { \ 1512#define check_xor() do { \
920 if (count == MAX_XOR_BLOCKS) { \ 1513 if (count == MAX_XOR_BLOCKS) { \
921 xor_block(count, STRIPE_SIZE, ptr); \ 1514 xor_blocks(count, STRIPE_SIZE, dest, ptr);\
922 count = 1; \ 1515 count = 0; \
923 } \ 1516 } \
924 } while(0) 1517 } while(0)
925 1518
926
927static void compute_block(struct stripe_head *sh, int dd_idx)
928{
929 int i, count, disks = sh->disks;
930 void *ptr[MAX_XOR_BLOCKS], *p;
931
932 PRINTK("compute_block, stripe %llu, idx %d\n",
933 (unsigned long long)sh->sector, dd_idx);
934
935 ptr[0] = page_address(sh->dev[dd_idx].page);
936 memset(ptr[0], 0, STRIPE_SIZE);
937 count = 1;
938 for (i = disks ; i--; ) {
939 if (i == dd_idx)
940 continue;
941 p = page_address(sh->dev[i].page);
942 if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
943 ptr[count++] = p;
944 else
945 printk(KERN_ERR "compute_block() %d, stripe %llu, %d"
946 " not present\n", dd_idx,
947 (unsigned long long)sh->sector, i);
948
949 check_xor();
950 }
951 if (count != 1)
952 xor_block(count, STRIPE_SIZE, ptr);
953 set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
954}
955
956static void compute_parity5(struct stripe_head *sh, int method)
957{
958 raid5_conf_t *conf = sh->raid_conf;
959 int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
960 void *ptr[MAX_XOR_BLOCKS];
961 struct bio *chosen;
962
963 PRINTK("compute_parity5, stripe %llu, method %d\n",
964 (unsigned long long)sh->sector, method);
965
966 count = 1;
967 ptr[0] = page_address(sh->dev[pd_idx].page);
968 switch(method) {
969 case READ_MODIFY_WRITE:
970 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags));
971 for (i=disks ; i-- ;) {
972 if (i==pd_idx)
973 continue;
974 if (sh->dev[i].towrite &&
975 test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
976 ptr[count++] = page_address(sh->dev[i].page);
977 chosen = sh->dev[i].towrite;
978 sh->dev[i].towrite = NULL;
979
980 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
981 wake_up(&conf->wait_for_overlap);
982
983 BUG_ON(sh->dev[i].written);
984 sh->dev[i].written = chosen;
985 check_xor();
986 }
987 }
988 break;
989 case RECONSTRUCT_WRITE:
990 memset(ptr[0], 0, STRIPE_SIZE);
991 for (i= disks; i-- ;)
992 if (i!=pd_idx && sh->dev[i].towrite) {
993 chosen = sh->dev[i].towrite;
994 sh->dev[i].towrite = NULL;
995
996 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
997 wake_up(&conf->wait_for_overlap);
998
999 BUG_ON(sh->dev[i].written);
1000 sh->dev[i].written = chosen;
1001 }
1002 break;
1003 case CHECK_PARITY:
1004 break;
1005 }
1006 if (count>1) {
1007 xor_block(count, STRIPE_SIZE, ptr);
1008 count = 1;
1009 }
1010
1011 for (i = disks; i--;)
1012 if (sh->dev[i].written) {
1013 sector_t sector = sh->dev[i].sector;
1014 struct bio *wbi = sh->dev[i].written;
1015 while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
1016 copy_data(1, wbi, sh->dev[i].page, sector);
1017 wbi = r5_next_bio(wbi, sector);
1018 }
1019
1020 set_bit(R5_LOCKED, &sh->dev[i].flags);
1021 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1022 }
1023
1024 switch(method) {
1025 case RECONSTRUCT_WRITE:
1026 case CHECK_PARITY:
1027 for (i=disks; i--;)
1028 if (i != pd_idx) {
1029 ptr[count++] = page_address(sh->dev[i].page);
1030 check_xor();
1031 }
1032 break;
1033 case READ_MODIFY_WRITE:
1034 for (i = disks; i--;)
1035 if (sh->dev[i].written) {
1036 ptr[count++] = page_address(sh->dev[i].page);
1037 check_xor();
1038 }
1039 }
1040 if (count != 1)
1041 xor_block(count, STRIPE_SIZE, ptr);
1042
1043 if (method != CHECK_PARITY) {
1044 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1045 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1046 } else
1047 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1048}
1049
1050static void compute_parity6(struct stripe_head *sh, int method) 1519static void compute_parity6(struct stripe_head *sh, int method)
1051{ 1520{
1052 raid6_conf_t *conf = sh->raid_conf; 1521 raid6_conf_t *conf = sh->raid_conf;
@@ -1058,7 +1527,7 @@ static void compute_parity6(struct stripe_head *sh, int method)
1058 qd_idx = raid6_next_disk(pd_idx, disks); 1527 qd_idx = raid6_next_disk(pd_idx, disks);
1059 d0_idx = raid6_next_disk(qd_idx, disks); 1528 d0_idx = raid6_next_disk(qd_idx, disks);
1060 1529
1061 PRINTK("compute_parity, stripe %llu, method %d\n", 1530 pr_debug("compute_parity, stripe %llu, method %d\n",
1062 (unsigned long long)sh->sector, method); 1531 (unsigned long long)sh->sector, method);
1063 1532
1064 switch(method) { 1533 switch(method) {
@@ -1132,20 +1601,20 @@ static void compute_parity6(struct stripe_head *sh, int method)
1132static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) 1601static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
1133{ 1602{
1134 int i, count, disks = sh->disks; 1603 int i, count, disks = sh->disks;
1135 void *ptr[MAX_XOR_BLOCKS], *p; 1604 void *ptr[MAX_XOR_BLOCKS], *dest, *p;
1136 int pd_idx = sh->pd_idx; 1605 int pd_idx = sh->pd_idx;
1137 int qd_idx = raid6_next_disk(pd_idx, disks); 1606 int qd_idx = raid6_next_disk(pd_idx, disks);
1138 1607
1139 PRINTK("compute_block_1, stripe %llu, idx %d\n", 1608 pr_debug("compute_block_1, stripe %llu, idx %d\n",
1140 (unsigned long long)sh->sector, dd_idx); 1609 (unsigned long long)sh->sector, dd_idx);
1141 1610
1142 if ( dd_idx == qd_idx ) { 1611 if ( dd_idx == qd_idx ) {
1143 /* We're actually computing the Q drive */ 1612 /* We're actually computing the Q drive */
1144 compute_parity6(sh, UPDATE_PARITY); 1613 compute_parity6(sh, UPDATE_PARITY);
1145 } else { 1614 } else {
1146 ptr[0] = page_address(sh->dev[dd_idx].page); 1615 dest = page_address(sh->dev[dd_idx].page);
1147 if (!nozero) memset(ptr[0], 0, STRIPE_SIZE); 1616 if (!nozero) memset(dest, 0, STRIPE_SIZE);
1148 count = 1; 1617 count = 0;
1149 for (i = disks ; i--; ) { 1618 for (i = disks ; i--; ) {
1150 if (i == dd_idx || i == qd_idx) 1619 if (i == dd_idx || i == qd_idx)
1151 continue; 1620 continue;
@@ -1159,8 +1628,8 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
1159 1628
1160 check_xor(); 1629 check_xor();
1161 } 1630 }
1162 if (count != 1) 1631 if (count)
1163 xor_block(count, STRIPE_SIZE, ptr); 1632 xor_blocks(count, STRIPE_SIZE, dest, ptr);
1164 if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); 1633 if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1165 else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); 1634 else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1166 } 1635 }
@@ -1183,7 +1652,7 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
1183 BUG_ON(faila == failb); 1652 BUG_ON(faila == failb);
1184 if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } 1653 if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
1185 1654
1186 PRINTK("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n", 1655 pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
1187 (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb); 1656 (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb);
1188 1657
1189 if ( failb == disks-1 ) { 1658 if ( failb == disks-1 ) {
@@ -1229,7 +1698,79 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
1229 } 1698 }
1230} 1699}
1231 1700
1701static int
1702handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
1703{
1704 int i, pd_idx = sh->pd_idx, disks = sh->disks;
1705 int locked = 0;
1232 1706
1707 if (rcw) {
1708 /* if we are not expanding this is a proper write request, and
1709 * there will be bios with new data to be drained into the
1710 * stripe cache
1711 */
1712 if (!expand) {
1713 set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
1714 sh->ops.count++;
1715 }
1716
1717 set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
1718 sh->ops.count++;
1719
1720 for (i = disks; i--; ) {
1721 struct r5dev *dev = &sh->dev[i];
1722
1723 if (dev->towrite) {
1724 set_bit(R5_LOCKED, &dev->flags);
1725 if (!expand)
1726 clear_bit(R5_UPTODATE, &dev->flags);
1727 locked++;
1728 }
1729 }
1730 } else {
1731 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
1732 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
1733
1734 set_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
1735 set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
1736 set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
1737
1738 sh->ops.count += 3;
1739
1740 for (i = disks; i--; ) {
1741 struct r5dev *dev = &sh->dev[i];
1742 if (i == pd_idx)
1743 continue;
1744
1745 /* For a read-modify write there may be blocks that are
1746 * locked for reading while others are ready to be
1747 * written so we distinguish these blocks by the
1748 * R5_Wantprexor bit
1749 */
1750 if (dev->towrite &&
1751 (test_bit(R5_UPTODATE, &dev->flags) ||
1752 test_bit(R5_Wantcompute, &dev->flags))) {
1753 set_bit(R5_Wantprexor, &dev->flags);
1754 set_bit(R5_LOCKED, &dev->flags);
1755 clear_bit(R5_UPTODATE, &dev->flags);
1756 locked++;
1757 }
1758 }
1759 }
1760
1761 /* keep the parity disk locked while asynchronous operations
1762 * are in flight
1763 */
1764 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1765 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1766 locked++;
1767
1768 pr_debug("%s: stripe %llu locked: %d pending: %lx\n",
1769 __FUNCTION__, (unsigned long long)sh->sector,
1770 locked, sh->ops.pending);
1771
1772 return locked;
1773}
1233 1774
1234/* 1775/*
1235 * Each stripe/dev can have one or more bion attached. 1776 * Each stripe/dev can have one or more bion attached.
@@ -1242,7 +1783,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
1242 raid5_conf_t *conf = sh->raid_conf; 1783 raid5_conf_t *conf = sh->raid_conf;
1243 int firstwrite=0; 1784 int firstwrite=0;
1244 1785
1245 PRINTK("adding bh b#%llu to stripe s#%llu\n", 1786 pr_debug("adding bh b#%llu to stripe s#%llu\n",
1246 (unsigned long long)bi->bi_sector, 1787 (unsigned long long)bi->bi_sector,
1247 (unsigned long long)sh->sector); 1788 (unsigned long long)sh->sector);
1248 1789
@@ -1271,7 +1812,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
1271 spin_unlock_irq(&conf->device_lock); 1812 spin_unlock_irq(&conf->device_lock);
1272 spin_unlock(&sh->lock); 1813 spin_unlock(&sh->lock);
1273 1814
1274 PRINTK("added bi b#%llu to stripe s#%llu, disk %d.\n", 1815 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
1275 (unsigned long long)bi->bi_sector, 1816 (unsigned long long)bi->bi_sector,
1276 (unsigned long long)sh->sector, dd_idx); 1817 (unsigned long long)sh->sector, dd_idx);
1277 1818
@@ -1326,6 +1867,729 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
1326 return pd_idx; 1867 return pd_idx;
1327} 1868}
1328 1869
1870static void
1871handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
1872 struct stripe_head_state *s, int disks,
1873 struct bio **return_bi)
1874{
1875 int i;
1876 for (i = disks; i--; ) {
1877 struct bio *bi;
1878 int bitmap_end = 0;
1879
1880 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1881 mdk_rdev_t *rdev;
1882 rcu_read_lock();
1883 rdev = rcu_dereference(conf->disks[i].rdev);
1884 if (rdev && test_bit(In_sync, &rdev->flags))
1885 /* multiple read failures in one stripe */
1886 md_error(conf->mddev, rdev);
1887 rcu_read_unlock();
1888 }
1889 spin_lock_irq(&conf->device_lock);
1890 /* fail all writes first */
1891 bi = sh->dev[i].towrite;
1892 sh->dev[i].towrite = NULL;
1893 if (bi) {
1894 s->to_write--;
1895 bitmap_end = 1;
1896 }
1897
1898 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1899 wake_up(&conf->wait_for_overlap);
1900
1901 while (bi && bi->bi_sector <
1902 sh->dev[i].sector + STRIPE_SECTORS) {
1903 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
1904 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1905 if (--bi->bi_phys_segments == 0) {
1906 md_write_end(conf->mddev);
1907 bi->bi_next = *return_bi;
1908 *return_bi = bi;
1909 }
1910 bi = nextbi;
1911 }
1912 /* and fail all 'written' */
1913 bi = sh->dev[i].written;
1914 sh->dev[i].written = NULL;
1915 if (bi) bitmap_end = 1;
1916 while (bi && bi->bi_sector <
1917 sh->dev[i].sector + STRIPE_SECTORS) {
1918 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
1919 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1920 if (--bi->bi_phys_segments == 0) {
1921 md_write_end(conf->mddev);
1922 bi->bi_next = *return_bi;
1923 *return_bi = bi;
1924 }
1925 bi = bi2;
1926 }
1927
1928 /* fail any reads if this device is non-operational and
1929 * the data has not reached the cache yet.
1930 */
1931 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
1932 (!test_bit(R5_Insync, &sh->dev[i].flags) ||
1933 test_bit(R5_ReadError, &sh->dev[i].flags))) {
1934 bi = sh->dev[i].toread;
1935 sh->dev[i].toread = NULL;
1936 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1937 wake_up(&conf->wait_for_overlap);
1938 if (bi) s->to_read--;
1939 while (bi && bi->bi_sector <
1940 sh->dev[i].sector + STRIPE_SECTORS) {
1941 struct bio *nextbi =
1942 r5_next_bio(bi, sh->dev[i].sector);
1943 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1944 if (--bi->bi_phys_segments == 0) {
1945 bi->bi_next = *return_bi;
1946 *return_bi = bi;
1947 }
1948 bi = nextbi;
1949 }
1950 }
1951 spin_unlock_irq(&conf->device_lock);
1952 if (bitmap_end)
1953 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
1954 STRIPE_SECTORS, 0, 0);
1955 }
1956
1957}
1958
1959/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks
1960 * to process
1961 */
1962static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
1963 struct stripe_head_state *s, int disk_idx, int disks)
1964{
1965 struct r5dev *dev = &sh->dev[disk_idx];
1966 struct r5dev *failed_dev = &sh->dev[s->failed_num];
1967
1968 /* don't schedule compute operations or reads on the parity block while
1969 * a check is in flight
1970 */
1971 if ((disk_idx == sh->pd_idx) &&
1972 test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
1973 return ~0;
1974
1975 /* is the data in this block needed, and can we get it? */
1976 if (!test_bit(R5_LOCKED, &dev->flags) &&
1977 !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread ||
1978 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
1979 s->syncing || s->expanding || (s->failed &&
1980 (failed_dev->toread || (failed_dev->towrite &&
1981 !test_bit(R5_OVERWRITE, &failed_dev->flags)
1982 ))))) {
1983 /* 1/ We would like to get this block, possibly by computing it,
1984 * but we might not be able to.
1985 *
1986 * 2/ Since parity check operations potentially make the parity
1987 * block !uptodate it will need to be refreshed before any
1988 * compute operations on data disks are scheduled.
1989 *
1990 * 3/ We hold off parity block re-reads until check operations
1991 * have quiesced.
1992 */
1993 if ((s->uptodate == disks - 1) &&
1994 !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
1995 set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
1996 set_bit(R5_Wantcompute, &dev->flags);
1997 sh->ops.target = disk_idx;
1998 s->req_compute = 1;
1999 sh->ops.count++;
2000 /* Careful: from this point on 'uptodate' is in the eye
2001 * of raid5_run_ops which services 'compute' operations
2002 * before writes. R5_Wantcompute flags a block that will
2003 * be R5_UPTODATE by the time it is needed for a
2004 * subsequent operation.
2005 */
2006 s->uptodate++;
2007 return 0; /* uptodate + compute == disks */
2008 } else if ((s->uptodate < disks - 1) &&
2009 test_bit(R5_Insync, &dev->flags)) {
2010 /* Note: we hold off compute operations while checks are
2011 * in flight, but we still prefer 'compute' over 'read'
2012 * hence we only read if (uptodate < * disks-1)
2013 */
2014 set_bit(R5_LOCKED, &dev->flags);
2015 set_bit(R5_Wantread, &dev->flags);
2016 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2017 sh->ops.count++;
2018 s->locked++;
2019 pr_debug("Reading block %d (sync=%d)\n", disk_idx,
2020 s->syncing);
2021 }
2022 }
2023
2024 return ~0;
2025}
2026
2027static void handle_issuing_new_read_requests5(struct stripe_head *sh,
2028 struct stripe_head_state *s, int disks)
2029{
2030 int i;
2031
2032 /* Clear completed compute operations. Parity recovery
2033 * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled
2034 * later on in this routine
2035 */
2036 if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
2037 !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
2038 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
2039 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
2040 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
2041 }
2042
2043 /* look for blocks to read/compute, skip this if a compute
2044 * is already in flight, or if the stripe contents are in the
2045 * midst of changing due to a write
2046 */
2047 if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
2048 !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) &&
2049 !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
2050 for (i = disks; i--; )
2051 if (__handle_issuing_new_read_requests5(
2052 sh, s, i, disks) == 0)
2053 break;
2054 }
2055 set_bit(STRIPE_HANDLE, &sh->state);
2056}
2057
2058static void handle_issuing_new_read_requests6(struct stripe_head *sh,
2059 struct stripe_head_state *s, struct r6_state *r6s,
2060 int disks)
2061{
2062 int i;
2063 for (i = disks; i--; ) {
2064 struct r5dev *dev = &sh->dev[i];
2065 if (!test_bit(R5_LOCKED, &dev->flags) &&
2066 !test_bit(R5_UPTODATE, &dev->flags) &&
2067 (dev->toread || (dev->towrite &&
2068 !test_bit(R5_OVERWRITE, &dev->flags)) ||
2069 s->syncing || s->expanding ||
2070 (s->failed >= 1 &&
2071 (sh->dev[r6s->failed_num[0]].toread ||
2072 s->to_write)) ||
2073 (s->failed >= 2 &&
2074 (sh->dev[r6s->failed_num[1]].toread ||
2075 s->to_write)))) {
2076 /* we would like to get this block, possibly
2077 * by computing it, but we might not be able to
2078 */
2079 if (s->uptodate == disks-1) {
2080 pr_debug("Computing stripe %llu block %d\n",
2081 (unsigned long long)sh->sector, i);
2082 compute_block_1(sh, i, 0);
2083 s->uptodate++;
2084 } else if ( s->uptodate == disks-2 && s->failed >= 2 ) {
2085 /* Computing 2-failure is *very* expensive; only
2086 * do it if failed >= 2
2087 */
2088 int other;
2089 for (other = disks; other--; ) {
2090 if (other == i)
2091 continue;
2092 if (!test_bit(R5_UPTODATE,
2093 &sh->dev[other].flags))
2094 break;
2095 }
2096 BUG_ON(other < 0);
2097 pr_debug("Computing stripe %llu blocks %d,%d\n",
2098 (unsigned long long)sh->sector,
2099 i, other);
2100 compute_block_2(sh, i, other);
2101 s->uptodate += 2;
2102 } else if (test_bit(R5_Insync, &dev->flags)) {
2103 set_bit(R5_LOCKED, &dev->flags);
2104 set_bit(R5_Wantread, &dev->flags);
2105 s->locked++;
2106 pr_debug("Reading block %d (sync=%d)\n",
2107 i, s->syncing);
2108 }
2109 }
2110 }
2111 set_bit(STRIPE_HANDLE, &sh->state);
2112}
2113
2114
2115/* handle_completed_write_requests
2116 * any written block on an uptodate or failed drive can be returned.
2117 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
2118 * never LOCKED, so we don't need to test 'failed' directly.
2119 */
2120static void handle_completed_write_requests(raid5_conf_t *conf,
2121 struct stripe_head *sh, int disks, struct bio **return_bi)
2122{
2123 int i;
2124 struct r5dev *dev;
2125
2126 for (i = disks; i--; )
2127 if (sh->dev[i].written) {
2128 dev = &sh->dev[i];
2129 if (!test_bit(R5_LOCKED, &dev->flags) &&
2130 test_bit(R5_UPTODATE, &dev->flags)) {
2131 /* We can return any write requests */
2132 struct bio *wbi, *wbi2;
2133 int bitmap_end = 0;
2134 pr_debug("Return write for disc %d\n", i);
2135 spin_lock_irq(&conf->device_lock);
2136 wbi = dev->written;
2137 dev->written = NULL;
2138 while (wbi && wbi->bi_sector <
2139 dev->sector + STRIPE_SECTORS) {
2140 wbi2 = r5_next_bio(wbi, dev->sector);
2141 if (--wbi->bi_phys_segments == 0) {
2142 md_write_end(conf->mddev);
2143 wbi->bi_next = *return_bi;
2144 *return_bi = wbi;
2145 }
2146 wbi = wbi2;
2147 }
2148 if (dev->towrite == NULL)
2149 bitmap_end = 1;
2150 spin_unlock_irq(&conf->device_lock);
2151 if (bitmap_end)
2152 bitmap_endwrite(conf->mddev->bitmap,
2153 sh->sector,
2154 STRIPE_SECTORS,
2155 !test_bit(STRIPE_DEGRADED, &sh->state),
2156 0);
2157 }
2158 }
2159}
2160
2161static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
2162 struct stripe_head *sh, struct stripe_head_state *s, int disks)
2163{
2164 int rmw = 0, rcw = 0, i;
2165 for (i = disks; i--; ) {
2166 /* would I have to read this buffer for read_modify_write */
2167 struct r5dev *dev = &sh->dev[i];
2168 if ((dev->towrite || i == sh->pd_idx) &&
2169 !test_bit(R5_LOCKED, &dev->flags) &&
2170 !(test_bit(R5_UPTODATE, &dev->flags) ||
2171 test_bit(R5_Wantcompute, &dev->flags))) {
2172 if (test_bit(R5_Insync, &dev->flags))
2173 rmw++;
2174 else
2175 rmw += 2*disks; /* cannot read it */
2176 }
2177 /* Would I have to read this buffer for reconstruct_write */
2178 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
2179 !test_bit(R5_LOCKED, &dev->flags) &&
2180 !(test_bit(R5_UPTODATE, &dev->flags) ||
2181 test_bit(R5_Wantcompute, &dev->flags))) {
2182 if (test_bit(R5_Insync, &dev->flags)) rcw++;
2183 else
2184 rcw += 2*disks;
2185 }
2186 }
2187 pr_debug("for sector %llu, rmw=%d rcw=%d\n",
2188 (unsigned long long)sh->sector, rmw, rcw);
2189 set_bit(STRIPE_HANDLE, &sh->state);
2190 if (rmw < rcw && rmw > 0)
2191 /* prefer read-modify-write, but need to get some data */
2192 for (i = disks; i--; ) {
2193 struct r5dev *dev = &sh->dev[i];
2194 if ((dev->towrite || i == sh->pd_idx) &&
2195 !test_bit(R5_LOCKED, &dev->flags) &&
2196 !(test_bit(R5_UPTODATE, &dev->flags) ||
2197 test_bit(R5_Wantcompute, &dev->flags)) &&
2198 test_bit(R5_Insync, &dev->flags)) {
2199 if (
2200 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2201 pr_debug("Read_old block "
2202 "%d for r-m-w\n", i);
2203 set_bit(R5_LOCKED, &dev->flags);
2204 set_bit(R5_Wantread, &dev->flags);
2205 if (!test_and_set_bit(
2206 STRIPE_OP_IO, &sh->ops.pending))
2207 sh->ops.count++;
2208 s->locked++;
2209 } else {
2210 set_bit(STRIPE_DELAYED, &sh->state);
2211 set_bit(STRIPE_HANDLE, &sh->state);
2212 }
2213 }
2214 }
2215 if (rcw <= rmw && rcw > 0)
2216 /* want reconstruct write, but need to get some data */
2217 for (i = disks; i--; ) {
2218 struct r5dev *dev = &sh->dev[i];
2219 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2220 i != sh->pd_idx &&
2221 !test_bit(R5_LOCKED, &dev->flags) &&
2222 !(test_bit(R5_UPTODATE, &dev->flags) ||
2223 test_bit(R5_Wantcompute, &dev->flags)) &&
2224 test_bit(R5_Insync, &dev->flags)) {
2225 if (
2226 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2227 pr_debug("Read_old block "
2228 "%d for Reconstruct\n", i);
2229 set_bit(R5_LOCKED, &dev->flags);
2230 set_bit(R5_Wantread, &dev->flags);
2231 if (!test_and_set_bit(
2232 STRIPE_OP_IO, &sh->ops.pending))
2233 sh->ops.count++;
2234 s->locked++;
2235 } else {
2236 set_bit(STRIPE_DELAYED, &sh->state);
2237 set_bit(STRIPE_HANDLE, &sh->state);
2238 }
2239 }
2240 }
2241 /* now if nothing is locked, and if we have enough data,
2242 * we can start a write request
2243 */
2244 /* since handle_stripe can be called at any time we need to handle the
2245 * case where a compute block operation has been submitted and then a
2246 * subsequent call wants to start a write request. raid5_run_ops only
2247 * handles the case where compute block and postxor are requested
2248 * simultaneously. If this is not the case then new writes need to be
2249 * held off until the compute completes.
2250 */
2251 if ((s->req_compute ||
2252 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) &&
2253 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
2254 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
2255 s->locked += handle_write_operations5(sh, rcw == 0, 0);
2256}
2257
2258static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
2259 struct stripe_head *sh, struct stripe_head_state *s,
2260 struct r6_state *r6s, int disks)
2261{
2262 int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i;
2263 int qd_idx = r6s->qd_idx;
2264 for (i = disks; i--; ) {
2265 struct r5dev *dev = &sh->dev[i];
2266 /* Would I have to read this buffer for reconstruct_write */
2267 if (!test_bit(R5_OVERWRITE, &dev->flags)
2268 && i != pd_idx && i != qd_idx
2269 && (!test_bit(R5_LOCKED, &dev->flags)
2270 ) &&
2271 !test_bit(R5_UPTODATE, &dev->flags)) {
2272 if (test_bit(R5_Insync, &dev->flags)) rcw++;
2273 else {
2274 pr_debug("raid6: must_compute: "
2275 "disk %d flags=%#lx\n", i, dev->flags);
2276 must_compute++;
2277 }
2278 }
2279 }
2280 pr_debug("for sector %llu, rcw=%d, must_compute=%d\n",
2281 (unsigned long long)sh->sector, rcw, must_compute);
2282 set_bit(STRIPE_HANDLE, &sh->state);
2283
2284 if (rcw > 0)
2285 /* want reconstruct write, but need to get some data */
2286 for (i = disks; i--; ) {
2287 struct r5dev *dev = &sh->dev[i];
2288 if (!test_bit(R5_OVERWRITE, &dev->flags)
2289 && !(s->failed == 0 && (i == pd_idx || i == qd_idx))
2290 && !test_bit(R5_LOCKED, &dev->flags) &&
2291 !test_bit(R5_UPTODATE, &dev->flags) &&
2292 test_bit(R5_Insync, &dev->flags)) {
2293 if (
2294 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2295 pr_debug("Read_old stripe %llu "
2296 "block %d for Reconstruct\n",
2297 (unsigned long long)sh->sector, i);
2298 set_bit(R5_LOCKED, &dev->flags);
2299 set_bit(R5_Wantread, &dev->flags);
2300 s->locked++;
2301 } else {
2302 pr_debug("Request delayed stripe %llu "
2303 "block %d for Reconstruct\n",
2304 (unsigned long long)sh->sector, i);
2305 set_bit(STRIPE_DELAYED, &sh->state);
2306 set_bit(STRIPE_HANDLE, &sh->state);
2307 }
2308 }
2309 }
2310 /* now if nothing is locked, and if we have enough data, we can start a
2311 * write request
2312 */
2313 if (s->locked == 0 && rcw == 0 &&
2314 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
2315 if (must_compute > 0) {
2316 /* We have failed blocks and need to compute them */
2317 switch (s->failed) {
2318 case 0:
2319 BUG();
2320 case 1:
2321 compute_block_1(sh, r6s->failed_num[0], 0);
2322 break;
2323 case 2:
2324 compute_block_2(sh, r6s->failed_num[0],
2325 r6s->failed_num[1]);
2326 break;
2327 default: /* This request should have been failed? */
2328 BUG();
2329 }
2330 }
2331
2332 pr_debug("Computing parity for stripe %llu\n",
2333 (unsigned long long)sh->sector);
2334 compute_parity6(sh, RECONSTRUCT_WRITE);
2335 /* now every locked buffer is ready to be written */
2336 for (i = disks; i--; )
2337 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
2338 pr_debug("Writing stripe %llu block %d\n",
2339 (unsigned long long)sh->sector, i);
2340 s->locked++;
2341 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2342 }
2343 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
2344 set_bit(STRIPE_INSYNC, &sh->state);
2345
2346 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2347 atomic_dec(&conf->preread_active_stripes);
2348 if (atomic_read(&conf->preread_active_stripes) <
2349 IO_THRESHOLD)
2350 md_wakeup_thread(conf->mddev->thread);
2351 }
2352 }
2353}
2354
2355static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2356 struct stripe_head_state *s, int disks)
2357{
2358 set_bit(STRIPE_HANDLE, &sh->state);
2359 /* Take one of the following actions:
2360 * 1/ start a check parity operation if (uptodate == disks)
2361 * 2/ finish a check parity operation and act on the result
2362 * 3/ skip to the writeback section if we previously
2363 * initiated a recovery operation
2364 */
2365 if (s->failed == 0 &&
2366 !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
2367 if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
2368 BUG_ON(s->uptodate != disks);
2369 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
2370 sh->ops.count++;
2371 s->uptodate--;
2372 } else if (
2373 test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) {
2374 clear_bit(STRIPE_OP_CHECK, &sh->ops.ack);
2375 clear_bit(STRIPE_OP_CHECK, &sh->ops.pending);
2376
2377 if (sh->ops.zero_sum_result == 0)
2378 /* parity is correct (on disc,
2379 * not in buffer any more)
2380 */
2381 set_bit(STRIPE_INSYNC, &sh->state);
2382 else {
2383 conf->mddev->resync_mismatches +=
2384 STRIPE_SECTORS;
2385 if (test_bit(
2386 MD_RECOVERY_CHECK, &conf->mddev->recovery))
2387 /* don't try to repair!! */
2388 set_bit(STRIPE_INSYNC, &sh->state);
2389 else {
2390 set_bit(STRIPE_OP_COMPUTE_BLK,
2391 &sh->ops.pending);
2392 set_bit(STRIPE_OP_MOD_REPAIR_PD,
2393 &sh->ops.pending);
2394 set_bit(R5_Wantcompute,
2395 &sh->dev[sh->pd_idx].flags);
2396 sh->ops.target = sh->pd_idx;
2397 sh->ops.count++;
2398 s->uptodate++;
2399 }
2400 }
2401 }
2402 }
2403
2404 /* check if we can clear a parity disk reconstruct */
2405 if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
2406 test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
2407
2408 clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending);
2409 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
2410 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
2411 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
2412 }
2413
2414 /* Wait for check parity and compute block operations to complete
2415 * before write-back
2416 */
2417 if (!test_bit(STRIPE_INSYNC, &sh->state) &&
2418 !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) &&
2419 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) {
2420 struct r5dev *dev;
2421 /* either failed parity check, or recovery is happening */
2422 if (s->failed == 0)
2423 s->failed_num = sh->pd_idx;
2424 dev = &sh->dev[s->failed_num];
2425 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
2426 BUG_ON(s->uptodate != disks);
2427
2428 set_bit(R5_LOCKED, &dev->flags);
2429 set_bit(R5_Wantwrite, &dev->flags);
2430 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2431 sh->ops.count++;
2432
2433 clear_bit(STRIPE_DEGRADED, &sh->state);
2434 s->locked++;
2435 set_bit(STRIPE_INSYNC, &sh->state);
2436 }
2437}
2438
2439
2440static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2441 struct stripe_head_state *s,
2442 struct r6_state *r6s, struct page *tmp_page,
2443 int disks)
2444{
2445 int update_p = 0, update_q = 0;
2446 struct r5dev *dev;
2447 int pd_idx = sh->pd_idx;
2448 int qd_idx = r6s->qd_idx;
2449
2450 set_bit(STRIPE_HANDLE, &sh->state);
2451
2452 BUG_ON(s->failed > 2);
2453 BUG_ON(s->uptodate < disks);
2454 /* Want to check and possibly repair P and Q.
2455 * However there could be one 'failed' device, in which
2456 * case we can only check one of them, possibly using the
2457 * other to generate missing data
2458 */
2459
2460 /* If !tmp_page, we cannot do the calculations,
2461 * but as we have set STRIPE_HANDLE, we will soon be called
2462 * by stripe_handle with a tmp_page - just wait until then.
2463 */
2464 if (tmp_page) {
2465 if (s->failed == r6s->q_failed) {
2466 /* The only possible failed device holds 'Q', so it
2467 * makes sense to check P (If anything else were failed,
2468 * we would have used P to recreate it).
2469 */
2470 compute_block_1(sh, pd_idx, 1);
2471 if (!page_is_zero(sh->dev[pd_idx].page)) {
2472 compute_block_1(sh, pd_idx, 0);
2473 update_p = 1;
2474 }
2475 }
2476 if (!r6s->q_failed && s->failed < 2) {
2477 /* q is not failed, and we didn't use it to generate
2478 * anything, so it makes sense to check it
2479 */
2480 memcpy(page_address(tmp_page),
2481 page_address(sh->dev[qd_idx].page),
2482 STRIPE_SIZE);
2483 compute_parity6(sh, UPDATE_PARITY);
2484 if (memcmp(page_address(tmp_page),
2485 page_address(sh->dev[qd_idx].page),
2486 STRIPE_SIZE) != 0) {
2487 clear_bit(STRIPE_INSYNC, &sh->state);
2488 update_q = 1;
2489 }
2490 }
2491 if (update_p || update_q) {
2492 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2493 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2494 /* don't try to repair!! */
2495 update_p = update_q = 0;
2496 }
2497
2498 /* now write out any block on a failed drive,
2499 * or P or Q if they need it
2500 */
2501
2502 if (s->failed == 2) {
2503 dev = &sh->dev[r6s->failed_num[1]];
2504 s->locked++;
2505 set_bit(R5_LOCKED, &dev->flags);
2506 set_bit(R5_Wantwrite, &dev->flags);
2507 }
2508 if (s->failed >= 1) {
2509 dev = &sh->dev[r6s->failed_num[0]];
2510 s->locked++;
2511 set_bit(R5_LOCKED, &dev->flags);
2512 set_bit(R5_Wantwrite, &dev->flags);
2513 }
2514
2515 if (update_p) {
2516 dev = &sh->dev[pd_idx];
2517 s->locked++;
2518 set_bit(R5_LOCKED, &dev->flags);
2519 set_bit(R5_Wantwrite, &dev->flags);
2520 }
2521 if (update_q) {
2522 dev = &sh->dev[qd_idx];
2523 s->locked++;
2524 set_bit(R5_LOCKED, &dev->flags);
2525 set_bit(R5_Wantwrite, &dev->flags);
2526 }
2527 clear_bit(STRIPE_DEGRADED, &sh->state);
2528
2529 set_bit(STRIPE_INSYNC, &sh->state);
2530 }
2531}
2532
2533static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2534 struct r6_state *r6s)
2535{
2536 int i;
2537
2538 /* We have read all the blocks in this stripe and now we need to
2539 * copy some of them into a target stripe for expand.
2540 */
2541 struct dma_async_tx_descriptor *tx = NULL;
2542 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2543 for (i = 0; i < sh->disks; i++)
2544 if (i != sh->pd_idx && (r6s && i != r6s->qd_idx)) {
2545 int dd_idx, pd_idx, j;
2546 struct stripe_head *sh2;
2547
2548 sector_t bn = compute_blocknr(sh, i);
2549 sector_t s = raid5_compute_sector(bn, conf->raid_disks,
2550 conf->raid_disks -
2551 conf->max_degraded, &dd_idx,
2552 &pd_idx, conf);
2553 sh2 = get_active_stripe(conf, s, conf->raid_disks,
2554 pd_idx, 1);
2555 if (sh2 == NULL)
2556 /* so far only the early blocks of this stripe
2557 * have been requested. When later blocks
2558 * get requested, we will try again
2559 */
2560 continue;
2561 if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
2562 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
2563 /* must have already done this block */
2564 release_stripe(sh2);
2565 continue;
2566 }
2567
2568 /* place all the copies on one channel */
2569 tx = async_memcpy(sh2->dev[dd_idx].page,
2570 sh->dev[i].page, 0, 0, STRIPE_SIZE,
2571 ASYNC_TX_DEP_ACK, tx, NULL, NULL);
2572
2573 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
2574 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
2575 for (j = 0; j < conf->raid_disks; j++)
2576 if (j != sh2->pd_idx &&
2577 (r6s && j != r6s->qd_idx) &&
2578 !test_bit(R5_Expanded, &sh2->dev[j].flags))
2579 break;
2580 if (j == conf->raid_disks) {
2581 set_bit(STRIPE_EXPAND_READY, &sh2->state);
2582 set_bit(STRIPE_HANDLE, &sh2->state);
2583 }
2584 release_stripe(sh2);
2585
2586 /* done submitting copies, wait for them to complete */
2587 if (i + 1 >= sh->disks) {
2588 async_tx_ack(tx);
2589 dma_wait_for_async_tx(tx);
2590 }
2591 }
2592}
1329 2593
1330/* 2594/*
1331 * handle_stripe - do things to a stripe. 2595 * handle_stripe - do things to a stripe.
@@ -1339,81 +2603,70 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
1339 * schedule a write of some buffers 2603 * schedule a write of some buffers
1340 * return confirmation of parity correctness 2604 * return confirmation of parity correctness
1341 * 2605 *
1342 * Parity calculations are done inside the stripe lock
1343 * buffers are taken off read_list or write_list, and bh_cache buffers 2606 * buffers are taken off read_list or write_list, and bh_cache buffers
1344 * get BH_Lock set before the stripe lock is released. 2607 * get BH_Lock set before the stripe lock is released.
1345 * 2608 *
1346 */ 2609 */
1347 2610
1348static void handle_stripe5(struct stripe_head *sh) 2611static void handle_stripe5(struct stripe_head *sh)
1349{ 2612{
1350 raid5_conf_t *conf = sh->raid_conf; 2613 raid5_conf_t *conf = sh->raid_conf;
1351 int disks = sh->disks; 2614 int disks = sh->disks, i;
1352 struct bio *return_bi= NULL; 2615 struct bio *return_bi = NULL;
1353 struct bio *bi; 2616 struct stripe_head_state s;
1354 int i;
1355 int syncing, expanding, expanded;
1356 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
1357 int non_overwrite = 0;
1358 int failed_num=0;
1359 struct r5dev *dev; 2617 struct r5dev *dev;
2618 unsigned long pending = 0;
1360 2619
1361 PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n", 2620 memset(&s, 0, sizeof(s));
1362 (unsigned long long)sh->sector, atomic_read(&sh->count), 2621 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d "
1363 sh->pd_idx); 2622 "ops=%lx:%lx:%lx\n", (unsigned long long)sh->sector, sh->state,
2623 atomic_read(&sh->count), sh->pd_idx,
2624 sh->ops.pending, sh->ops.ack, sh->ops.complete);
1364 2625
1365 spin_lock(&sh->lock); 2626 spin_lock(&sh->lock);
1366 clear_bit(STRIPE_HANDLE, &sh->state); 2627 clear_bit(STRIPE_HANDLE, &sh->state);
1367 clear_bit(STRIPE_DELAYED, &sh->state); 2628 clear_bit(STRIPE_DELAYED, &sh->state);
1368 2629
1369 syncing = test_bit(STRIPE_SYNCING, &sh->state); 2630 s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
1370 expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 2631 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
1371 expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 2632 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
1372 /* Now to look around and see what can be done */ 2633 /* Now to look around and see what can be done */
1373 2634
1374 rcu_read_lock(); 2635 rcu_read_lock();
1375 for (i=disks; i--; ) { 2636 for (i=disks; i--; ) {
1376 mdk_rdev_t *rdev; 2637 mdk_rdev_t *rdev;
1377 dev = &sh->dev[i]; 2638 struct r5dev *dev = &sh->dev[i];
1378 clear_bit(R5_Insync, &dev->flags); 2639 clear_bit(R5_Insync, &dev->flags);
1379 2640
1380 PRINTK("check %d: state 0x%lx read %p write %p written %p\n", 2641 pr_debug("check %d: state 0x%lx toread %p read %p write %p "
1381 i, dev->flags, dev->toread, dev->towrite, dev->written); 2642 "written %p\n", i, dev->flags, dev->toread, dev->read,
1382 /* maybe we can reply to a read */ 2643 dev->towrite, dev->written);
1383 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
1384 struct bio *rbi, *rbi2;
1385 PRINTK("Return read for disc %d\n", i);
1386 spin_lock_irq(&conf->device_lock);
1387 rbi = dev->toread;
1388 dev->toread = NULL;
1389 if (test_and_clear_bit(R5_Overlap, &dev->flags))
1390 wake_up(&conf->wait_for_overlap);
1391 spin_unlock_irq(&conf->device_lock);
1392 while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
1393 copy_data(0, rbi, dev->page, dev->sector);
1394 rbi2 = r5_next_bio(rbi, dev->sector);
1395 spin_lock_irq(&conf->device_lock);
1396 if (--rbi->bi_phys_segments == 0) {
1397 rbi->bi_next = return_bi;
1398 return_bi = rbi;
1399 }
1400 spin_unlock_irq(&conf->device_lock);
1401 rbi = rbi2;
1402 }
1403 }
1404 2644
1405 /* now count some things */ 2645 /* maybe we can request a biofill operation
1406 if (test_bit(R5_LOCKED, &dev->flags)) locked++; 2646 *
1407 if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++; 2647 * new wantfill requests are only permitted while
2648 * STRIPE_OP_BIOFILL is clear
2649 */
2650 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
2651 !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
2652 set_bit(R5_Wantfill, &dev->flags);
1408 2653
1409 2654 /* now count some things */
1410 if (dev->toread) to_read++; 2655 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
2656 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
2657 if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++;
2658
2659 if (test_bit(R5_Wantfill, &dev->flags))
2660 s.to_fill++;
2661 else if (dev->toread)
2662 s.to_read++;
1411 if (dev->towrite) { 2663 if (dev->towrite) {
1412 to_write++; 2664 s.to_write++;
1413 if (!test_bit(R5_OVERWRITE, &dev->flags)) 2665 if (!test_bit(R5_OVERWRITE, &dev->flags))
1414 non_overwrite++; 2666 s.non_overwrite++;
1415 } 2667 }
1416 if (dev->written) written++; 2668 if (dev->written)
2669 s.written++;
1417 rdev = rcu_dereference(conf->disks[i].rdev); 2670 rdev = rcu_dereference(conf->disks[i].rdev);
1418 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 2671 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
1419 /* The ReadError flag will just be confusing now */ 2672 /* The ReadError flag will just be confusing now */
@@ -1422,306 +2675,131 @@ static void handle_stripe5(struct stripe_head *sh)
1422 } 2675 }
1423 if (!rdev || !test_bit(In_sync, &rdev->flags) 2676 if (!rdev || !test_bit(In_sync, &rdev->flags)
1424 || test_bit(R5_ReadError, &dev->flags)) { 2677 || test_bit(R5_ReadError, &dev->flags)) {
1425 failed++; 2678 s.failed++;
1426 failed_num = i; 2679 s.failed_num = i;
1427 } else 2680 } else
1428 set_bit(R5_Insync, &dev->flags); 2681 set_bit(R5_Insync, &dev->flags);
1429 } 2682 }
1430 rcu_read_unlock(); 2683 rcu_read_unlock();
1431 PRINTK("locked=%d uptodate=%d to_read=%d" 2684
2685 if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
2686 sh->ops.count++;
2687
2688 pr_debug("locked=%d uptodate=%d to_read=%d"
1432 " to_write=%d failed=%d failed_num=%d\n", 2689 " to_write=%d failed=%d failed_num=%d\n",
1433 locked, uptodate, to_read, to_write, failed, failed_num); 2690 s.locked, s.uptodate, s.to_read, s.to_write,
2691 s.failed, s.failed_num);
1434 /* check if the array has lost two devices and, if so, some requests might 2692 /* check if the array has lost two devices and, if so, some requests might
1435 * need to be failed 2693 * need to be failed
1436 */ 2694 */
1437 if (failed > 1 && to_read+to_write+written) { 2695 if (s.failed > 1 && s.to_read+s.to_write+s.written)
1438 for (i=disks; i--; ) { 2696 handle_requests_to_failed_array(conf, sh, &s, disks,
1439 int bitmap_end = 0; 2697 &return_bi);
1440 2698 if (s.failed > 1 && s.syncing) {
1441 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1442 mdk_rdev_t *rdev;
1443 rcu_read_lock();
1444 rdev = rcu_dereference(conf->disks[i].rdev);
1445 if (rdev && test_bit(In_sync, &rdev->flags))
1446 /* multiple read failures in one stripe */
1447 md_error(conf->mddev, rdev);
1448 rcu_read_unlock();
1449 }
1450
1451 spin_lock_irq(&conf->device_lock);
1452 /* fail all writes first */
1453 bi = sh->dev[i].towrite;
1454 sh->dev[i].towrite = NULL;
1455 if (bi) { to_write--; bitmap_end = 1; }
1456
1457 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1458 wake_up(&conf->wait_for_overlap);
1459
1460 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
1461 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
1462 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1463 if (--bi->bi_phys_segments == 0) {
1464 md_write_end(conf->mddev);
1465 bi->bi_next = return_bi;
1466 return_bi = bi;
1467 }
1468 bi = nextbi;
1469 }
1470 /* and fail all 'written' */
1471 bi = sh->dev[i].written;
1472 sh->dev[i].written = NULL;
1473 if (bi) bitmap_end = 1;
1474 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
1475 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
1476 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1477 if (--bi->bi_phys_segments == 0) {
1478 md_write_end(conf->mddev);
1479 bi->bi_next = return_bi;
1480 return_bi = bi;
1481 }
1482 bi = bi2;
1483 }
1484
1485 /* fail any reads if this device is non-operational */
1486 if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
1487 test_bit(R5_ReadError, &sh->dev[i].flags)) {
1488 bi = sh->dev[i].toread;
1489 sh->dev[i].toread = NULL;
1490 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1491 wake_up(&conf->wait_for_overlap);
1492 if (bi) to_read--;
1493 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
1494 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
1495 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1496 if (--bi->bi_phys_segments == 0) {
1497 bi->bi_next = return_bi;
1498 return_bi = bi;
1499 }
1500 bi = nextbi;
1501 }
1502 }
1503 spin_unlock_irq(&conf->device_lock);
1504 if (bitmap_end)
1505 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
1506 STRIPE_SECTORS, 0, 0);
1507 }
1508 }
1509 if (failed > 1 && syncing) {
1510 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 2699 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
1511 clear_bit(STRIPE_SYNCING, &sh->state); 2700 clear_bit(STRIPE_SYNCING, &sh->state);
1512 syncing = 0; 2701 s.syncing = 0;
1513 } 2702 }
1514 2703
1515 /* might be able to return some write requests if the parity block 2704 /* might be able to return some write requests if the parity block
1516 * is safe, or on a failed drive 2705 * is safe, or on a failed drive
1517 */ 2706 */
1518 dev = &sh->dev[sh->pd_idx]; 2707 dev = &sh->dev[sh->pd_idx];
1519 if ( written && 2708 if ( s.written &&
1520 ( (test_bit(R5_Insync, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) && 2709 ((test_bit(R5_Insync, &dev->flags) &&
1521 test_bit(R5_UPTODATE, &dev->flags)) 2710 !test_bit(R5_LOCKED, &dev->flags) &&
1522 || (failed == 1 && failed_num == sh->pd_idx)) 2711 test_bit(R5_UPTODATE, &dev->flags)) ||
1523 ) { 2712 (s.failed == 1 && s.failed_num == sh->pd_idx)))
1524 /* any written block on an uptodate or failed drive can be returned. 2713 handle_completed_write_requests(conf, sh, disks, &return_bi);
1525 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
1526 * never LOCKED, so we don't need to test 'failed' directly.
1527 */
1528 for (i=disks; i--; )
1529 if (sh->dev[i].written) {
1530 dev = &sh->dev[i];
1531 if (!test_bit(R5_LOCKED, &dev->flags) &&
1532 test_bit(R5_UPTODATE, &dev->flags) ) {
1533 /* We can return any write requests */
1534 struct bio *wbi, *wbi2;
1535 int bitmap_end = 0;
1536 PRINTK("Return write for disc %d\n", i);
1537 spin_lock_irq(&conf->device_lock);
1538 wbi = dev->written;
1539 dev->written = NULL;
1540 while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
1541 wbi2 = r5_next_bio(wbi, dev->sector);
1542 if (--wbi->bi_phys_segments == 0) {
1543 md_write_end(conf->mddev);
1544 wbi->bi_next = return_bi;
1545 return_bi = wbi;
1546 }
1547 wbi = wbi2;
1548 }
1549 if (dev->towrite == NULL)
1550 bitmap_end = 1;
1551 spin_unlock_irq(&conf->device_lock);
1552 if (bitmap_end)
1553 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
1554 STRIPE_SECTORS,
1555 !test_bit(STRIPE_DEGRADED, &sh->state), 0);
1556 }
1557 }
1558 }
1559 2714
1560 /* Now we might consider reading some blocks, either to check/generate 2715 /* Now we might consider reading some blocks, either to check/generate
1561 * parity, or to satisfy requests 2716 * parity, or to satisfy requests
1562 * or to load a block that is being partially written. 2717 * or to load a block that is being partially written.
1563 */ 2718 */
1564 if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding) { 2719 if (s.to_read || s.non_overwrite ||
1565 for (i=disks; i--;) { 2720 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding ||
1566 dev = &sh->dev[i]; 2721 test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
1567 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && 2722 handle_issuing_new_read_requests5(sh, &s, disks);
1568 (dev->toread || 2723
1569 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2724 /* Now we check to see if any write operations have recently
1570 syncing || 2725 * completed
1571 expanding || 2726 */
1572 (failed && (sh->dev[failed_num].toread || 2727
1573 (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags)))) 2728 /* leave prexor set until postxor is done, allows us to distinguish
1574 ) 2729 * a rmw from a rcw during biodrain
1575 ) { 2730 */
1576 /* we would like to get this block, possibly 2731 if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) &&
1577 * by computing it, but we might not be able to 2732 test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
1578 */ 2733
1579 if (uptodate == disks-1) { 2734 clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
1580 PRINTK("Computing block %d\n", i); 2735 clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack);
1581 compute_block(sh, i); 2736 clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
1582 uptodate++; 2737
1583 } else if (test_bit(R5_Insync, &dev->flags)) { 2738 for (i = disks; i--; )
1584 set_bit(R5_LOCKED, &dev->flags); 2739 clear_bit(R5_Wantprexor, &sh->dev[i].flags);
1585 set_bit(R5_Wantread, &dev->flags);
1586 locked++;
1587 PRINTK("Reading block %d (sync=%d)\n",
1588 i, syncing);
1589 }
1590 }
1591 }
1592 set_bit(STRIPE_HANDLE, &sh->state);
1593 } 2740 }
1594 2741
1595 /* now to consider writing and what else, if anything should be read */ 2742 /* if only POSTXOR is set then this is an 'expand' postxor */
1596 if (to_write) { 2743 if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) &&
1597 int rmw=0, rcw=0; 2744 test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
1598 for (i=disks ; i--;) { 2745
1599 /* would I have to read this buffer for read_modify_write */ 2746 clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
2747 clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack);
2748 clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
2749
2750 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
2751 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
2752 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
2753
2754 /* All the 'written' buffers and the parity block are ready to
2755 * be written back to disk
2756 */
2757 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
2758 for (i = disks; i--; ) {
1600 dev = &sh->dev[i]; 2759 dev = &sh->dev[i];
1601 if ((dev->towrite || i == sh->pd_idx) && 2760 if (test_bit(R5_LOCKED, &dev->flags) &&
1602 (!test_bit(R5_LOCKED, &dev->flags) 2761 (i == sh->pd_idx || dev->written)) {
1603 ) && 2762 pr_debug("Writing block %d\n", i);
1604 !test_bit(R5_UPTODATE, &dev->flags)) { 2763 set_bit(R5_Wantwrite, &dev->flags);
1605 if (test_bit(R5_Insync, &dev->flags) 2764 if (!test_and_set_bit(
1606/* && !(!mddev->insync && i == sh->pd_idx) */ 2765 STRIPE_OP_IO, &sh->ops.pending))
1607 ) 2766 sh->ops.count++;
1608 rmw++; 2767 if (!test_bit(R5_Insync, &dev->flags) ||
1609 else rmw += 2*disks; /* cannot read it */ 2768 (i == sh->pd_idx && s.failed == 0))
1610 } 2769 set_bit(STRIPE_INSYNC, &sh->state);
1611 /* Would I have to read this buffer for reconstruct_write */
1612 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
1613 (!test_bit(R5_LOCKED, &dev->flags)
1614 ) &&
1615 !test_bit(R5_UPTODATE, &dev->flags)) {
1616 if (test_bit(R5_Insync, &dev->flags)) rcw++;
1617 else rcw += 2*disks;
1618 } 2770 }
1619 } 2771 }
1620 PRINTK("for sector %llu, rmw=%d rcw=%d\n", 2772 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
1621 (unsigned long long)sh->sector, rmw, rcw); 2773 atomic_dec(&conf->preread_active_stripes);
1622 set_bit(STRIPE_HANDLE, &sh->state); 2774 if (atomic_read(&conf->preread_active_stripes) <
1623 if (rmw < rcw && rmw > 0) 2775 IO_THRESHOLD)
1624 /* prefer read-modify-write, but need to get some data */ 2776 md_wakeup_thread(conf->mddev->thread);
1625 for (i=disks; i--;) {
1626 dev = &sh->dev[i];
1627 if ((dev->towrite || i == sh->pd_idx) &&
1628 !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1629 test_bit(R5_Insync, &dev->flags)) {
1630 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1631 {
1632 PRINTK("Read_old block %d for r-m-w\n", i);
1633 set_bit(R5_LOCKED, &dev->flags);
1634 set_bit(R5_Wantread, &dev->flags);
1635 locked++;
1636 } else {
1637 set_bit(STRIPE_DELAYED, &sh->state);
1638 set_bit(STRIPE_HANDLE, &sh->state);
1639 }
1640 }
1641 }
1642 if (rcw <= rmw && rcw > 0)
1643 /* want reconstruct write, but need to get some data */
1644 for (i=disks; i--;) {
1645 dev = &sh->dev[i];
1646 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
1647 !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1648 test_bit(R5_Insync, &dev->flags)) {
1649 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1650 {
1651 PRINTK("Read_old block %d for Reconstruct\n", i);
1652 set_bit(R5_LOCKED, &dev->flags);
1653 set_bit(R5_Wantread, &dev->flags);
1654 locked++;
1655 } else {
1656 set_bit(STRIPE_DELAYED, &sh->state);
1657 set_bit(STRIPE_HANDLE, &sh->state);
1658 }
1659 }
1660 }
1661 /* now if nothing is locked, and if we have enough data, we can start a write request */
1662 if (locked == 0 && (rcw == 0 ||rmw == 0) &&
1663 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
1664 PRINTK("Computing parity...\n");
1665 compute_parity5(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
1666 /* now every locked buffer is ready to be written */
1667 for (i=disks; i--;)
1668 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
1669 PRINTK("Writing block %d\n", i);
1670 locked++;
1671 set_bit(R5_Wantwrite, &sh->dev[i].flags);
1672 if (!test_bit(R5_Insync, &sh->dev[i].flags)
1673 || (i==sh->pd_idx && failed == 0))
1674 set_bit(STRIPE_INSYNC, &sh->state);
1675 }
1676 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
1677 atomic_dec(&conf->preread_active_stripes);
1678 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
1679 md_wakeup_thread(conf->mddev->thread);
1680 }
1681 } 2777 }
1682 } 2778 }
1683 2779
1684 /* maybe we need to check and possibly fix the parity for this stripe 2780 /* Now to consider new write requests and what else, if anything
1685 * Any reads will already have been scheduled, so we just see if enough data 2781 * should be read. We do not handle new writes when:
1686 * is available 2782 * 1/ A 'write' operation (copy+xor) is already in flight.
2783 * 2/ A 'check' operation is in flight, as it may clobber the parity
2784 * block.
1687 */ 2785 */
1688 if (syncing && locked == 0 && 2786 if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) &&
1689 !test_bit(STRIPE_INSYNC, &sh->state)) { 2787 !test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
1690 set_bit(STRIPE_HANDLE, &sh->state); 2788 handle_issuing_new_write_requests5(conf, sh, &s, disks);
1691 if (failed == 0) {
1692 BUG_ON(uptodate != disks);
1693 compute_parity5(sh, CHECK_PARITY);
1694 uptodate--;
1695 if (page_is_zero(sh->dev[sh->pd_idx].page)) {
1696 /* parity is correct (on disc, not in buffer any more) */
1697 set_bit(STRIPE_INSYNC, &sh->state);
1698 } else {
1699 conf->mddev->resync_mismatches += STRIPE_SECTORS;
1700 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
1701 /* don't try to repair!! */
1702 set_bit(STRIPE_INSYNC, &sh->state);
1703 else {
1704 compute_block(sh, sh->pd_idx);
1705 uptodate++;
1706 }
1707 }
1708 }
1709 if (!test_bit(STRIPE_INSYNC, &sh->state)) {
1710 /* either failed parity check, or recovery is happening */
1711 if (failed==0)
1712 failed_num = sh->pd_idx;
1713 dev = &sh->dev[failed_num];
1714 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
1715 BUG_ON(uptodate != disks);
1716 2789
1717 set_bit(R5_LOCKED, &dev->flags); 2790 /* maybe we need to check and possibly fix the parity for this stripe
1718 set_bit(R5_Wantwrite, &dev->flags); 2791 * Any reads will already have been scheduled, so we just see if enough
1719 clear_bit(STRIPE_DEGRADED, &sh->state); 2792 * data is available. The parity check is held off while parity
1720 locked++; 2793 * dependent operations are in flight.
1721 set_bit(STRIPE_INSYNC, &sh->state); 2794 */
1722 } 2795 if ((s.syncing && s.locked == 0 &&
1723 } 2796 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
1724 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 2797 !test_bit(STRIPE_INSYNC, &sh->state)) ||
2798 test_bit(STRIPE_OP_CHECK, &sh->ops.pending) ||
2799 test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending))
2800 handle_parity_checks5(conf, sh, &s, disks);
2801
2802 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
1725 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 2803 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
1726 clear_bit(STRIPE_SYNCING, &sh->state); 2804 clear_bit(STRIPE_SYNCING, &sh->state);
1727 } 2805 }
@@ -1729,186 +2807,102 @@ static void handle_stripe5(struct stripe_head *sh)
1729 /* If the failed drive is just a ReadError, then we might need to progress 2807 /* If the failed drive is just a ReadError, then we might need to progress
1730 * the repair/check process 2808 * the repair/check process
1731 */ 2809 */
1732 if (failed == 1 && ! conf->mddev->ro && 2810 if (s.failed == 1 && !conf->mddev->ro &&
1733 test_bit(R5_ReadError, &sh->dev[failed_num].flags) 2811 test_bit(R5_ReadError, &sh->dev[s.failed_num].flags)
1734 && !test_bit(R5_LOCKED, &sh->dev[failed_num].flags) 2812 && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags)
1735 && test_bit(R5_UPTODATE, &sh->dev[failed_num].flags) 2813 && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags)
1736 ) { 2814 ) {
1737 dev = &sh->dev[failed_num]; 2815 dev = &sh->dev[s.failed_num];
1738 if (!test_bit(R5_ReWrite, &dev->flags)) { 2816 if (!test_bit(R5_ReWrite, &dev->flags)) {
1739 set_bit(R5_Wantwrite, &dev->flags); 2817 set_bit(R5_Wantwrite, &dev->flags);
2818 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2819 sh->ops.count++;
1740 set_bit(R5_ReWrite, &dev->flags); 2820 set_bit(R5_ReWrite, &dev->flags);
1741 set_bit(R5_LOCKED, &dev->flags); 2821 set_bit(R5_LOCKED, &dev->flags);
1742 locked++; 2822 s.locked++;
1743 } else { 2823 } else {
1744 /* let's read it back */ 2824 /* let's read it back */
1745 set_bit(R5_Wantread, &dev->flags); 2825 set_bit(R5_Wantread, &dev->flags);
2826 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2827 sh->ops.count++;
1746 set_bit(R5_LOCKED, &dev->flags); 2828 set_bit(R5_LOCKED, &dev->flags);
1747 locked++; 2829 s.locked++;
1748 } 2830 }
1749 } 2831 }
1750 2832
1751 if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { 2833 /* Finish postxor operations initiated by the expansion
1752 /* Need to write out all blocks after computing parity */ 2834 * process
1753 sh->disks = conf->raid_disks; 2835 */
1754 sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks); 2836 if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) &&
1755 compute_parity5(sh, RECONSTRUCT_WRITE); 2837 !test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) {
1756 for (i= conf->raid_disks; i--;) { 2838
1757 set_bit(R5_LOCKED, &sh->dev[i].flags); 2839 clear_bit(STRIPE_EXPANDING, &sh->state);
1758 locked++; 2840
2841 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
2842 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
2843 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
2844
2845 for (i = conf->raid_disks; i--; ) {
1759 set_bit(R5_Wantwrite, &sh->dev[i].flags); 2846 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2847 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2848 sh->ops.count++;
1760 } 2849 }
1761 clear_bit(STRIPE_EXPANDING, &sh->state); 2850 }
1762 } else if (expanded) { 2851
2852 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
2853 !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
2854 /* Need to write out all blocks after computing parity */
2855 sh->disks = conf->raid_disks;
2856 sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
2857 conf->raid_disks);
2858 s.locked += handle_write_operations5(sh, 0, 1);
2859 } else if (s.expanded &&
2860 !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
1763 clear_bit(STRIPE_EXPAND_READY, &sh->state); 2861 clear_bit(STRIPE_EXPAND_READY, &sh->state);
1764 atomic_dec(&conf->reshape_stripes); 2862 atomic_dec(&conf->reshape_stripes);
1765 wake_up(&conf->wait_for_overlap); 2863 wake_up(&conf->wait_for_overlap);
1766 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 2864 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
1767 } 2865 }
1768 2866
1769 if (expanding && locked == 0) { 2867 if (s.expanding && s.locked == 0)
1770 /* We have read all the blocks in this stripe and now we need to 2868 handle_stripe_expansion(conf, sh, NULL);
1771 * copy some of them into a target stripe for expand. 2869
1772 */ 2870 if (sh->ops.count)
1773 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 2871 pending = get_stripe_work(sh);
1774 for (i=0; i< sh->disks; i++)
1775 if (i != sh->pd_idx) {
1776 int dd_idx, pd_idx, j;
1777 struct stripe_head *sh2;
1778
1779 sector_t bn = compute_blocknr(sh, i);
1780 sector_t s = raid5_compute_sector(bn, conf->raid_disks,
1781 conf->raid_disks-1,
1782 &dd_idx, &pd_idx, conf);
1783 sh2 = get_active_stripe(conf, s, conf->raid_disks, pd_idx, 1);
1784 if (sh2 == NULL)
1785 /* so far only the early blocks of this stripe
1786 * have been requested. When later blocks
1787 * get requested, we will try again
1788 */
1789 continue;
1790 if(!test_bit(STRIPE_EXPANDING, &sh2->state) ||
1791 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
1792 /* must have already done this block */
1793 release_stripe(sh2);
1794 continue;
1795 }
1796 memcpy(page_address(sh2->dev[dd_idx].page),
1797 page_address(sh->dev[i].page),
1798 STRIPE_SIZE);
1799 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
1800 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
1801 for (j=0; j<conf->raid_disks; j++)
1802 if (j != sh2->pd_idx &&
1803 !test_bit(R5_Expanded, &sh2->dev[j].flags))
1804 break;
1805 if (j == conf->raid_disks) {
1806 set_bit(STRIPE_EXPAND_READY, &sh2->state);
1807 set_bit(STRIPE_HANDLE, &sh2->state);
1808 }
1809 release_stripe(sh2);
1810 }
1811 }
1812 2872
1813 spin_unlock(&sh->lock); 2873 spin_unlock(&sh->lock);
1814 2874
1815 while ((bi=return_bi)) { 2875 if (pending)
1816 int bytes = bi->bi_size; 2876 raid5_run_ops(sh, pending);
1817 2877
1818 return_bi = bi->bi_next; 2878 return_io(return_bi);
1819 bi->bi_next = NULL;
1820 bi->bi_size = 0;
1821 bi->bi_end_io(bi, bytes,
1822 test_bit(BIO_UPTODATE, &bi->bi_flags)
1823 ? 0 : -EIO);
1824 }
1825 for (i=disks; i-- ;) {
1826 int rw;
1827 struct bio *bi;
1828 mdk_rdev_t *rdev;
1829 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
1830 rw = WRITE;
1831 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
1832 rw = READ;
1833 else
1834 continue;
1835
1836 bi = &sh->dev[i].req;
1837
1838 bi->bi_rw = rw;
1839 if (rw == WRITE)
1840 bi->bi_end_io = raid5_end_write_request;
1841 else
1842 bi->bi_end_io = raid5_end_read_request;
1843
1844 rcu_read_lock();
1845 rdev = rcu_dereference(conf->disks[i].rdev);
1846 if (rdev && test_bit(Faulty, &rdev->flags))
1847 rdev = NULL;
1848 if (rdev)
1849 atomic_inc(&rdev->nr_pending);
1850 rcu_read_unlock();
1851
1852 if (rdev) {
1853 if (syncing || expanding || expanded)
1854 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
1855 2879
1856 bi->bi_bdev = rdev->bdev;
1857 PRINTK("for %llu schedule op %ld on disc %d\n",
1858 (unsigned long long)sh->sector, bi->bi_rw, i);
1859 atomic_inc(&sh->count);
1860 bi->bi_sector = sh->sector + rdev->data_offset;
1861 bi->bi_flags = 1 << BIO_UPTODATE;
1862 bi->bi_vcnt = 1;
1863 bi->bi_max_vecs = 1;
1864 bi->bi_idx = 0;
1865 bi->bi_io_vec = &sh->dev[i].vec;
1866 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1867 bi->bi_io_vec[0].bv_offset = 0;
1868 bi->bi_size = STRIPE_SIZE;
1869 bi->bi_next = NULL;
1870 if (rw == WRITE &&
1871 test_bit(R5_ReWrite, &sh->dev[i].flags))
1872 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
1873 generic_make_request(bi);
1874 } else {
1875 if (rw == WRITE)
1876 set_bit(STRIPE_DEGRADED, &sh->state);
1877 PRINTK("skip op %ld on disc %d for sector %llu\n",
1878 bi->bi_rw, i, (unsigned long long)sh->sector);
1879 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1880 set_bit(STRIPE_HANDLE, &sh->state);
1881 }
1882 }
1883} 2880}
1884 2881
1885static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) 2882static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
1886{ 2883{
1887 raid6_conf_t *conf = sh->raid_conf; 2884 raid6_conf_t *conf = sh->raid_conf;
1888 int disks = sh->disks; 2885 int disks = sh->disks;
1889 struct bio *return_bi= NULL; 2886 struct bio *return_bi = NULL;
1890 struct bio *bi; 2887 int i, pd_idx = sh->pd_idx;
1891 int i; 2888 struct stripe_head_state s;
1892 int syncing, expanding, expanded; 2889 struct r6_state r6s;
1893 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
1894 int non_overwrite = 0;
1895 int failed_num[2] = {0, 0};
1896 struct r5dev *dev, *pdev, *qdev; 2890 struct r5dev *dev, *pdev, *qdev;
1897 int pd_idx = sh->pd_idx;
1898 int qd_idx = raid6_next_disk(pd_idx, disks);
1899 int p_failed, q_failed;
1900 2891
1901 PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d, qd_idx=%d\n", 2892 r6s.qd_idx = raid6_next_disk(pd_idx, disks);
1902 (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count), 2893 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
1903 pd_idx, qd_idx); 2894 "pd_idx=%d, qd_idx=%d\n",
2895 (unsigned long long)sh->sector, sh->state,
2896 atomic_read(&sh->count), pd_idx, r6s.qd_idx);
2897 memset(&s, 0, sizeof(s));
1904 2898
1905 spin_lock(&sh->lock); 2899 spin_lock(&sh->lock);
1906 clear_bit(STRIPE_HANDLE, &sh->state); 2900 clear_bit(STRIPE_HANDLE, &sh->state);
1907 clear_bit(STRIPE_DELAYED, &sh->state); 2901 clear_bit(STRIPE_DELAYED, &sh->state);
1908 2902
1909 syncing = test_bit(STRIPE_SYNCING, &sh->state); 2903 s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
1910 expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 2904 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
1911 expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 2905 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
1912 /* Now to look around and see what can be done */ 2906 /* Now to look around and see what can be done */
1913 2907
1914 rcu_read_lock(); 2908 rcu_read_lock();
@@ -1917,12 +2911,12 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
1917 dev = &sh->dev[i]; 2911 dev = &sh->dev[i];
1918 clear_bit(R5_Insync, &dev->flags); 2912 clear_bit(R5_Insync, &dev->flags);
1919 2913
1920 PRINTK("check %d: state 0x%lx read %p write %p written %p\n", 2914 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
1921 i, dev->flags, dev->toread, dev->towrite, dev->written); 2915 i, dev->flags, dev->toread, dev->towrite, dev->written);
1922 /* maybe we can reply to a read */ 2916 /* maybe we can reply to a read */
1923 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { 2917 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
1924 struct bio *rbi, *rbi2; 2918 struct bio *rbi, *rbi2;
1925 PRINTK("Return read for disc %d\n", i); 2919 pr_debug("Return read for disc %d\n", i);
1926 spin_lock_irq(&conf->device_lock); 2920 spin_lock_irq(&conf->device_lock);
1927 rbi = dev->toread; 2921 rbi = dev->toread;
1928 dev->toread = NULL; 2922 dev->toread = NULL;
@@ -1943,17 +2937,19 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
1943 } 2937 }
1944 2938
1945 /* now count some things */ 2939 /* now count some things */
1946 if (test_bit(R5_LOCKED, &dev->flags)) locked++; 2940 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
1947 if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++; 2941 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
1948 2942
1949 2943
1950 if (dev->toread) to_read++; 2944 if (dev->toread)
2945 s.to_read++;
1951 if (dev->towrite) { 2946 if (dev->towrite) {
1952 to_write++; 2947 s.to_write++;
1953 if (!test_bit(R5_OVERWRITE, &dev->flags)) 2948 if (!test_bit(R5_OVERWRITE, &dev->flags))
1954 non_overwrite++; 2949 s.non_overwrite++;
1955 } 2950 }
1956 if (dev->written) written++; 2951 if (dev->written)
2952 s.written++;
1957 rdev = rcu_dereference(conf->disks[i].rdev); 2953 rdev = rcu_dereference(conf->disks[i].rdev);
1958 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 2954 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
1959 /* The ReadError flag will just be confusing now */ 2955 /* The ReadError flag will just be confusing now */
@@ -1962,96 +2958,27 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
1962 } 2958 }
1963 if (!rdev || !test_bit(In_sync, &rdev->flags) 2959 if (!rdev || !test_bit(In_sync, &rdev->flags)
1964 || test_bit(R5_ReadError, &dev->flags)) { 2960 || test_bit(R5_ReadError, &dev->flags)) {
1965 if ( failed < 2 ) 2961 if (s.failed < 2)
1966 failed_num[failed] = i; 2962 r6s.failed_num[s.failed] = i;
1967 failed++; 2963 s.failed++;
1968 } else 2964 } else
1969 set_bit(R5_Insync, &dev->flags); 2965 set_bit(R5_Insync, &dev->flags);
1970 } 2966 }
1971 rcu_read_unlock(); 2967 rcu_read_unlock();
1972 PRINTK("locked=%d uptodate=%d to_read=%d" 2968 pr_debug("locked=%d uptodate=%d to_read=%d"
1973 " to_write=%d failed=%d failed_num=%d,%d\n", 2969 " to_write=%d failed=%d failed_num=%d,%d\n",
1974 locked, uptodate, to_read, to_write, failed, 2970 s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
1975 failed_num[0], failed_num[1]); 2971 r6s.failed_num[0], r6s.failed_num[1]);
1976 /* check if the array has lost >2 devices and, if so, some requests might 2972 /* check if the array has lost >2 devices and, if so, some requests
1977 * need to be failed 2973 * might need to be failed
1978 */ 2974 */
1979 if (failed > 2 && to_read+to_write+written) { 2975 if (s.failed > 2 && s.to_read+s.to_write+s.written)
1980 for (i=disks; i--; ) { 2976 handle_requests_to_failed_array(conf, sh, &s, disks,
1981 int bitmap_end = 0; 2977 &return_bi);
1982 2978 if (s.failed > 2 && s.syncing) {
1983 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1984 mdk_rdev_t *rdev;
1985 rcu_read_lock();
1986 rdev = rcu_dereference(conf->disks[i].rdev);
1987 if (rdev && test_bit(In_sync, &rdev->flags))
1988 /* multiple read failures in one stripe */
1989 md_error(conf->mddev, rdev);
1990 rcu_read_unlock();
1991 }
1992
1993 spin_lock_irq(&conf->device_lock);
1994 /* fail all writes first */
1995 bi = sh->dev[i].towrite;
1996 sh->dev[i].towrite = NULL;
1997 if (bi) { to_write--; bitmap_end = 1; }
1998
1999 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2000 wake_up(&conf->wait_for_overlap);
2001
2002 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
2003 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2004 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2005 if (--bi->bi_phys_segments == 0) {
2006 md_write_end(conf->mddev);
2007 bi->bi_next = return_bi;
2008 return_bi = bi;
2009 }
2010 bi = nextbi;
2011 }
2012 /* and fail all 'written' */
2013 bi = sh->dev[i].written;
2014 sh->dev[i].written = NULL;
2015 if (bi) bitmap_end = 1;
2016 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
2017 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
2018 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2019 if (--bi->bi_phys_segments == 0) {
2020 md_write_end(conf->mddev);
2021 bi->bi_next = return_bi;
2022 return_bi = bi;
2023 }
2024 bi = bi2;
2025 }
2026
2027 /* fail any reads if this device is non-operational */
2028 if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
2029 test_bit(R5_ReadError, &sh->dev[i].flags)) {
2030 bi = sh->dev[i].toread;
2031 sh->dev[i].toread = NULL;
2032 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2033 wake_up(&conf->wait_for_overlap);
2034 if (bi) to_read--;
2035 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
2036 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2037 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2038 if (--bi->bi_phys_segments == 0) {
2039 bi->bi_next = return_bi;
2040 return_bi = bi;
2041 }
2042 bi = nextbi;
2043 }
2044 }
2045 spin_unlock_irq(&conf->device_lock);
2046 if (bitmap_end)
2047 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2048 STRIPE_SECTORS, 0, 0);
2049 }
2050 }
2051 if (failed > 2 && syncing) {
2052 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 2979 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
2053 clear_bit(STRIPE_SYNCING, &sh->state); 2980 clear_bit(STRIPE_SYNCING, &sh->state);
2054 syncing = 0; 2981 s.syncing = 0;
2055 } 2982 }
2056 2983
2057 /* 2984 /*
@@ -2059,279 +2986,41 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2059 * are safe, or on a failed drive 2986 * are safe, or on a failed drive
2060 */ 2987 */
2061 pdev = &sh->dev[pd_idx]; 2988 pdev = &sh->dev[pd_idx];
2062 p_failed = (failed >= 1 && failed_num[0] == pd_idx) 2989 r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx)
2063 || (failed >= 2 && failed_num[1] == pd_idx); 2990 || (s.failed >= 2 && r6s.failed_num[1] == pd_idx);
2064 qdev = &sh->dev[qd_idx]; 2991 qdev = &sh->dev[r6s.qd_idx];
2065 q_failed = (failed >= 1 && failed_num[0] == qd_idx) 2992 r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == r6s.qd_idx)
2066 || (failed >= 2 && failed_num[1] == qd_idx); 2993 || (s.failed >= 2 && r6s.failed_num[1] == r6s.qd_idx);
2067 2994
2068 if ( written && 2995 if ( s.written &&
2069 ( p_failed || ((test_bit(R5_Insync, &pdev->flags) 2996 ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
2070 && !test_bit(R5_LOCKED, &pdev->flags) 2997 && !test_bit(R5_LOCKED, &pdev->flags)
2071 && test_bit(R5_UPTODATE, &pdev->flags))) ) && 2998 && test_bit(R5_UPTODATE, &pdev->flags)))) &&
2072 ( q_failed || ((test_bit(R5_Insync, &qdev->flags) 2999 ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
2073 && !test_bit(R5_LOCKED, &qdev->flags) 3000 && !test_bit(R5_LOCKED, &qdev->flags)
2074 && test_bit(R5_UPTODATE, &qdev->flags))) ) ) { 3001 && test_bit(R5_UPTODATE, &qdev->flags)))))
2075 /* any written block on an uptodate or failed drive can be 3002 handle_completed_write_requests(conf, sh, disks, &return_bi);
2076 * returned. Note that if we 'wrote' to a failed drive,
2077 * it will be UPTODATE, but never LOCKED, so we don't need
2078 * to test 'failed' directly.
2079 */
2080 for (i=disks; i--; )
2081 if (sh->dev[i].written) {
2082 dev = &sh->dev[i];
2083 if (!test_bit(R5_LOCKED, &dev->flags) &&
2084 test_bit(R5_UPTODATE, &dev->flags) ) {
2085 /* We can return any write requests */
2086 int bitmap_end = 0;
2087 struct bio *wbi, *wbi2;
2088 PRINTK("Return write for stripe %llu disc %d\n",
2089 (unsigned long long)sh->sector, i);
2090 spin_lock_irq(&conf->device_lock);
2091 wbi = dev->written;
2092 dev->written = NULL;
2093 while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
2094 wbi2 = r5_next_bio(wbi, dev->sector);
2095 if (--wbi->bi_phys_segments == 0) {
2096 md_write_end(conf->mddev);
2097 wbi->bi_next = return_bi;
2098 return_bi = wbi;
2099 }
2100 wbi = wbi2;
2101 }
2102 if (dev->towrite == NULL)
2103 bitmap_end = 1;
2104 spin_unlock_irq(&conf->device_lock);
2105 if (bitmap_end)
2106 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2107 STRIPE_SECTORS,
2108 !test_bit(STRIPE_DEGRADED, &sh->state), 0);
2109 }
2110 }
2111 }
2112 3003
2113 /* Now we might consider reading some blocks, either to check/generate 3004 /* Now we might consider reading some blocks, either to check/generate
2114 * parity, or to satisfy requests 3005 * parity, or to satisfy requests
2115 * or to load a block that is being partially written. 3006 * or to load a block that is being partially written.
2116 */ 3007 */
2117 if (to_read || non_overwrite || (to_write && failed) || 3008 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
2118 (syncing && (uptodate < disks)) || expanding) { 3009 (s.syncing && (s.uptodate < disks)) || s.expanding)
2119 for (i=disks; i--;) { 3010 handle_issuing_new_read_requests6(sh, &s, &r6s, disks);
2120 dev = &sh->dev[i];
2121 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
2122 (dev->toread ||
2123 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2124 syncing ||
2125 expanding ||
2126 (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) ||
2127 (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write))
2128 )
2129 ) {
2130 /* we would like to get this block, possibly
2131 * by computing it, but we might not be able to
2132 */
2133 if (uptodate == disks-1) {
2134 PRINTK("Computing stripe %llu block %d\n",
2135 (unsigned long long)sh->sector, i);
2136 compute_block_1(sh, i, 0);
2137 uptodate++;
2138 } else if ( uptodate == disks-2 && failed >= 2 ) {
2139 /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */
2140 int other;
2141 for (other=disks; other--;) {
2142 if ( other == i )
2143 continue;
2144 if ( !test_bit(R5_UPTODATE, &sh->dev[other].flags) )
2145 break;
2146 }
2147 BUG_ON(other < 0);
2148 PRINTK("Computing stripe %llu blocks %d,%d\n",
2149 (unsigned long long)sh->sector, i, other);
2150 compute_block_2(sh, i, other);
2151 uptodate += 2;
2152 } else if (test_bit(R5_Insync, &dev->flags)) {
2153 set_bit(R5_LOCKED, &dev->flags);
2154 set_bit(R5_Wantread, &dev->flags);
2155 locked++;
2156 PRINTK("Reading block %d (sync=%d)\n",
2157 i, syncing);
2158 }
2159 }
2160 }
2161 set_bit(STRIPE_HANDLE, &sh->state);
2162 }
2163 3011
2164 /* now to consider writing and what else, if anything should be read */ 3012 /* now to consider writing and what else, if anything should be read */
2165 if (to_write) { 3013 if (s.to_write)
2166 int rcw=0, must_compute=0; 3014 handle_issuing_new_write_requests6(conf, sh, &s, &r6s, disks);
2167 for (i=disks ; i--;) {
2168 dev = &sh->dev[i];
2169 /* Would I have to read this buffer for reconstruct_write */
2170 if (!test_bit(R5_OVERWRITE, &dev->flags)
2171 && i != pd_idx && i != qd_idx
2172 && (!test_bit(R5_LOCKED, &dev->flags)
2173 ) &&
2174 !test_bit(R5_UPTODATE, &dev->flags)) {
2175 if (test_bit(R5_Insync, &dev->flags)) rcw++;
2176 else {
2177 PRINTK("raid6: must_compute: disk %d flags=%#lx\n", i, dev->flags);
2178 must_compute++;
2179 }
2180 }
2181 }
2182 PRINTK("for sector %llu, rcw=%d, must_compute=%d\n",
2183 (unsigned long long)sh->sector, rcw, must_compute);
2184 set_bit(STRIPE_HANDLE, &sh->state);
2185
2186 if (rcw > 0)
2187 /* want reconstruct write, but need to get some data */
2188 for (i=disks; i--;) {
2189 dev = &sh->dev[i];
2190 if (!test_bit(R5_OVERWRITE, &dev->flags)
2191 && !(failed == 0 && (i == pd_idx || i == qd_idx))
2192 && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
2193 test_bit(R5_Insync, &dev->flags)) {
2194 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
2195 {
2196 PRINTK("Read_old stripe %llu block %d for Reconstruct\n",
2197 (unsigned long long)sh->sector, i);
2198 set_bit(R5_LOCKED, &dev->flags);
2199 set_bit(R5_Wantread, &dev->flags);
2200 locked++;
2201 } else {
2202 PRINTK("Request delayed stripe %llu block %d for Reconstruct\n",
2203 (unsigned long long)sh->sector, i);
2204 set_bit(STRIPE_DELAYED, &sh->state);
2205 set_bit(STRIPE_HANDLE, &sh->state);
2206 }
2207 }
2208 }
2209 /* now if nothing is locked, and if we have enough data, we can start a write request */
2210 if (locked == 0 && rcw == 0 &&
2211 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
2212 if ( must_compute > 0 ) {
2213 /* We have failed blocks and need to compute them */
2214 switch ( failed ) {
2215 case 0: BUG();
2216 case 1: compute_block_1(sh, failed_num[0], 0); break;
2217 case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break;
2218 default: BUG(); /* This request should have been failed? */
2219 }
2220 }
2221
2222 PRINTK("Computing parity for stripe %llu\n", (unsigned long long)sh->sector);
2223 compute_parity6(sh, RECONSTRUCT_WRITE);
2224 /* now every locked buffer is ready to be written */
2225 for (i=disks; i--;)
2226 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
2227 PRINTK("Writing stripe %llu block %d\n",
2228 (unsigned long long)sh->sector, i);
2229 locked++;
2230 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2231 }
2232 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
2233 set_bit(STRIPE_INSYNC, &sh->state);
2234
2235 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2236 atomic_dec(&conf->preread_active_stripes);
2237 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
2238 md_wakeup_thread(conf->mddev->thread);
2239 }
2240 }
2241 }
2242 3015
2243 /* maybe we need to check and possibly fix the parity for this stripe 3016 /* maybe we need to check and possibly fix the parity for this stripe
2244 * Any reads will already have been scheduled, so we just see if enough data 3017 * Any reads will already have been scheduled, so we just see if enough
2245 * is available 3018 * data is available
2246 */ 3019 */
2247 if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) { 3020 if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state))
2248 int update_p = 0, update_q = 0; 3021 handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks);
2249 struct r5dev *dev;
2250
2251 set_bit(STRIPE_HANDLE, &sh->state);
2252
2253 BUG_ON(failed>2);
2254 BUG_ON(uptodate < disks);
2255 /* Want to check and possibly repair P and Q.
2256 * However there could be one 'failed' device, in which
2257 * case we can only check one of them, possibly using the
2258 * other to generate missing data
2259 */
2260
2261 /* If !tmp_page, we cannot do the calculations,
2262 * but as we have set STRIPE_HANDLE, we will soon be called
2263 * by stripe_handle with a tmp_page - just wait until then.
2264 */
2265 if (tmp_page) {
2266 if (failed == q_failed) {
2267 /* The only possible failed device holds 'Q', so it makes
2268 * sense to check P (If anything else were failed, we would
2269 * have used P to recreate it).
2270 */
2271 compute_block_1(sh, pd_idx, 1);
2272 if (!page_is_zero(sh->dev[pd_idx].page)) {
2273 compute_block_1(sh,pd_idx,0);
2274 update_p = 1;
2275 }
2276 }
2277 if (!q_failed && failed < 2) {
2278 /* q is not failed, and we didn't use it to generate
2279 * anything, so it makes sense to check it
2280 */
2281 memcpy(page_address(tmp_page),
2282 page_address(sh->dev[qd_idx].page),
2283 STRIPE_SIZE);
2284 compute_parity6(sh, UPDATE_PARITY);
2285 if (memcmp(page_address(tmp_page),
2286 page_address(sh->dev[qd_idx].page),
2287 STRIPE_SIZE)!= 0) {
2288 clear_bit(STRIPE_INSYNC, &sh->state);
2289 update_q = 1;
2290 }
2291 }
2292 if (update_p || update_q) {
2293 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2294 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2295 /* don't try to repair!! */
2296 update_p = update_q = 0;
2297 }
2298
2299 /* now write out any block on a failed drive,
2300 * or P or Q if they need it
2301 */
2302 3022
2303 if (failed == 2) { 3023 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
2304 dev = &sh->dev[failed_num[1]];
2305 locked++;
2306 set_bit(R5_LOCKED, &dev->flags);
2307 set_bit(R5_Wantwrite, &dev->flags);
2308 }
2309 if (failed >= 1) {
2310 dev = &sh->dev[failed_num[0]];
2311 locked++;
2312 set_bit(R5_LOCKED, &dev->flags);
2313 set_bit(R5_Wantwrite, &dev->flags);
2314 }
2315
2316 if (update_p) {
2317 dev = &sh->dev[pd_idx];
2318 locked ++;
2319 set_bit(R5_LOCKED, &dev->flags);
2320 set_bit(R5_Wantwrite, &dev->flags);
2321 }
2322 if (update_q) {
2323 dev = &sh->dev[qd_idx];
2324 locked++;
2325 set_bit(R5_LOCKED, &dev->flags);
2326 set_bit(R5_Wantwrite, &dev->flags);
2327 }
2328 clear_bit(STRIPE_DEGRADED, &sh->state);
2329
2330 set_bit(STRIPE_INSYNC, &sh->state);
2331 }
2332 }
2333
2334 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
2335 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 3024 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
2336 clear_bit(STRIPE_SYNCING, &sh->state); 3025 clear_bit(STRIPE_SYNCING, &sh->state);
2337 } 3026 }
@@ -2339,9 +3028,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2339 /* If the failed drives are just a ReadError, then we might need 3028 /* If the failed drives are just a ReadError, then we might need
2340 * to progress the repair/check process 3029 * to progress the repair/check process
2341 */ 3030 */
2342 if (failed <= 2 && ! conf->mddev->ro) 3031 if (s.failed <= 2 && !conf->mddev->ro)
2343 for (i=0; i<failed;i++) { 3032 for (i = 0; i < s.failed; i++) {
2344 dev = &sh->dev[failed_num[i]]; 3033 dev = &sh->dev[r6s.failed_num[i]];
2345 if (test_bit(R5_ReadError, &dev->flags) 3034 if (test_bit(R5_ReadError, &dev->flags)
2346 && !test_bit(R5_LOCKED, &dev->flags) 3035 && !test_bit(R5_LOCKED, &dev->flags)
2347 && test_bit(R5_UPTODATE, &dev->flags) 3036 && test_bit(R5_UPTODATE, &dev->flags)
@@ -2358,7 +3047,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2358 } 3047 }
2359 } 3048 }
2360 3049
2361 if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { 3050 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
2362 /* Need to write out all blocks after computing P&Q */ 3051 /* Need to write out all blocks after computing P&Q */
2363 sh->disks = conf->raid_disks; 3052 sh->disks = conf->raid_disks;
2364 sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 3053 sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
@@ -2366,82 +3055,24 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2366 compute_parity6(sh, RECONSTRUCT_WRITE); 3055 compute_parity6(sh, RECONSTRUCT_WRITE);
2367 for (i = conf->raid_disks ; i-- ; ) { 3056 for (i = conf->raid_disks ; i-- ; ) {
2368 set_bit(R5_LOCKED, &sh->dev[i].flags); 3057 set_bit(R5_LOCKED, &sh->dev[i].flags);
2369 locked++; 3058 s.locked++;
2370 set_bit(R5_Wantwrite, &sh->dev[i].flags); 3059 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2371 } 3060 }
2372 clear_bit(STRIPE_EXPANDING, &sh->state); 3061 clear_bit(STRIPE_EXPANDING, &sh->state);
2373 } else if (expanded) { 3062 } else if (s.expanded) {
2374 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3063 clear_bit(STRIPE_EXPAND_READY, &sh->state);
2375 atomic_dec(&conf->reshape_stripes); 3064 atomic_dec(&conf->reshape_stripes);
2376 wake_up(&conf->wait_for_overlap); 3065 wake_up(&conf->wait_for_overlap);
2377 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3066 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
2378 } 3067 }
2379 3068
2380 if (expanding && locked == 0) { 3069 if (s.expanding && s.locked == 0)
2381 /* We have read all the blocks in this stripe and now we need to 3070 handle_stripe_expansion(conf, sh, &r6s);
2382 * copy some of them into a target stripe for expand.
2383 */
2384 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2385 for (i = 0; i < sh->disks ; i++)
2386 if (i != pd_idx && i != qd_idx) {
2387 int dd_idx2, pd_idx2, j;
2388 struct stripe_head *sh2;
2389
2390 sector_t bn = compute_blocknr(sh, i);
2391 sector_t s = raid5_compute_sector(
2392 bn, conf->raid_disks,
2393 conf->raid_disks - conf->max_degraded,
2394 &dd_idx2, &pd_idx2, conf);
2395 sh2 = get_active_stripe(conf, s,
2396 conf->raid_disks,
2397 pd_idx2, 1);
2398 if (sh2 == NULL)
2399 /* so for only the early blocks of
2400 * this stripe have been requests.
2401 * When later blocks get requests, we
2402 * will try again
2403 */
2404 continue;
2405 if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
2406 test_bit(R5_Expanded,
2407 &sh2->dev[dd_idx2].flags)) {
2408 /* must have already done this block */
2409 release_stripe(sh2);
2410 continue;
2411 }
2412 memcpy(page_address(sh2->dev[dd_idx2].page),
2413 page_address(sh->dev[i].page),
2414 STRIPE_SIZE);
2415 set_bit(R5_Expanded, &sh2->dev[dd_idx2].flags);
2416 set_bit(R5_UPTODATE, &sh2->dev[dd_idx2].flags);
2417 for (j = 0 ; j < conf->raid_disks ; j++)
2418 if (j != sh2->pd_idx &&
2419 j != raid6_next_disk(sh2->pd_idx,
2420 sh2->disks) &&
2421 !test_bit(R5_Expanded,
2422 &sh2->dev[j].flags))
2423 break;
2424 if (j == conf->raid_disks) {
2425 set_bit(STRIPE_EXPAND_READY,
2426 &sh2->state);
2427 set_bit(STRIPE_HANDLE, &sh2->state);
2428 }
2429 release_stripe(sh2);
2430 }
2431 }
2432 3071
2433 spin_unlock(&sh->lock); 3072 spin_unlock(&sh->lock);
2434 3073
2435 while ((bi=return_bi)) { 3074 return_io(return_bi);
2436 int bytes = bi->bi_size;
2437 3075
2438 return_bi = bi->bi_next;
2439 bi->bi_next = NULL;
2440 bi->bi_size = 0;
2441 bi->bi_end_io(bi, bytes,
2442 test_bit(BIO_UPTODATE, &bi->bi_flags)
2443 ? 0 : -EIO);
2444 }
2445 for (i=disks; i-- ;) { 3076 for (i=disks; i-- ;) {
2446 int rw; 3077 int rw;
2447 struct bio *bi; 3078 struct bio *bi;
@@ -2470,11 +3101,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2470 rcu_read_unlock(); 3101 rcu_read_unlock();
2471 3102
2472 if (rdev) { 3103 if (rdev) {
2473 if (syncing || expanding || expanded) 3104 if (s.syncing || s.expanding || s.expanded)
2474 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 3105 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
2475 3106
2476 bi->bi_bdev = rdev->bdev; 3107 bi->bi_bdev = rdev->bdev;
2477 PRINTK("for %llu schedule op %ld on disc %d\n", 3108 pr_debug("for %llu schedule op %ld on disc %d\n",
2478 (unsigned long long)sh->sector, bi->bi_rw, i); 3109 (unsigned long long)sh->sector, bi->bi_rw, i);
2479 atomic_inc(&sh->count); 3110 atomic_inc(&sh->count);
2480 bi->bi_sector = sh->sector + rdev->data_offset; 3111 bi->bi_sector = sh->sector + rdev->data_offset;
@@ -2494,7 +3125,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2494 } else { 3125 } else {
2495 if (rw == WRITE) 3126 if (rw == WRITE)
2496 set_bit(STRIPE_DEGRADED, &sh->state); 3127 set_bit(STRIPE_DEGRADED, &sh->state);
2497 PRINTK("skip op %ld on disc %d for sector %llu\n", 3128 pr_debug("skip op %ld on disc %d for sector %llu\n",
2498 bi->bi_rw, i, (unsigned long long)sh->sector); 3129 bi->bi_rw, i, (unsigned long long)sh->sector);
2499 clear_bit(R5_LOCKED, &sh->dev[i].flags); 3130 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2500 set_bit(STRIPE_HANDLE, &sh->state); 3131 set_bit(STRIPE_HANDLE, &sh->state);
@@ -2738,7 +3369,7 @@ static int raid5_align_endio(struct bio *bi, unsigned int bytes, int error)
2738 } 3369 }
2739 3370
2740 3371
2741 PRINTK("raid5_align_endio : io error...handing IO for a retry\n"); 3372 pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
2742 3373
2743 add_bio_to_retry(raid_bi, conf); 3374 add_bio_to_retry(raid_bi, conf);
2744 return 0; 3375 return 0;
@@ -2776,7 +3407,7 @@ static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio)
2776 mdk_rdev_t *rdev; 3407 mdk_rdev_t *rdev;
2777 3408
2778 if (!in_chunk_boundary(mddev, raid_bio)) { 3409 if (!in_chunk_boundary(mddev, raid_bio)) {
2779 PRINTK("chunk_aligned_read : non aligned\n"); 3410 pr_debug("chunk_aligned_read : non aligned\n");
2780 return 0; 3411 return 0;
2781 } 3412 }
2782 /* 3413 /*
@@ -2900,7 +3531,7 @@ static int make_request(request_queue_t *q, struct bio * bi)
2900 3531
2901 new_sector = raid5_compute_sector(logical_sector, disks, data_disks, 3532 new_sector = raid5_compute_sector(logical_sector, disks, data_disks,
2902 &dd_idx, &pd_idx, conf); 3533 &dd_idx, &pd_idx, conf);
2903 PRINTK("raid5: make_request, sector %llu logical %llu\n", 3534 pr_debug("raid5: make_request, sector %llu logical %llu\n",
2904 (unsigned long long)new_sector, 3535 (unsigned long long)new_sector,
2905 (unsigned long long)logical_sector); 3536 (unsigned long long)logical_sector);
2906 3537
@@ -3273,7 +3904,7 @@ static void raid5d (mddev_t *mddev)
3273 raid5_conf_t *conf = mddev_to_conf(mddev); 3904 raid5_conf_t *conf = mddev_to_conf(mddev);
3274 int handled; 3905 int handled;
3275 3906
3276 PRINTK("+++ raid5d active\n"); 3907 pr_debug("+++ raid5d active\n");
3277 3908
3278 md_check_recovery(mddev); 3909 md_check_recovery(mddev);
3279 3910
@@ -3308,8 +3939,10 @@ static void raid5d (mddev_t *mddev)
3308 handled++; 3939 handled++;
3309 } 3940 }
3310 3941
3311 if (list_empty(&conf->handle_list)) 3942 if (list_empty(&conf->handle_list)) {
3943 async_tx_issue_pending_all();
3312 break; 3944 break;
3945 }
3313 3946
3314 first = conf->handle_list.next; 3947 first = conf->handle_list.next;
3315 sh = list_entry(first, struct stripe_head, lru); 3948 sh = list_entry(first, struct stripe_head, lru);
@@ -3325,13 +3958,13 @@ static void raid5d (mddev_t *mddev)
3325 3958
3326 spin_lock_irq(&conf->device_lock); 3959 spin_lock_irq(&conf->device_lock);
3327 } 3960 }
3328 PRINTK("%d stripes handled\n", handled); 3961 pr_debug("%d stripes handled\n", handled);
3329 3962
3330 spin_unlock_irq(&conf->device_lock); 3963 spin_unlock_irq(&conf->device_lock);
3331 3964
3332 unplug_slaves(mddev); 3965 unplug_slaves(mddev);
3333 3966
3334 PRINTK("--- raid5d inactive\n"); 3967 pr_debug("--- raid5d inactive\n");
3335} 3968}
3336 3969
3337static ssize_t 3970static ssize_t
@@ -3507,7 +4140,7 @@ static int run(mddev_t *mddev)
3507 atomic_set(&conf->preread_active_stripes, 0); 4140 atomic_set(&conf->preread_active_stripes, 0);
3508 atomic_set(&conf->active_aligned_reads, 0); 4141 atomic_set(&conf->active_aligned_reads, 0);
3509 4142
3510 PRINTK("raid5: run(%s) called.\n", mdname(mddev)); 4143 pr_debug("raid5: run(%s) called.\n", mdname(mddev));
3511 4144
3512 ITERATE_RDEV(mddev,rdev,tmp) { 4145 ITERATE_RDEV(mddev,rdev,tmp) {
3513 raid_disk = rdev->raid_disk; 4146 raid_disk = rdev->raid_disk;
@@ -3690,7 +4323,7 @@ static int stop(mddev_t *mddev)
3690 return 0; 4323 return 0;
3691} 4324}
3692 4325
3693#if RAID5_DEBUG 4326#ifdef DEBUG
3694static void print_sh (struct seq_file *seq, struct stripe_head *sh) 4327static void print_sh (struct seq_file *seq, struct stripe_head *sh)
3695{ 4328{
3696 int i; 4329 int i;
@@ -3737,7 +4370,7 @@ static void status (struct seq_file *seq, mddev_t *mddev)
3737 conf->disks[i].rdev && 4370 conf->disks[i].rdev &&
3738 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); 4371 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
3739 seq_printf (seq, "]"); 4372 seq_printf (seq, "]");
3740#if RAID5_DEBUG 4373#ifdef DEBUG
3741 seq_printf (seq, "\n"); 4374 seq_printf (seq, "\n");
3742 printall(seq, conf); 4375 printall(seq, conf);
3743#endif 4376#endif
diff --git a/drivers/md/xor.c b/drivers/md/xor.c
deleted file mode 100644
index 324897c4be4e..000000000000
--- a/drivers/md/xor.c
+++ /dev/null
@@ -1,154 +0,0 @@
1/*
2 * xor.c : Multiple Devices driver for Linux
3 *
4 * Copyright (C) 1996, 1997, 1998, 1999, 2000,
5 * Ingo Molnar, Matti Aarnio, Jakub Jelinek, Richard Henderson.
6 *
7 * Dispatch optimized RAID-5 checksumming functions.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2, or (at your option)
12 * any later version.
13 *
14 * You should have received a copy of the GNU General Public License
15 * (for example /usr/src/linux/COPYING); if not, write to the Free
16 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17 */
18
19#define BH_TRACE 0
20#include <linux/module.h>
21#include <linux/raid/md.h>
22#include <linux/raid/xor.h>
23#include <asm/xor.h>
24
25/* The xor routines to use. */
26static struct xor_block_template *active_template;
27
28void
29xor_block(unsigned int count, unsigned int bytes, void **ptr)
30{
31 unsigned long *p0, *p1, *p2, *p3, *p4;
32
33 p0 = (unsigned long *) ptr[0];
34 p1 = (unsigned long *) ptr[1];
35 if (count == 2) {
36 active_template->do_2(bytes, p0, p1);
37 return;
38 }
39
40 p2 = (unsigned long *) ptr[2];
41 if (count == 3) {
42 active_template->do_3(bytes, p0, p1, p2);
43 return;
44 }
45
46 p3 = (unsigned long *) ptr[3];
47 if (count == 4) {
48 active_template->do_4(bytes, p0, p1, p2, p3);
49 return;
50 }
51
52 p4 = (unsigned long *) ptr[4];
53 active_template->do_5(bytes, p0, p1, p2, p3, p4);
54}
55
56/* Set of all registered templates. */
57static struct xor_block_template *template_list;
58
59#define BENCH_SIZE (PAGE_SIZE)
60
61static void
62do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
63{
64 int speed;
65 unsigned long now;
66 int i, count, max;
67
68 tmpl->next = template_list;
69 template_list = tmpl;
70
71 /*
72 * Count the number of XORs done during a whole jiffy, and use
73 * this to calculate the speed of checksumming. We use a 2-page
74 * allocation to have guaranteed color L1-cache layout.
75 */
76 max = 0;
77 for (i = 0; i < 5; i++) {
78 now = jiffies;
79 count = 0;
80 while (jiffies == now) {
81 mb();
82 tmpl->do_2(BENCH_SIZE, b1, b2);
83 mb();
84 count++;
85 mb();
86 }
87 if (count > max)
88 max = count;
89 }
90
91 speed = max * (HZ * BENCH_SIZE / 1024);
92 tmpl->speed = speed;
93
94 printk(" %-10s: %5d.%03d MB/sec\n", tmpl->name,
95 speed / 1000, speed % 1000);
96}
97
98static int
99calibrate_xor_block(void)
100{
101 void *b1, *b2;
102 struct xor_block_template *f, *fastest;
103
104 b1 = (void *) __get_free_pages(GFP_KERNEL, 2);
105 if (! b1) {
106 printk("raid5: Yikes! No memory available.\n");
107 return -ENOMEM;
108 }
109 b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE;
110
111 /*
112 * If this arch/cpu has a short-circuited selection, don't loop through all
113 * the possible functions, just test the best one
114 */
115
116 fastest = NULL;
117
118#ifdef XOR_SELECT_TEMPLATE
119 fastest = XOR_SELECT_TEMPLATE(fastest);
120#endif
121
122#define xor_speed(templ) do_xor_speed((templ), b1, b2)
123
124 if (fastest) {
125 printk(KERN_INFO "raid5: automatically using best checksumming function: %s\n",
126 fastest->name);
127 xor_speed(fastest);
128 } else {
129 printk(KERN_INFO "raid5: measuring checksumming speed\n");
130 XOR_TRY_TEMPLATES;
131 fastest = template_list;
132 for (f = fastest; f; f = f->next)
133 if (f->speed > fastest->speed)
134 fastest = f;
135 }
136
137 printk("raid5: using function: %s (%d.%03d MB/sec)\n",
138 fastest->name, fastest->speed / 1000, fastest->speed % 1000);
139
140#undef xor_speed
141
142 free_pages((unsigned long)b1, 2);
143
144 active_template = fastest;
145 return 0;
146}
147
148static __exit void xor_exit(void) { }
149
150EXPORT_SYMBOL(xor_block);
151MODULE_LICENSE("GPL");
152
153module_init(calibrate_xor_block);
154module_exit(xor_exit);