summaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c826
1 files changed, 691 insertions, 135 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index cd2f96b2c572..77dfd720aaa0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -54,6 +54,7 @@
54#include <linux/slab.h> 54#include <linux/slab.h>
55#include <linux/ratelimit.h> 55#include <linux/ratelimit.h>
56#include <linux/nodemask.h> 56#include <linux/nodemask.h>
57#include <linux/flex_array.h>
57#include <trace/events/block.h> 58#include <trace/events/block.h>
58 59
59#include "md.h" 60#include "md.h"
@@ -496,7 +497,7 @@ static void shrink_buffers(struct stripe_head *sh)
496 } 497 }
497} 498}
498 499
499static int grow_buffers(struct stripe_head *sh) 500static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
500{ 501{
501 int i; 502 int i;
502 int num = sh->raid_conf->pool_size; 503 int num = sh->raid_conf->pool_size;
@@ -504,7 +505,7 @@ static int grow_buffers(struct stripe_head *sh)
504 for (i = 0; i < num; i++) { 505 for (i = 0; i < num; i++) {
505 struct page *page; 506 struct page *page;
506 507
507 if (!(page = alloc_page(GFP_KERNEL))) { 508 if (!(page = alloc_page(gfp))) {
508 return 1; 509 return 1;
509 } 510 }
510 sh->dev[i].page = page; 511 sh->dev[i].page = page;
@@ -525,6 +526,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
525 BUG_ON(atomic_read(&sh->count) != 0); 526 BUG_ON(atomic_read(&sh->count) != 0);
526 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 527 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
527 BUG_ON(stripe_operations_active(sh)); 528 BUG_ON(stripe_operations_active(sh));
529 BUG_ON(sh->batch_head);
528 530
529 pr_debug("init_stripe called, stripe %llu\n", 531 pr_debug("init_stripe called, stripe %llu\n",
530 (unsigned long long)sector); 532 (unsigned long long)sector);
@@ -552,8 +554,10 @@ retry:
552 } 554 }
553 if (read_seqcount_retry(&conf->gen_lock, seq)) 555 if (read_seqcount_retry(&conf->gen_lock, seq))
554 goto retry; 556 goto retry;
557 sh->overwrite_disks = 0;
555 insert_hash(conf, sh); 558 insert_hash(conf, sh);
556 sh->cpu = smp_processor_id(); 559 sh->cpu = smp_processor_id();
560 set_bit(STRIPE_BATCH_READY, &sh->state);
557} 561}
558 562
559static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, 563static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
@@ -668,20 +672,28 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
668 *(conf->hash_locks + hash)); 672 *(conf->hash_locks + hash));
669 sh = __find_stripe(conf, sector, conf->generation - previous); 673 sh = __find_stripe(conf, sector, conf->generation - previous);
670 if (!sh) { 674 if (!sh) {
671 if (!conf->inactive_blocked) 675 if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
672 sh = get_free_stripe(conf, hash); 676 sh = get_free_stripe(conf, hash);
677 if (!sh && llist_empty(&conf->released_stripes) &&
678 !test_bit(R5_DID_ALLOC, &conf->cache_state))
679 set_bit(R5_ALLOC_MORE,
680 &conf->cache_state);
681 }
673 if (noblock && sh == NULL) 682 if (noblock && sh == NULL)
674 break; 683 break;
675 if (!sh) { 684 if (!sh) {
676 conf->inactive_blocked = 1; 685 set_bit(R5_INACTIVE_BLOCKED,
686 &conf->cache_state);
677 wait_event_lock_irq( 687 wait_event_lock_irq(
678 conf->wait_for_stripe, 688 conf->wait_for_stripe,
679 !list_empty(conf->inactive_list + hash) && 689 !list_empty(conf->inactive_list + hash) &&
680 (atomic_read(&conf->active_stripes) 690 (atomic_read(&conf->active_stripes)
681 < (conf->max_nr_stripes * 3 / 4) 691 < (conf->max_nr_stripes * 3 / 4)
682 || !conf->inactive_blocked), 692 || !test_bit(R5_INACTIVE_BLOCKED,
693 &conf->cache_state)),
683 *(conf->hash_locks + hash)); 694 *(conf->hash_locks + hash));
684 conf->inactive_blocked = 0; 695 clear_bit(R5_INACTIVE_BLOCKED,
696 &conf->cache_state);
685 } else { 697 } else {
686 init_stripe(sh, sector, previous); 698 init_stripe(sh, sector, previous);
687 atomic_inc(&sh->count); 699 atomic_inc(&sh->count);
@@ -708,6 +720,130 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
708 return sh; 720 return sh;
709} 721}
710 722
723static bool is_full_stripe_write(struct stripe_head *sh)
724{
725 BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
726 return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
727}
728
729static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
730{
731 local_irq_disable();
732 if (sh1 > sh2) {
733 spin_lock(&sh2->stripe_lock);
734 spin_lock_nested(&sh1->stripe_lock, 1);
735 } else {
736 spin_lock(&sh1->stripe_lock);
737 spin_lock_nested(&sh2->stripe_lock, 1);
738 }
739}
740
741static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
742{
743 spin_unlock(&sh1->stripe_lock);
744 spin_unlock(&sh2->stripe_lock);
745 local_irq_enable();
746}
747
748/* Only freshly new full stripe normal write stripe can be added to a batch list */
749static bool stripe_can_batch(struct stripe_head *sh)
750{
751 return test_bit(STRIPE_BATCH_READY, &sh->state) &&
752 is_full_stripe_write(sh);
753}
754
755/* we only do back search */
756static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh)
757{
758 struct stripe_head *head;
759 sector_t head_sector, tmp_sec;
760 int hash;
761 int dd_idx;
762
763 if (!stripe_can_batch(sh))
764 return;
765 /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */
766 tmp_sec = sh->sector;
767 if (!sector_div(tmp_sec, conf->chunk_sectors))
768 return;
769 head_sector = sh->sector - STRIPE_SECTORS;
770
771 hash = stripe_hash_locks_hash(head_sector);
772 spin_lock_irq(conf->hash_locks + hash);
773 head = __find_stripe(conf, head_sector, conf->generation);
774 if (head && !atomic_inc_not_zero(&head->count)) {
775 spin_lock(&conf->device_lock);
776 if (!atomic_read(&head->count)) {
777 if (!test_bit(STRIPE_HANDLE, &head->state))
778 atomic_inc(&conf->active_stripes);
779 BUG_ON(list_empty(&head->lru) &&
780 !test_bit(STRIPE_EXPANDING, &head->state));
781 list_del_init(&head->lru);
782 if (head->group) {
783 head->group->stripes_cnt--;
784 head->group = NULL;
785 }
786 }
787 atomic_inc(&head->count);
788 spin_unlock(&conf->device_lock);
789 }
790 spin_unlock_irq(conf->hash_locks + hash);
791
792 if (!head)
793 return;
794 if (!stripe_can_batch(head))
795 goto out;
796
797 lock_two_stripes(head, sh);
798 /* clear_batch_ready clear the flag */
799 if (!stripe_can_batch(head) || !stripe_can_batch(sh))
800 goto unlock_out;
801
802 if (sh->batch_head)
803 goto unlock_out;
804
805 dd_idx = 0;
806 while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
807 dd_idx++;
808 if (head->dev[dd_idx].towrite->bi_rw != sh->dev[dd_idx].towrite->bi_rw)
809 goto unlock_out;
810
811 if (head->batch_head) {
812 spin_lock(&head->batch_head->batch_lock);
813 /* This batch list is already running */
814 if (!stripe_can_batch(head)) {
815 spin_unlock(&head->batch_head->batch_lock);
816 goto unlock_out;
817 }
818
819 /*
820 * at this point, head's BATCH_READY could be cleared, but we
821 * can still add the stripe to batch list
822 */
823 list_add(&sh->batch_list, &head->batch_list);
824 spin_unlock(&head->batch_head->batch_lock);
825
826 sh->batch_head = head->batch_head;
827 } else {
828 head->batch_head = head;
829 sh->batch_head = head->batch_head;
830 spin_lock(&head->batch_lock);
831 list_add_tail(&sh->batch_list, &head->batch_list);
832 spin_unlock(&head->batch_lock);
833 }
834
835 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
836 if (atomic_dec_return(&conf->preread_active_stripes)
837 < IO_THRESHOLD)
838 md_wakeup_thread(conf->mddev->thread);
839
840 atomic_inc(&sh->count);
841unlock_out:
842 unlock_two_stripes(head, sh);
843out:
844 release_stripe(head);
845}
846
711/* Determine if 'data_offset' or 'new_data_offset' should be used 847/* Determine if 'data_offset' or 'new_data_offset' should be used
712 * in this stripe_head. 848 * in this stripe_head.
713 */ 849 */
@@ -738,6 +874,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
738{ 874{
739 struct r5conf *conf = sh->raid_conf; 875 struct r5conf *conf = sh->raid_conf;
740 int i, disks = sh->disks; 876 int i, disks = sh->disks;
877 struct stripe_head *head_sh = sh;
741 878
742 might_sleep(); 879 might_sleep();
743 880
@@ -746,6 +883,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
746 int replace_only = 0; 883 int replace_only = 0;
747 struct bio *bi, *rbi; 884 struct bio *bi, *rbi;
748 struct md_rdev *rdev, *rrdev = NULL; 885 struct md_rdev *rdev, *rrdev = NULL;
886
887 sh = head_sh;
749 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 888 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
750 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 889 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
751 rw = WRITE_FUA; 890 rw = WRITE_FUA;
@@ -764,6 +903,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
764 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) 903 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
765 rw |= REQ_SYNC; 904 rw |= REQ_SYNC;
766 905
906again:
767 bi = &sh->dev[i].req; 907 bi = &sh->dev[i].req;
768 rbi = &sh->dev[i].rreq; /* For writing to replacement */ 908 rbi = &sh->dev[i].rreq; /* For writing to replacement */
769 909
@@ -782,7 +922,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
782 /* We raced and saw duplicates */ 922 /* We raced and saw duplicates */
783 rrdev = NULL; 923 rrdev = NULL;
784 } else { 924 } else {
785 if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev) 925 if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
786 rdev = rrdev; 926 rdev = rrdev;
787 rrdev = NULL; 927 rrdev = NULL;
788 } 928 }
@@ -853,13 +993,15 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
853 __func__, (unsigned long long)sh->sector, 993 __func__, (unsigned long long)sh->sector,
854 bi->bi_rw, i); 994 bi->bi_rw, i);
855 atomic_inc(&sh->count); 995 atomic_inc(&sh->count);
996 if (sh != head_sh)
997 atomic_inc(&head_sh->count);
856 if (use_new_offset(conf, sh)) 998 if (use_new_offset(conf, sh))
857 bi->bi_iter.bi_sector = (sh->sector 999 bi->bi_iter.bi_sector = (sh->sector
858 + rdev->new_data_offset); 1000 + rdev->new_data_offset);
859 else 1001 else
860 bi->bi_iter.bi_sector = (sh->sector 1002 bi->bi_iter.bi_sector = (sh->sector
861 + rdev->data_offset); 1003 + rdev->data_offset);
862 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 1004 if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
863 bi->bi_rw |= REQ_NOMERGE; 1005 bi->bi_rw |= REQ_NOMERGE;
864 1006
865 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1007 if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
@@ -903,6 +1045,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
903 __func__, (unsigned long long)sh->sector, 1045 __func__, (unsigned long long)sh->sector,
904 rbi->bi_rw, i); 1046 rbi->bi_rw, i);
905 atomic_inc(&sh->count); 1047 atomic_inc(&sh->count);
1048 if (sh != head_sh)
1049 atomic_inc(&head_sh->count);
906 if (use_new_offset(conf, sh)) 1050 if (use_new_offset(conf, sh))
907 rbi->bi_iter.bi_sector = (sh->sector 1051 rbi->bi_iter.bi_sector = (sh->sector
908 + rrdev->new_data_offset); 1052 + rrdev->new_data_offset);
@@ -934,8 +1078,18 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
934 pr_debug("skip op %ld on disc %d for sector %llu\n", 1078 pr_debug("skip op %ld on disc %d for sector %llu\n",
935 bi->bi_rw, i, (unsigned long long)sh->sector); 1079 bi->bi_rw, i, (unsigned long long)sh->sector);
936 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1080 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1081 if (sh->batch_head)
1082 set_bit(STRIPE_BATCH_ERR,
1083 &sh->batch_head->state);
937 set_bit(STRIPE_HANDLE, &sh->state); 1084 set_bit(STRIPE_HANDLE, &sh->state);
938 } 1085 }
1086
1087 if (!head_sh->batch_head)
1088 continue;
1089 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1090 batch_list);
1091 if (sh != head_sh)
1092 goto again;
939 } 1093 }
940} 1094}
941 1095
@@ -1051,6 +1205,7 @@ static void ops_run_biofill(struct stripe_head *sh)
1051 struct async_submit_ctl submit; 1205 struct async_submit_ctl submit;
1052 int i; 1206 int i;
1053 1207
1208 BUG_ON(sh->batch_head);
1054 pr_debug("%s: stripe %llu\n", __func__, 1209 pr_debug("%s: stripe %llu\n", __func__,
1055 (unsigned long long)sh->sector); 1210 (unsigned long long)sh->sector);
1056 1211
@@ -1109,16 +1264,28 @@ static void ops_complete_compute(void *stripe_head_ref)
1109 1264
1110/* return a pointer to the address conversion region of the scribble buffer */ 1265/* return a pointer to the address conversion region of the scribble buffer */
1111static addr_conv_t *to_addr_conv(struct stripe_head *sh, 1266static addr_conv_t *to_addr_conv(struct stripe_head *sh,
1112 struct raid5_percpu *percpu) 1267 struct raid5_percpu *percpu, int i)
1113{ 1268{
1114 return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); 1269 void *addr;
1270
1271 addr = flex_array_get(percpu->scribble, i);
1272 return addr + sizeof(struct page *) * (sh->disks + 2);
1273}
1274
1275/* return a pointer to the address conversion region of the scribble buffer */
1276static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
1277{
1278 void *addr;
1279
1280 addr = flex_array_get(percpu->scribble, i);
1281 return addr;
1115} 1282}
1116 1283
1117static struct dma_async_tx_descriptor * 1284static struct dma_async_tx_descriptor *
1118ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 1285ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
1119{ 1286{
1120 int disks = sh->disks; 1287 int disks = sh->disks;
1121 struct page **xor_srcs = percpu->scribble; 1288 struct page **xor_srcs = to_addr_page(percpu, 0);
1122 int target = sh->ops.target; 1289 int target = sh->ops.target;
1123 struct r5dev *tgt = &sh->dev[target]; 1290 struct r5dev *tgt = &sh->dev[target];
1124 struct page *xor_dest = tgt->page; 1291 struct page *xor_dest = tgt->page;
@@ -1127,6 +1294,8 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
1127 struct async_submit_ctl submit; 1294 struct async_submit_ctl submit;
1128 int i; 1295 int i;
1129 1296
1297 BUG_ON(sh->batch_head);
1298
1130 pr_debug("%s: stripe %llu block: %d\n", 1299 pr_debug("%s: stripe %llu block: %d\n",
1131 __func__, (unsigned long long)sh->sector, target); 1300 __func__, (unsigned long long)sh->sector, target);
1132 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1301 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
@@ -1138,7 +1307,7 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
1138 atomic_inc(&sh->count); 1307 atomic_inc(&sh->count);
1139 1308
1140 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 1309 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
1141 ops_complete_compute, sh, to_addr_conv(sh, percpu)); 1310 ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
1142 if (unlikely(count == 1)) 1311 if (unlikely(count == 1))
1143 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1312 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1144 else 1313 else
@@ -1156,7 +1325,9 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
1156 * destination buffer is recorded in srcs[count] and the Q destination 1325 * destination buffer is recorded in srcs[count] and the Q destination
1157 * is recorded in srcs[count+1]]. 1326 * is recorded in srcs[count+1]].
1158 */ 1327 */
1159static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) 1328static int set_syndrome_sources(struct page **srcs,
1329 struct stripe_head *sh,
1330 int srctype)
1160{ 1331{
1161 int disks = sh->disks; 1332 int disks = sh->disks;
1162 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 1333 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
@@ -1171,8 +1342,15 @@ static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
1171 i = d0_idx; 1342 i = d0_idx;
1172 do { 1343 do {
1173 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1344 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1345 struct r5dev *dev = &sh->dev[i];
1174 1346
1175 srcs[slot] = sh->dev[i].page; 1347 if (i == sh->qd_idx || i == sh->pd_idx ||
1348 (srctype == SYNDROME_SRC_ALL) ||
1349 (srctype == SYNDROME_SRC_WANT_DRAIN &&
1350 test_bit(R5_Wantdrain, &dev->flags)) ||
1351 (srctype == SYNDROME_SRC_WRITTEN &&
1352 dev->written))
1353 srcs[slot] = sh->dev[i].page;
1176 i = raid6_next_disk(i, disks); 1354 i = raid6_next_disk(i, disks);
1177 } while (i != d0_idx); 1355 } while (i != d0_idx);
1178 1356
@@ -1183,7 +1361,7 @@ static struct dma_async_tx_descriptor *
1183ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 1361ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
1184{ 1362{
1185 int disks = sh->disks; 1363 int disks = sh->disks;
1186 struct page **blocks = percpu->scribble; 1364 struct page **blocks = to_addr_page(percpu, 0);
1187 int target; 1365 int target;
1188 int qd_idx = sh->qd_idx; 1366 int qd_idx = sh->qd_idx;
1189 struct dma_async_tx_descriptor *tx; 1367 struct dma_async_tx_descriptor *tx;
@@ -1193,6 +1371,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
1193 int i; 1371 int i;
1194 int count; 1372 int count;
1195 1373
1374 BUG_ON(sh->batch_head);
1196 if (sh->ops.target < 0) 1375 if (sh->ops.target < 0)
1197 target = sh->ops.target2; 1376 target = sh->ops.target2;
1198 else if (sh->ops.target2 < 0) 1377 else if (sh->ops.target2 < 0)
@@ -1211,12 +1390,12 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
1211 atomic_inc(&sh->count); 1390 atomic_inc(&sh->count);
1212 1391
1213 if (target == qd_idx) { 1392 if (target == qd_idx) {
1214 count = set_syndrome_sources(blocks, sh); 1393 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1215 blocks[count] = NULL; /* regenerating p is not necessary */ 1394 blocks[count] = NULL; /* regenerating p is not necessary */
1216 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 1395 BUG_ON(blocks[count+1] != dest); /* q should already be set */
1217 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1396 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1218 ops_complete_compute, sh, 1397 ops_complete_compute, sh,
1219 to_addr_conv(sh, percpu)); 1398 to_addr_conv(sh, percpu, 0));
1220 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1399 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1221 } else { 1400 } else {
1222 /* Compute any data- or p-drive using XOR */ 1401 /* Compute any data- or p-drive using XOR */
@@ -1229,7 +1408,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
1229 1408
1230 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1409 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1231 NULL, ops_complete_compute, sh, 1410 NULL, ops_complete_compute, sh,
1232 to_addr_conv(sh, percpu)); 1411 to_addr_conv(sh, percpu, 0));
1233 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 1412 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
1234 } 1413 }
1235 1414
@@ -1248,9 +1427,10 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
1248 struct r5dev *tgt = &sh->dev[target]; 1427 struct r5dev *tgt = &sh->dev[target];
1249 struct r5dev *tgt2 = &sh->dev[target2]; 1428 struct r5dev *tgt2 = &sh->dev[target2];
1250 struct dma_async_tx_descriptor *tx; 1429 struct dma_async_tx_descriptor *tx;
1251 struct page **blocks = percpu->scribble; 1430 struct page **blocks = to_addr_page(percpu, 0);
1252 struct async_submit_ctl submit; 1431 struct async_submit_ctl submit;
1253 1432
1433 BUG_ON(sh->batch_head);
1254 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 1434 pr_debug("%s: stripe %llu block1: %d block2: %d\n",
1255 __func__, (unsigned long long)sh->sector, target, target2); 1435 __func__, (unsigned long long)sh->sector, target, target2);
1256 BUG_ON(target < 0 || target2 < 0); 1436 BUG_ON(target < 0 || target2 < 0);
@@ -1290,7 +1470,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
1290 /* Missing P+Q, just recompute */ 1470 /* Missing P+Q, just recompute */
1291 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1471 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1292 ops_complete_compute, sh, 1472 ops_complete_compute, sh,
1293 to_addr_conv(sh, percpu)); 1473 to_addr_conv(sh, percpu, 0));
1294 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 1474 return async_gen_syndrome(blocks, 0, syndrome_disks+2,
1295 STRIPE_SIZE, &submit); 1475 STRIPE_SIZE, &submit);
1296 } else { 1476 } else {
@@ -1314,21 +1494,21 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
1314 init_async_submit(&submit, 1494 init_async_submit(&submit,
1315 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1495 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1316 NULL, NULL, NULL, 1496 NULL, NULL, NULL,
1317 to_addr_conv(sh, percpu)); 1497 to_addr_conv(sh, percpu, 0));
1318 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 1498 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
1319 &submit); 1499 &submit);
1320 1500
1321 count = set_syndrome_sources(blocks, sh); 1501 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1322 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 1502 init_async_submit(&submit, ASYNC_TX_FENCE, tx,
1323 ops_complete_compute, sh, 1503 ops_complete_compute, sh,
1324 to_addr_conv(sh, percpu)); 1504 to_addr_conv(sh, percpu, 0));
1325 return async_gen_syndrome(blocks, 0, count+2, 1505 return async_gen_syndrome(blocks, 0, count+2,
1326 STRIPE_SIZE, &submit); 1506 STRIPE_SIZE, &submit);
1327 } 1507 }
1328 } else { 1508 } else {
1329 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1509 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1330 ops_complete_compute, sh, 1510 ops_complete_compute, sh,
1331 to_addr_conv(sh, percpu)); 1511 to_addr_conv(sh, percpu, 0));
1332 if (failb == syndrome_disks) { 1512 if (failb == syndrome_disks) {
1333 /* We're missing D+P. */ 1513 /* We're missing D+P. */
1334 return async_raid6_datap_recov(syndrome_disks+2, 1514 return async_raid6_datap_recov(syndrome_disks+2,
@@ -1352,17 +1532,18 @@ static void ops_complete_prexor(void *stripe_head_ref)
1352} 1532}
1353 1533
1354static struct dma_async_tx_descriptor * 1534static struct dma_async_tx_descriptor *
1355ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, 1535ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
1356 struct dma_async_tx_descriptor *tx) 1536 struct dma_async_tx_descriptor *tx)
1357{ 1537{
1358 int disks = sh->disks; 1538 int disks = sh->disks;
1359 struct page **xor_srcs = percpu->scribble; 1539 struct page **xor_srcs = to_addr_page(percpu, 0);
1360 int count = 0, pd_idx = sh->pd_idx, i; 1540 int count = 0, pd_idx = sh->pd_idx, i;
1361 struct async_submit_ctl submit; 1541 struct async_submit_ctl submit;
1362 1542
1363 /* existing parity data subtracted */ 1543 /* existing parity data subtracted */
1364 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1544 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1365 1545
1546 BUG_ON(sh->batch_head);
1366 pr_debug("%s: stripe %llu\n", __func__, 1547 pr_debug("%s: stripe %llu\n", __func__,
1367 (unsigned long long)sh->sector); 1548 (unsigned long long)sh->sector);
1368 1549
@@ -1374,31 +1555,56 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
1374 } 1555 }
1375 1556
1376 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1557 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1377 ops_complete_prexor, sh, to_addr_conv(sh, percpu)); 1558 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1378 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1559 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1379 1560
1380 return tx; 1561 return tx;
1381} 1562}
1382 1563
1383static struct dma_async_tx_descriptor * 1564static struct dma_async_tx_descriptor *
1565ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
1566 struct dma_async_tx_descriptor *tx)
1567{
1568 struct page **blocks = to_addr_page(percpu, 0);
1569 int count;
1570 struct async_submit_ctl submit;
1571
1572 pr_debug("%s: stripe %llu\n", __func__,
1573 (unsigned long long)sh->sector);
1574
1575 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);
1576
1577 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
1578 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1579 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1580
1581 return tx;
1582}
1583
1584static struct dma_async_tx_descriptor *
1384ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1585ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1385{ 1586{
1386 int disks = sh->disks; 1587 int disks = sh->disks;
1387 int i; 1588 int i;
1589 struct stripe_head *head_sh = sh;
1388 1590
1389 pr_debug("%s: stripe %llu\n", __func__, 1591 pr_debug("%s: stripe %llu\n", __func__,
1390 (unsigned long long)sh->sector); 1592 (unsigned long long)sh->sector);
1391 1593
1392 for (i = disks; i--; ) { 1594 for (i = disks; i--; ) {
1393 struct r5dev *dev = &sh->dev[i]; 1595 struct r5dev *dev;
1394 struct bio *chosen; 1596 struct bio *chosen;
1395 1597
1396 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 1598 sh = head_sh;
1599 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) {
1397 struct bio *wbi; 1600 struct bio *wbi;
1398 1601
1602again:
1603 dev = &sh->dev[i];
1399 spin_lock_irq(&sh->stripe_lock); 1604 spin_lock_irq(&sh->stripe_lock);
1400 chosen = dev->towrite; 1605 chosen = dev->towrite;
1401 dev->towrite = NULL; 1606 dev->towrite = NULL;
1607 sh->overwrite_disks = 0;
1402 BUG_ON(dev->written); 1608 BUG_ON(dev->written);
1403 wbi = dev->written = chosen; 1609 wbi = dev->written = chosen;
1404 spin_unlock_irq(&sh->stripe_lock); 1610 spin_unlock_irq(&sh->stripe_lock);
@@ -1423,6 +1629,15 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1423 } 1629 }
1424 wbi = r5_next_bio(wbi, dev->sector); 1630 wbi = r5_next_bio(wbi, dev->sector);
1425 } 1631 }
1632
1633 if (head_sh->batch_head) {
1634 sh = list_first_entry(&sh->batch_list,
1635 struct stripe_head,
1636 batch_list);
1637 if (sh == head_sh)
1638 continue;
1639 goto again;
1640 }
1426 } 1641 }
1427 } 1642 }
1428 1643
@@ -1478,12 +1693,15 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1478 struct dma_async_tx_descriptor *tx) 1693 struct dma_async_tx_descriptor *tx)
1479{ 1694{
1480 int disks = sh->disks; 1695 int disks = sh->disks;
1481 struct page **xor_srcs = percpu->scribble; 1696 struct page **xor_srcs;
1482 struct async_submit_ctl submit; 1697 struct async_submit_ctl submit;
1483 int count = 0, pd_idx = sh->pd_idx, i; 1698 int count, pd_idx = sh->pd_idx, i;
1484 struct page *xor_dest; 1699 struct page *xor_dest;
1485 int prexor = 0; 1700 int prexor = 0;
1486 unsigned long flags; 1701 unsigned long flags;
1702 int j = 0;
1703 struct stripe_head *head_sh = sh;
1704 int last_stripe;
1487 1705
1488 pr_debug("%s: stripe %llu\n", __func__, 1706 pr_debug("%s: stripe %llu\n", __func__,
1489 (unsigned long long)sh->sector); 1707 (unsigned long long)sh->sector);
@@ -1500,15 +1718,18 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1500 ops_complete_reconstruct(sh); 1718 ops_complete_reconstruct(sh);
1501 return; 1719 return;
1502 } 1720 }
1721again:
1722 count = 0;
1723 xor_srcs = to_addr_page(percpu, j);
1503 /* check if prexor is active which means only process blocks 1724 /* check if prexor is active which means only process blocks
1504 * that are part of a read-modify-write (written) 1725 * that are part of a read-modify-write (written)
1505 */ 1726 */
1506 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1727 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1507 prexor = 1; 1728 prexor = 1;
1508 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1729 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1509 for (i = disks; i--; ) { 1730 for (i = disks; i--; ) {
1510 struct r5dev *dev = &sh->dev[i]; 1731 struct r5dev *dev = &sh->dev[i];
1511 if (dev->written) 1732 if (head_sh->dev[i].written)
1512 xor_srcs[count++] = dev->page; 1733 xor_srcs[count++] = dev->page;
1513 } 1734 }
1514 } else { 1735 } else {
@@ -1525,17 +1746,32 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1525 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1746 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
1526 * for the synchronous xor case 1747 * for the synchronous xor case
1527 */ 1748 */
1528 flags = ASYNC_TX_ACK | 1749 last_stripe = !head_sh->batch_head ||
1529 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1750 list_first_entry(&sh->batch_list,
1530 1751 struct stripe_head, batch_list) == head_sh;
1531 atomic_inc(&sh->count); 1752 if (last_stripe) {
1753 flags = ASYNC_TX_ACK |
1754 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
1755
1756 atomic_inc(&head_sh->count);
1757 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh,
1758 to_addr_conv(sh, percpu, j));
1759 } else {
1760 flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST;
1761 init_async_submit(&submit, flags, tx, NULL, NULL,
1762 to_addr_conv(sh, percpu, j));
1763 }
1532 1764
1533 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
1534 to_addr_conv(sh, percpu));
1535 if (unlikely(count == 1)) 1765 if (unlikely(count == 1))
1536 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1766 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1537 else 1767 else
1538 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1768 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1769 if (!last_stripe) {
1770 j++;
1771 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1772 batch_list);
1773 goto again;
1774 }
1539} 1775}
1540 1776
1541static void 1777static void
@@ -1543,8 +1779,12 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1543 struct dma_async_tx_descriptor *tx) 1779 struct dma_async_tx_descriptor *tx)
1544{ 1780{
1545 struct async_submit_ctl submit; 1781 struct async_submit_ctl submit;
1546 struct page **blocks = percpu->scribble; 1782 struct page **blocks;
1547 int count, i; 1783 int count, i, j = 0;
1784 struct stripe_head *head_sh = sh;
1785 int last_stripe;
1786 int synflags;
1787 unsigned long txflags;
1548 1788
1549 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1789 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1550 1790
@@ -1562,13 +1802,36 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1562 return; 1802 return;
1563 } 1803 }
1564 1804
1565 count = set_syndrome_sources(blocks, sh); 1805again:
1806 blocks = to_addr_page(percpu, j);
1566 1807
1567 atomic_inc(&sh->count); 1808 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1809 synflags = SYNDROME_SRC_WRITTEN;
1810 txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST;
1811 } else {
1812 synflags = SYNDROME_SRC_ALL;
1813 txflags = ASYNC_TX_ACK;
1814 }
1815
1816 count = set_syndrome_sources(blocks, sh, synflags);
1817 last_stripe = !head_sh->batch_head ||
1818 list_first_entry(&sh->batch_list,
1819 struct stripe_head, batch_list) == head_sh;
1568 1820
1569 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, 1821 if (last_stripe) {
1570 sh, to_addr_conv(sh, percpu)); 1822 atomic_inc(&head_sh->count);
1823 init_async_submit(&submit, txflags, tx, ops_complete_reconstruct,
1824 head_sh, to_addr_conv(sh, percpu, j));
1825 } else
1826 init_async_submit(&submit, 0, tx, NULL, NULL,
1827 to_addr_conv(sh, percpu, j));
1571 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1828 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1829 if (!last_stripe) {
1830 j++;
1831 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1832 batch_list);
1833 goto again;
1834 }
1572} 1835}
1573 1836
1574static void ops_complete_check(void *stripe_head_ref) 1837static void ops_complete_check(void *stripe_head_ref)
@@ -1589,7 +1852,7 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
1589 int pd_idx = sh->pd_idx; 1852 int pd_idx = sh->pd_idx;
1590 int qd_idx = sh->qd_idx; 1853 int qd_idx = sh->qd_idx;
1591 struct page *xor_dest; 1854 struct page *xor_dest;
1592 struct page **xor_srcs = percpu->scribble; 1855 struct page **xor_srcs = to_addr_page(percpu, 0);
1593 struct dma_async_tx_descriptor *tx; 1856 struct dma_async_tx_descriptor *tx;
1594 struct async_submit_ctl submit; 1857 struct async_submit_ctl submit;
1595 int count; 1858 int count;
@@ -1598,6 +1861,7 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
1598 pr_debug("%s: stripe %llu\n", __func__, 1861 pr_debug("%s: stripe %llu\n", __func__,
1599 (unsigned long long)sh->sector); 1862 (unsigned long long)sh->sector);
1600 1863
1864 BUG_ON(sh->batch_head);
1601 count = 0; 1865 count = 0;
1602 xor_dest = sh->dev[pd_idx].page; 1866 xor_dest = sh->dev[pd_idx].page;
1603 xor_srcs[count++] = xor_dest; 1867 xor_srcs[count++] = xor_dest;
@@ -1608,7 +1872,7 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
1608 } 1872 }
1609 1873
1610 init_async_submit(&submit, 0, NULL, NULL, NULL, 1874 init_async_submit(&submit, 0, NULL, NULL, NULL,
1611 to_addr_conv(sh, percpu)); 1875 to_addr_conv(sh, percpu, 0));
1612 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1876 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
1613 &sh->ops.zero_sum_result, &submit); 1877 &sh->ops.zero_sum_result, &submit);
1614 1878
@@ -1619,20 +1883,21 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
1619 1883
1620static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 1884static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
1621{ 1885{
1622 struct page **srcs = percpu->scribble; 1886 struct page **srcs = to_addr_page(percpu, 0);
1623 struct async_submit_ctl submit; 1887 struct async_submit_ctl submit;
1624 int count; 1888 int count;
1625 1889
1626 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 1890 pr_debug("%s: stripe %llu checkp: %d\n", __func__,
1627 (unsigned long long)sh->sector, checkp); 1891 (unsigned long long)sh->sector, checkp);
1628 1892
1629 count = set_syndrome_sources(srcs, sh); 1893 BUG_ON(sh->batch_head);
1894 count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
1630 if (!checkp) 1895 if (!checkp)
1631 srcs[count] = NULL; 1896 srcs[count] = NULL;
1632 1897
1633 atomic_inc(&sh->count); 1898 atomic_inc(&sh->count);
1634 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 1899 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
1635 sh, to_addr_conv(sh, percpu)); 1900 sh, to_addr_conv(sh, percpu, 0));
1636 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 1901 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
1637 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 1902 &sh->ops.zero_sum_result, percpu->spare_page, &submit);
1638} 1903}
@@ -1667,8 +1932,12 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1667 async_tx_ack(tx); 1932 async_tx_ack(tx);
1668 } 1933 }
1669 1934
1670 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) 1935 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
1671 tx = ops_run_prexor(sh, percpu, tx); 1936 if (level < 6)
1937 tx = ops_run_prexor5(sh, percpu, tx);
1938 else
1939 tx = ops_run_prexor6(sh, percpu, tx);
1940 }
1672 1941
1673 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 1942 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
1674 tx = ops_run_biodrain(sh, tx); 1943 tx = ops_run_biodrain(sh, tx);
@@ -1693,7 +1962,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1693 BUG(); 1962 BUG();
1694 } 1963 }
1695 1964
1696 if (overlap_clear) 1965 if (overlap_clear && !sh->batch_head)
1697 for (i = disks; i--; ) { 1966 for (i = disks; i--; ) {
1698 struct r5dev *dev = &sh->dev[i]; 1967 struct r5dev *dev = &sh->dev[i];
1699 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 1968 if (test_and_clear_bit(R5_Overlap, &dev->flags))
@@ -1702,10 +1971,10 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1702 put_cpu(); 1971 put_cpu();
1703} 1972}
1704 1973
1705static int grow_one_stripe(struct r5conf *conf, int hash) 1974static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
1706{ 1975{
1707 struct stripe_head *sh; 1976 struct stripe_head *sh;
1708 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); 1977 sh = kmem_cache_zalloc(conf->slab_cache, gfp);
1709 if (!sh) 1978 if (!sh)
1710 return 0; 1979 return 0;
1711 1980
@@ -1713,17 +1982,23 @@ static int grow_one_stripe(struct r5conf *conf, int hash)
1713 1982
1714 spin_lock_init(&sh->stripe_lock); 1983 spin_lock_init(&sh->stripe_lock);
1715 1984
1716 if (grow_buffers(sh)) { 1985 if (grow_buffers(sh, gfp)) {
1717 shrink_buffers(sh); 1986 shrink_buffers(sh);
1718 kmem_cache_free(conf->slab_cache, sh); 1987 kmem_cache_free(conf->slab_cache, sh);
1719 return 0; 1988 return 0;
1720 } 1989 }
1721 sh->hash_lock_index = hash; 1990 sh->hash_lock_index =
1991 conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
1722 /* we just created an active stripe so... */ 1992 /* we just created an active stripe so... */
1723 atomic_set(&sh->count, 1); 1993 atomic_set(&sh->count, 1);
1724 atomic_inc(&conf->active_stripes); 1994 atomic_inc(&conf->active_stripes);
1725 INIT_LIST_HEAD(&sh->lru); 1995 INIT_LIST_HEAD(&sh->lru);
1996
1997 spin_lock_init(&sh->batch_lock);
1998 INIT_LIST_HEAD(&sh->batch_list);
1999 sh->batch_head = NULL;
1726 release_stripe(sh); 2000 release_stripe(sh);
2001 conf->max_nr_stripes++;
1727 return 1; 2002 return 1;
1728} 2003}
1729 2004
@@ -1731,7 +2006,6 @@ static int grow_stripes(struct r5conf *conf, int num)
1731{ 2006{
1732 struct kmem_cache *sc; 2007 struct kmem_cache *sc;
1733 int devs = max(conf->raid_disks, conf->previous_raid_disks); 2008 int devs = max(conf->raid_disks, conf->previous_raid_disks);
1734 int hash;
1735 2009
1736 if (conf->mddev->gendisk) 2010 if (conf->mddev->gendisk)
1737 sprintf(conf->cache_name[0], 2011 sprintf(conf->cache_name[0],
@@ -1749,13 +2023,10 @@ static int grow_stripes(struct r5conf *conf, int num)
1749 return 1; 2023 return 1;
1750 conf->slab_cache = sc; 2024 conf->slab_cache = sc;
1751 conf->pool_size = devs; 2025 conf->pool_size = devs;
1752 hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; 2026 while (num--)
1753 while (num--) { 2027 if (!grow_one_stripe(conf, GFP_KERNEL))
1754 if (!grow_one_stripe(conf, hash))
1755 return 1; 2028 return 1;
1756 conf->max_nr_stripes++; 2029
1757 hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
1758 }
1759 return 0; 2030 return 0;
1760} 2031}
1761 2032
@@ -1772,13 +2043,21 @@ static int grow_stripes(struct r5conf *conf, int num)
1772 * calculate over all devices (not just the data blocks), using zeros in place 2043 * calculate over all devices (not just the data blocks), using zeros in place
1773 * of the P and Q blocks. 2044 * of the P and Q blocks.
1774 */ 2045 */
1775static size_t scribble_len(int num) 2046static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags)
1776{ 2047{
2048 struct flex_array *ret;
1777 size_t len; 2049 size_t len;
1778 2050
1779 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); 2051 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
1780 2052 ret = flex_array_alloc(len, cnt, flags);
1781 return len; 2053 if (!ret)
2054 return NULL;
2055 /* always prealloc all elements, so no locking is required */
2056 if (flex_array_prealloc(ret, 0, cnt, flags)) {
2057 flex_array_free(ret);
2058 return NULL;
2059 }
2060 return ret;
1782} 2061}
1783 2062
1784static int resize_stripes(struct r5conf *conf, int newsize) 2063static int resize_stripes(struct r5conf *conf, int newsize)
@@ -1896,16 +2175,16 @@ static int resize_stripes(struct r5conf *conf, int newsize)
1896 err = -ENOMEM; 2175 err = -ENOMEM;
1897 2176
1898 get_online_cpus(); 2177 get_online_cpus();
1899 conf->scribble_len = scribble_len(newsize);
1900 for_each_present_cpu(cpu) { 2178 for_each_present_cpu(cpu) {
1901 struct raid5_percpu *percpu; 2179 struct raid5_percpu *percpu;
1902 void *scribble; 2180 struct flex_array *scribble;
1903 2181
1904 percpu = per_cpu_ptr(conf->percpu, cpu); 2182 percpu = per_cpu_ptr(conf->percpu, cpu);
1905 scribble = kmalloc(conf->scribble_len, GFP_NOIO); 2183 scribble = scribble_alloc(newsize, conf->chunk_sectors /
2184 STRIPE_SECTORS, GFP_NOIO);
1906 2185
1907 if (scribble) { 2186 if (scribble) {
1908 kfree(percpu->scribble); 2187 flex_array_free(percpu->scribble);
1909 percpu->scribble = scribble; 2188 percpu->scribble = scribble;
1910 } else { 2189 } else {
1911 err = -ENOMEM; 2190 err = -ENOMEM;
@@ -1937,9 +2216,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
1937 return err; 2216 return err;
1938} 2217}
1939 2218
1940static int drop_one_stripe(struct r5conf *conf, int hash) 2219static int drop_one_stripe(struct r5conf *conf)
1941{ 2220{
1942 struct stripe_head *sh; 2221 struct stripe_head *sh;
2222 int hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
1943 2223
1944 spin_lock_irq(conf->hash_locks + hash); 2224 spin_lock_irq(conf->hash_locks + hash);
1945 sh = get_free_stripe(conf, hash); 2225 sh = get_free_stripe(conf, hash);
@@ -1950,15 +2230,15 @@ static int drop_one_stripe(struct r5conf *conf, int hash)
1950 shrink_buffers(sh); 2230 shrink_buffers(sh);
1951 kmem_cache_free(conf->slab_cache, sh); 2231 kmem_cache_free(conf->slab_cache, sh);
1952 atomic_dec(&conf->active_stripes); 2232 atomic_dec(&conf->active_stripes);
2233 conf->max_nr_stripes--;
1953 return 1; 2234 return 1;
1954} 2235}
1955 2236
1956static void shrink_stripes(struct r5conf *conf) 2237static void shrink_stripes(struct r5conf *conf)
1957{ 2238{
1958 int hash; 2239 while (conf->max_nr_stripes &&
1959 for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++) 2240 drop_one_stripe(conf))
1960 while (drop_one_stripe(conf, hash)) 2241 ;
1961 ;
1962 2242
1963 if (conf->slab_cache) 2243 if (conf->slab_cache)
1964 kmem_cache_destroy(conf->slab_cache); 2244 kmem_cache_destroy(conf->slab_cache);
@@ -2154,10 +2434,16 @@ static void raid5_end_write_request(struct bio *bi, int error)
2154 } 2434 }
2155 rdev_dec_pending(rdev, conf->mddev); 2435 rdev_dec_pending(rdev, conf->mddev);
2156 2436
2437 if (sh->batch_head && !uptodate)
2438 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
2439
2157 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) 2440 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
2158 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2441 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2159 set_bit(STRIPE_HANDLE, &sh->state); 2442 set_bit(STRIPE_HANDLE, &sh->state);
2160 release_stripe(sh); 2443 release_stripe(sh);
2444
2445 if (sh->batch_head && sh != sh->batch_head)
2446 release_stripe(sh->batch_head);
2161} 2447}
2162 2448
2163static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); 2449static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
@@ -2535,7 +2821,7 @@ static void
2535schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 2821schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
2536 int rcw, int expand) 2822 int rcw, int expand)
2537{ 2823{
2538 int i, pd_idx = sh->pd_idx, disks = sh->disks; 2824 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
2539 struct r5conf *conf = sh->raid_conf; 2825 struct r5conf *conf = sh->raid_conf;
2540 int level = conf->level; 2826 int level = conf->level;
2541 2827
@@ -2571,13 +2857,15 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
2571 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 2857 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
2572 atomic_inc(&conf->pending_full_writes); 2858 atomic_inc(&conf->pending_full_writes);
2573 } else { 2859 } else {
2574 BUG_ON(level == 6);
2575 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 2860 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
2576 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 2861 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
2862 BUG_ON(level == 6 &&
2863 (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
2864 test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
2577 2865
2578 for (i = disks; i--; ) { 2866 for (i = disks; i--; ) {
2579 struct r5dev *dev = &sh->dev[i]; 2867 struct r5dev *dev = &sh->dev[i];
2580 if (i == pd_idx) 2868 if (i == pd_idx || i == qd_idx)
2581 continue; 2869 continue;
2582 2870
2583 if (dev->towrite && 2871 if (dev->towrite &&
@@ -2624,7 +2912,8 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
2624 * toread/towrite point to the first in a chain. 2912 * toread/towrite point to the first in a chain.
2625 * The bi_next chain must be in order. 2913 * The bi_next chain must be in order.
2626 */ 2914 */
2627static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) 2915static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
2916 int forwrite, int previous)
2628{ 2917{
2629 struct bio **bip; 2918 struct bio **bip;
2630 struct r5conf *conf = sh->raid_conf; 2919 struct r5conf *conf = sh->raid_conf;
@@ -2643,6 +2932,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2643 * protect it. 2932 * protect it.
2644 */ 2933 */
2645 spin_lock_irq(&sh->stripe_lock); 2934 spin_lock_irq(&sh->stripe_lock);
2935 /* Don't allow new IO added to stripes in batch list */
2936 if (sh->batch_head)
2937 goto overlap;
2646 if (forwrite) { 2938 if (forwrite) {
2647 bip = &sh->dev[dd_idx].towrite; 2939 bip = &sh->dev[dd_idx].towrite;
2648 if (*bip == NULL) 2940 if (*bip == NULL)
@@ -2657,6 +2949,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2657 if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) 2949 if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
2658 goto overlap; 2950 goto overlap;
2659 2951
2952 if (!forwrite || previous)
2953 clear_bit(STRIPE_BATCH_READY, &sh->state);
2954
2660 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 2955 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
2661 if (*bip) 2956 if (*bip)
2662 bi->bi_next = *bip; 2957 bi->bi_next = *bip;
@@ -2674,7 +2969,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2674 sector = bio_end_sector(bi); 2969 sector = bio_end_sector(bi);
2675 } 2970 }
2676 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2971 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
2677 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2972 if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
2973 sh->overwrite_disks++;
2678 } 2974 }
2679 2975
2680 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2976 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
@@ -2688,6 +2984,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2688 sh->bm_seq = conf->seq_flush+1; 2984 sh->bm_seq = conf->seq_flush+1;
2689 set_bit(STRIPE_BIT_DELAY, &sh->state); 2985 set_bit(STRIPE_BIT_DELAY, &sh->state);
2690 } 2986 }
2987
2988 if (stripe_can_batch(sh))
2989 stripe_add_to_batch_list(conf, sh);
2691 return 1; 2990 return 1;
2692 2991
2693 overlap: 2992 overlap:
@@ -2720,6 +3019,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2720 struct bio **return_bi) 3019 struct bio **return_bi)
2721{ 3020{
2722 int i; 3021 int i;
3022 BUG_ON(sh->batch_head);
2723 for (i = disks; i--; ) { 3023 for (i = disks; i--; ) {
2724 struct bio *bi; 3024 struct bio *bi;
2725 int bitmap_end = 0; 3025 int bitmap_end = 0;
@@ -2746,6 +3046,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2746 /* fail all writes first */ 3046 /* fail all writes first */
2747 bi = sh->dev[i].towrite; 3047 bi = sh->dev[i].towrite;
2748 sh->dev[i].towrite = NULL; 3048 sh->dev[i].towrite = NULL;
3049 sh->overwrite_disks = 0;
2749 spin_unlock_irq(&sh->stripe_lock); 3050 spin_unlock_irq(&sh->stripe_lock);
2750 if (bi) 3051 if (bi)
2751 bitmap_end = 1; 3052 bitmap_end = 1;
@@ -2834,6 +3135,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
2834 int abort = 0; 3135 int abort = 0;
2835 int i; 3136 int i;
2836 3137
3138 BUG_ON(sh->batch_head);
2837 clear_bit(STRIPE_SYNCING, &sh->state); 3139 clear_bit(STRIPE_SYNCING, &sh->state);
2838 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 3140 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
2839 wake_up(&conf->wait_for_overlap); 3141 wake_up(&conf->wait_for_overlap);
@@ -3064,6 +3366,7 @@ static void handle_stripe_fill(struct stripe_head *sh,
3064{ 3366{
3065 int i; 3367 int i;
3066 3368
3369 BUG_ON(sh->batch_head);
3067 /* look for blocks to read/compute, skip this if a compute 3370 /* look for blocks to read/compute, skip this if a compute
3068 * is already in flight, or if the stripe contents are in the 3371 * is already in flight, or if the stripe contents are in the
3069 * midst of changing due to a write 3372 * midst of changing due to a write
@@ -3087,6 +3390,9 @@ static void handle_stripe_clean_event(struct r5conf *conf,
3087 int i; 3390 int i;
3088 struct r5dev *dev; 3391 struct r5dev *dev;
3089 int discard_pending = 0; 3392 int discard_pending = 0;
3393 struct stripe_head *head_sh = sh;
3394 bool do_endio = false;
3395 int wakeup_nr = 0;
3090 3396
3091 for (i = disks; i--; ) 3397 for (i = disks; i--; )
3092 if (sh->dev[i].written) { 3398 if (sh->dev[i].written) {
@@ -3102,8 +3408,11 @@ static void handle_stripe_clean_event(struct r5conf *conf,
3102 clear_bit(R5_UPTODATE, &dev->flags); 3408 clear_bit(R5_UPTODATE, &dev->flags);
3103 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { 3409 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
3104 WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); 3410 WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
3105 dev->page = dev->orig_page;
3106 } 3411 }
3412 do_endio = true;
3413
3414returnbi:
3415 dev->page = dev->orig_page;
3107 wbi = dev->written; 3416 wbi = dev->written;
3108 dev->written = NULL; 3417 dev->written = NULL;
3109 while (wbi && wbi->bi_iter.bi_sector < 3418 while (wbi && wbi->bi_iter.bi_sector <
@@ -3120,6 +3429,17 @@ static void handle_stripe_clean_event(struct r5conf *conf,
3120 STRIPE_SECTORS, 3429 STRIPE_SECTORS,
3121 !test_bit(STRIPE_DEGRADED, &sh->state), 3430 !test_bit(STRIPE_DEGRADED, &sh->state),
3122 0); 3431 0);
3432 if (head_sh->batch_head) {
3433 sh = list_first_entry(&sh->batch_list,
3434 struct stripe_head,
3435 batch_list);
3436 if (sh != head_sh) {
3437 dev = &sh->dev[i];
3438 goto returnbi;
3439 }
3440 }
3441 sh = head_sh;
3442 dev = &sh->dev[i];
3123 } else if (test_bit(R5_Discard, &dev->flags)) 3443 } else if (test_bit(R5_Discard, &dev->flags))
3124 discard_pending = 1; 3444 discard_pending = 1;
3125 WARN_ON(test_bit(R5_SkipCopy, &dev->flags)); 3445 WARN_ON(test_bit(R5_SkipCopy, &dev->flags));
@@ -3141,8 +3461,17 @@ static void handle_stripe_clean_event(struct r5conf *conf,
3141 * will be reinitialized 3461 * will be reinitialized
3142 */ 3462 */
3143 spin_lock_irq(&conf->device_lock); 3463 spin_lock_irq(&conf->device_lock);
3464unhash:
3144 remove_hash(sh); 3465 remove_hash(sh);
3466 if (head_sh->batch_head) {
3467 sh = list_first_entry(&sh->batch_list,
3468 struct stripe_head, batch_list);
3469 if (sh != head_sh)
3470 goto unhash;
3471 }
3145 spin_unlock_irq(&conf->device_lock); 3472 spin_unlock_irq(&conf->device_lock);
3473 sh = head_sh;
3474
3146 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) 3475 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
3147 set_bit(STRIPE_HANDLE, &sh->state); 3476 set_bit(STRIPE_HANDLE, &sh->state);
3148 3477
@@ -3151,6 +3480,45 @@ static void handle_stripe_clean_event(struct r5conf *conf,
3151 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 3480 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
3152 if (atomic_dec_and_test(&conf->pending_full_writes)) 3481 if (atomic_dec_and_test(&conf->pending_full_writes))
3153 md_wakeup_thread(conf->mddev->thread); 3482 md_wakeup_thread(conf->mddev->thread);
3483
3484 if (!head_sh->batch_head || !do_endio)
3485 return;
3486 for (i = 0; i < head_sh->disks; i++) {
3487 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
3488 wakeup_nr++;
3489 }
3490 while (!list_empty(&head_sh->batch_list)) {
3491 int i;
3492 sh = list_first_entry(&head_sh->batch_list,
3493 struct stripe_head, batch_list);
3494 list_del_init(&sh->batch_list);
3495
3496 set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
3497 head_sh->state & ~((1 << STRIPE_ACTIVE) |
3498 (1 << STRIPE_PREREAD_ACTIVE) |
3499 STRIPE_EXPAND_SYNC_FLAG));
3500 sh->check_state = head_sh->check_state;
3501 sh->reconstruct_state = head_sh->reconstruct_state;
3502 for (i = 0; i < sh->disks; i++) {
3503 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3504 wakeup_nr++;
3505 sh->dev[i].flags = head_sh->dev[i].flags;
3506 }
3507
3508 spin_lock_irq(&sh->stripe_lock);
3509 sh->batch_head = NULL;
3510 spin_unlock_irq(&sh->stripe_lock);
3511 if (sh->state & STRIPE_EXPAND_SYNC_FLAG)
3512 set_bit(STRIPE_HANDLE, &sh->state);
3513 release_stripe(sh);
3514 }
3515
3516 spin_lock_irq(&head_sh->stripe_lock);
3517 head_sh->batch_head = NULL;
3518 spin_unlock_irq(&head_sh->stripe_lock);
3519 wake_up_nr(&conf->wait_for_overlap, wakeup_nr);
3520 if (head_sh->state & STRIPE_EXPAND_SYNC_FLAG)
3521 set_bit(STRIPE_HANDLE, &head_sh->state);
3154} 3522}
3155 3523
3156static void handle_stripe_dirtying(struct r5conf *conf, 3524static void handle_stripe_dirtying(struct r5conf *conf,
@@ -3161,28 +3529,27 @@ static void handle_stripe_dirtying(struct r5conf *conf,
3161 int rmw = 0, rcw = 0, i; 3529 int rmw = 0, rcw = 0, i;
3162 sector_t recovery_cp = conf->mddev->recovery_cp; 3530 sector_t recovery_cp = conf->mddev->recovery_cp;
3163 3531
3164 /* RAID6 requires 'rcw' in current implementation. 3532 /* Check whether resync is now happening or should start.
3165 * Otherwise, check whether resync is now happening or should start.
3166 * If yes, then the array is dirty (after unclean shutdown or 3533 * If yes, then the array is dirty (after unclean shutdown or
3167 * initial creation), so parity in some stripes might be inconsistent. 3534 * initial creation), so parity in some stripes might be inconsistent.
3168 * In this case, we need to always do reconstruct-write, to ensure 3535 * In this case, we need to always do reconstruct-write, to ensure
3169 * that in case of drive failure or read-error correction, we 3536 * that in case of drive failure or read-error correction, we
3170 * generate correct data from the parity. 3537 * generate correct data from the parity.
3171 */ 3538 */
3172 if (conf->max_degraded == 2 || 3539 if (conf->rmw_level == PARITY_DISABLE_RMW ||
3173 (recovery_cp < MaxSector && sh->sector >= recovery_cp && 3540 (recovery_cp < MaxSector && sh->sector >= recovery_cp &&
3174 s->failed == 0)) { 3541 s->failed == 0)) {
3175 /* Calculate the real rcw later - for now make it 3542 /* Calculate the real rcw later - for now make it
3176 * look like rcw is cheaper 3543 * look like rcw is cheaper
3177 */ 3544 */
3178 rcw = 1; rmw = 2; 3545 rcw = 1; rmw = 2;
3179 pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n", 3546 pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
3180 conf->max_degraded, (unsigned long long)recovery_cp, 3547 conf->rmw_level, (unsigned long long)recovery_cp,
3181 (unsigned long long)sh->sector); 3548 (unsigned long long)sh->sector);
3182 } else for (i = disks; i--; ) { 3549 } else for (i = disks; i--; ) {
3183 /* would I have to read this buffer for read_modify_write */ 3550 /* would I have to read this buffer for read_modify_write */
3184 struct r5dev *dev = &sh->dev[i]; 3551 struct r5dev *dev = &sh->dev[i];
3185 if ((dev->towrite || i == sh->pd_idx) && 3552 if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
3186 !test_bit(R5_LOCKED, &dev->flags) && 3553 !test_bit(R5_LOCKED, &dev->flags) &&
3187 !(test_bit(R5_UPTODATE, &dev->flags) || 3554 !(test_bit(R5_UPTODATE, &dev->flags) ||
3188 test_bit(R5_Wantcompute, &dev->flags))) { 3555 test_bit(R5_Wantcompute, &dev->flags))) {
@@ -3192,7 +3559,8 @@ static void handle_stripe_dirtying(struct r5conf *conf,
3192 rmw += 2*disks; /* cannot read it */ 3559 rmw += 2*disks; /* cannot read it */
3193 } 3560 }
3194 /* Would I have to read this buffer for reconstruct_write */ 3561 /* Would I have to read this buffer for reconstruct_write */
3195 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && 3562 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3563 i != sh->pd_idx && i != sh->qd_idx &&
3196 !test_bit(R5_LOCKED, &dev->flags) && 3564 !test_bit(R5_LOCKED, &dev->flags) &&
3197 !(test_bit(R5_UPTODATE, &dev->flags) || 3565 !(test_bit(R5_UPTODATE, &dev->flags) ||
3198 test_bit(R5_Wantcompute, &dev->flags))) { 3566 test_bit(R5_Wantcompute, &dev->flags))) {
@@ -3205,7 +3573,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
3205 pr_debug("for sector %llu, rmw=%d rcw=%d\n", 3573 pr_debug("for sector %llu, rmw=%d rcw=%d\n",
3206 (unsigned long long)sh->sector, rmw, rcw); 3574 (unsigned long long)sh->sector, rmw, rcw);
3207 set_bit(STRIPE_HANDLE, &sh->state); 3575 set_bit(STRIPE_HANDLE, &sh->state);
3208 if (rmw < rcw && rmw > 0) { 3576 if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_ENABLE_RMW)) && rmw > 0) {
3209 /* prefer read-modify-write, but need to get some data */ 3577 /* prefer read-modify-write, but need to get some data */
3210 if (conf->mddev->queue) 3578 if (conf->mddev->queue)
3211 blk_add_trace_msg(conf->mddev->queue, 3579 blk_add_trace_msg(conf->mddev->queue,
@@ -3213,7 +3581,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
3213 (unsigned long long)sh->sector, rmw); 3581 (unsigned long long)sh->sector, rmw);
3214 for (i = disks; i--; ) { 3582 for (i = disks; i--; ) {
3215 struct r5dev *dev = &sh->dev[i]; 3583 struct r5dev *dev = &sh->dev[i];
3216 if ((dev->towrite || i == sh->pd_idx) && 3584 if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
3217 !test_bit(R5_LOCKED, &dev->flags) && 3585 !test_bit(R5_LOCKED, &dev->flags) &&
3218 !(test_bit(R5_UPTODATE, &dev->flags) || 3586 !(test_bit(R5_UPTODATE, &dev->flags) ||
3219 test_bit(R5_Wantcompute, &dev->flags)) && 3587 test_bit(R5_Wantcompute, &dev->flags)) &&
@@ -3232,7 +3600,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
3232 } 3600 }
3233 } 3601 }
3234 } 3602 }
3235 if (rcw <= rmw && rcw > 0) { 3603 if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_ENABLE_RMW)) && rcw > 0) {
3236 /* want reconstruct write, but need to get some data */ 3604 /* want reconstruct write, but need to get some data */
3237 int qread =0; 3605 int qread =0;
3238 rcw = 0; 3606 rcw = 0;
@@ -3290,6 +3658,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
3290{ 3658{
3291 struct r5dev *dev = NULL; 3659 struct r5dev *dev = NULL;
3292 3660
3661 BUG_ON(sh->batch_head);
3293 set_bit(STRIPE_HANDLE, &sh->state); 3662 set_bit(STRIPE_HANDLE, &sh->state);
3294 3663
3295 switch (sh->check_state) { 3664 switch (sh->check_state) {
@@ -3380,6 +3749,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
3380 int qd_idx = sh->qd_idx; 3749 int qd_idx = sh->qd_idx;
3381 struct r5dev *dev; 3750 struct r5dev *dev;
3382 3751
3752 BUG_ON(sh->batch_head);
3383 set_bit(STRIPE_HANDLE, &sh->state); 3753 set_bit(STRIPE_HANDLE, &sh->state);
3384 3754
3385 BUG_ON(s->failed > 2); 3755 BUG_ON(s->failed > 2);
@@ -3543,6 +3913,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
3543 * copy some of them into a target stripe for expand. 3913 * copy some of them into a target stripe for expand.
3544 */ 3914 */
3545 struct dma_async_tx_descriptor *tx = NULL; 3915 struct dma_async_tx_descriptor *tx = NULL;
3916 BUG_ON(sh->batch_head);
3546 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3917 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3547 for (i = 0; i < sh->disks; i++) 3918 for (i = 0; i < sh->disks; i++)
3548 if (i != sh->pd_idx && i != sh->qd_idx) { 3919 if (i != sh->pd_idx && i != sh->qd_idx) {
@@ -3615,8 +3986,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3615 3986
3616 memset(s, 0, sizeof(*s)); 3987 memset(s, 0, sizeof(*s));
3617 3988
3618 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3989 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head;
3619 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 3990 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
3620 s->failed_num[0] = -1; 3991 s->failed_num[0] = -1;
3621 s->failed_num[1] = -1; 3992 s->failed_num[1] = -1;
3622 3993
@@ -3786,6 +4157,80 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3786 rcu_read_unlock(); 4157 rcu_read_unlock();
3787} 4158}
3788 4159
4160static int clear_batch_ready(struct stripe_head *sh)
4161{
4162 struct stripe_head *tmp;
4163 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
4164 return 0;
4165 spin_lock(&sh->stripe_lock);
4166 if (!sh->batch_head) {
4167 spin_unlock(&sh->stripe_lock);
4168 return 0;
4169 }
4170
4171 /*
4172 * this stripe could be added to a batch list before we check
4173 * BATCH_READY, skips it
4174 */
4175 if (sh->batch_head != sh) {
4176 spin_unlock(&sh->stripe_lock);
4177 return 1;
4178 }
4179 spin_lock(&sh->batch_lock);
4180 list_for_each_entry(tmp, &sh->batch_list, batch_list)
4181 clear_bit(STRIPE_BATCH_READY, &tmp->state);
4182 spin_unlock(&sh->batch_lock);
4183 spin_unlock(&sh->stripe_lock);
4184
4185 /*
4186 * BATCH_READY is cleared, no new stripes can be added.
4187 * batch_list can be accessed without lock
4188 */
4189 return 0;
4190}
4191
4192static void check_break_stripe_batch_list(struct stripe_head *sh)
4193{
4194 struct stripe_head *head_sh, *next;
4195 int i;
4196
4197 if (!test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
4198 return;
4199
4200 head_sh = sh;
4201 do {
4202 sh = list_first_entry(&sh->batch_list,
4203 struct stripe_head, batch_list);
4204 BUG_ON(sh == head_sh);
4205 } while (!test_bit(STRIPE_DEGRADED, &sh->state));
4206
4207 while (sh != head_sh) {
4208 next = list_first_entry(&sh->batch_list,
4209 struct stripe_head, batch_list);
4210 list_del_init(&sh->batch_list);
4211
4212 set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
4213 head_sh->state & ~((1 << STRIPE_ACTIVE) |
4214 (1 << STRIPE_PREREAD_ACTIVE) |
4215 (1 << STRIPE_DEGRADED) |
4216 STRIPE_EXPAND_SYNC_FLAG));
4217 sh->check_state = head_sh->check_state;
4218 sh->reconstruct_state = head_sh->reconstruct_state;
4219 for (i = 0; i < sh->disks; i++)
4220 sh->dev[i].flags = head_sh->dev[i].flags &
4221 (~((1 << R5_WriteError) | (1 << R5_Overlap)));
4222
4223 spin_lock_irq(&sh->stripe_lock);
4224 sh->batch_head = NULL;
4225 spin_unlock_irq(&sh->stripe_lock);
4226
4227 set_bit(STRIPE_HANDLE, &sh->state);
4228 release_stripe(sh);
4229
4230 sh = next;
4231 }
4232}
4233
3789static void handle_stripe(struct stripe_head *sh) 4234static void handle_stripe(struct stripe_head *sh)
3790{ 4235{
3791 struct stripe_head_state s; 4236 struct stripe_head_state s;
@@ -3803,7 +4248,14 @@ static void handle_stripe(struct stripe_head *sh)
3803 return; 4248 return;
3804 } 4249 }
3805 4250
3806 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 4251 if (clear_batch_ready(sh) ) {
4252 clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
4253 return;
4254 }
4255
4256 check_break_stripe_batch_list(sh);
4257
4258 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
3807 spin_lock(&sh->stripe_lock); 4259 spin_lock(&sh->stripe_lock);
3808 /* Cannot process 'sync' concurrently with 'discard' */ 4260 /* Cannot process 'sync' concurrently with 'discard' */
3809 if (!test_bit(STRIPE_DISCARD, &sh->state) && 4261 if (!test_bit(STRIPE_DISCARD, &sh->state) &&
@@ -4158,7 +4610,7 @@ static int raid5_congested(struct mddev *mddev, int bits)
4158 * how busy the stripe_cache is 4610 * how busy the stripe_cache is
4159 */ 4611 */
4160 4612
4161 if (conf->inactive_blocked) 4613 if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
4162 return 1; 4614 return 1;
4163 if (conf->quiesce) 4615 if (conf->quiesce)
4164 return 1; 4616 return 1;
@@ -4180,8 +4632,12 @@ static int raid5_mergeable_bvec(struct mddev *mddev,
4180 unsigned int chunk_sectors = mddev->chunk_sectors; 4632 unsigned int chunk_sectors = mddev->chunk_sectors;
4181 unsigned int bio_sectors = bvm->bi_size >> 9; 4633 unsigned int bio_sectors = bvm->bi_size >> 9;
4182 4634
4183 if ((bvm->bi_rw & 1) == WRITE) 4635 /*
4184 return biovec->bv_len; /* always allow writes to be mergeable */ 4636 * always allow writes to be mergeable, read as well if array
4637 * is degraded as we'll go through stripe cache anyway.
4638 */
4639 if ((bvm->bi_rw & 1) == WRITE || mddev->degraded)
4640 return biovec->bv_len;
4185 4641
4186 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 4642 if (mddev->new_chunk_sectors < mddev->chunk_sectors)
4187 chunk_sectors = mddev->new_chunk_sectors; 4643 chunk_sectors = mddev->new_chunk_sectors;
@@ -4603,12 +5059,14 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
4603 } 5059 }
4604 set_bit(STRIPE_DISCARD, &sh->state); 5060 set_bit(STRIPE_DISCARD, &sh->state);
4605 finish_wait(&conf->wait_for_overlap, &w); 5061 finish_wait(&conf->wait_for_overlap, &w);
5062 sh->overwrite_disks = 0;
4606 for (d = 0; d < conf->raid_disks; d++) { 5063 for (d = 0; d < conf->raid_disks; d++) {
4607 if (d == sh->pd_idx || d == sh->qd_idx) 5064 if (d == sh->pd_idx || d == sh->qd_idx)
4608 continue; 5065 continue;
4609 sh->dev[d].towrite = bi; 5066 sh->dev[d].towrite = bi;
4610 set_bit(R5_OVERWRITE, &sh->dev[d].flags); 5067 set_bit(R5_OVERWRITE, &sh->dev[d].flags);
4611 raid5_inc_bi_active_stripes(bi); 5068 raid5_inc_bi_active_stripes(bi);
5069 sh->overwrite_disks++;
4612 } 5070 }
4613 spin_unlock_irq(&sh->stripe_lock); 5071 spin_unlock_irq(&sh->stripe_lock);
4614 if (conf->mddev->bitmap) { 5072 if (conf->mddev->bitmap) {
@@ -4656,7 +5114,12 @@ static void make_request(struct mddev *mddev, struct bio * bi)
4656 5114
4657 md_write_start(mddev, bi); 5115 md_write_start(mddev, bi);
4658 5116
4659 if (rw == READ && 5117 /*
5118 * If array is degraded, better not do chunk aligned read because
5119 * later we might have to read it again in order to reconstruct
5120 * data on failed drives.
5121 */
5122 if (rw == READ && mddev->degraded == 0 &&
4660 mddev->reshape_position == MaxSector && 5123 mddev->reshape_position == MaxSector &&
4661 chunk_aligned_read(mddev,bi)) 5124 chunk_aligned_read(mddev,bi))
4662 return; 5125 return;
@@ -4772,7 +5235,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
4772 } 5235 }
4773 5236
4774 if (test_bit(STRIPE_EXPANDING, &sh->state) || 5237 if (test_bit(STRIPE_EXPANDING, &sh->state) ||
4775 !add_stripe_bio(sh, bi, dd_idx, rw)) { 5238 !add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
4776 /* Stripe is busy expanding or 5239 /* Stripe is busy expanding or
4777 * add failed due to overlap. Flush everything 5240 * add failed due to overlap. Flush everything
4778 * and wait a while 5241 * and wait a while
@@ -4785,7 +5248,8 @@ static void make_request(struct mddev *mddev, struct bio * bi)
4785 } 5248 }
4786 set_bit(STRIPE_HANDLE, &sh->state); 5249 set_bit(STRIPE_HANDLE, &sh->state);
4787 clear_bit(STRIPE_DELAYED, &sh->state); 5250 clear_bit(STRIPE_DELAYED, &sh->state);
4788 if ((bi->bi_rw & REQ_SYNC) && 5251 if ((!sh->batch_head || sh == sh->batch_head) &&
5252 (bi->bi_rw & REQ_SYNC) &&
4789 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5253 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4790 atomic_inc(&conf->preread_active_stripes); 5254 atomic_inc(&conf->preread_active_stripes);
4791 release_stripe_plug(mddev, sh); 5255 release_stripe_plug(mddev, sh);
@@ -5050,8 +5514,7 @@ ret:
5050 return reshape_sectors; 5514 return reshape_sectors;
5051} 5515}
5052 5516
5053/* FIXME go_faster isn't used */ 5517static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
5054static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster)
5055{ 5518{
5056 struct r5conf *conf = mddev->private; 5519 struct r5conf *conf = mddev->private;
5057 struct stripe_head *sh; 5520 struct stripe_head *sh;
@@ -5186,7 +5649,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
5186 return handled; 5649 return handled;
5187 } 5650 }
5188 5651
5189 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 5652 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
5190 release_stripe(sh); 5653 release_stripe(sh);
5191 raid5_set_bi_processed_stripes(raid_bio, scnt); 5654 raid5_set_bi_processed_stripes(raid_bio, scnt);
5192 conf->retry_read_aligned = raid_bio; 5655 conf->retry_read_aligned = raid_bio;
@@ -5312,6 +5775,8 @@ static void raid5d(struct md_thread *thread)
5312 int batch_size, released; 5775 int batch_size, released;
5313 5776
5314 released = release_stripe_list(conf, conf->temp_inactive_list); 5777 released = release_stripe_list(conf, conf->temp_inactive_list);
5778 if (released)
5779 clear_bit(R5_DID_ALLOC, &conf->cache_state);
5315 5780
5316 if ( 5781 if (
5317 !list_empty(&conf->bitmap_list)) { 5782 !list_empty(&conf->bitmap_list)) {
@@ -5350,6 +5815,13 @@ static void raid5d(struct md_thread *thread)
5350 pr_debug("%d stripes handled\n", handled); 5815 pr_debug("%d stripes handled\n", handled);
5351 5816
5352 spin_unlock_irq(&conf->device_lock); 5817 spin_unlock_irq(&conf->device_lock);
5818 if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state)) {
5819 grow_one_stripe(conf, __GFP_NOWARN);
5820 /* Set flag even if allocation failed. This helps
5821 * slow down allocation requests when mem is short
5822 */
5823 set_bit(R5_DID_ALLOC, &conf->cache_state);
5824 }
5353 5825
5354 async_tx_issue_pending_all(); 5826 async_tx_issue_pending_all();
5355 blk_finish_plug(&plug); 5827 blk_finish_plug(&plug);
@@ -5365,7 +5837,7 @@ raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
5365 spin_lock(&mddev->lock); 5837 spin_lock(&mddev->lock);
5366 conf = mddev->private; 5838 conf = mddev->private;
5367 if (conf) 5839 if (conf)
5368 ret = sprintf(page, "%d\n", conf->max_nr_stripes); 5840 ret = sprintf(page, "%d\n", conf->min_nr_stripes);
5369 spin_unlock(&mddev->lock); 5841 spin_unlock(&mddev->lock);
5370 return ret; 5842 return ret;
5371} 5843}
@@ -5375,30 +5847,24 @@ raid5_set_cache_size(struct mddev *mddev, int size)
5375{ 5847{
5376 struct r5conf *conf = mddev->private; 5848 struct r5conf *conf = mddev->private;
5377 int err; 5849 int err;
5378 int hash;
5379 5850
5380 if (size <= 16 || size > 32768) 5851 if (size <= 16 || size > 32768)
5381 return -EINVAL; 5852 return -EINVAL;
5382 hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS; 5853
5383 while (size < conf->max_nr_stripes) { 5854 conf->min_nr_stripes = size;
5384 if (drop_one_stripe(conf, hash)) 5855 while (size < conf->max_nr_stripes &&
5385 conf->max_nr_stripes--; 5856 drop_one_stripe(conf))
5386 else 5857 ;
5387 break; 5858
5388 hash--; 5859
5389 if (hash < 0)
5390 hash = NR_STRIPE_HASH_LOCKS - 1;
5391 }
5392 err = md_allow_write(mddev); 5860 err = md_allow_write(mddev);
5393 if (err) 5861 if (err)
5394 return err; 5862 return err;
5395 hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; 5863
5396 while (size > conf->max_nr_stripes) { 5864 while (size > conf->max_nr_stripes)
5397 if (grow_one_stripe(conf, hash)) 5865 if (!grow_one_stripe(conf, GFP_KERNEL))
5398 conf->max_nr_stripes++; 5866 break;
5399 else break; 5867
5400 hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
5401 }
5402 return 0; 5868 return 0;
5403} 5869}
5404EXPORT_SYMBOL(raid5_set_cache_size); 5870EXPORT_SYMBOL(raid5_set_cache_size);
@@ -5433,6 +5899,49 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
5433 raid5_store_stripe_cache_size); 5899 raid5_store_stripe_cache_size);
5434 5900
5435static ssize_t 5901static ssize_t
5902raid5_show_rmw_level(struct mddev *mddev, char *page)
5903{
5904 struct r5conf *conf = mddev->private;
5905 if (conf)
5906 return sprintf(page, "%d\n", conf->rmw_level);
5907 else
5908 return 0;
5909}
5910
5911static ssize_t
5912raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len)
5913{
5914 struct r5conf *conf = mddev->private;
5915 unsigned long new;
5916
5917 if (!conf)
5918 return -ENODEV;
5919
5920 if (len >= PAGE_SIZE)
5921 return -EINVAL;
5922
5923 if (kstrtoul(page, 10, &new))
5924 return -EINVAL;
5925
5926 if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome)
5927 return -EINVAL;
5928
5929 if (new != PARITY_DISABLE_RMW &&
5930 new != PARITY_ENABLE_RMW &&
5931 new != PARITY_PREFER_RMW)
5932 return -EINVAL;
5933
5934 conf->rmw_level = new;
5935 return len;
5936}
5937
5938static struct md_sysfs_entry
5939raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR,
5940 raid5_show_rmw_level,
5941 raid5_store_rmw_level);
5942
5943
5944static ssize_t
5436raid5_show_preread_threshold(struct mddev *mddev, char *page) 5945raid5_show_preread_threshold(struct mddev *mddev, char *page)
5437{ 5946{
5438 struct r5conf *conf; 5947 struct r5conf *conf;
@@ -5463,7 +5972,7 @@ raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
5463 conf = mddev->private; 5972 conf = mddev->private;
5464 if (!conf) 5973 if (!conf)
5465 err = -ENODEV; 5974 err = -ENODEV;
5466 else if (new > conf->max_nr_stripes) 5975 else if (new > conf->min_nr_stripes)
5467 err = -EINVAL; 5976 err = -EINVAL;
5468 else 5977 else
5469 conf->bypass_threshold = new; 5978 conf->bypass_threshold = new;
@@ -5618,6 +6127,7 @@ static struct attribute *raid5_attrs[] = {
5618 &raid5_preread_bypass_threshold.attr, 6127 &raid5_preread_bypass_threshold.attr,
5619 &raid5_group_thread_cnt.attr, 6128 &raid5_group_thread_cnt.attr,
5620 &raid5_skip_copy.attr, 6129 &raid5_skip_copy.attr,
6130 &raid5_rmw_level.attr,
5621 NULL, 6131 NULL,
5622}; 6132};
5623static struct attribute_group raid5_attrs_group = { 6133static struct attribute_group raid5_attrs_group = {
@@ -5699,7 +6209,8 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
5699static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 6209static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
5700{ 6210{
5701 safe_put_page(percpu->spare_page); 6211 safe_put_page(percpu->spare_page);
5702 kfree(percpu->scribble); 6212 if (percpu->scribble)
6213 flex_array_free(percpu->scribble);
5703 percpu->spare_page = NULL; 6214 percpu->spare_page = NULL;
5704 percpu->scribble = NULL; 6215 percpu->scribble = NULL;
5705} 6216}
@@ -5709,7 +6220,9 @@ static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu
5709 if (conf->level == 6 && !percpu->spare_page) 6220 if (conf->level == 6 && !percpu->spare_page)
5710 percpu->spare_page = alloc_page(GFP_KERNEL); 6221 percpu->spare_page = alloc_page(GFP_KERNEL);
5711 if (!percpu->scribble) 6222 if (!percpu->scribble)
5712 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 6223 percpu->scribble = scribble_alloc(max(conf->raid_disks,
6224 conf->previous_raid_disks), conf->chunk_sectors /
6225 STRIPE_SECTORS, GFP_KERNEL);
5713 6226
5714 if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) { 6227 if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) {
5715 free_scratch_buffer(conf, percpu); 6228 free_scratch_buffer(conf, percpu);
@@ -5740,6 +6253,8 @@ static void raid5_free_percpu(struct r5conf *conf)
5740 6253
5741static void free_conf(struct r5conf *conf) 6254static void free_conf(struct r5conf *conf)
5742{ 6255{
6256 if (conf->shrinker.seeks)
6257 unregister_shrinker(&conf->shrinker);
5743 free_thread_groups(conf); 6258 free_thread_groups(conf);
5744 shrink_stripes(conf); 6259 shrink_stripes(conf);
5745 raid5_free_percpu(conf); 6260 raid5_free_percpu(conf);
@@ -5807,6 +6322,30 @@ static int raid5_alloc_percpu(struct r5conf *conf)
5807 return err; 6322 return err;
5808} 6323}
5809 6324
6325static unsigned long raid5_cache_scan(struct shrinker *shrink,
6326 struct shrink_control *sc)
6327{
6328 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
6329 int ret = 0;
6330 while (ret < sc->nr_to_scan) {
6331 if (drop_one_stripe(conf) == 0)
6332 return SHRINK_STOP;
6333 ret++;
6334 }
6335 return ret;
6336}
6337
6338static unsigned long raid5_cache_count(struct shrinker *shrink,
6339 struct shrink_control *sc)
6340{
6341 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
6342
6343 if (conf->max_nr_stripes < conf->min_nr_stripes)
6344 /* unlikely, but not impossible */
6345 return 0;
6346 return conf->max_nr_stripes - conf->min_nr_stripes;
6347}
6348
5810static struct r5conf *setup_conf(struct mddev *mddev) 6349static struct r5conf *setup_conf(struct mddev *mddev)
5811{ 6350{
5812 struct r5conf *conf; 6351 struct r5conf *conf;
@@ -5879,7 +6418,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5879 else 6418 else
5880 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 6419 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
5881 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 6420 max_disks = max(conf->raid_disks, conf->previous_raid_disks);
5882 conf->scribble_len = scribble_len(max_disks);
5883 6421
5884 conf->disks = kzalloc(max_disks * sizeof(struct disk_info), 6422 conf->disks = kzalloc(max_disks * sizeof(struct disk_info),
5885 GFP_KERNEL); 6423 GFP_KERNEL);
@@ -5907,6 +6445,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5907 INIT_LIST_HEAD(conf->temp_inactive_list + i); 6445 INIT_LIST_HEAD(conf->temp_inactive_list + i);
5908 6446
5909 conf->level = mddev->new_level; 6447 conf->level = mddev->new_level;
6448 conf->chunk_sectors = mddev->new_chunk_sectors;
5910 if (raid5_alloc_percpu(conf) != 0) 6449 if (raid5_alloc_percpu(conf) != 0)
5911 goto abort; 6450 goto abort;
5912 6451
@@ -5939,12 +6478,17 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5939 conf->fullsync = 1; 6478 conf->fullsync = 1;
5940 } 6479 }
5941 6480
5942 conf->chunk_sectors = mddev->new_chunk_sectors;
5943 conf->level = mddev->new_level; 6481 conf->level = mddev->new_level;
5944 if (conf->level == 6) 6482 if (conf->level == 6) {
5945 conf->max_degraded = 2; 6483 conf->max_degraded = 2;
5946 else 6484 if (raid6_call.xor_syndrome)
6485 conf->rmw_level = PARITY_ENABLE_RMW;
6486 else
6487 conf->rmw_level = PARITY_DISABLE_RMW;
6488 } else {
5947 conf->max_degraded = 1; 6489 conf->max_degraded = 1;
6490 conf->rmw_level = PARITY_ENABLE_RMW;
6491 }
5948 conf->algorithm = mddev->new_layout; 6492 conf->algorithm = mddev->new_layout;
5949 conf->reshape_progress = mddev->reshape_position; 6493 conf->reshape_progress = mddev->reshape_position;
5950 if (conf->reshape_progress != MaxSector) { 6494 if (conf->reshape_progress != MaxSector) {
@@ -5952,10 +6496,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5952 conf->prev_algo = mddev->layout; 6496 conf->prev_algo = mddev->layout;
5953 } 6497 }
5954 6498
5955 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + 6499 conf->min_nr_stripes = NR_STRIPES;
6500 memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
5956 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 6501 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
5957 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); 6502 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
5958 if (grow_stripes(conf, NR_STRIPES)) { 6503 if (grow_stripes(conf, conf->min_nr_stripes)) {
5959 printk(KERN_ERR 6504 printk(KERN_ERR
5960 "md/raid:%s: couldn't allocate %dkB for buffers\n", 6505 "md/raid:%s: couldn't allocate %dkB for buffers\n",
5961 mdname(mddev), memory); 6506 mdname(mddev), memory);
@@ -5963,6 +6508,17 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5963 } else 6508 } else
5964 printk(KERN_INFO "md/raid:%s: allocated %dkB\n", 6509 printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
5965 mdname(mddev), memory); 6510 mdname(mddev), memory);
6511 /*
6512 * Losing a stripe head costs more than the time to refill it,
6513 * it reduces the queue depth and so can hurt throughput.
6514 * So set it rather large, scaled by number of devices.
6515 */
6516 conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
6517 conf->shrinker.scan_objects = raid5_cache_scan;
6518 conf->shrinker.count_objects = raid5_cache_count;
6519 conf->shrinker.batch = 128;
6520 conf->shrinker.flags = 0;
6521 register_shrinker(&conf->shrinker);
5966 6522
5967 sprintf(pers_name, "raid%d", mddev->new_level); 6523 sprintf(pers_name, "raid%d", mddev->new_level);
5968 conf->thread = md_register_thread(raid5d, mddev, pers_name); 6524 conf->thread = md_register_thread(raid5d, mddev, pers_name);
@@ -6604,9 +7160,9 @@ static int check_stripe_cache(struct mddev *mddev)
6604 */ 7160 */
6605 struct r5conf *conf = mddev->private; 7161 struct r5conf *conf = mddev->private;
6606 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 7162 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
6607 > conf->max_nr_stripes || 7163 > conf->min_nr_stripes ||
6608 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 7164 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
6609 > conf->max_nr_stripes) { 7165 > conf->min_nr_stripes) {
6610 printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", 7166 printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n",
6611 mdname(mddev), 7167 mdname(mddev),
6612 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 7168 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)