diff options
author | Markus Stockhausen <stockhausen@collogia.de> | 2014-12-14 20:57:05 -0500 |
---|---|---|
committer | NeilBrown <neilb@suse.de> | 2015-04-21 18:00:42 -0400 |
commit | 584acdd49cd2472ca0f5a06adbe979db82d0b4af (patch) | |
tree | 94abdc5ca0208e47275bc2a8ad82c2d25cefddfd | |
parent | a582564b24bec0443b5c5ff43ee6d1258f8bd658 (diff) |
md/raid5: activate raid6 rmw feature
Glue it altogehter. The raid6 rmw path should work the same as the
already existing raid5 logic. So emulate the prexor handling/flags
and split functions as needed.
1) Enable xor_syndrome() in the async layer.
2) Split ops_run_prexor() into RAID4/5 and RAID6 logic. Xor the syndrome
at the start of a rmw run as we did it before for the single parity.
3) Take care of rmw run in ops_run_reconstruct6(). Again process only
the changed pages to get syndrome back into sync.
4) Enhance set_syndrome_sources() to fill NULL pages if we are in a rmw
run. The lower layers will calculate start & end pages from that and
call the xor_syndrome() correspondingly.
5) Adapt the several places where we ignored Q handling up to now.
Performance numbers for a single E5630 system with a mix of 10 7200k
desktop/server disks. 300 seconds random write with 8 threads onto a
3,2TB (10*400GB) RAID6 64K chunk without spare (group_thread_cnt=4)
bsize rmw_level=1 rmw_level=0 rmw_level=1 rmw_level=0
skip_copy=1 skip_copy=1 skip_copy=0 skip_copy=0
4K 115 KB/s 141 KB/s 165 KB/s 140 KB/s
8K 225 KB/s 275 KB/s 324 KB/s 274 KB/s
16K 434 KB/s 536 KB/s 640 KB/s 534 KB/s
32K 751 KB/s 1,051 KB/s 1,234 KB/s 1,045 KB/s
64K 1,339 KB/s 1,958 KB/s 2,282 KB/s 1,962 KB/s
128K 2,673 KB/s 3,862 KB/s 4,113 KB/s 3,898 KB/s
256K 7,685 KB/s 7,539 KB/s 7,557 KB/s 7,638 KB/s
512K 19,556 KB/s 19,558 KB/s 19,652 KB/s 19,688 Kb/s
Signed-off-by: Markus Stockhausen <stockhausen@collogia.de>
Signed-off-by: NeilBrown <neilb@suse.de>
-rw-r--r-- | crypto/async_tx/async_pq.c | 19 | ||||
-rw-r--r-- | drivers/md/raid5.c | 104 | ||||
-rw-r--r-- | drivers/md/raid5.h | 19 | ||||
-rw-r--r-- | include/linux/async_tx.h | 3 |
4 files changed, 115 insertions, 30 deletions
diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c index d05327caf69d..5d355e0c2633 100644 --- a/crypto/async_tx/async_pq.c +++ b/crypto/async_tx/async_pq.c | |||
@@ -124,6 +124,7 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks, | |||
124 | { | 124 | { |
125 | void **srcs; | 125 | void **srcs; |
126 | int i; | 126 | int i; |
127 | int start = -1, stop = disks - 3; | ||
127 | 128 | ||
128 | if (submit->scribble) | 129 | if (submit->scribble) |
129 | srcs = submit->scribble; | 130 | srcs = submit->scribble; |
@@ -134,10 +135,21 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks, | |||
134 | if (blocks[i] == NULL) { | 135 | if (blocks[i] == NULL) { |
135 | BUG_ON(i > disks - 3); /* P or Q can't be zero */ | 136 | BUG_ON(i > disks - 3); /* P or Q can't be zero */ |
136 | srcs[i] = (void*)raid6_empty_zero_page; | 137 | srcs[i] = (void*)raid6_empty_zero_page; |
137 | } else | 138 | } else { |
138 | srcs[i] = page_address(blocks[i]) + offset; | 139 | srcs[i] = page_address(blocks[i]) + offset; |
140 | if (i < disks - 2) { | ||
141 | stop = i; | ||
142 | if (start == -1) | ||
143 | start = i; | ||
144 | } | ||
145 | } | ||
139 | } | 146 | } |
140 | raid6_call.gen_syndrome(disks, len, srcs); | 147 | if (submit->flags & ASYNC_TX_PQ_XOR_DST) { |
148 | BUG_ON(!raid6_call.xor_syndrome); | ||
149 | if (start >= 0) | ||
150 | raid6_call.xor_syndrome(disks, start, stop, len, srcs); | ||
151 | } else | ||
152 | raid6_call.gen_syndrome(disks, len, srcs); | ||
141 | async_tx_sync_epilog(submit); | 153 | async_tx_sync_epilog(submit); |
142 | } | 154 | } |
143 | 155 | ||
@@ -178,7 +190,8 @@ async_gen_syndrome(struct page **blocks, unsigned int offset, int disks, | |||
178 | if (device) | 190 | if (device) |
179 | unmap = dmaengine_get_unmap_data(device->dev, disks, GFP_NOIO); | 191 | unmap = dmaengine_get_unmap_data(device->dev, disks, GFP_NOIO); |
180 | 192 | ||
181 | if (unmap && | 193 | /* XORing P/Q is only implemented in software */ |
194 | if (unmap && !(submit->flags & ASYNC_TX_PQ_XOR_DST) && | ||
182 | (src_cnt <= dma_maxpq(device, 0) || | 195 | (src_cnt <= dma_maxpq(device, 0) || |
183 | dma_maxpq(device, DMA_PREP_CONTINUE) > 0) && | 196 | dma_maxpq(device, DMA_PREP_CONTINUE) > 0) && |
184 | is_dma_pq_aligned(device, offset, 0, len)) { | 197 | is_dma_pq_aligned(device, offset, 0, len)) { |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3ae097d50b51..c82ce1fd8723 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -1317,7 +1317,9 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
1317 | * destination buffer is recorded in srcs[count] and the Q destination | 1317 | * destination buffer is recorded in srcs[count] and the Q destination |
1318 | * is recorded in srcs[count+1]]. | 1318 | * is recorded in srcs[count+1]]. |
1319 | */ | 1319 | */ |
1320 | static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) | 1320 | static int set_syndrome_sources(struct page **srcs, |
1321 | struct stripe_head *sh, | ||
1322 | int srctype) | ||
1321 | { | 1323 | { |
1322 | int disks = sh->disks; | 1324 | int disks = sh->disks; |
1323 | int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); | 1325 | int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); |
@@ -1332,8 +1334,15 @@ static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) | |||
1332 | i = d0_idx; | 1334 | i = d0_idx; |
1333 | do { | 1335 | do { |
1334 | int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); | 1336 | int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); |
1337 | struct r5dev *dev = &sh->dev[i]; | ||
1335 | 1338 | ||
1336 | srcs[slot] = sh->dev[i].page; | 1339 | if (i == sh->qd_idx || i == sh->pd_idx || |
1340 | (srctype == SYNDROME_SRC_ALL) || | ||
1341 | (srctype == SYNDROME_SRC_WANT_DRAIN && | ||
1342 | test_bit(R5_Wantdrain, &dev->flags)) || | ||
1343 | (srctype == SYNDROME_SRC_WRITTEN && | ||
1344 | dev->written)) | ||
1345 | srcs[slot] = sh->dev[i].page; | ||
1337 | i = raid6_next_disk(i, disks); | 1346 | i = raid6_next_disk(i, disks); |
1338 | } while (i != d0_idx); | 1347 | } while (i != d0_idx); |
1339 | 1348 | ||
@@ -1373,7 +1382,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
1373 | atomic_inc(&sh->count); | 1382 | atomic_inc(&sh->count); |
1374 | 1383 | ||
1375 | if (target == qd_idx) { | 1384 | if (target == qd_idx) { |
1376 | count = set_syndrome_sources(blocks, sh); | 1385 | count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); |
1377 | blocks[count] = NULL; /* regenerating p is not necessary */ | 1386 | blocks[count] = NULL; /* regenerating p is not necessary */ |
1378 | BUG_ON(blocks[count+1] != dest); /* q should already be set */ | 1387 | BUG_ON(blocks[count+1] != dest); /* q should already be set */ |
1379 | init_async_submit(&submit, ASYNC_TX_FENCE, NULL, | 1388 | init_async_submit(&submit, ASYNC_TX_FENCE, NULL, |
@@ -1481,7 +1490,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
1481 | tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, | 1490 | tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, |
1482 | &submit); | 1491 | &submit); |
1483 | 1492 | ||
1484 | count = set_syndrome_sources(blocks, sh); | 1493 | count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); |
1485 | init_async_submit(&submit, ASYNC_TX_FENCE, tx, | 1494 | init_async_submit(&submit, ASYNC_TX_FENCE, tx, |
1486 | ops_complete_compute, sh, | 1495 | ops_complete_compute, sh, |
1487 | to_addr_conv(sh, percpu, 0)); | 1496 | to_addr_conv(sh, percpu, 0)); |
@@ -1515,8 +1524,8 @@ static void ops_complete_prexor(void *stripe_head_ref) | |||
1515 | } | 1524 | } |
1516 | 1525 | ||
1517 | static struct dma_async_tx_descriptor * | 1526 | static struct dma_async_tx_descriptor * |
1518 | ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, | 1527 | ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, |
1519 | struct dma_async_tx_descriptor *tx) | 1528 | struct dma_async_tx_descriptor *tx) |
1520 | { | 1529 | { |
1521 | int disks = sh->disks; | 1530 | int disks = sh->disks; |
1522 | struct page **xor_srcs = to_addr_page(percpu, 0); | 1531 | struct page **xor_srcs = to_addr_page(percpu, 0); |
@@ -1545,6 +1554,26 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, | |||
1545 | } | 1554 | } |
1546 | 1555 | ||
1547 | static struct dma_async_tx_descriptor * | 1556 | static struct dma_async_tx_descriptor * |
1557 | ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, | ||
1558 | struct dma_async_tx_descriptor *tx) | ||
1559 | { | ||
1560 | struct page **blocks = to_addr_page(percpu, 0); | ||
1561 | int count; | ||
1562 | struct async_submit_ctl submit; | ||
1563 | |||
1564 | pr_debug("%s: stripe %llu\n", __func__, | ||
1565 | (unsigned long long)sh->sector); | ||
1566 | |||
1567 | count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN); | ||
1568 | |||
1569 | init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx, | ||
1570 | ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); | ||
1571 | tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); | ||
1572 | |||
1573 | return tx; | ||
1574 | } | ||
1575 | |||
1576 | static struct dma_async_tx_descriptor * | ||
1548 | ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | 1577 | ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) |
1549 | { | 1578 | { |
1550 | int disks = sh->disks; | 1579 | int disks = sh->disks; |
@@ -1746,6 +1775,8 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, | |||
1746 | int count, i, j = 0; | 1775 | int count, i, j = 0; |
1747 | struct stripe_head *head_sh = sh; | 1776 | struct stripe_head *head_sh = sh; |
1748 | int last_stripe; | 1777 | int last_stripe; |
1778 | int synflags; | ||
1779 | unsigned long txflags; | ||
1749 | 1780 | ||
1750 | pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); | 1781 | pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); |
1751 | 1782 | ||
@@ -1765,14 +1796,23 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, | |||
1765 | 1796 | ||
1766 | again: | 1797 | again: |
1767 | blocks = to_addr_page(percpu, j); | 1798 | blocks = to_addr_page(percpu, j); |
1768 | count = set_syndrome_sources(blocks, sh); | 1799 | |
1800 | if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { | ||
1801 | synflags = SYNDROME_SRC_WRITTEN; | ||
1802 | txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST; | ||
1803 | } else { | ||
1804 | synflags = SYNDROME_SRC_ALL; | ||
1805 | txflags = ASYNC_TX_ACK; | ||
1806 | } | ||
1807 | |||
1808 | count = set_syndrome_sources(blocks, sh, synflags); | ||
1769 | last_stripe = !head_sh->batch_head || | 1809 | last_stripe = !head_sh->batch_head || |
1770 | list_first_entry(&sh->batch_list, | 1810 | list_first_entry(&sh->batch_list, |
1771 | struct stripe_head, batch_list) == head_sh; | 1811 | struct stripe_head, batch_list) == head_sh; |
1772 | 1812 | ||
1773 | if (last_stripe) { | 1813 | if (last_stripe) { |
1774 | atomic_inc(&head_sh->count); | 1814 | atomic_inc(&head_sh->count); |
1775 | init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, | 1815 | init_async_submit(&submit, txflags, tx, ops_complete_reconstruct, |
1776 | head_sh, to_addr_conv(sh, percpu, j)); | 1816 | head_sh, to_addr_conv(sh, percpu, j)); |
1777 | } else | 1817 | } else |
1778 | init_async_submit(&submit, 0, tx, NULL, NULL, | 1818 | init_async_submit(&submit, 0, tx, NULL, NULL, |
@@ -1843,7 +1883,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu | |||
1843 | (unsigned long long)sh->sector, checkp); | 1883 | (unsigned long long)sh->sector, checkp); |
1844 | 1884 | ||
1845 | BUG_ON(sh->batch_head); | 1885 | BUG_ON(sh->batch_head); |
1846 | count = set_syndrome_sources(srcs, sh); | 1886 | count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL); |
1847 | if (!checkp) | 1887 | if (!checkp) |
1848 | srcs[count] = NULL; | 1888 | srcs[count] = NULL; |
1849 | 1889 | ||
@@ -1884,8 +1924,12 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | |||
1884 | async_tx_ack(tx); | 1924 | async_tx_ack(tx); |
1885 | } | 1925 | } |
1886 | 1926 | ||
1887 | if (test_bit(STRIPE_OP_PREXOR, &ops_request)) | 1927 | if (test_bit(STRIPE_OP_PREXOR, &ops_request)) { |
1888 | tx = ops_run_prexor(sh, percpu, tx); | 1928 | if (level < 6) |
1929 | tx = ops_run_prexor5(sh, percpu, tx); | ||
1930 | else | ||
1931 | tx = ops_run_prexor6(sh, percpu, tx); | ||
1932 | } | ||
1889 | 1933 | ||
1890 | if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { | 1934 | if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { |
1891 | tx = ops_run_biodrain(sh, tx); | 1935 | tx = ops_run_biodrain(sh, tx); |
@@ -2770,7 +2814,7 @@ static void | |||
2770 | schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, | 2814 | schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, |
2771 | int rcw, int expand) | 2815 | int rcw, int expand) |
2772 | { | 2816 | { |
2773 | int i, pd_idx = sh->pd_idx, disks = sh->disks; | 2817 | int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks; |
2774 | struct r5conf *conf = sh->raid_conf; | 2818 | struct r5conf *conf = sh->raid_conf; |
2775 | int level = conf->level; | 2819 | int level = conf->level; |
2776 | 2820 | ||
@@ -2806,13 +2850,15 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, | |||
2806 | if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) | 2850 | if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) |
2807 | atomic_inc(&conf->pending_full_writes); | 2851 | atomic_inc(&conf->pending_full_writes); |
2808 | } else { | 2852 | } else { |
2809 | BUG_ON(level == 6); | ||
2810 | BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || | 2853 | BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || |
2811 | test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); | 2854 | test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); |
2855 | BUG_ON(level == 6 && | ||
2856 | (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) || | ||
2857 | test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags)))); | ||
2812 | 2858 | ||
2813 | for (i = disks; i--; ) { | 2859 | for (i = disks; i--; ) { |
2814 | struct r5dev *dev = &sh->dev[i]; | 2860 | struct r5dev *dev = &sh->dev[i]; |
2815 | if (i == pd_idx) | 2861 | if (i == pd_idx || i == qd_idx) |
2816 | continue; | 2862 | continue; |
2817 | 2863 | ||
2818 | if (dev->towrite && | 2864 | if (dev->towrite && |
@@ -3476,28 +3522,27 @@ static void handle_stripe_dirtying(struct r5conf *conf, | |||
3476 | int rmw = 0, rcw = 0, i; | 3522 | int rmw = 0, rcw = 0, i; |
3477 | sector_t recovery_cp = conf->mddev->recovery_cp; | 3523 | sector_t recovery_cp = conf->mddev->recovery_cp; |
3478 | 3524 | ||
3479 | /* RAID6 requires 'rcw' in current implementation. | 3525 | /* Check whether resync is now happening or should start. |
3480 | * Otherwise, check whether resync is now happening or should start. | ||
3481 | * If yes, then the array is dirty (after unclean shutdown or | 3526 | * If yes, then the array is dirty (after unclean shutdown or |
3482 | * initial creation), so parity in some stripes might be inconsistent. | 3527 | * initial creation), so parity in some stripes might be inconsistent. |
3483 | * In this case, we need to always do reconstruct-write, to ensure | 3528 | * In this case, we need to always do reconstruct-write, to ensure |
3484 | * that in case of drive failure or read-error correction, we | 3529 | * that in case of drive failure or read-error correction, we |
3485 | * generate correct data from the parity. | 3530 | * generate correct data from the parity. |
3486 | */ | 3531 | */ |
3487 | if (conf->max_degraded == 2 || | 3532 | if (conf->rmw_level == PARITY_DISABLE_RMW || |
3488 | (recovery_cp < MaxSector && sh->sector >= recovery_cp && | 3533 | (recovery_cp < MaxSector && sh->sector >= recovery_cp && |
3489 | s->failed == 0)) { | 3534 | s->failed == 0)) { |
3490 | /* Calculate the real rcw later - for now make it | 3535 | /* Calculate the real rcw later - for now make it |
3491 | * look like rcw is cheaper | 3536 | * look like rcw is cheaper |
3492 | */ | 3537 | */ |
3493 | rcw = 1; rmw = 2; | 3538 | rcw = 1; rmw = 2; |
3494 | pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n", | 3539 | pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n", |
3495 | conf->max_degraded, (unsigned long long)recovery_cp, | 3540 | conf->rmw_level, (unsigned long long)recovery_cp, |
3496 | (unsigned long long)sh->sector); | 3541 | (unsigned long long)sh->sector); |
3497 | } else for (i = disks; i--; ) { | 3542 | } else for (i = disks; i--; ) { |
3498 | /* would I have to read this buffer for read_modify_write */ | 3543 | /* would I have to read this buffer for read_modify_write */ |
3499 | struct r5dev *dev = &sh->dev[i]; | 3544 | struct r5dev *dev = &sh->dev[i]; |
3500 | if ((dev->towrite || i == sh->pd_idx) && | 3545 | if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) && |
3501 | !test_bit(R5_LOCKED, &dev->flags) && | 3546 | !test_bit(R5_LOCKED, &dev->flags) && |
3502 | !(test_bit(R5_UPTODATE, &dev->flags) || | 3547 | !(test_bit(R5_UPTODATE, &dev->flags) || |
3503 | test_bit(R5_Wantcompute, &dev->flags))) { | 3548 | test_bit(R5_Wantcompute, &dev->flags))) { |
@@ -3507,7 +3552,8 @@ static void handle_stripe_dirtying(struct r5conf *conf, | |||
3507 | rmw += 2*disks; /* cannot read it */ | 3552 | rmw += 2*disks; /* cannot read it */ |
3508 | } | 3553 | } |
3509 | /* Would I have to read this buffer for reconstruct_write */ | 3554 | /* Would I have to read this buffer for reconstruct_write */ |
3510 | if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && | 3555 | if (!test_bit(R5_OVERWRITE, &dev->flags) && |
3556 | i != sh->pd_idx && i != sh->qd_idx && | ||
3511 | !test_bit(R5_LOCKED, &dev->flags) && | 3557 | !test_bit(R5_LOCKED, &dev->flags) && |
3512 | !(test_bit(R5_UPTODATE, &dev->flags) || | 3558 | !(test_bit(R5_UPTODATE, &dev->flags) || |
3513 | test_bit(R5_Wantcompute, &dev->flags))) { | 3559 | test_bit(R5_Wantcompute, &dev->flags))) { |
@@ -3520,7 +3566,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, | |||
3520 | pr_debug("for sector %llu, rmw=%d rcw=%d\n", | 3566 | pr_debug("for sector %llu, rmw=%d rcw=%d\n", |
3521 | (unsigned long long)sh->sector, rmw, rcw); | 3567 | (unsigned long long)sh->sector, rmw, rcw); |
3522 | set_bit(STRIPE_HANDLE, &sh->state); | 3568 | set_bit(STRIPE_HANDLE, &sh->state); |
3523 | if (rmw < rcw && rmw > 0) { | 3569 | if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_ENABLE_RMW)) && rmw > 0) { |
3524 | /* prefer read-modify-write, but need to get some data */ | 3570 | /* prefer read-modify-write, but need to get some data */ |
3525 | if (conf->mddev->queue) | 3571 | if (conf->mddev->queue) |
3526 | blk_add_trace_msg(conf->mddev->queue, | 3572 | blk_add_trace_msg(conf->mddev->queue, |
@@ -3528,7 +3574,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, | |||
3528 | (unsigned long long)sh->sector, rmw); | 3574 | (unsigned long long)sh->sector, rmw); |
3529 | for (i = disks; i--; ) { | 3575 | for (i = disks; i--; ) { |
3530 | struct r5dev *dev = &sh->dev[i]; | 3576 | struct r5dev *dev = &sh->dev[i]; |
3531 | if ((dev->towrite || i == sh->pd_idx) && | 3577 | if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) && |
3532 | !test_bit(R5_LOCKED, &dev->flags) && | 3578 | !test_bit(R5_LOCKED, &dev->flags) && |
3533 | !(test_bit(R5_UPTODATE, &dev->flags) || | 3579 | !(test_bit(R5_UPTODATE, &dev->flags) || |
3534 | test_bit(R5_Wantcompute, &dev->flags)) && | 3580 | test_bit(R5_Wantcompute, &dev->flags)) && |
@@ -3547,7 +3593,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, | |||
3547 | } | 3593 | } |
3548 | } | 3594 | } |
3549 | } | 3595 | } |
3550 | if (rcw <= rmw && rcw > 0) { | 3596 | if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_ENABLE_RMW)) && rcw > 0) { |
3551 | /* want reconstruct write, but need to get some data */ | 3597 | /* want reconstruct write, but need to get some data */ |
3552 | int qread =0; | 3598 | int qread =0; |
3553 | rcw = 0; | 3599 | rcw = 0; |
@@ -6344,10 +6390,16 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
6344 | } | 6390 | } |
6345 | 6391 | ||
6346 | conf->level = mddev->new_level; | 6392 | conf->level = mddev->new_level; |
6347 | if (conf->level == 6) | 6393 | if (conf->level == 6) { |
6348 | conf->max_degraded = 2; | 6394 | conf->max_degraded = 2; |
6349 | else | 6395 | if (raid6_call.xor_syndrome) |
6396 | conf->rmw_level = PARITY_ENABLE_RMW; | ||
6397 | else | ||
6398 | conf->rmw_level = PARITY_DISABLE_RMW; | ||
6399 | } else { | ||
6350 | conf->max_degraded = 1; | 6400 | conf->max_degraded = 1; |
6401 | conf->rmw_level = PARITY_ENABLE_RMW; | ||
6402 | } | ||
6351 | conf->algorithm = mddev->new_layout; | 6403 | conf->algorithm = mddev->new_layout; |
6352 | conf->reshape_progress = mddev->reshape_position; | 6404 | conf->reshape_progress = mddev->reshape_position; |
6353 | if (conf->reshape_progress != MaxSector) { | 6405 | if (conf->reshape_progress != MaxSector) { |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index ee65ed844d3f..57fef9ba36fa 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
@@ -355,6 +355,23 @@ enum { | |||
355 | STRIPE_OP_RECONSTRUCT, | 355 | STRIPE_OP_RECONSTRUCT, |
356 | STRIPE_OP_CHECK, | 356 | STRIPE_OP_CHECK, |
357 | }; | 357 | }; |
358 | |||
359 | /* | ||
360 | * RAID parity calculation preferences | ||
361 | */ | ||
362 | enum { | ||
363 | PARITY_DISABLE_RMW = 0, | ||
364 | PARITY_ENABLE_RMW, | ||
365 | }; | ||
366 | |||
367 | /* | ||
368 | * Pages requested from set_syndrome_sources() | ||
369 | */ | ||
370 | enum { | ||
371 | SYNDROME_SRC_ALL, | ||
372 | SYNDROME_SRC_WANT_DRAIN, | ||
373 | SYNDROME_SRC_WRITTEN, | ||
374 | }; | ||
358 | /* | 375 | /* |
359 | * Plugging: | 376 | * Plugging: |
360 | * | 377 | * |
@@ -411,7 +428,7 @@ struct r5conf { | |||
411 | spinlock_t hash_locks[NR_STRIPE_HASH_LOCKS]; | 428 | spinlock_t hash_locks[NR_STRIPE_HASH_LOCKS]; |
412 | struct mddev *mddev; | 429 | struct mddev *mddev; |
413 | int chunk_sectors; | 430 | int chunk_sectors; |
414 | int level, algorithm; | 431 | int level, algorithm, rmw_level; |
415 | int max_degraded; | 432 | int max_degraded; |
416 | int raid_disks; | 433 | int raid_disks; |
417 | int max_nr_stripes; | 434 | int max_nr_stripes; |
diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h index 179b38ffd351..388574ea38ed 100644 --- a/include/linux/async_tx.h +++ b/include/linux/async_tx.h | |||
@@ -60,12 +60,15 @@ struct dma_chan_ref { | |||
60 | * dependency chain | 60 | * dependency chain |
61 | * @ASYNC_TX_FENCE: specify that the next operation in the dependency | 61 | * @ASYNC_TX_FENCE: specify that the next operation in the dependency |
62 | * chain uses this operation's result as an input | 62 | * chain uses this operation's result as an input |
63 | * @ASYNC_TX_PQ_XOR_DST: do not overwrite the syndrome but XOR it with the | ||
64 | * input data. Required for rmw case. | ||
63 | */ | 65 | */ |
64 | enum async_tx_flags { | 66 | enum async_tx_flags { |
65 | ASYNC_TX_XOR_ZERO_DST = (1 << 0), | 67 | ASYNC_TX_XOR_ZERO_DST = (1 << 0), |
66 | ASYNC_TX_XOR_DROP_DST = (1 << 1), | 68 | ASYNC_TX_XOR_DROP_DST = (1 << 1), |
67 | ASYNC_TX_ACK = (1 << 2), | 69 | ASYNC_TX_ACK = (1 << 2), |
68 | ASYNC_TX_FENCE = (1 << 3), | 70 | ASYNC_TX_FENCE = (1 << 3), |
71 | ASYNC_TX_PQ_XOR_DST = (1 << 4), | ||
69 | }; | 72 | }; |
70 | 73 | ||
71 | /** | 74 | /** |