diff options
author | Dan Williams <dan.j.williams@intel.com> | 2009-09-08 20:42:29 -0400 |
---|---|---|
committer | Dan Williams <dan.j.williams@intel.com> | 2009-09-08 20:42:29 -0400 |
commit | f9dd2134374c8de6b911e2b8652c6c9622eaa658 (patch) | |
tree | c1b8f8d622941606b9e7247ab31d811ba4295011 /drivers | |
parent | 4b652f0db3be891c7b76b109c3b55003b920fc96 (diff) | |
parent | 07a3b417dc3d00802bd7b4874c3e811f0b015a7d (diff) |
Merge branch 'md-raid6-accel' into ioat3.2
Conflicts:
include/linux/dmaengine.h
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/dma/Kconfig | 2 | ||||
-rw-r--r-- | drivers/dma/dmaengine.c | 53 | ||||
-rw-r--r-- | drivers/dma/dmatest.c | 26 | ||||
-rw-r--r-- | drivers/dma/iop-adma.c | 40 | ||||
-rw-r--r-- | drivers/md/Kconfig | 26 | ||||
-rw-r--r-- | drivers/md/raid5.c | 1486 | ||||
-rw-r--r-- | drivers/md/raid5.h | 28 |
7 files changed, 1021 insertions, 640 deletions
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 3b3c01b6f1ee..912a51b5cbd3 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig | |||
@@ -4,7 +4,7 @@ | |||
4 | 4 | ||
5 | menuconfig DMADEVICES | 5 | menuconfig DMADEVICES |
6 | bool "DMA Engine support" | 6 | bool "DMA Engine support" |
7 | depends on !HIGHMEM64G && HAS_DMA | 7 | depends on HAS_DMA |
8 | help | 8 | help |
9 | DMA engines can do asynchronous data transfers without | 9 | DMA engines can do asynchronous data transfers without |
10 | involving the host CPU. Currently, this framework can be | 10 | involving the host CPU. Currently, this framework can be |
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index 5a87384ea4ff..96598479eece 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c | |||
@@ -644,8 +644,12 @@ int dma_async_device_register(struct dma_device *device) | |||
644 | !device->device_prep_dma_memcpy); | 644 | !device->device_prep_dma_memcpy); |
645 | BUG_ON(dma_has_cap(DMA_XOR, device->cap_mask) && | 645 | BUG_ON(dma_has_cap(DMA_XOR, device->cap_mask) && |
646 | !device->device_prep_dma_xor); | 646 | !device->device_prep_dma_xor); |
647 | BUG_ON(dma_has_cap(DMA_ZERO_SUM, device->cap_mask) && | 647 | BUG_ON(dma_has_cap(DMA_XOR_VAL, device->cap_mask) && |
648 | !device->device_prep_dma_zero_sum); | 648 | !device->device_prep_dma_xor_val); |
649 | BUG_ON(dma_has_cap(DMA_PQ, device->cap_mask) && | ||
650 | !device->device_prep_dma_pq); | ||
651 | BUG_ON(dma_has_cap(DMA_PQ_VAL, device->cap_mask) && | ||
652 | !device->device_prep_dma_pq_val); | ||
649 | BUG_ON(dma_has_cap(DMA_MEMSET, device->cap_mask) && | 653 | BUG_ON(dma_has_cap(DMA_MEMSET, device->cap_mask) && |
650 | !device->device_prep_dma_memset); | 654 | !device->device_prep_dma_memset); |
651 | BUG_ON(dma_has_cap(DMA_INTERRUPT, device->cap_mask) && | 655 | BUG_ON(dma_has_cap(DMA_INTERRUPT, device->cap_mask) && |
@@ -939,49 +943,24 @@ EXPORT_SYMBOL(dma_async_tx_descriptor_init); | |||
939 | 943 | ||
940 | /* dma_wait_for_async_tx - spin wait for a transaction to complete | 944 | /* dma_wait_for_async_tx - spin wait for a transaction to complete |
941 | * @tx: in-flight transaction to wait on | 945 | * @tx: in-flight transaction to wait on |
942 | * | ||
943 | * This routine assumes that tx was obtained from a call to async_memcpy, | ||
944 | * async_xor, async_memset, etc which ensures that tx is "in-flight" (prepped | ||
945 | * and submitted). Walking the parent chain is only meant to cover for DMA | ||
946 | * drivers that do not implement the DMA_INTERRUPT capability and may race with | ||
947 | * the driver's descriptor cleanup routine. | ||
948 | */ | 946 | */ |
949 | enum dma_status | 947 | enum dma_status |
950 | dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx) | 948 | dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx) |
951 | { | 949 | { |
952 | enum dma_status status; | 950 | unsigned long dma_sync_wait_timeout = jiffies + msecs_to_jiffies(5000); |
953 | struct dma_async_tx_descriptor *iter; | ||
954 | struct dma_async_tx_descriptor *parent; | ||
955 | 951 | ||
956 | if (!tx) | 952 | if (!tx) |
957 | return DMA_SUCCESS; | 953 | return DMA_SUCCESS; |
958 | 954 | ||
959 | WARN_ONCE(tx->parent, "%s: speculatively walking dependency chain for" | 955 | while (tx->cookie == -EBUSY) { |
960 | " %s\n", __func__, dma_chan_name(tx->chan)); | 956 | if (time_after_eq(jiffies, dma_sync_wait_timeout)) { |
961 | 957 | pr_err("%s timeout waiting for descriptor submission\n", | |
962 | /* poll through the dependency chain, return when tx is complete */ | 958 | __func__); |
963 | do { | 959 | return DMA_ERROR; |
964 | iter = tx; | 960 | } |
965 | 961 | cpu_relax(); | |
966 | /* find the root of the unsubmitted dependency chain */ | 962 | } |
967 | do { | 963 | return dma_sync_wait(tx->chan, tx->cookie); |
968 | parent = iter->parent; | ||
969 | if (!parent) | ||
970 | break; | ||
971 | else | ||
972 | iter = parent; | ||
973 | } while (parent); | ||
974 | |||
975 | /* there is a small window for ->parent == NULL and | ||
976 | * ->cookie == -EBUSY | ||
977 | */ | ||
978 | while (iter->cookie == -EBUSY) | ||
979 | cpu_relax(); | ||
980 | |||
981 | status = dma_sync_wait(iter->chan, iter->cookie); | ||
982 | } while (status == DMA_IN_PROGRESS || (iter != tx)); | ||
983 | |||
984 | return status; | ||
985 | } | 964 | } |
986 | EXPORT_SYMBOL_GPL(dma_wait_for_async_tx); | 965 | EXPORT_SYMBOL_GPL(dma_wait_for_async_tx); |
987 | 966 | ||
diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c index fb7da5141e96..58e49e41c7a3 100644 --- a/drivers/dma/dmatest.c +++ b/drivers/dma/dmatest.c | |||
@@ -43,6 +43,11 @@ module_param(xor_sources, uint, S_IRUGO); | |||
43 | MODULE_PARM_DESC(xor_sources, | 43 | MODULE_PARM_DESC(xor_sources, |
44 | "Number of xor source buffers (default: 3)"); | 44 | "Number of xor source buffers (default: 3)"); |
45 | 45 | ||
46 | static unsigned int pq_sources = 3; | ||
47 | module_param(pq_sources, uint, S_IRUGO); | ||
48 | MODULE_PARM_DESC(pq_sources, | ||
49 | "Number of p+q source buffers (default: 3)"); | ||
50 | |||
46 | /* | 51 | /* |
47 | * Initialization patterns. All bytes in the source buffer has bit 7 | 52 | * Initialization patterns. All bytes in the source buffer has bit 7 |
48 | * set, all bytes in the destination buffer has bit 7 cleared. | 53 | * set, all bytes in the destination buffer has bit 7 cleared. |
@@ -227,6 +232,7 @@ static int dmatest_func(void *data) | |||
227 | dma_cookie_t cookie; | 232 | dma_cookie_t cookie; |
228 | enum dma_status status; | 233 | enum dma_status status; |
229 | enum dma_ctrl_flags flags; | 234 | enum dma_ctrl_flags flags; |
235 | u8 pq_coefs[pq_sources]; | ||
230 | int ret; | 236 | int ret; |
231 | int src_cnt; | 237 | int src_cnt; |
232 | int dst_cnt; | 238 | int dst_cnt; |
@@ -243,6 +249,11 @@ static int dmatest_func(void *data) | |||
243 | else if (thread->type == DMA_XOR) { | 249 | else if (thread->type == DMA_XOR) { |
244 | src_cnt = xor_sources | 1; /* force odd to ensure dst = src */ | 250 | src_cnt = xor_sources | 1; /* force odd to ensure dst = src */ |
245 | dst_cnt = 1; | 251 | dst_cnt = 1; |
252 | } else if (thread->type == DMA_PQ) { | ||
253 | src_cnt = pq_sources | 1; /* force odd to ensure dst = src */ | ||
254 | dst_cnt = 2; | ||
255 | for (i = 0; i < pq_sources; i++) | ||
256 | pq_coefs[i] = 1; | ||
246 | } else | 257 | } else |
247 | goto err_srcs; | 258 | goto err_srcs; |
248 | 259 | ||
@@ -310,6 +321,15 @@ static int dmatest_func(void *data) | |||
310 | dma_dsts[0] + dst_off, | 321 | dma_dsts[0] + dst_off, |
311 | dma_srcs, xor_sources, | 322 | dma_srcs, xor_sources, |
312 | len, flags); | 323 | len, flags); |
324 | else if (thread->type == DMA_PQ) { | ||
325 | dma_addr_t dma_pq[dst_cnt]; | ||
326 | |||
327 | for (i = 0; i < dst_cnt; i++) | ||
328 | dma_pq[i] = dma_dsts[i] + dst_off; | ||
329 | tx = dev->device_prep_dma_pq(chan, dma_pq, dma_srcs, | ||
330 | pq_sources, pq_coefs, | ||
331 | len, flags); | ||
332 | } | ||
313 | 333 | ||
314 | if (!tx) { | 334 | if (!tx) { |
315 | for (i = 0; i < src_cnt; i++) | 335 | for (i = 0; i < src_cnt; i++) |
@@ -446,6 +466,8 @@ static int dmatest_add_threads(struct dmatest_chan *dtc, enum dma_transaction_ty | |||
446 | op = "copy"; | 466 | op = "copy"; |
447 | else if (type == DMA_XOR) | 467 | else if (type == DMA_XOR) |
448 | op = "xor"; | 468 | op = "xor"; |
469 | else if (type == DMA_PQ) | ||
470 | op = "pq"; | ||
449 | else | 471 | else |
450 | return -EINVAL; | 472 | return -EINVAL; |
451 | 473 | ||
@@ -501,6 +523,10 @@ static int dmatest_add_channel(struct dma_chan *chan) | |||
501 | cnt = dmatest_add_threads(dtc, DMA_XOR); | 523 | cnt = dmatest_add_threads(dtc, DMA_XOR); |
502 | thread_count += cnt > 0 ?: 0; | 524 | thread_count += cnt > 0 ?: 0; |
503 | } | 525 | } |
526 | if (dma_has_cap(DMA_PQ, dma_dev->cap_mask)) { | ||
527 | cnt = dmatest_add_threads(dtc, DMA_PQ); | ||
528 | thread_count += cnt > 0 ?: 0; | ||
529 | } | ||
504 | 530 | ||
505 | pr_info("dmatest: Started %u threads using %s\n", | 531 | pr_info("dmatest: Started %u threads using %s\n", |
506 | thread_count, dma_chan_name(chan)); | 532 | thread_count, dma_chan_name(chan)); |
diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c index 2f052265122f..4496bc606662 100644 --- a/drivers/dma/iop-adma.c +++ b/drivers/dma/iop-adma.c | |||
@@ -660,9 +660,9 @@ iop_adma_prep_dma_xor(struct dma_chan *chan, dma_addr_t dma_dest, | |||
660 | } | 660 | } |
661 | 661 | ||
662 | static struct dma_async_tx_descriptor * | 662 | static struct dma_async_tx_descriptor * |
663 | iop_adma_prep_dma_zero_sum(struct dma_chan *chan, dma_addr_t *dma_src, | 663 | iop_adma_prep_dma_xor_val(struct dma_chan *chan, dma_addr_t *dma_src, |
664 | unsigned int src_cnt, size_t len, u32 *result, | 664 | unsigned int src_cnt, size_t len, u32 *result, |
665 | unsigned long flags) | 665 | unsigned long flags) |
666 | { | 666 | { |
667 | struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan); | 667 | struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan); |
668 | struct iop_adma_desc_slot *sw_desc, *grp_start; | 668 | struct iop_adma_desc_slot *sw_desc, *grp_start; |
@@ -906,7 +906,7 @@ out: | |||
906 | 906 | ||
907 | #define IOP_ADMA_NUM_SRC_TEST 4 /* must be <= 15 */ | 907 | #define IOP_ADMA_NUM_SRC_TEST 4 /* must be <= 15 */ |
908 | static int __devinit | 908 | static int __devinit |
909 | iop_adma_xor_zero_sum_self_test(struct iop_adma_device *device) | 909 | iop_adma_xor_val_self_test(struct iop_adma_device *device) |
910 | { | 910 | { |
911 | int i, src_idx; | 911 | int i, src_idx; |
912 | struct page *dest; | 912 | struct page *dest; |
@@ -1002,7 +1002,7 @@ iop_adma_xor_zero_sum_self_test(struct iop_adma_device *device) | |||
1002 | PAGE_SIZE, DMA_TO_DEVICE); | 1002 | PAGE_SIZE, DMA_TO_DEVICE); |
1003 | 1003 | ||
1004 | /* skip zero sum if the capability is not present */ | 1004 | /* skip zero sum if the capability is not present */ |
1005 | if (!dma_has_cap(DMA_ZERO_SUM, dma_chan->device->cap_mask)) | 1005 | if (!dma_has_cap(DMA_XOR_VAL, dma_chan->device->cap_mask)) |
1006 | goto free_resources; | 1006 | goto free_resources; |
1007 | 1007 | ||
1008 | /* zero sum the sources with the destintation page */ | 1008 | /* zero sum the sources with the destintation page */ |
@@ -1016,10 +1016,10 @@ iop_adma_xor_zero_sum_self_test(struct iop_adma_device *device) | |||
1016 | dma_srcs[i] = dma_map_page(dma_chan->device->dev, | 1016 | dma_srcs[i] = dma_map_page(dma_chan->device->dev, |
1017 | zero_sum_srcs[i], 0, PAGE_SIZE, | 1017 | zero_sum_srcs[i], 0, PAGE_SIZE, |
1018 | DMA_TO_DEVICE); | 1018 | DMA_TO_DEVICE); |
1019 | tx = iop_adma_prep_dma_zero_sum(dma_chan, dma_srcs, | 1019 | tx = iop_adma_prep_dma_xor_val(dma_chan, dma_srcs, |
1020 | IOP_ADMA_NUM_SRC_TEST + 1, PAGE_SIZE, | 1020 | IOP_ADMA_NUM_SRC_TEST + 1, PAGE_SIZE, |
1021 | &zero_sum_result, | 1021 | &zero_sum_result, |
1022 | DMA_PREP_INTERRUPT | DMA_CTRL_ACK); | 1022 | DMA_PREP_INTERRUPT | DMA_CTRL_ACK); |
1023 | 1023 | ||
1024 | cookie = iop_adma_tx_submit(tx); | 1024 | cookie = iop_adma_tx_submit(tx); |
1025 | iop_adma_issue_pending(dma_chan); | 1025 | iop_adma_issue_pending(dma_chan); |
@@ -1072,10 +1072,10 @@ iop_adma_xor_zero_sum_self_test(struct iop_adma_device *device) | |||
1072 | dma_srcs[i] = dma_map_page(dma_chan->device->dev, | 1072 | dma_srcs[i] = dma_map_page(dma_chan->device->dev, |
1073 | zero_sum_srcs[i], 0, PAGE_SIZE, | 1073 | zero_sum_srcs[i], 0, PAGE_SIZE, |
1074 | DMA_TO_DEVICE); | 1074 | DMA_TO_DEVICE); |
1075 | tx = iop_adma_prep_dma_zero_sum(dma_chan, dma_srcs, | 1075 | tx = iop_adma_prep_dma_xor_val(dma_chan, dma_srcs, |
1076 | IOP_ADMA_NUM_SRC_TEST + 1, PAGE_SIZE, | 1076 | IOP_ADMA_NUM_SRC_TEST + 1, PAGE_SIZE, |
1077 | &zero_sum_result, | 1077 | &zero_sum_result, |
1078 | DMA_PREP_INTERRUPT | DMA_CTRL_ACK); | 1078 | DMA_PREP_INTERRUPT | DMA_CTRL_ACK); |
1079 | 1079 | ||
1080 | cookie = iop_adma_tx_submit(tx); | 1080 | cookie = iop_adma_tx_submit(tx); |
1081 | iop_adma_issue_pending(dma_chan); | 1081 | iop_adma_issue_pending(dma_chan); |
@@ -1192,9 +1192,9 @@ static int __devinit iop_adma_probe(struct platform_device *pdev) | |||
1192 | dma_dev->max_xor = iop_adma_get_max_xor(); | 1192 | dma_dev->max_xor = iop_adma_get_max_xor(); |
1193 | dma_dev->device_prep_dma_xor = iop_adma_prep_dma_xor; | 1193 | dma_dev->device_prep_dma_xor = iop_adma_prep_dma_xor; |
1194 | } | 1194 | } |
1195 | if (dma_has_cap(DMA_ZERO_SUM, dma_dev->cap_mask)) | 1195 | if (dma_has_cap(DMA_XOR_VAL, dma_dev->cap_mask)) |
1196 | dma_dev->device_prep_dma_zero_sum = | 1196 | dma_dev->device_prep_dma_xor_val = |
1197 | iop_adma_prep_dma_zero_sum; | 1197 | iop_adma_prep_dma_xor_val; |
1198 | if (dma_has_cap(DMA_INTERRUPT, dma_dev->cap_mask)) | 1198 | if (dma_has_cap(DMA_INTERRUPT, dma_dev->cap_mask)) |
1199 | dma_dev->device_prep_dma_interrupt = | 1199 | dma_dev->device_prep_dma_interrupt = |
1200 | iop_adma_prep_dma_interrupt; | 1200 | iop_adma_prep_dma_interrupt; |
@@ -1249,7 +1249,7 @@ static int __devinit iop_adma_probe(struct platform_device *pdev) | |||
1249 | 1249 | ||
1250 | if (dma_has_cap(DMA_XOR, dma_dev->cap_mask) || | 1250 | if (dma_has_cap(DMA_XOR, dma_dev->cap_mask) || |
1251 | dma_has_cap(DMA_MEMSET, dma_dev->cap_mask)) { | 1251 | dma_has_cap(DMA_MEMSET, dma_dev->cap_mask)) { |
1252 | ret = iop_adma_xor_zero_sum_self_test(adev); | 1252 | ret = iop_adma_xor_val_self_test(adev); |
1253 | dev_dbg(&pdev->dev, "xor self test returned %d\n", ret); | 1253 | dev_dbg(&pdev->dev, "xor self test returned %d\n", ret); |
1254 | if (ret) | 1254 | if (ret) |
1255 | goto err_free_iop_chan; | 1255 | goto err_free_iop_chan; |
@@ -1257,12 +1257,12 @@ static int __devinit iop_adma_probe(struct platform_device *pdev) | |||
1257 | 1257 | ||
1258 | dev_printk(KERN_INFO, &pdev->dev, "Intel(R) IOP: " | 1258 | dev_printk(KERN_INFO, &pdev->dev, "Intel(R) IOP: " |
1259 | "( %s%s%s%s%s%s%s%s%s%s)\n", | 1259 | "( %s%s%s%s%s%s%s%s%s%s)\n", |
1260 | dma_has_cap(DMA_PQ_XOR, dma_dev->cap_mask) ? "pq_xor " : "", | 1260 | dma_has_cap(DMA_PQ, dma_dev->cap_mask) ? "pq " : "", |
1261 | dma_has_cap(DMA_PQ_UPDATE, dma_dev->cap_mask) ? "pq_update " : "", | 1261 | dma_has_cap(DMA_PQ_UPDATE, dma_dev->cap_mask) ? "pq_update " : "", |
1262 | dma_has_cap(DMA_PQ_ZERO_SUM, dma_dev->cap_mask) ? "pq_zero_sum " : "", | 1262 | dma_has_cap(DMA_PQ_VAL, dma_dev->cap_mask) ? "pq_val " : "", |
1263 | dma_has_cap(DMA_XOR, dma_dev->cap_mask) ? "xor " : "", | 1263 | dma_has_cap(DMA_XOR, dma_dev->cap_mask) ? "xor " : "", |
1264 | dma_has_cap(DMA_DUAL_XOR, dma_dev->cap_mask) ? "dual_xor " : "", | 1264 | dma_has_cap(DMA_DUAL_XOR, dma_dev->cap_mask) ? "dual_xor " : "", |
1265 | dma_has_cap(DMA_ZERO_SUM, dma_dev->cap_mask) ? "xor_zero_sum " : "", | 1265 | dma_has_cap(DMA_XOR_VAL, dma_dev->cap_mask) ? "xor_val " : "", |
1266 | dma_has_cap(DMA_MEMSET, dma_dev->cap_mask) ? "fill " : "", | 1266 | dma_has_cap(DMA_MEMSET, dma_dev->cap_mask) ? "fill " : "", |
1267 | dma_has_cap(DMA_MEMCPY_CRC32C, dma_dev->cap_mask) ? "cpy+crc " : "", | 1267 | dma_has_cap(DMA_MEMCPY_CRC32C, dma_dev->cap_mask) ? "cpy+crc " : "", |
1268 | dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask) ? "cpy " : "", | 1268 | dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask) ? "cpy " : "", |
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 36e0675be9f7..09c0c6e49ab5 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -124,6 +124,8 @@ config MD_RAID456 | |||
124 | select MD_RAID6_PQ | 124 | select MD_RAID6_PQ |
125 | select ASYNC_MEMCPY | 125 | select ASYNC_MEMCPY |
126 | select ASYNC_XOR | 126 | select ASYNC_XOR |
127 | select ASYNC_PQ | ||
128 | select ASYNC_RAID6_RECOV | ||
127 | ---help--- | 129 | ---help--- |
128 | A RAID-5 set of N drives with a capacity of C MB per drive provides | 130 | A RAID-5 set of N drives with a capacity of C MB per drive provides |
129 | the capacity of C * (N - 1) MB, and protects against a failure | 131 | the capacity of C * (N - 1) MB, and protects against a failure |
@@ -152,9 +154,33 @@ config MD_RAID456 | |||
152 | 154 | ||
153 | If unsure, say Y. | 155 | If unsure, say Y. |
154 | 156 | ||
157 | config MULTICORE_RAID456 | ||
158 | bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)" | ||
159 | depends on MD_RAID456 | ||
160 | depends on SMP | ||
161 | depends on EXPERIMENTAL | ||
162 | ---help--- | ||
163 | Enable the raid456 module to dispatch per-stripe raid operations to a | ||
164 | thread pool. | ||
165 | |||
166 | If unsure, say N. | ||
167 | |||
155 | config MD_RAID6_PQ | 168 | config MD_RAID6_PQ |
156 | tristate | 169 | tristate |
157 | 170 | ||
171 | config ASYNC_RAID6_TEST | ||
172 | tristate "Self test for hardware accelerated raid6 recovery" | ||
173 | depends on MD_RAID6_PQ | ||
174 | select ASYNC_RAID6_RECOV | ||
175 | ---help--- | ||
176 | This is a one-shot self test that permutes through the | ||
177 | recovery of all the possible two disk failure scenarios for a | ||
178 | N-disk array. Recovery is performed with the asynchronous | ||
179 | raid6 recovery routines, and will optionally use an offload | ||
180 | engine if one is available. | ||
181 | |||
182 | If unsure, say N. | ||
183 | |||
158 | config MD_MULTIPATH | 184 | config MD_MULTIPATH |
159 | tristate "Multipath I/O support" | 185 | tristate "Multipath I/O support" |
160 | depends on BLK_DEV_MD | 186 | depends on BLK_DEV_MD |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index bb37fb1b2d82..0a5cf2171214 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -47,7 +47,9 @@ | |||
47 | #include <linux/kthread.h> | 47 | #include <linux/kthread.h> |
48 | #include <linux/raid/pq.h> | 48 | #include <linux/raid/pq.h> |
49 | #include <linux/async_tx.h> | 49 | #include <linux/async_tx.h> |
50 | #include <linux/async.h> | ||
50 | #include <linux/seq_file.h> | 51 | #include <linux/seq_file.h> |
52 | #include <linux/cpu.h> | ||
51 | #include "md.h" | 53 | #include "md.h" |
52 | #include "raid5.h" | 54 | #include "raid5.h" |
53 | #include "bitmap.h" | 55 | #include "bitmap.h" |
@@ -499,11 +501,14 @@ async_copy_data(int frombio, struct bio *bio, struct page *page, | |||
499 | struct page *bio_page; | 501 | struct page *bio_page; |
500 | int i; | 502 | int i; |
501 | int page_offset; | 503 | int page_offset; |
504 | struct async_submit_ctl submit; | ||
502 | 505 | ||
503 | if (bio->bi_sector >= sector) | 506 | if (bio->bi_sector >= sector) |
504 | page_offset = (signed)(bio->bi_sector - sector) * 512; | 507 | page_offset = (signed)(bio->bi_sector - sector) * 512; |
505 | else | 508 | else |
506 | page_offset = (signed)(sector - bio->bi_sector) * -512; | 509 | page_offset = (signed)(sector - bio->bi_sector) * -512; |
510 | |||
511 | init_async_submit(&submit, 0, tx, NULL, NULL, NULL); | ||
507 | bio_for_each_segment(bvl, bio, i) { | 512 | bio_for_each_segment(bvl, bio, i) { |
508 | int len = bio_iovec_idx(bio, i)->bv_len; | 513 | int len = bio_iovec_idx(bio, i)->bv_len; |
509 | int clen; | 514 | int clen; |
@@ -525,15 +530,14 @@ async_copy_data(int frombio, struct bio *bio, struct page *page, | |||
525 | bio_page = bio_iovec_idx(bio, i)->bv_page; | 530 | bio_page = bio_iovec_idx(bio, i)->bv_page; |
526 | if (frombio) | 531 | if (frombio) |
527 | tx = async_memcpy(page, bio_page, page_offset, | 532 | tx = async_memcpy(page, bio_page, page_offset, |
528 | b_offset, clen, | 533 | b_offset, clen, &submit); |
529 | ASYNC_TX_DEP_ACK, | ||
530 | tx, NULL, NULL); | ||
531 | else | 534 | else |
532 | tx = async_memcpy(bio_page, page, b_offset, | 535 | tx = async_memcpy(bio_page, page, b_offset, |
533 | page_offset, clen, | 536 | page_offset, clen, &submit); |
534 | ASYNC_TX_DEP_ACK, | ||
535 | tx, NULL, NULL); | ||
536 | } | 537 | } |
538 | /* chain the operations */ | ||
539 | submit.depend_tx = tx; | ||
540 | |||
537 | if (clen < len) /* hit end of page */ | 541 | if (clen < len) /* hit end of page */ |
538 | break; | 542 | break; |
539 | page_offset += len; | 543 | page_offset += len; |
@@ -592,6 +596,7 @@ static void ops_run_biofill(struct stripe_head *sh) | |||
592 | { | 596 | { |
593 | struct dma_async_tx_descriptor *tx = NULL; | 597 | struct dma_async_tx_descriptor *tx = NULL; |
594 | raid5_conf_t *conf = sh->raid_conf; | 598 | raid5_conf_t *conf = sh->raid_conf; |
599 | struct async_submit_ctl submit; | ||
595 | int i; | 600 | int i; |
596 | 601 | ||
597 | pr_debug("%s: stripe %llu\n", __func__, | 602 | pr_debug("%s: stripe %llu\n", __func__, |
@@ -615,22 +620,34 @@ static void ops_run_biofill(struct stripe_head *sh) | |||
615 | } | 620 | } |
616 | 621 | ||
617 | atomic_inc(&sh->count); | 622 | atomic_inc(&sh->count); |
618 | async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, | 623 | init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); |
619 | ops_complete_biofill, sh); | 624 | async_trigger_callback(&submit); |
620 | } | 625 | } |
621 | 626 | ||
622 | static void ops_complete_compute5(void *stripe_head_ref) | 627 | static void mark_target_uptodate(struct stripe_head *sh, int target) |
623 | { | 628 | { |
624 | struct stripe_head *sh = stripe_head_ref; | 629 | struct r5dev *tgt; |
625 | int target = sh->ops.target; | ||
626 | struct r5dev *tgt = &sh->dev[target]; | ||
627 | 630 | ||
628 | pr_debug("%s: stripe %llu\n", __func__, | 631 | if (target < 0) |
629 | (unsigned long long)sh->sector); | 632 | return; |
630 | 633 | ||
634 | tgt = &sh->dev[target]; | ||
631 | set_bit(R5_UPTODATE, &tgt->flags); | 635 | set_bit(R5_UPTODATE, &tgt->flags); |
632 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); | 636 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); |
633 | clear_bit(R5_Wantcompute, &tgt->flags); | 637 | clear_bit(R5_Wantcompute, &tgt->flags); |
638 | } | ||
639 | |||
640 | static void ops_complete_compute(void *stripe_head_ref) | ||
641 | { | ||
642 | struct stripe_head *sh = stripe_head_ref; | ||
643 | |||
644 | pr_debug("%s: stripe %llu\n", __func__, | ||
645 | (unsigned long long)sh->sector); | ||
646 | |||
647 | /* mark the computed target(s) as uptodate */ | ||
648 | mark_target_uptodate(sh, sh->ops.target); | ||
649 | mark_target_uptodate(sh, sh->ops.target2); | ||
650 | |||
634 | clear_bit(STRIPE_COMPUTE_RUN, &sh->state); | 651 | clear_bit(STRIPE_COMPUTE_RUN, &sh->state); |
635 | if (sh->check_state == check_state_compute_run) | 652 | if (sh->check_state == check_state_compute_run) |
636 | sh->check_state = check_state_compute_result; | 653 | sh->check_state = check_state_compute_result; |
@@ -638,16 +655,24 @@ static void ops_complete_compute5(void *stripe_head_ref) | |||
638 | release_stripe(sh); | 655 | release_stripe(sh); |
639 | } | 656 | } |
640 | 657 | ||
641 | static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) | 658 | /* return a pointer to the address conversion region of the scribble buffer */ |
659 | static addr_conv_t *to_addr_conv(struct stripe_head *sh, | ||
660 | struct raid5_percpu *percpu) | ||
661 | { | ||
662 | return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); | ||
663 | } | ||
664 | |||
665 | static struct dma_async_tx_descriptor * | ||
666 | ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) | ||
642 | { | 667 | { |
643 | /* kernel stack size limits the total number of disks */ | ||
644 | int disks = sh->disks; | 668 | int disks = sh->disks; |
645 | struct page *xor_srcs[disks]; | 669 | struct page **xor_srcs = percpu->scribble; |
646 | int target = sh->ops.target; | 670 | int target = sh->ops.target; |
647 | struct r5dev *tgt = &sh->dev[target]; | 671 | struct r5dev *tgt = &sh->dev[target]; |
648 | struct page *xor_dest = tgt->page; | 672 | struct page *xor_dest = tgt->page; |
649 | int count = 0; | 673 | int count = 0; |
650 | struct dma_async_tx_descriptor *tx; | 674 | struct dma_async_tx_descriptor *tx; |
675 | struct async_submit_ctl submit; | ||
651 | int i; | 676 | int i; |
652 | 677 | ||
653 | pr_debug("%s: stripe %llu block: %d\n", | 678 | pr_debug("%s: stripe %llu block: %d\n", |
@@ -660,17 +685,207 @@ static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) | |||
660 | 685 | ||
661 | atomic_inc(&sh->count); | 686 | atomic_inc(&sh->count); |
662 | 687 | ||
688 | init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL, | ||
689 | ops_complete_compute, sh, to_addr_conv(sh, percpu)); | ||
663 | if (unlikely(count == 1)) | 690 | if (unlikely(count == 1)) |
664 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, | 691 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); |
665 | 0, NULL, ops_complete_compute5, sh); | ||
666 | else | 692 | else |
667 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | 693 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); |
668 | ASYNC_TX_XOR_ZERO_DST, NULL, | ||
669 | ops_complete_compute5, sh); | ||
670 | 694 | ||
671 | return tx; | 695 | return tx; |
672 | } | 696 | } |
673 | 697 | ||
698 | /* set_syndrome_sources - populate source buffers for gen_syndrome | ||
699 | * @srcs - (struct page *) array of size sh->disks | ||
700 | * @sh - stripe_head to parse | ||
701 | * | ||
702 | * Populates srcs in proper layout order for the stripe and returns the | ||
703 | * 'count' of sources to be used in a call to async_gen_syndrome. The P | ||
704 | * destination buffer is recorded in srcs[count] and the Q destination | ||
705 | * is recorded in srcs[count+1]]. | ||
706 | */ | ||
707 | static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) | ||
708 | { | ||
709 | int disks = sh->disks; | ||
710 | int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); | ||
711 | int d0_idx = raid6_d0(sh); | ||
712 | int count; | ||
713 | int i; | ||
714 | |||
715 | for (i = 0; i < disks; i++) | ||
716 | srcs[i] = (void *)raid6_empty_zero_page; | ||
717 | |||
718 | count = 0; | ||
719 | i = d0_idx; | ||
720 | do { | ||
721 | int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); | ||
722 | |||
723 | srcs[slot] = sh->dev[i].page; | ||
724 | i = raid6_next_disk(i, disks); | ||
725 | } while (i != d0_idx); | ||
726 | BUG_ON(count != syndrome_disks); | ||
727 | |||
728 | return count; | ||
729 | } | ||
730 | |||
731 | static struct dma_async_tx_descriptor * | ||
732 | ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) | ||
733 | { | ||
734 | int disks = sh->disks; | ||
735 | struct page **blocks = percpu->scribble; | ||
736 | int target; | ||
737 | int qd_idx = sh->qd_idx; | ||
738 | struct dma_async_tx_descriptor *tx; | ||
739 | struct async_submit_ctl submit; | ||
740 | struct r5dev *tgt; | ||
741 | struct page *dest; | ||
742 | int i; | ||
743 | int count; | ||
744 | |||
745 | if (sh->ops.target < 0) | ||
746 | target = sh->ops.target2; | ||
747 | else if (sh->ops.target2 < 0) | ||
748 | target = sh->ops.target; | ||
749 | else | ||
750 | /* we should only have one valid target */ | ||
751 | BUG(); | ||
752 | BUG_ON(target < 0); | ||
753 | pr_debug("%s: stripe %llu block: %d\n", | ||
754 | __func__, (unsigned long long)sh->sector, target); | ||
755 | |||
756 | tgt = &sh->dev[target]; | ||
757 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); | ||
758 | dest = tgt->page; | ||
759 | |||
760 | atomic_inc(&sh->count); | ||
761 | |||
762 | if (target == qd_idx) { | ||
763 | count = set_syndrome_sources(blocks, sh); | ||
764 | blocks[count] = NULL; /* regenerating p is not necessary */ | ||
765 | BUG_ON(blocks[count+1] != dest); /* q should already be set */ | ||
766 | init_async_submit(&submit, 0, NULL, ops_complete_compute, sh, | ||
767 | to_addr_conv(sh, percpu)); | ||
768 | tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); | ||
769 | } else { | ||
770 | /* Compute any data- or p-drive using XOR */ | ||
771 | count = 0; | ||
772 | for (i = disks; i-- ; ) { | ||
773 | if (i == target || i == qd_idx) | ||
774 | continue; | ||
775 | blocks[count++] = sh->dev[i].page; | ||
776 | } | ||
777 | |||
778 | init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL, | ||
779 | ops_complete_compute, sh, | ||
780 | to_addr_conv(sh, percpu)); | ||
781 | tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); | ||
782 | } | ||
783 | |||
784 | return tx; | ||
785 | } | ||
786 | |||
787 | static struct dma_async_tx_descriptor * | ||
788 | ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) | ||
789 | { | ||
790 | int i, count, disks = sh->disks; | ||
791 | int syndrome_disks = sh->ddf_layout ? disks : disks-2; | ||
792 | int d0_idx = raid6_d0(sh); | ||
793 | int faila = -1, failb = -1; | ||
794 | int target = sh->ops.target; | ||
795 | int target2 = sh->ops.target2; | ||
796 | struct r5dev *tgt = &sh->dev[target]; | ||
797 | struct r5dev *tgt2 = &sh->dev[target2]; | ||
798 | struct dma_async_tx_descriptor *tx; | ||
799 | struct page **blocks = percpu->scribble; | ||
800 | struct async_submit_ctl submit; | ||
801 | |||
802 | pr_debug("%s: stripe %llu block1: %d block2: %d\n", | ||
803 | __func__, (unsigned long long)sh->sector, target, target2); | ||
804 | BUG_ON(target < 0 || target2 < 0); | ||
805 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); | ||
806 | BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); | ||
807 | |||
808 | /* we need to open-code set_syndrome_sources to handle to the | ||
809 | * slot number conversion for 'faila' and 'failb' | ||
810 | */ | ||
811 | for (i = 0; i < disks ; i++) | ||
812 | blocks[i] = (void *)raid6_empty_zero_page; | ||
813 | count = 0; | ||
814 | i = d0_idx; | ||
815 | do { | ||
816 | int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); | ||
817 | |||
818 | blocks[slot] = sh->dev[i].page; | ||
819 | |||
820 | if (i == target) | ||
821 | faila = slot; | ||
822 | if (i == target2) | ||
823 | failb = slot; | ||
824 | i = raid6_next_disk(i, disks); | ||
825 | } while (i != d0_idx); | ||
826 | BUG_ON(count != syndrome_disks); | ||
827 | |||
828 | BUG_ON(faila == failb); | ||
829 | if (failb < faila) | ||
830 | swap(faila, failb); | ||
831 | pr_debug("%s: stripe: %llu faila: %d failb: %d\n", | ||
832 | __func__, (unsigned long long)sh->sector, faila, failb); | ||
833 | |||
834 | atomic_inc(&sh->count); | ||
835 | |||
836 | if (failb == syndrome_disks+1) { | ||
837 | /* Q disk is one of the missing disks */ | ||
838 | if (faila == syndrome_disks) { | ||
839 | /* Missing P+Q, just recompute */ | ||
840 | init_async_submit(&submit, 0, NULL, ops_complete_compute, | ||
841 | sh, to_addr_conv(sh, percpu)); | ||
842 | return async_gen_syndrome(blocks, 0, count+2, | ||
843 | STRIPE_SIZE, &submit); | ||
844 | } else { | ||
845 | struct page *dest; | ||
846 | int data_target; | ||
847 | int qd_idx = sh->qd_idx; | ||
848 | |||
849 | /* Missing D+Q: recompute D from P, then recompute Q */ | ||
850 | if (target == qd_idx) | ||
851 | data_target = target2; | ||
852 | else | ||
853 | data_target = target; | ||
854 | |||
855 | count = 0; | ||
856 | for (i = disks; i-- ; ) { | ||
857 | if (i == data_target || i == qd_idx) | ||
858 | continue; | ||
859 | blocks[count++] = sh->dev[i].page; | ||
860 | } | ||
861 | dest = sh->dev[data_target].page; | ||
862 | init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL, | ||
863 | NULL, NULL, to_addr_conv(sh, percpu)); | ||
864 | tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, | ||
865 | &submit); | ||
866 | |||
867 | count = set_syndrome_sources(blocks, sh); | ||
868 | init_async_submit(&submit, 0, tx, ops_complete_compute, | ||
869 | sh, to_addr_conv(sh, percpu)); | ||
870 | return async_gen_syndrome(blocks, 0, count+2, | ||
871 | STRIPE_SIZE, &submit); | ||
872 | } | ||
873 | } | ||
874 | |||
875 | init_async_submit(&submit, 0, NULL, ops_complete_compute, sh, | ||
876 | to_addr_conv(sh, percpu)); | ||
877 | if (failb == syndrome_disks) { | ||
878 | /* We're missing D+P. */ | ||
879 | return async_raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, | ||
880 | faila, blocks, &submit); | ||
881 | } else { | ||
882 | /* We're missing D+D. */ | ||
883 | return async_raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, | ||
884 | faila, failb, blocks, &submit); | ||
885 | } | ||
886 | } | ||
887 | |||
888 | |||
674 | static void ops_complete_prexor(void *stripe_head_ref) | 889 | static void ops_complete_prexor(void *stripe_head_ref) |
675 | { | 890 | { |
676 | struct stripe_head *sh = stripe_head_ref; | 891 | struct stripe_head *sh = stripe_head_ref; |
@@ -680,12 +895,13 @@ static void ops_complete_prexor(void *stripe_head_ref) | |||
680 | } | 895 | } |
681 | 896 | ||
682 | static struct dma_async_tx_descriptor * | 897 | static struct dma_async_tx_descriptor * |
683 | ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | 898 | ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, |
899 | struct dma_async_tx_descriptor *tx) | ||
684 | { | 900 | { |
685 | /* kernel stack size limits the total number of disks */ | ||
686 | int disks = sh->disks; | 901 | int disks = sh->disks; |
687 | struct page *xor_srcs[disks]; | 902 | struct page **xor_srcs = percpu->scribble; |
688 | int count = 0, pd_idx = sh->pd_idx, i; | 903 | int count = 0, pd_idx = sh->pd_idx, i; |
904 | struct async_submit_ctl submit; | ||
689 | 905 | ||
690 | /* existing parity data subtracted */ | 906 | /* existing parity data subtracted */ |
691 | struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; | 907 | struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; |
@@ -700,9 +916,9 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
700 | xor_srcs[count++] = dev->page; | 916 | xor_srcs[count++] = dev->page; |
701 | } | 917 | } |
702 | 918 | ||
703 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | 919 | init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST, tx, |
704 | ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx, | 920 | ops_complete_prexor, sh, to_addr_conv(sh, percpu)); |
705 | ops_complete_prexor, sh); | 921 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); |
706 | 922 | ||
707 | return tx; | 923 | return tx; |
708 | } | 924 | } |
@@ -742,17 +958,21 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
742 | return tx; | 958 | return tx; |
743 | } | 959 | } |
744 | 960 | ||
745 | static void ops_complete_postxor(void *stripe_head_ref) | 961 | static void ops_complete_reconstruct(void *stripe_head_ref) |
746 | { | 962 | { |
747 | struct stripe_head *sh = stripe_head_ref; | 963 | struct stripe_head *sh = stripe_head_ref; |
748 | int disks = sh->disks, i, pd_idx = sh->pd_idx; | 964 | int disks = sh->disks; |
965 | int pd_idx = sh->pd_idx; | ||
966 | int qd_idx = sh->qd_idx; | ||
967 | int i; | ||
749 | 968 | ||
750 | pr_debug("%s: stripe %llu\n", __func__, | 969 | pr_debug("%s: stripe %llu\n", __func__, |
751 | (unsigned long long)sh->sector); | 970 | (unsigned long long)sh->sector); |
752 | 971 | ||
753 | for (i = disks; i--; ) { | 972 | for (i = disks; i--; ) { |
754 | struct r5dev *dev = &sh->dev[i]; | 973 | struct r5dev *dev = &sh->dev[i]; |
755 | if (dev->written || i == pd_idx) | 974 | |
975 | if (dev->written || i == pd_idx || i == qd_idx) | ||
756 | set_bit(R5_UPTODATE, &dev->flags); | 976 | set_bit(R5_UPTODATE, &dev->flags); |
757 | } | 977 | } |
758 | 978 | ||
@@ -770,12 +990,12 @@ static void ops_complete_postxor(void *stripe_head_ref) | |||
770 | } | 990 | } |
771 | 991 | ||
772 | static void | 992 | static void |
773 | ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | 993 | ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, |
994 | struct dma_async_tx_descriptor *tx) | ||
774 | { | 995 | { |
775 | /* kernel stack size limits the total number of disks */ | ||
776 | int disks = sh->disks; | 996 | int disks = sh->disks; |
777 | struct page *xor_srcs[disks]; | 997 | struct page **xor_srcs = percpu->scribble; |
778 | 998 | struct async_submit_ctl submit; | |
779 | int count = 0, pd_idx = sh->pd_idx, i; | 999 | int count = 0, pd_idx = sh->pd_idx, i; |
780 | struct page *xor_dest; | 1000 | struct page *xor_dest; |
781 | int prexor = 0; | 1001 | int prexor = 0; |
@@ -809,18 +1029,36 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
809 | * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST | 1029 | * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST |
810 | * for the synchronous xor case | 1030 | * for the synchronous xor case |
811 | */ | 1031 | */ |
812 | flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK | | 1032 | flags = ASYNC_TX_ACK | |
813 | (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); | 1033 | (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); |
814 | 1034 | ||
815 | atomic_inc(&sh->count); | 1035 | atomic_inc(&sh->count); |
816 | 1036 | ||
817 | if (unlikely(count == 1)) { | 1037 | init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, |
818 | flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); | 1038 | to_addr_conv(sh, percpu)); |
819 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, | 1039 | if (unlikely(count == 1)) |
820 | flags, tx, ops_complete_postxor, sh); | 1040 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); |
821 | } else | 1041 | else |
822 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | 1042 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); |
823 | flags, tx, ops_complete_postxor, sh); | 1043 | } |
1044 | |||
1045 | static void | ||
1046 | ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, | ||
1047 | struct dma_async_tx_descriptor *tx) | ||
1048 | { | ||
1049 | struct async_submit_ctl submit; | ||
1050 | struct page **blocks = percpu->scribble; | ||
1051 | int count; | ||
1052 | |||
1053 | pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); | ||
1054 | |||
1055 | count = set_syndrome_sources(blocks, sh); | ||
1056 | |||
1057 | atomic_inc(&sh->count); | ||
1058 | |||
1059 | init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, | ||
1060 | sh, to_addr_conv(sh, percpu)); | ||
1061 | async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); | ||
824 | } | 1062 | } |
825 | 1063 | ||
826 | static void ops_complete_check(void *stripe_head_ref) | 1064 | static void ops_complete_check(void *stripe_head_ref) |
@@ -835,63 +1073,115 @@ static void ops_complete_check(void *stripe_head_ref) | |||
835 | release_stripe(sh); | 1073 | release_stripe(sh); |
836 | } | 1074 | } |
837 | 1075 | ||
838 | static void ops_run_check(struct stripe_head *sh) | 1076 | static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) |
839 | { | 1077 | { |
840 | /* kernel stack size limits the total number of disks */ | ||
841 | int disks = sh->disks; | 1078 | int disks = sh->disks; |
842 | struct page *xor_srcs[disks]; | 1079 | int pd_idx = sh->pd_idx; |
1080 | int qd_idx = sh->qd_idx; | ||
1081 | struct page *xor_dest; | ||
1082 | struct page **xor_srcs = percpu->scribble; | ||
843 | struct dma_async_tx_descriptor *tx; | 1083 | struct dma_async_tx_descriptor *tx; |
844 | 1084 | struct async_submit_ctl submit; | |
845 | int count = 0, pd_idx = sh->pd_idx, i; | 1085 | int count; |
846 | struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; | 1086 | int i; |
847 | 1087 | ||
848 | pr_debug("%s: stripe %llu\n", __func__, | 1088 | pr_debug("%s: stripe %llu\n", __func__, |
849 | (unsigned long long)sh->sector); | 1089 | (unsigned long long)sh->sector); |
850 | 1090 | ||
1091 | count = 0; | ||
1092 | xor_dest = sh->dev[pd_idx].page; | ||
1093 | xor_srcs[count++] = xor_dest; | ||
851 | for (i = disks; i--; ) { | 1094 | for (i = disks; i--; ) { |
852 | struct r5dev *dev = &sh->dev[i]; | 1095 | if (i == pd_idx || i == qd_idx) |
853 | if (i != pd_idx) | 1096 | continue; |
854 | xor_srcs[count++] = dev->page; | 1097 | xor_srcs[count++] = sh->dev[i].page; |
855 | } | 1098 | } |
856 | 1099 | ||
857 | tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | 1100 | init_async_submit(&submit, 0, NULL, NULL, NULL, |
858 | &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); | 1101 | to_addr_conv(sh, percpu)); |
1102 | tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | ||
1103 | &sh->ops.zero_sum_result, &submit); | ||
1104 | |||
1105 | atomic_inc(&sh->count); | ||
1106 | init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); | ||
1107 | tx = async_trigger_callback(&submit); | ||
1108 | } | ||
1109 | |||
1110 | static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) | ||
1111 | { | ||
1112 | struct page **srcs = percpu->scribble; | ||
1113 | struct async_submit_ctl submit; | ||
1114 | int count; | ||
1115 | |||
1116 | pr_debug("%s: stripe %llu checkp: %d\n", __func__, | ||
1117 | (unsigned long long)sh->sector, checkp); | ||
1118 | |||
1119 | count = set_syndrome_sources(srcs, sh); | ||
1120 | if (!checkp) | ||
1121 | srcs[count] = NULL; | ||
859 | 1122 | ||
860 | atomic_inc(&sh->count); | 1123 | atomic_inc(&sh->count); |
861 | tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, | 1124 | init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, |
862 | ops_complete_check, sh); | 1125 | sh, to_addr_conv(sh, percpu)); |
1126 | async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, | ||
1127 | &sh->ops.zero_sum_result, percpu->spare_page, &submit); | ||
863 | } | 1128 | } |
864 | 1129 | ||
865 | static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) | 1130 | static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) |
866 | { | 1131 | { |
867 | int overlap_clear = 0, i, disks = sh->disks; | 1132 | int overlap_clear = 0, i, disks = sh->disks; |
868 | struct dma_async_tx_descriptor *tx = NULL; | 1133 | struct dma_async_tx_descriptor *tx = NULL; |
1134 | raid5_conf_t *conf = sh->raid_conf; | ||
1135 | int level = conf->level; | ||
1136 | struct raid5_percpu *percpu; | ||
1137 | unsigned long cpu; | ||
869 | 1138 | ||
1139 | cpu = get_cpu(); | ||
1140 | percpu = per_cpu_ptr(conf->percpu, cpu); | ||
870 | if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { | 1141 | if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { |
871 | ops_run_biofill(sh); | 1142 | ops_run_biofill(sh); |
872 | overlap_clear++; | 1143 | overlap_clear++; |
873 | } | 1144 | } |
874 | 1145 | ||
875 | if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { | 1146 | if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { |
876 | tx = ops_run_compute5(sh); | 1147 | if (level < 6) |
877 | /* terminate the chain if postxor is not set to be run */ | 1148 | tx = ops_run_compute5(sh, percpu); |
878 | if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request)) | 1149 | else { |
1150 | if (sh->ops.target2 < 0 || sh->ops.target < 0) | ||
1151 | tx = ops_run_compute6_1(sh, percpu); | ||
1152 | else | ||
1153 | tx = ops_run_compute6_2(sh, percpu); | ||
1154 | } | ||
1155 | /* terminate the chain if reconstruct is not set to be run */ | ||
1156 | if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) | ||
879 | async_tx_ack(tx); | 1157 | async_tx_ack(tx); |
880 | } | 1158 | } |
881 | 1159 | ||
882 | if (test_bit(STRIPE_OP_PREXOR, &ops_request)) | 1160 | if (test_bit(STRIPE_OP_PREXOR, &ops_request)) |
883 | tx = ops_run_prexor(sh, tx); | 1161 | tx = ops_run_prexor(sh, percpu, tx); |
884 | 1162 | ||
885 | if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { | 1163 | if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { |
886 | tx = ops_run_biodrain(sh, tx); | 1164 | tx = ops_run_biodrain(sh, tx); |
887 | overlap_clear++; | 1165 | overlap_clear++; |
888 | } | 1166 | } |
889 | 1167 | ||
890 | if (test_bit(STRIPE_OP_POSTXOR, &ops_request)) | 1168 | if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { |
891 | ops_run_postxor(sh, tx); | 1169 | if (level < 6) |
1170 | ops_run_reconstruct5(sh, percpu, tx); | ||
1171 | else | ||
1172 | ops_run_reconstruct6(sh, percpu, tx); | ||
1173 | } | ||
892 | 1174 | ||
893 | if (test_bit(STRIPE_OP_CHECK, &ops_request)) | 1175 | if (test_bit(STRIPE_OP_CHECK, &ops_request)) { |
894 | ops_run_check(sh); | 1176 | if (sh->check_state == check_state_run) |
1177 | ops_run_check_p(sh, percpu); | ||
1178 | else if (sh->check_state == check_state_run_q) | ||
1179 | ops_run_check_pq(sh, percpu, 0); | ||
1180 | else if (sh->check_state == check_state_run_pq) | ||
1181 | ops_run_check_pq(sh, percpu, 1); | ||
1182 | else | ||
1183 | BUG(); | ||
1184 | } | ||
895 | 1185 | ||
896 | if (overlap_clear) | 1186 | if (overlap_clear) |
897 | for (i = disks; i--; ) { | 1187 | for (i = disks; i--; ) { |
@@ -899,6 +1189,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) | |||
899 | if (test_and_clear_bit(R5_Overlap, &dev->flags)) | 1189 | if (test_and_clear_bit(R5_Overlap, &dev->flags)) |
900 | wake_up(&sh->raid_conf->wait_for_overlap); | 1190 | wake_up(&sh->raid_conf->wait_for_overlap); |
901 | } | 1191 | } |
1192 | put_cpu(); | ||
902 | } | 1193 | } |
903 | 1194 | ||
904 | static int grow_one_stripe(raid5_conf_t *conf) | 1195 | static int grow_one_stripe(raid5_conf_t *conf) |
@@ -948,6 +1239,28 @@ static int grow_stripes(raid5_conf_t *conf, int num) | |||
948 | return 0; | 1239 | return 0; |
949 | } | 1240 | } |
950 | 1241 | ||
1242 | /** | ||
1243 | * scribble_len - return the required size of the scribble region | ||
1244 | * @num - total number of disks in the array | ||
1245 | * | ||
1246 | * The size must be enough to contain: | ||
1247 | * 1/ a struct page pointer for each device in the array +2 | ||
1248 | * 2/ room to convert each entry in (1) to its corresponding dma | ||
1249 | * (dma_map_page()) or page (page_address()) address. | ||
1250 | * | ||
1251 | * Note: the +2 is for the destination buffers of the ddf/raid6 case where we | ||
1252 | * calculate over all devices (not just the data blocks), using zeros in place | ||
1253 | * of the P and Q blocks. | ||
1254 | */ | ||
1255 | static size_t scribble_len(int num) | ||
1256 | { | ||
1257 | size_t len; | ||
1258 | |||
1259 | len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); | ||
1260 | |||
1261 | return len; | ||
1262 | } | ||
1263 | |||
951 | static int resize_stripes(raid5_conf_t *conf, int newsize) | 1264 | static int resize_stripes(raid5_conf_t *conf, int newsize) |
952 | { | 1265 | { |
953 | /* Make all the stripes able to hold 'newsize' devices. | 1266 | /* Make all the stripes able to hold 'newsize' devices. |
@@ -976,6 +1289,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) | |||
976 | struct stripe_head *osh, *nsh; | 1289 | struct stripe_head *osh, *nsh; |
977 | LIST_HEAD(newstripes); | 1290 | LIST_HEAD(newstripes); |
978 | struct disk_info *ndisks; | 1291 | struct disk_info *ndisks; |
1292 | unsigned long cpu; | ||
979 | int err; | 1293 | int err; |
980 | struct kmem_cache *sc; | 1294 | struct kmem_cache *sc; |
981 | int i; | 1295 | int i; |
@@ -1041,7 +1355,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) | |||
1041 | /* Step 3. | 1355 | /* Step 3. |
1042 | * At this point, we are holding all the stripes so the array | 1356 | * At this point, we are holding all the stripes so the array |
1043 | * is completely stalled, so now is a good time to resize | 1357 | * is completely stalled, so now is a good time to resize |
1044 | * conf->disks. | 1358 | * conf->disks and the scribble region |
1045 | */ | 1359 | */ |
1046 | ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); | 1360 | ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); |
1047 | if (ndisks) { | 1361 | if (ndisks) { |
@@ -1052,10 +1366,30 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) | |||
1052 | } else | 1366 | } else |
1053 | err = -ENOMEM; | 1367 | err = -ENOMEM; |
1054 | 1368 | ||
1369 | get_online_cpus(); | ||
1370 | conf->scribble_len = scribble_len(newsize); | ||
1371 | for_each_present_cpu(cpu) { | ||
1372 | struct raid5_percpu *percpu; | ||
1373 | void *scribble; | ||
1374 | |||
1375 | percpu = per_cpu_ptr(conf->percpu, cpu); | ||
1376 | scribble = kmalloc(conf->scribble_len, GFP_NOIO); | ||
1377 | |||
1378 | if (scribble) { | ||
1379 | kfree(percpu->scribble); | ||
1380 | percpu->scribble = scribble; | ||
1381 | } else { | ||
1382 | err = -ENOMEM; | ||
1383 | break; | ||
1384 | } | ||
1385 | } | ||
1386 | put_online_cpus(); | ||
1387 | |||
1055 | /* Step 4, return new stripes to service */ | 1388 | /* Step 4, return new stripes to service */ |
1056 | while(!list_empty(&newstripes)) { | 1389 | while(!list_empty(&newstripes)) { |
1057 | nsh = list_entry(newstripes.next, struct stripe_head, lru); | 1390 | nsh = list_entry(newstripes.next, struct stripe_head, lru); |
1058 | list_del_init(&nsh->lru); | 1391 | list_del_init(&nsh->lru); |
1392 | |||
1059 | for (i=conf->raid_disks; i < newsize; i++) | 1393 | for (i=conf->raid_disks; i < newsize; i++) |
1060 | if (nsh->dev[i].page == NULL) { | 1394 | if (nsh->dev[i].page == NULL) { |
1061 | struct page *p = alloc_page(GFP_NOIO); | 1395 | struct page *p = alloc_page(GFP_NOIO); |
@@ -1594,258 +1928,13 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) | |||
1594 | } | 1928 | } |
1595 | 1929 | ||
1596 | 1930 | ||
1597 | |||
1598 | /* | ||
1599 | * Copy data between a page in the stripe cache, and one or more bion | ||
1600 | * The page could align with the middle of the bio, or there could be | ||
1601 | * several bion, each with several bio_vecs, which cover part of the page | ||
1602 | * Multiple bion are linked together on bi_next. There may be extras | ||
1603 | * at the end of this list. We ignore them. | ||
1604 | */ | ||
1605 | static void copy_data(int frombio, struct bio *bio, | ||
1606 | struct page *page, | ||
1607 | sector_t sector) | ||
1608 | { | ||
1609 | char *pa = page_address(page); | ||
1610 | struct bio_vec *bvl; | ||
1611 | int i; | ||
1612 | int page_offset; | ||
1613 | |||
1614 | if (bio->bi_sector >= sector) | ||
1615 | page_offset = (signed)(bio->bi_sector - sector) * 512; | ||
1616 | else | ||
1617 | page_offset = (signed)(sector - bio->bi_sector) * -512; | ||
1618 | bio_for_each_segment(bvl, bio, i) { | ||
1619 | int len = bio_iovec_idx(bio,i)->bv_len; | ||
1620 | int clen; | ||
1621 | int b_offset = 0; | ||
1622 | |||
1623 | if (page_offset < 0) { | ||
1624 | b_offset = -page_offset; | ||
1625 | page_offset += b_offset; | ||
1626 | len -= b_offset; | ||
1627 | } | ||
1628 | |||
1629 | if (len > 0 && page_offset + len > STRIPE_SIZE) | ||
1630 | clen = STRIPE_SIZE - page_offset; | ||
1631 | else clen = len; | ||
1632 | |||
1633 | if (clen > 0) { | ||
1634 | char *ba = __bio_kmap_atomic(bio, i, KM_USER0); | ||
1635 | if (frombio) | ||
1636 | memcpy(pa+page_offset, ba+b_offset, clen); | ||
1637 | else | ||
1638 | memcpy(ba+b_offset, pa+page_offset, clen); | ||
1639 | __bio_kunmap_atomic(ba, KM_USER0); | ||
1640 | } | ||
1641 | if (clen < len) /* hit end of page */ | ||
1642 | break; | ||
1643 | page_offset += len; | ||
1644 | } | ||
1645 | } | ||
1646 | |||
1647 | #define check_xor() do { \ | ||
1648 | if (count == MAX_XOR_BLOCKS) { \ | ||
1649 | xor_blocks(count, STRIPE_SIZE, dest, ptr);\ | ||
1650 | count = 0; \ | ||
1651 | } \ | ||
1652 | } while(0) | ||
1653 | |||
1654 | static void compute_parity6(struct stripe_head *sh, int method) | ||
1655 | { | ||
1656 | raid5_conf_t *conf = sh->raid_conf; | ||
1657 | int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count; | ||
1658 | int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); | ||
1659 | struct bio *chosen; | ||
1660 | /**** FIX THIS: This could be very bad if disks is close to 256 ****/ | ||
1661 | void *ptrs[syndrome_disks+2]; | ||
1662 | |||
1663 | pd_idx = sh->pd_idx; | ||
1664 | qd_idx = sh->qd_idx; | ||
1665 | d0_idx = raid6_d0(sh); | ||
1666 | |||
1667 | pr_debug("compute_parity, stripe %llu, method %d\n", | ||
1668 | (unsigned long long)sh->sector, method); | ||
1669 | |||
1670 | switch(method) { | ||
1671 | case READ_MODIFY_WRITE: | ||
1672 | BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */ | ||
1673 | case RECONSTRUCT_WRITE: | ||
1674 | for (i= disks; i-- ;) | ||
1675 | if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) { | ||
1676 | chosen = sh->dev[i].towrite; | ||
1677 | sh->dev[i].towrite = NULL; | ||
1678 | |||
1679 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | ||
1680 | wake_up(&conf->wait_for_overlap); | ||
1681 | |||
1682 | BUG_ON(sh->dev[i].written); | ||
1683 | sh->dev[i].written = chosen; | ||
1684 | } | ||
1685 | break; | ||
1686 | case CHECK_PARITY: | ||
1687 | BUG(); /* Not implemented yet */ | ||
1688 | } | ||
1689 | |||
1690 | for (i = disks; i--;) | ||
1691 | if (sh->dev[i].written) { | ||
1692 | sector_t sector = sh->dev[i].sector; | ||
1693 | struct bio *wbi = sh->dev[i].written; | ||
1694 | while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { | ||
1695 | copy_data(1, wbi, sh->dev[i].page, sector); | ||
1696 | wbi = r5_next_bio(wbi, sector); | ||
1697 | } | ||
1698 | |||
1699 | set_bit(R5_LOCKED, &sh->dev[i].flags); | ||
1700 | set_bit(R5_UPTODATE, &sh->dev[i].flags); | ||
1701 | } | ||
1702 | |||
1703 | /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/ | ||
1704 | |||
1705 | for (i = 0; i < disks; i++) | ||
1706 | ptrs[i] = (void *)raid6_empty_zero_page; | ||
1707 | |||
1708 | count = 0; | ||
1709 | i = d0_idx; | ||
1710 | do { | ||
1711 | int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); | ||
1712 | |||
1713 | ptrs[slot] = page_address(sh->dev[i].page); | ||
1714 | if (slot < syndrome_disks && | ||
1715 | !test_bit(R5_UPTODATE, &sh->dev[i].flags)) { | ||
1716 | printk(KERN_ERR "block %d/%d not uptodate " | ||
1717 | "on parity calc\n", i, count); | ||
1718 | BUG(); | ||
1719 | } | ||
1720 | |||
1721 | i = raid6_next_disk(i, disks); | ||
1722 | } while (i != d0_idx); | ||
1723 | BUG_ON(count != syndrome_disks); | ||
1724 | |||
1725 | raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs); | ||
1726 | |||
1727 | switch(method) { | ||
1728 | case RECONSTRUCT_WRITE: | ||
1729 | set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); | ||
1730 | set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); | ||
1731 | set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); | ||
1732 | set_bit(R5_LOCKED, &sh->dev[qd_idx].flags); | ||
1733 | break; | ||
1734 | case UPDATE_PARITY: | ||
1735 | set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); | ||
1736 | set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); | ||
1737 | break; | ||
1738 | } | ||
1739 | } | ||
1740 | |||
1741 | |||
1742 | /* Compute one missing block */ | ||
1743 | static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) | ||
1744 | { | ||
1745 | int i, count, disks = sh->disks; | ||
1746 | void *ptr[MAX_XOR_BLOCKS], *dest, *p; | ||
1747 | int qd_idx = sh->qd_idx; | ||
1748 | |||
1749 | pr_debug("compute_block_1, stripe %llu, idx %d\n", | ||
1750 | (unsigned long long)sh->sector, dd_idx); | ||
1751 | |||
1752 | if ( dd_idx == qd_idx ) { | ||
1753 | /* We're actually computing the Q drive */ | ||
1754 | compute_parity6(sh, UPDATE_PARITY); | ||
1755 | } else { | ||
1756 | dest = page_address(sh->dev[dd_idx].page); | ||
1757 | if (!nozero) memset(dest, 0, STRIPE_SIZE); | ||
1758 | count = 0; | ||
1759 | for (i = disks ; i--; ) { | ||
1760 | if (i == dd_idx || i == qd_idx) | ||
1761 | continue; | ||
1762 | p = page_address(sh->dev[i].page); | ||
1763 | if (test_bit(R5_UPTODATE, &sh->dev[i].flags)) | ||
1764 | ptr[count++] = p; | ||
1765 | else | ||
1766 | printk("compute_block() %d, stripe %llu, %d" | ||
1767 | " not present\n", dd_idx, | ||
1768 | (unsigned long long)sh->sector, i); | ||
1769 | |||
1770 | check_xor(); | ||
1771 | } | ||
1772 | if (count) | ||
1773 | xor_blocks(count, STRIPE_SIZE, dest, ptr); | ||
1774 | if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); | ||
1775 | else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); | ||
1776 | } | ||
1777 | } | ||
1778 | |||
1779 | /* Compute two missing blocks */ | ||
1780 | static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) | ||
1781 | { | ||
1782 | int i, count, disks = sh->disks; | ||
1783 | int syndrome_disks = sh->ddf_layout ? disks : disks-2; | ||
1784 | int d0_idx = raid6_d0(sh); | ||
1785 | int faila = -1, failb = -1; | ||
1786 | /**** FIX THIS: This could be very bad if disks is close to 256 ****/ | ||
1787 | void *ptrs[syndrome_disks+2]; | ||
1788 | |||
1789 | for (i = 0; i < disks ; i++) | ||
1790 | ptrs[i] = (void *)raid6_empty_zero_page; | ||
1791 | count = 0; | ||
1792 | i = d0_idx; | ||
1793 | do { | ||
1794 | int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); | ||
1795 | |||
1796 | ptrs[slot] = page_address(sh->dev[i].page); | ||
1797 | |||
1798 | if (i == dd_idx1) | ||
1799 | faila = slot; | ||
1800 | if (i == dd_idx2) | ||
1801 | failb = slot; | ||
1802 | i = raid6_next_disk(i, disks); | ||
1803 | } while (i != d0_idx); | ||
1804 | BUG_ON(count != syndrome_disks); | ||
1805 | |||
1806 | BUG_ON(faila == failb); | ||
1807 | if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } | ||
1808 | |||
1809 | pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n", | ||
1810 | (unsigned long long)sh->sector, dd_idx1, dd_idx2, | ||
1811 | faila, failb); | ||
1812 | |||
1813 | if (failb == syndrome_disks+1) { | ||
1814 | /* Q disk is one of the missing disks */ | ||
1815 | if (faila == syndrome_disks) { | ||
1816 | /* Missing P+Q, just recompute */ | ||
1817 | compute_parity6(sh, UPDATE_PARITY); | ||
1818 | return; | ||
1819 | } else { | ||
1820 | /* We're missing D+Q; recompute D from P */ | ||
1821 | compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ? | ||
1822 | dd_idx2 : dd_idx1), | ||
1823 | 0); | ||
1824 | compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */ | ||
1825 | return; | ||
1826 | } | ||
1827 | } | ||
1828 | |||
1829 | /* We're missing D+P or D+D; */ | ||
1830 | if (failb == syndrome_disks) { | ||
1831 | /* We're missing D+P. */ | ||
1832 | raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs); | ||
1833 | } else { | ||
1834 | /* We're missing D+D. */ | ||
1835 | raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb, | ||
1836 | ptrs); | ||
1837 | } | ||
1838 | |||
1839 | /* Both the above update both missing blocks */ | ||
1840 | set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags); | ||
1841 | set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags); | ||
1842 | } | ||
1843 | |||
1844 | static void | 1931 | static void |
1845 | schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, | 1932 | schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, |
1846 | int rcw, int expand) | 1933 | int rcw, int expand) |
1847 | { | 1934 | { |
1848 | int i, pd_idx = sh->pd_idx, disks = sh->disks; | 1935 | int i, pd_idx = sh->pd_idx, disks = sh->disks; |
1936 | raid5_conf_t *conf = sh->raid_conf; | ||
1937 | int level = conf->level; | ||
1849 | 1938 | ||
1850 | if (rcw) { | 1939 | if (rcw) { |
1851 | /* if we are not expanding this is a proper write request, and | 1940 | /* if we are not expanding this is a proper write request, and |
@@ -1858,7 +1947,7 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, | |||
1858 | } else | 1947 | } else |
1859 | sh->reconstruct_state = reconstruct_state_run; | 1948 | sh->reconstruct_state = reconstruct_state_run; |
1860 | 1949 | ||
1861 | set_bit(STRIPE_OP_POSTXOR, &s->ops_request); | 1950 | set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); |
1862 | 1951 | ||
1863 | for (i = disks; i--; ) { | 1952 | for (i = disks; i--; ) { |
1864 | struct r5dev *dev = &sh->dev[i]; | 1953 | struct r5dev *dev = &sh->dev[i]; |
@@ -1871,17 +1960,18 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, | |||
1871 | s->locked++; | 1960 | s->locked++; |
1872 | } | 1961 | } |
1873 | } | 1962 | } |
1874 | if (s->locked + 1 == disks) | 1963 | if (s->locked + conf->max_degraded == disks) |
1875 | if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) | 1964 | if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) |
1876 | atomic_inc(&sh->raid_conf->pending_full_writes); | 1965 | atomic_inc(&conf->pending_full_writes); |
1877 | } else { | 1966 | } else { |
1967 | BUG_ON(level == 6); | ||
1878 | BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || | 1968 | BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || |
1879 | test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); | 1969 | test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); |
1880 | 1970 | ||
1881 | sh->reconstruct_state = reconstruct_state_prexor_drain_run; | 1971 | sh->reconstruct_state = reconstruct_state_prexor_drain_run; |
1882 | set_bit(STRIPE_OP_PREXOR, &s->ops_request); | 1972 | set_bit(STRIPE_OP_PREXOR, &s->ops_request); |
1883 | set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); | 1973 | set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); |
1884 | set_bit(STRIPE_OP_POSTXOR, &s->ops_request); | 1974 | set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); |
1885 | 1975 | ||
1886 | for (i = disks; i--; ) { | 1976 | for (i = disks; i--; ) { |
1887 | struct r5dev *dev = &sh->dev[i]; | 1977 | struct r5dev *dev = &sh->dev[i]; |
@@ -1899,13 +1989,22 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, | |||
1899 | } | 1989 | } |
1900 | } | 1990 | } |
1901 | 1991 | ||
1902 | /* keep the parity disk locked while asynchronous operations | 1992 | /* keep the parity disk(s) locked while asynchronous operations |
1903 | * are in flight | 1993 | * are in flight |
1904 | */ | 1994 | */ |
1905 | set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); | 1995 | set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); |
1906 | clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); | 1996 | clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); |
1907 | s->locked++; | 1997 | s->locked++; |
1908 | 1998 | ||
1999 | if (level == 6) { | ||
2000 | int qd_idx = sh->qd_idx; | ||
2001 | struct r5dev *dev = &sh->dev[qd_idx]; | ||
2002 | |||
2003 | set_bit(R5_LOCKED, &dev->flags); | ||
2004 | clear_bit(R5_UPTODATE, &dev->flags); | ||
2005 | s->locked++; | ||
2006 | } | ||
2007 | |||
1909 | pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", | 2008 | pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", |
1910 | __func__, (unsigned long long)sh->sector, | 2009 | __func__, (unsigned long long)sh->sector, |
1911 | s->locked, s->ops_request); | 2010 | s->locked, s->ops_request); |
@@ -1986,13 +2085,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
1986 | 2085 | ||
1987 | static void end_reshape(raid5_conf_t *conf); | 2086 | static void end_reshape(raid5_conf_t *conf); |
1988 | 2087 | ||
1989 | static int page_is_zero(struct page *p) | ||
1990 | { | ||
1991 | char *a = page_address(p); | ||
1992 | return ((*(u32*)a) == 0 && | ||
1993 | memcmp(a, a+4, STRIPE_SIZE-4)==0); | ||
1994 | } | ||
1995 | |||
1996 | static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, | 2088 | static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, |
1997 | struct stripe_head *sh) | 2089 | struct stripe_head *sh) |
1998 | { | 2090 | { |
@@ -2133,9 +2225,10 @@ static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, | |||
2133 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); | 2225 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); |
2134 | set_bit(R5_Wantcompute, &dev->flags); | 2226 | set_bit(R5_Wantcompute, &dev->flags); |
2135 | sh->ops.target = disk_idx; | 2227 | sh->ops.target = disk_idx; |
2228 | sh->ops.target2 = -1; | ||
2136 | s->req_compute = 1; | 2229 | s->req_compute = 1; |
2137 | /* Careful: from this point on 'uptodate' is in the eye | 2230 | /* Careful: from this point on 'uptodate' is in the eye |
2138 | * of raid5_run_ops which services 'compute' operations | 2231 | * of raid_run_ops which services 'compute' operations |
2139 | * before writes. R5_Wantcompute flags a block that will | 2232 | * before writes. R5_Wantcompute flags a block that will |
2140 | * be R5_UPTODATE by the time it is needed for a | 2233 | * be R5_UPTODATE by the time it is needed for a |
2141 | * subsequent operation. | 2234 | * subsequent operation. |
@@ -2174,61 +2267,104 @@ static void handle_stripe_fill5(struct stripe_head *sh, | |||
2174 | set_bit(STRIPE_HANDLE, &sh->state); | 2267 | set_bit(STRIPE_HANDLE, &sh->state); |
2175 | } | 2268 | } |
2176 | 2269 | ||
2177 | static void handle_stripe_fill6(struct stripe_head *sh, | 2270 | /* fetch_block6 - checks the given member device to see if its data needs |
2178 | struct stripe_head_state *s, struct r6_state *r6s, | 2271 | * to be read or computed to satisfy a request. |
2179 | int disks) | 2272 | * |
2273 | * Returns 1 when no more member devices need to be checked, otherwise returns | ||
2274 | * 0 to tell the loop in handle_stripe_fill6 to continue | ||
2275 | */ | ||
2276 | static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, | ||
2277 | struct r6_state *r6s, int disk_idx, int disks) | ||
2180 | { | 2278 | { |
2181 | int i; | 2279 | struct r5dev *dev = &sh->dev[disk_idx]; |
2182 | for (i = disks; i--; ) { | 2280 | struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]], |
2183 | struct r5dev *dev = &sh->dev[i]; | 2281 | &sh->dev[r6s->failed_num[1]] }; |
2184 | if (!test_bit(R5_LOCKED, &dev->flags) && | 2282 | |
2185 | !test_bit(R5_UPTODATE, &dev->flags) && | 2283 | if (!test_bit(R5_LOCKED, &dev->flags) && |
2186 | (dev->toread || (dev->towrite && | 2284 | !test_bit(R5_UPTODATE, &dev->flags) && |
2187 | !test_bit(R5_OVERWRITE, &dev->flags)) || | 2285 | (dev->toread || |
2188 | s->syncing || s->expanding || | 2286 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || |
2189 | (s->failed >= 1 && | 2287 | s->syncing || s->expanding || |
2190 | (sh->dev[r6s->failed_num[0]].toread || | 2288 | (s->failed >= 1 && |
2191 | s->to_write)) || | 2289 | (fdev[0]->toread || s->to_write)) || |
2192 | (s->failed >= 2 && | 2290 | (s->failed >= 2 && |
2193 | (sh->dev[r6s->failed_num[1]].toread || | 2291 | (fdev[1]->toread || s->to_write)))) { |
2194 | s->to_write)))) { | 2292 | /* we would like to get this block, possibly by computing it, |
2195 | /* we would like to get this block, possibly | 2293 | * otherwise read it if the backing disk is insync |
2196 | * by computing it, but we might not be able to | 2294 | */ |
2295 | BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); | ||
2296 | BUG_ON(test_bit(R5_Wantread, &dev->flags)); | ||
2297 | if ((s->uptodate == disks - 1) && | ||
2298 | (s->failed && (disk_idx == r6s->failed_num[0] || | ||
2299 | disk_idx == r6s->failed_num[1]))) { | ||
2300 | /* have disk failed, and we're requested to fetch it; | ||
2301 | * do compute it | ||
2197 | */ | 2302 | */ |
2198 | if ((s->uptodate == disks - 1) && | 2303 | pr_debug("Computing stripe %llu block %d\n", |
2199 | (s->failed && (i == r6s->failed_num[0] || | 2304 | (unsigned long long)sh->sector, disk_idx); |
2200 | i == r6s->failed_num[1]))) { | 2305 | set_bit(STRIPE_COMPUTE_RUN, &sh->state); |
2201 | pr_debug("Computing stripe %llu block %d\n", | 2306 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); |
2202 | (unsigned long long)sh->sector, i); | 2307 | set_bit(R5_Wantcompute, &dev->flags); |
2203 | compute_block_1(sh, i, 0); | 2308 | sh->ops.target = disk_idx; |
2204 | s->uptodate++; | 2309 | sh->ops.target2 = -1; /* no 2nd target */ |
2205 | } else if ( s->uptodate == disks-2 && s->failed >= 2 ) { | 2310 | s->req_compute = 1; |
2206 | /* Computing 2-failure is *very* expensive; only | 2311 | s->uptodate++; |
2207 | * do it if failed >= 2 | 2312 | return 1; |
2208 | */ | 2313 | } else if (s->uptodate == disks-2 && s->failed >= 2) { |
2209 | int other; | 2314 | /* Computing 2-failure is *very* expensive; only |
2210 | for (other = disks; other--; ) { | 2315 | * do it if failed >= 2 |
2211 | if (other == i) | 2316 | */ |
2212 | continue; | 2317 | int other; |
2213 | if (!test_bit(R5_UPTODATE, | 2318 | for (other = disks; other--; ) { |
2214 | &sh->dev[other].flags)) | 2319 | if (other == disk_idx) |
2215 | break; | 2320 | continue; |
2216 | } | 2321 | if (!test_bit(R5_UPTODATE, |
2217 | BUG_ON(other < 0); | 2322 | &sh->dev[other].flags)) |
2218 | pr_debug("Computing stripe %llu blocks %d,%d\n", | 2323 | break; |
2219 | (unsigned long long)sh->sector, | ||
2220 | i, other); | ||
2221 | compute_block_2(sh, i, other); | ||
2222 | s->uptodate += 2; | ||
2223 | } else if (test_bit(R5_Insync, &dev->flags)) { | ||
2224 | set_bit(R5_LOCKED, &dev->flags); | ||
2225 | set_bit(R5_Wantread, &dev->flags); | ||
2226 | s->locked++; | ||
2227 | pr_debug("Reading block %d (sync=%d)\n", | ||
2228 | i, s->syncing); | ||
2229 | } | 2324 | } |
2325 | BUG_ON(other < 0); | ||
2326 | pr_debug("Computing stripe %llu blocks %d,%d\n", | ||
2327 | (unsigned long long)sh->sector, | ||
2328 | disk_idx, other); | ||
2329 | set_bit(STRIPE_COMPUTE_RUN, &sh->state); | ||
2330 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); | ||
2331 | set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); | ||
2332 | set_bit(R5_Wantcompute, &sh->dev[other].flags); | ||
2333 | sh->ops.target = disk_idx; | ||
2334 | sh->ops.target2 = other; | ||
2335 | s->uptodate += 2; | ||
2336 | s->req_compute = 1; | ||
2337 | return 1; | ||
2338 | } else if (test_bit(R5_Insync, &dev->flags)) { | ||
2339 | set_bit(R5_LOCKED, &dev->flags); | ||
2340 | set_bit(R5_Wantread, &dev->flags); | ||
2341 | s->locked++; | ||
2342 | pr_debug("Reading block %d (sync=%d)\n", | ||
2343 | disk_idx, s->syncing); | ||
2230 | } | 2344 | } |
2231 | } | 2345 | } |
2346 | |||
2347 | return 0; | ||
2348 | } | ||
2349 | |||
2350 | /** | ||
2351 | * handle_stripe_fill6 - read or compute data to satisfy pending requests. | ||
2352 | */ | ||
2353 | static void handle_stripe_fill6(struct stripe_head *sh, | ||
2354 | struct stripe_head_state *s, struct r6_state *r6s, | ||
2355 | int disks) | ||
2356 | { | ||
2357 | int i; | ||
2358 | |||
2359 | /* look for blocks to read/compute, skip this if a compute | ||
2360 | * is already in flight, or if the stripe contents are in the | ||
2361 | * midst of changing due to a write | ||
2362 | */ | ||
2363 | if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && | ||
2364 | !sh->reconstruct_state) | ||
2365 | for (i = disks; i--; ) | ||
2366 | if (fetch_block6(sh, s, r6s, i, disks)) | ||
2367 | break; | ||
2232 | set_bit(STRIPE_HANDLE, &sh->state); | 2368 | set_bit(STRIPE_HANDLE, &sh->state); |
2233 | } | 2369 | } |
2234 | 2370 | ||
@@ -2362,114 +2498,61 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf, | |||
2362 | */ | 2498 | */ |
2363 | /* since handle_stripe can be called at any time we need to handle the | 2499 | /* since handle_stripe can be called at any time we need to handle the |
2364 | * case where a compute block operation has been submitted and then a | 2500 | * case where a compute block operation has been submitted and then a |
2365 | * subsequent call wants to start a write request. raid5_run_ops only | 2501 | * subsequent call wants to start a write request. raid_run_ops only |
2366 | * handles the case where compute block and postxor are requested | 2502 | * handles the case where compute block and reconstruct are requested |
2367 | * simultaneously. If this is not the case then new writes need to be | 2503 | * simultaneously. If this is not the case then new writes need to be |
2368 | * held off until the compute completes. | 2504 | * held off until the compute completes. |
2369 | */ | 2505 | */ |
2370 | if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && | 2506 | if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && |
2371 | (s->locked == 0 && (rcw == 0 || rmw == 0) && | 2507 | (s->locked == 0 && (rcw == 0 || rmw == 0) && |
2372 | !test_bit(STRIPE_BIT_DELAY, &sh->state))) | 2508 | !test_bit(STRIPE_BIT_DELAY, &sh->state))) |
2373 | schedule_reconstruction5(sh, s, rcw == 0, 0); | 2509 | schedule_reconstruction(sh, s, rcw == 0, 0); |
2374 | } | 2510 | } |
2375 | 2511 | ||
2376 | static void handle_stripe_dirtying6(raid5_conf_t *conf, | 2512 | static void handle_stripe_dirtying6(raid5_conf_t *conf, |
2377 | struct stripe_head *sh, struct stripe_head_state *s, | 2513 | struct stripe_head *sh, struct stripe_head_state *s, |
2378 | struct r6_state *r6s, int disks) | 2514 | struct r6_state *r6s, int disks) |
2379 | { | 2515 | { |
2380 | int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i; | 2516 | int rcw = 0, pd_idx = sh->pd_idx, i; |
2381 | int qd_idx = sh->qd_idx; | 2517 | int qd_idx = sh->qd_idx; |
2518 | |||
2519 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2382 | for (i = disks; i--; ) { | 2520 | for (i = disks; i--; ) { |
2383 | struct r5dev *dev = &sh->dev[i]; | 2521 | struct r5dev *dev = &sh->dev[i]; |
2384 | /* Would I have to read this buffer for reconstruct_write */ | 2522 | /* check if we haven't enough data */ |
2385 | if (!test_bit(R5_OVERWRITE, &dev->flags) | 2523 | if (!test_bit(R5_OVERWRITE, &dev->flags) && |
2386 | && i != pd_idx && i != qd_idx | 2524 | i != pd_idx && i != qd_idx && |
2387 | && (!test_bit(R5_LOCKED, &dev->flags) | 2525 | !test_bit(R5_LOCKED, &dev->flags) && |
2388 | ) && | 2526 | !(test_bit(R5_UPTODATE, &dev->flags) || |
2389 | !test_bit(R5_UPTODATE, &dev->flags)) { | 2527 | test_bit(R5_Wantcompute, &dev->flags))) { |
2390 | if (test_bit(R5_Insync, &dev->flags)) rcw++; | 2528 | rcw++; |
2391 | else { | 2529 | if (!test_bit(R5_Insync, &dev->flags)) |
2392 | pr_debug("raid6: must_compute: " | 2530 | continue; /* it's a failed drive */ |
2393 | "disk %d flags=%#lx\n", i, dev->flags); | 2531 | |
2394 | must_compute++; | 2532 | if ( |
2533 | test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
2534 | pr_debug("Read_old stripe %llu " | ||
2535 | "block %d for Reconstruct\n", | ||
2536 | (unsigned long long)sh->sector, i); | ||
2537 | set_bit(R5_LOCKED, &dev->flags); | ||
2538 | set_bit(R5_Wantread, &dev->flags); | ||
2539 | s->locked++; | ||
2540 | } else { | ||
2541 | pr_debug("Request delayed stripe %llu " | ||
2542 | "block %d for Reconstruct\n", | ||
2543 | (unsigned long long)sh->sector, i); | ||
2544 | set_bit(STRIPE_DELAYED, &sh->state); | ||
2545 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2395 | } | 2546 | } |
2396 | } | 2547 | } |
2397 | } | 2548 | } |
2398 | pr_debug("for sector %llu, rcw=%d, must_compute=%d\n", | ||
2399 | (unsigned long long)sh->sector, rcw, must_compute); | ||
2400 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2401 | |||
2402 | if (rcw > 0) | ||
2403 | /* want reconstruct write, but need to get some data */ | ||
2404 | for (i = disks; i--; ) { | ||
2405 | struct r5dev *dev = &sh->dev[i]; | ||
2406 | if (!test_bit(R5_OVERWRITE, &dev->flags) | ||
2407 | && !(s->failed == 0 && (i == pd_idx || i == qd_idx)) | ||
2408 | && !test_bit(R5_LOCKED, &dev->flags) && | ||
2409 | !test_bit(R5_UPTODATE, &dev->flags) && | ||
2410 | test_bit(R5_Insync, &dev->flags)) { | ||
2411 | if ( | ||
2412 | test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
2413 | pr_debug("Read_old stripe %llu " | ||
2414 | "block %d for Reconstruct\n", | ||
2415 | (unsigned long long)sh->sector, i); | ||
2416 | set_bit(R5_LOCKED, &dev->flags); | ||
2417 | set_bit(R5_Wantread, &dev->flags); | ||
2418 | s->locked++; | ||
2419 | } else { | ||
2420 | pr_debug("Request delayed stripe %llu " | ||
2421 | "block %d for Reconstruct\n", | ||
2422 | (unsigned long long)sh->sector, i); | ||
2423 | set_bit(STRIPE_DELAYED, &sh->state); | ||
2424 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2425 | } | ||
2426 | } | ||
2427 | } | ||
2428 | /* now if nothing is locked, and if we have enough data, we can start a | 2549 | /* now if nothing is locked, and if we have enough data, we can start a |
2429 | * write request | 2550 | * write request |
2430 | */ | 2551 | */ |
2431 | if (s->locked == 0 && rcw == 0 && | 2552 | if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && |
2553 | s->locked == 0 && rcw == 0 && | ||
2432 | !test_bit(STRIPE_BIT_DELAY, &sh->state)) { | 2554 | !test_bit(STRIPE_BIT_DELAY, &sh->state)) { |
2433 | if (must_compute > 0) { | 2555 | schedule_reconstruction(sh, s, 1, 0); |
2434 | /* We have failed blocks and need to compute them */ | ||
2435 | switch (s->failed) { | ||
2436 | case 0: | ||
2437 | BUG(); | ||
2438 | case 1: | ||
2439 | compute_block_1(sh, r6s->failed_num[0], 0); | ||
2440 | break; | ||
2441 | case 2: | ||
2442 | compute_block_2(sh, r6s->failed_num[0], | ||
2443 | r6s->failed_num[1]); | ||
2444 | break; | ||
2445 | default: /* This request should have been failed? */ | ||
2446 | BUG(); | ||
2447 | } | ||
2448 | } | ||
2449 | |||
2450 | pr_debug("Computing parity for stripe %llu\n", | ||
2451 | (unsigned long long)sh->sector); | ||
2452 | compute_parity6(sh, RECONSTRUCT_WRITE); | ||
2453 | /* now every locked buffer is ready to be written */ | ||
2454 | for (i = disks; i--; ) | ||
2455 | if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { | ||
2456 | pr_debug("Writing stripe %llu block %d\n", | ||
2457 | (unsigned long long)sh->sector, i); | ||
2458 | s->locked++; | ||
2459 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | ||
2460 | } | ||
2461 | if (s->locked == disks) | ||
2462 | if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) | ||
2463 | atomic_inc(&conf->pending_full_writes); | ||
2464 | /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ | ||
2465 | set_bit(STRIPE_INSYNC, &sh->state); | ||
2466 | |||
2467 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
2468 | atomic_dec(&conf->preread_active_stripes); | ||
2469 | if (atomic_read(&conf->preread_active_stripes) < | ||
2470 | IO_THRESHOLD) | ||
2471 | md_wakeup_thread(conf->mddev->thread); | ||
2472 | } | ||
2473 | } | 2556 | } |
2474 | } | 2557 | } |
2475 | 2558 | ||
@@ -2528,7 +2611,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | |||
2528 | * we are done. Otherwise update the mismatch count and repair | 2611 | * we are done. Otherwise update the mismatch count and repair |
2529 | * parity if !MD_RECOVERY_CHECK | 2612 | * parity if !MD_RECOVERY_CHECK |
2530 | */ | 2613 | */ |
2531 | if (sh->ops.zero_sum_result == 0) | 2614 | if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) |
2532 | /* parity is correct (on disc, | 2615 | /* parity is correct (on disc, |
2533 | * not in buffer any more) | 2616 | * not in buffer any more) |
2534 | */ | 2617 | */ |
@@ -2545,6 +2628,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | |||
2545 | set_bit(R5_Wantcompute, | 2628 | set_bit(R5_Wantcompute, |
2546 | &sh->dev[sh->pd_idx].flags); | 2629 | &sh->dev[sh->pd_idx].flags); |
2547 | sh->ops.target = sh->pd_idx; | 2630 | sh->ops.target = sh->pd_idx; |
2631 | sh->ops.target2 = -1; | ||
2548 | s->uptodate++; | 2632 | s->uptodate++; |
2549 | } | 2633 | } |
2550 | } | 2634 | } |
@@ -2561,67 +2645,74 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | |||
2561 | 2645 | ||
2562 | 2646 | ||
2563 | static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | 2647 | static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, |
2564 | struct stripe_head_state *s, | 2648 | struct stripe_head_state *s, |
2565 | struct r6_state *r6s, struct page *tmp_page, | 2649 | struct r6_state *r6s, int disks) |
2566 | int disks) | ||
2567 | { | 2650 | { |
2568 | int update_p = 0, update_q = 0; | ||
2569 | struct r5dev *dev; | ||
2570 | int pd_idx = sh->pd_idx; | 2651 | int pd_idx = sh->pd_idx; |
2571 | int qd_idx = sh->qd_idx; | 2652 | int qd_idx = sh->qd_idx; |
2653 | struct r5dev *dev; | ||
2572 | 2654 | ||
2573 | set_bit(STRIPE_HANDLE, &sh->state); | 2655 | set_bit(STRIPE_HANDLE, &sh->state); |
2574 | 2656 | ||
2575 | BUG_ON(s->failed > 2); | 2657 | BUG_ON(s->failed > 2); |
2576 | BUG_ON(s->uptodate < disks); | 2658 | |
2577 | /* Want to check and possibly repair P and Q. | 2659 | /* Want to check and possibly repair P and Q. |
2578 | * However there could be one 'failed' device, in which | 2660 | * However there could be one 'failed' device, in which |
2579 | * case we can only check one of them, possibly using the | 2661 | * case we can only check one of them, possibly using the |
2580 | * other to generate missing data | 2662 | * other to generate missing data |
2581 | */ | 2663 | */ |
2582 | 2664 | ||
2583 | /* If !tmp_page, we cannot do the calculations, | 2665 | switch (sh->check_state) { |
2584 | * but as we have set STRIPE_HANDLE, we will soon be called | 2666 | case check_state_idle: |
2585 | * by stripe_handle with a tmp_page - just wait until then. | 2667 | /* start a new check operation if there are < 2 failures */ |
2586 | */ | ||
2587 | if (tmp_page) { | ||
2588 | if (s->failed == r6s->q_failed) { | 2668 | if (s->failed == r6s->q_failed) { |
2589 | /* The only possible failed device holds 'Q', so it | 2669 | /* The only possible failed device holds Q, so it |
2590 | * makes sense to check P (If anything else were failed, | 2670 | * makes sense to check P (If anything else were failed, |
2591 | * we would have used P to recreate it). | 2671 | * we would have used P to recreate it). |
2592 | */ | 2672 | */ |
2593 | compute_block_1(sh, pd_idx, 1); | 2673 | sh->check_state = check_state_run; |
2594 | if (!page_is_zero(sh->dev[pd_idx].page)) { | ||
2595 | compute_block_1(sh, pd_idx, 0); | ||
2596 | update_p = 1; | ||
2597 | } | ||
2598 | } | 2674 | } |
2599 | if (!r6s->q_failed && s->failed < 2) { | 2675 | if (!r6s->q_failed && s->failed < 2) { |
2600 | /* q is not failed, and we didn't use it to generate | 2676 | /* Q is not failed, and we didn't use it to generate |
2601 | * anything, so it makes sense to check it | 2677 | * anything, so it makes sense to check it |
2602 | */ | 2678 | */ |
2603 | memcpy(page_address(tmp_page), | 2679 | if (sh->check_state == check_state_run) |
2604 | page_address(sh->dev[qd_idx].page), | 2680 | sh->check_state = check_state_run_pq; |
2605 | STRIPE_SIZE); | 2681 | else |
2606 | compute_parity6(sh, UPDATE_PARITY); | 2682 | sh->check_state = check_state_run_q; |
2607 | if (memcmp(page_address(tmp_page), | ||
2608 | page_address(sh->dev[qd_idx].page), | ||
2609 | STRIPE_SIZE) != 0) { | ||
2610 | clear_bit(STRIPE_INSYNC, &sh->state); | ||
2611 | update_q = 1; | ||
2612 | } | ||
2613 | } | 2683 | } |
2614 | if (update_p || update_q) { | 2684 | |
2615 | conf->mddev->resync_mismatches += STRIPE_SECTORS; | 2685 | /* discard potentially stale zero_sum_result */ |
2616 | if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) | 2686 | sh->ops.zero_sum_result = 0; |
2617 | /* don't try to repair!! */ | 2687 | |
2618 | update_p = update_q = 0; | 2688 | if (sh->check_state == check_state_run) { |
2689 | /* async_xor_zero_sum destroys the contents of P */ | ||
2690 | clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); | ||
2691 | s->uptodate--; | ||
2619 | } | 2692 | } |
2693 | if (sh->check_state >= check_state_run && | ||
2694 | sh->check_state <= check_state_run_pq) { | ||
2695 | /* async_syndrome_zero_sum preserves P and Q, so | ||
2696 | * no need to mark them !uptodate here | ||
2697 | */ | ||
2698 | set_bit(STRIPE_OP_CHECK, &s->ops_request); | ||
2699 | break; | ||
2700 | } | ||
2701 | |||
2702 | /* we have 2-disk failure */ | ||
2703 | BUG_ON(s->failed != 2); | ||
2704 | /* fall through */ | ||
2705 | case check_state_compute_result: | ||
2706 | sh->check_state = check_state_idle; | ||
2707 | |||
2708 | /* check that a write has not made the stripe insync */ | ||
2709 | if (test_bit(STRIPE_INSYNC, &sh->state)) | ||
2710 | break; | ||
2620 | 2711 | ||
2621 | /* now write out any block on a failed drive, | 2712 | /* now write out any block on a failed drive, |
2622 | * or P or Q if they need it | 2713 | * or P or Q if they were recomputed |
2623 | */ | 2714 | */ |
2624 | 2715 | BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ | |
2625 | if (s->failed == 2) { | 2716 | if (s->failed == 2) { |
2626 | dev = &sh->dev[r6s->failed_num[1]]; | 2717 | dev = &sh->dev[r6s->failed_num[1]]; |
2627 | s->locked++; | 2718 | s->locked++; |
@@ -2634,14 +2725,13 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | |||
2634 | set_bit(R5_LOCKED, &dev->flags); | 2725 | set_bit(R5_LOCKED, &dev->flags); |
2635 | set_bit(R5_Wantwrite, &dev->flags); | 2726 | set_bit(R5_Wantwrite, &dev->flags); |
2636 | } | 2727 | } |
2637 | 2728 | if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { | |
2638 | if (update_p) { | ||
2639 | dev = &sh->dev[pd_idx]; | 2729 | dev = &sh->dev[pd_idx]; |
2640 | s->locked++; | 2730 | s->locked++; |
2641 | set_bit(R5_LOCKED, &dev->flags); | 2731 | set_bit(R5_LOCKED, &dev->flags); |
2642 | set_bit(R5_Wantwrite, &dev->flags); | 2732 | set_bit(R5_Wantwrite, &dev->flags); |
2643 | } | 2733 | } |
2644 | if (update_q) { | 2734 | if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { |
2645 | dev = &sh->dev[qd_idx]; | 2735 | dev = &sh->dev[qd_idx]; |
2646 | s->locked++; | 2736 | s->locked++; |
2647 | set_bit(R5_LOCKED, &dev->flags); | 2737 | set_bit(R5_LOCKED, &dev->flags); |
@@ -2650,6 +2740,70 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | |||
2650 | clear_bit(STRIPE_DEGRADED, &sh->state); | 2740 | clear_bit(STRIPE_DEGRADED, &sh->state); |
2651 | 2741 | ||
2652 | set_bit(STRIPE_INSYNC, &sh->state); | 2742 | set_bit(STRIPE_INSYNC, &sh->state); |
2743 | break; | ||
2744 | case check_state_run: | ||
2745 | case check_state_run_q: | ||
2746 | case check_state_run_pq: | ||
2747 | break; /* we will be called again upon completion */ | ||
2748 | case check_state_check_result: | ||
2749 | sh->check_state = check_state_idle; | ||
2750 | |||
2751 | /* handle a successful check operation, if parity is correct | ||
2752 | * we are done. Otherwise update the mismatch count and repair | ||
2753 | * parity if !MD_RECOVERY_CHECK | ||
2754 | */ | ||
2755 | if (sh->ops.zero_sum_result == 0) { | ||
2756 | /* both parities are correct */ | ||
2757 | if (!s->failed) | ||
2758 | set_bit(STRIPE_INSYNC, &sh->state); | ||
2759 | else { | ||
2760 | /* in contrast to the raid5 case we can validate | ||
2761 | * parity, but still have a failure to write | ||
2762 | * back | ||
2763 | */ | ||
2764 | sh->check_state = check_state_compute_result; | ||
2765 | /* Returning at this point means that we may go | ||
2766 | * off and bring p and/or q uptodate again so | ||
2767 | * we make sure to check zero_sum_result again | ||
2768 | * to verify if p or q need writeback | ||
2769 | */ | ||
2770 | } | ||
2771 | } else { | ||
2772 | conf->mddev->resync_mismatches += STRIPE_SECTORS; | ||
2773 | if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) | ||
2774 | /* don't try to repair!! */ | ||
2775 | set_bit(STRIPE_INSYNC, &sh->state); | ||
2776 | else { | ||
2777 | int *target = &sh->ops.target; | ||
2778 | |||
2779 | sh->ops.target = -1; | ||
2780 | sh->ops.target2 = -1; | ||
2781 | sh->check_state = check_state_compute_run; | ||
2782 | set_bit(STRIPE_COMPUTE_RUN, &sh->state); | ||
2783 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); | ||
2784 | if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { | ||
2785 | set_bit(R5_Wantcompute, | ||
2786 | &sh->dev[pd_idx].flags); | ||
2787 | *target = pd_idx; | ||
2788 | target = &sh->ops.target2; | ||
2789 | s->uptodate++; | ||
2790 | } | ||
2791 | if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { | ||
2792 | set_bit(R5_Wantcompute, | ||
2793 | &sh->dev[qd_idx].flags); | ||
2794 | *target = qd_idx; | ||
2795 | s->uptodate++; | ||
2796 | } | ||
2797 | } | ||
2798 | } | ||
2799 | break; | ||
2800 | case check_state_compute_run: | ||
2801 | break; | ||
2802 | default: | ||
2803 | printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", | ||
2804 | __func__, sh->check_state, | ||
2805 | (unsigned long long) sh->sector); | ||
2806 | BUG(); | ||
2653 | } | 2807 | } |
2654 | } | 2808 | } |
2655 | 2809 | ||
@@ -2667,6 +2821,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, | |||
2667 | if (i != sh->pd_idx && i != sh->qd_idx) { | 2821 | if (i != sh->pd_idx && i != sh->qd_idx) { |
2668 | int dd_idx, j; | 2822 | int dd_idx, j; |
2669 | struct stripe_head *sh2; | 2823 | struct stripe_head *sh2; |
2824 | struct async_submit_ctl submit; | ||
2670 | 2825 | ||
2671 | sector_t bn = compute_blocknr(sh, i, 1); | 2826 | sector_t bn = compute_blocknr(sh, i, 1); |
2672 | sector_t s = raid5_compute_sector(conf, bn, 0, | 2827 | sector_t s = raid5_compute_sector(conf, bn, 0, |
@@ -2686,9 +2841,10 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, | |||
2686 | } | 2841 | } |
2687 | 2842 | ||
2688 | /* place all the copies on one channel */ | 2843 | /* place all the copies on one channel */ |
2844 | init_async_submit(&submit, 0, tx, NULL, NULL, NULL); | ||
2689 | tx = async_memcpy(sh2->dev[dd_idx].page, | 2845 | tx = async_memcpy(sh2->dev[dd_idx].page, |
2690 | sh->dev[i].page, 0, 0, STRIPE_SIZE, | 2846 | sh->dev[i].page, 0, 0, STRIPE_SIZE, |
2691 | ASYNC_TX_DEP_ACK, tx, NULL, NULL); | 2847 | &submit); |
2692 | 2848 | ||
2693 | set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); | 2849 | set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); |
2694 | set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); | 2850 | set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); |
@@ -2974,7 +3130,7 @@ static bool handle_stripe5(struct stripe_head *sh) | |||
2974 | /* Need to write out all blocks after computing parity */ | 3130 | /* Need to write out all blocks after computing parity */ |
2975 | sh->disks = conf->raid_disks; | 3131 | sh->disks = conf->raid_disks; |
2976 | stripe_set_idx(sh->sector, conf, 0, sh); | 3132 | stripe_set_idx(sh->sector, conf, 0, sh); |
2977 | schedule_reconstruction5(sh, &s, 1, 1); | 3133 | schedule_reconstruction(sh, &s, 1, 1); |
2978 | } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { | 3134 | } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { |
2979 | clear_bit(STRIPE_EXPAND_READY, &sh->state); | 3135 | clear_bit(STRIPE_EXPAND_READY, &sh->state); |
2980 | atomic_dec(&conf->reshape_stripes); | 3136 | atomic_dec(&conf->reshape_stripes); |
@@ -2994,7 +3150,7 @@ static bool handle_stripe5(struct stripe_head *sh) | |||
2994 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); | 3150 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); |
2995 | 3151 | ||
2996 | if (s.ops_request) | 3152 | if (s.ops_request) |
2997 | raid5_run_ops(sh, s.ops_request); | 3153 | raid_run_ops(sh, s.ops_request); |
2998 | 3154 | ||
2999 | ops_run_io(sh, &s); | 3155 | ops_run_io(sh, &s); |
3000 | 3156 | ||
@@ -3003,7 +3159,7 @@ static bool handle_stripe5(struct stripe_head *sh) | |||
3003 | return blocked_rdev == NULL; | 3159 | return blocked_rdev == NULL; |
3004 | } | 3160 | } |
3005 | 3161 | ||
3006 | static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | 3162 | static bool handle_stripe6(struct stripe_head *sh) |
3007 | { | 3163 | { |
3008 | raid5_conf_t *conf = sh->raid_conf; | 3164 | raid5_conf_t *conf = sh->raid_conf; |
3009 | int disks = sh->disks; | 3165 | int disks = sh->disks; |
@@ -3015,9 +3171,10 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3015 | mdk_rdev_t *blocked_rdev = NULL; | 3171 | mdk_rdev_t *blocked_rdev = NULL; |
3016 | 3172 | ||
3017 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, " | 3173 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, " |
3018 | "pd_idx=%d, qd_idx=%d\n", | 3174 | "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", |
3019 | (unsigned long long)sh->sector, sh->state, | 3175 | (unsigned long long)sh->sector, sh->state, |
3020 | atomic_read(&sh->count), pd_idx, qd_idx); | 3176 | atomic_read(&sh->count), pd_idx, qd_idx, |
3177 | sh->check_state, sh->reconstruct_state); | ||
3021 | memset(&s, 0, sizeof(s)); | 3178 | memset(&s, 0, sizeof(s)); |
3022 | 3179 | ||
3023 | spin_lock(&sh->lock); | 3180 | spin_lock(&sh->lock); |
@@ -3037,35 +3194,24 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3037 | 3194 | ||
3038 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", | 3195 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", |
3039 | i, dev->flags, dev->toread, dev->towrite, dev->written); | 3196 | i, dev->flags, dev->toread, dev->towrite, dev->written); |
3040 | /* maybe we can reply to a read */ | 3197 | /* maybe we can reply to a read |
3041 | if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { | 3198 | * |
3042 | struct bio *rbi, *rbi2; | 3199 | * new wantfill requests are only permitted while |
3043 | pr_debug("Return read for disc %d\n", i); | 3200 | * ops_complete_biofill is guaranteed to be inactive |
3044 | spin_lock_irq(&conf->device_lock); | 3201 | */ |
3045 | rbi = dev->toread; | 3202 | if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && |
3046 | dev->toread = NULL; | 3203 | !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) |
3047 | if (test_and_clear_bit(R5_Overlap, &dev->flags)) | 3204 | set_bit(R5_Wantfill, &dev->flags); |
3048 | wake_up(&conf->wait_for_overlap); | ||
3049 | spin_unlock_irq(&conf->device_lock); | ||
3050 | while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { | ||
3051 | copy_data(0, rbi, dev->page, dev->sector); | ||
3052 | rbi2 = r5_next_bio(rbi, dev->sector); | ||
3053 | spin_lock_irq(&conf->device_lock); | ||
3054 | if (!raid5_dec_bi_phys_segments(rbi)) { | ||
3055 | rbi->bi_next = return_bi; | ||
3056 | return_bi = rbi; | ||
3057 | } | ||
3058 | spin_unlock_irq(&conf->device_lock); | ||
3059 | rbi = rbi2; | ||
3060 | } | ||
3061 | } | ||
3062 | 3205 | ||
3063 | /* now count some things */ | 3206 | /* now count some things */ |
3064 | if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; | 3207 | if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; |
3065 | if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; | 3208 | if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; |
3209 | if (test_bit(R5_Wantcompute, &dev->flags)) | ||
3210 | BUG_ON(++s.compute > 2); | ||
3066 | 3211 | ||
3067 | 3212 | if (test_bit(R5_Wantfill, &dev->flags)) { | |
3068 | if (dev->toread) | 3213 | s.to_fill++; |
3214 | } else if (dev->toread) | ||
3069 | s.to_read++; | 3215 | s.to_read++; |
3070 | if (dev->towrite) { | 3216 | if (dev->towrite) { |
3071 | s.to_write++; | 3217 | s.to_write++; |
@@ -3106,6 +3252,11 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3106 | blocked_rdev = NULL; | 3252 | blocked_rdev = NULL; |
3107 | } | 3253 | } |
3108 | 3254 | ||
3255 | if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { | ||
3256 | set_bit(STRIPE_OP_BIOFILL, &s.ops_request); | ||
3257 | set_bit(STRIPE_BIOFILL_RUN, &sh->state); | ||
3258 | } | ||
3259 | |||
3109 | pr_debug("locked=%d uptodate=%d to_read=%d" | 3260 | pr_debug("locked=%d uptodate=%d to_read=%d" |
3110 | " to_write=%d failed=%d failed_num=%d,%d\n", | 3261 | " to_write=%d failed=%d failed_num=%d,%d\n", |
3111 | s.locked, s.uptodate, s.to_read, s.to_write, s.failed, | 3262 | s.locked, s.uptodate, s.to_read, s.to_write, s.failed, |
@@ -3146,19 +3297,62 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3146 | * or to load a block that is being partially written. | 3297 | * or to load a block that is being partially written. |
3147 | */ | 3298 | */ |
3148 | if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || | 3299 | if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || |
3149 | (s.syncing && (s.uptodate < disks)) || s.expanding) | 3300 | (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) |
3150 | handle_stripe_fill6(sh, &s, &r6s, disks); | 3301 | handle_stripe_fill6(sh, &s, &r6s, disks); |
3151 | 3302 | ||
3152 | /* now to consider writing and what else, if anything should be read */ | 3303 | /* Now we check to see if any write operations have recently |
3153 | if (s.to_write) | 3304 | * completed |
3305 | */ | ||
3306 | if (sh->reconstruct_state == reconstruct_state_drain_result) { | ||
3307 | int qd_idx = sh->qd_idx; | ||
3308 | |||
3309 | sh->reconstruct_state = reconstruct_state_idle; | ||
3310 | /* All the 'written' buffers and the parity blocks are ready to | ||
3311 | * be written back to disk | ||
3312 | */ | ||
3313 | BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); | ||
3314 | BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags)); | ||
3315 | for (i = disks; i--; ) { | ||
3316 | dev = &sh->dev[i]; | ||
3317 | if (test_bit(R5_LOCKED, &dev->flags) && | ||
3318 | (i == sh->pd_idx || i == qd_idx || | ||
3319 | dev->written)) { | ||
3320 | pr_debug("Writing block %d\n", i); | ||
3321 | BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); | ||
3322 | set_bit(R5_Wantwrite, &dev->flags); | ||
3323 | if (!test_bit(R5_Insync, &dev->flags) || | ||
3324 | ((i == sh->pd_idx || i == qd_idx) && | ||
3325 | s.failed == 0)) | ||
3326 | set_bit(STRIPE_INSYNC, &sh->state); | ||
3327 | } | ||
3328 | } | ||
3329 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
3330 | atomic_dec(&conf->preread_active_stripes); | ||
3331 | if (atomic_read(&conf->preread_active_stripes) < | ||
3332 | IO_THRESHOLD) | ||
3333 | md_wakeup_thread(conf->mddev->thread); | ||
3334 | } | ||
3335 | } | ||
3336 | |||
3337 | /* Now to consider new write requests and what else, if anything | ||
3338 | * should be read. We do not handle new writes when: | ||
3339 | * 1/ A 'write' operation (copy+gen_syndrome) is already in flight. | ||
3340 | * 2/ A 'check' operation is in flight, as it may clobber the parity | ||
3341 | * block. | ||
3342 | */ | ||
3343 | if (s.to_write && !sh->reconstruct_state && !sh->check_state) | ||
3154 | handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); | 3344 | handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); |
3155 | 3345 | ||
3156 | /* maybe we need to check and possibly fix the parity for this stripe | 3346 | /* maybe we need to check and possibly fix the parity for this stripe |
3157 | * Any reads will already have been scheduled, so we just see if enough | 3347 | * Any reads will already have been scheduled, so we just see if enough |
3158 | * data is available | 3348 | * data is available. The parity check is held off while parity |
3349 | * dependent operations are in flight. | ||
3159 | */ | 3350 | */ |
3160 | if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) | 3351 | if (sh->check_state || |
3161 | handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks); | 3352 | (s.syncing && s.locked == 0 && |
3353 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && | ||
3354 | !test_bit(STRIPE_INSYNC, &sh->state))) | ||
3355 | handle_parity_checks6(conf, sh, &s, &r6s, disks); | ||
3162 | 3356 | ||
3163 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { | 3357 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { |
3164 | md_done_sync(conf->mddev, STRIPE_SECTORS,1); | 3358 | md_done_sync(conf->mddev, STRIPE_SECTORS,1); |
@@ -3179,15 +3373,29 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3179 | set_bit(R5_Wantwrite, &dev->flags); | 3373 | set_bit(R5_Wantwrite, &dev->flags); |
3180 | set_bit(R5_ReWrite, &dev->flags); | 3374 | set_bit(R5_ReWrite, &dev->flags); |
3181 | set_bit(R5_LOCKED, &dev->flags); | 3375 | set_bit(R5_LOCKED, &dev->flags); |
3376 | s.locked++; | ||
3182 | } else { | 3377 | } else { |
3183 | /* let's read it back */ | 3378 | /* let's read it back */ |
3184 | set_bit(R5_Wantread, &dev->flags); | 3379 | set_bit(R5_Wantread, &dev->flags); |
3185 | set_bit(R5_LOCKED, &dev->flags); | 3380 | set_bit(R5_LOCKED, &dev->flags); |
3381 | s.locked++; | ||
3186 | } | 3382 | } |
3187 | } | 3383 | } |
3188 | } | 3384 | } |
3189 | 3385 | ||
3190 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { | 3386 | /* Finish reconstruct operations initiated by the expansion process */ |
3387 | if (sh->reconstruct_state == reconstruct_state_result) { | ||
3388 | sh->reconstruct_state = reconstruct_state_idle; | ||
3389 | clear_bit(STRIPE_EXPANDING, &sh->state); | ||
3390 | for (i = conf->raid_disks; i--; ) { | ||
3391 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | ||
3392 | set_bit(R5_LOCKED, &sh->dev[i].flags); | ||
3393 | s.locked++; | ||
3394 | } | ||
3395 | } | ||
3396 | |||
3397 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && | ||
3398 | !sh->reconstruct_state) { | ||
3191 | struct stripe_head *sh2 | 3399 | struct stripe_head *sh2 |
3192 | = get_active_stripe(conf, sh->sector, 1, 1, 1); | 3400 | = get_active_stripe(conf, sh->sector, 1, 1, 1); |
3193 | if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { | 3401 | if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { |
@@ -3208,14 +3416,8 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3208 | /* Need to write out all blocks after computing P&Q */ | 3416 | /* Need to write out all blocks after computing P&Q */ |
3209 | sh->disks = conf->raid_disks; | 3417 | sh->disks = conf->raid_disks; |
3210 | stripe_set_idx(sh->sector, conf, 0, sh); | 3418 | stripe_set_idx(sh->sector, conf, 0, sh); |
3211 | compute_parity6(sh, RECONSTRUCT_WRITE); | 3419 | schedule_reconstruction(sh, &s, 1, 1); |
3212 | for (i = conf->raid_disks ; i-- ; ) { | 3420 | } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { |
3213 | set_bit(R5_LOCKED, &sh->dev[i].flags); | ||
3214 | s.locked++; | ||
3215 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | ||
3216 | } | ||
3217 | clear_bit(STRIPE_EXPANDING, &sh->state); | ||
3218 | } else if (s.expanded) { | ||
3219 | clear_bit(STRIPE_EXPAND_READY, &sh->state); | 3421 | clear_bit(STRIPE_EXPAND_READY, &sh->state); |
3220 | atomic_dec(&conf->reshape_stripes); | 3422 | atomic_dec(&conf->reshape_stripes); |
3221 | wake_up(&conf->wait_for_overlap); | 3423 | wake_up(&conf->wait_for_overlap); |
@@ -3233,6 +3435,9 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3233 | if (unlikely(blocked_rdev)) | 3435 | if (unlikely(blocked_rdev)) |
3234 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); | 3436 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); |
3235 | 3437 | ||
3438 | if (s.ops_request) | ||
3439 | raid_run_ops(sh, s.ops_request); | ||
3440 | |||
3236 | ops_run_io(sh, &s); | 3441 | ops_run_io(sh, &s); |
3237 | 3442 | ||
3238 | return_io(return_bi); | 3443 | return_io(return_bi); |
@@ -3241,16 +3446,14 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3241 | } | 3446 | } |
3242 | 3447 | ||
3243 | /* returns true if the stripe was handled */ | 3448 | /* returns true if the stripe was handled */ |
3244 | static bool handle_stripe(struct stripe_head *sh, struct page *tmp_page) | 3449 | static bool handle_stripe(struct stripe_head *sh) |
3245 | { | 3450 | { |
3246 | if (sh->raid_conf->level == 6) | 3451 | if (sh->raid_conf->level == 6) |
3247 | return handle_stripe6(sh, tmp_page); | 3452 | return handle_stripe6(sh); |
3248 | else | 3453 | else |
3249 | return handle_stripe5(sh); | 3454 | return handle_stripe5(sh); |
3250 | } | 3455 | } |
3251 | 3456 | ||
3252 | |||
3253 | |||
3254 | static void raid5_activate_delayed(raid5_conf_t *conf) | 3457 | static void raid5_activate_delayed(raid5_conf_t *conf) |
3255 | { | 3458 | { |
3256 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { | 3459 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { |
@@ -4046,7 +4249,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski | |||
4046 | spin_unlock(&sh->lock); | 4249 | spin_unlock(&sh->lock); |
4047 | 4250 | ||
4048 | /* wait for any blocked device to be handled */ | 4251 | /* wait for any blocked device to be handled */ |
4049 | while(unlikely(!handle_stripe(sh, NULL))) | 4252 | while (unlikely(!handle_stripe(sh))) |
4050 | ; | 4253 | ; |
4051 | release_stripe(sh); | 4254 | release_stripe(sh); |
4052 | 4255 | ||
@@ -4103,7 +4306,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) | |||
4103 | return handled; | 4306 | return handled; |
4104 | } | 4307 | } |
4105 | 4308 | ||
4106 | handle_stripe(sh, NULL); | 4309 | handle_stripe(sh); |
4107 | release_stripe(sh); | 4310 | release_stripe(sh); |
4108 | handled++; | 4311 | handled++; |
4109 | } | 4312 | } |
@@ -4117,6 +4320,36 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) | |||
4117 | return handled; | 4320 | return handled; |
4118 | } | 4321 | } |
4119 | 4322 | ||
4323 | #ifdef CONFIG_MULTICORE_RAID456 | ||
4324 | static void __process_stripe(void *param, async_cookie_t cookie) | ||
4325 | { | ||
4326 | struct stripe_head *sh = param; | ||
4327 | |||
4328 | handle_stripe(sh); | ||
4329 | release_stripe(sh); | ||
4330 | } | ||
4331 | |||
4332 | static void process_stripe(struct stripe_head *sh, struct list_head *domain) | ||
4333 | { | ||
4334 | async_schedule_domain(__process_stripe, sh, domain); | ||
4335 | } | ||
4336 | |||
4337 | static void synchronize_stripe_processing(struct list_head *domain) | ||
4338 | { | ||
4339 | async_synchronize_full_domain(domain); | ||
4340 | } | ||
4341 | #else | ||
4342 | static void process_stripe(struct stripe_head *sh, struct list_head *domain) | ||
4343 | { | ||
4344 | handle_stripe(sh); | ||
4345 | release_stripe(sh); | ||
4346 | cond_resched(); | ||
4347 | } | ||
4348 | |||
4349 | static void synchronize_stripe_processing(struct list_head *domain) | ||
4350 | { | ||
4351 | } | ||
4352 | #endif | ||
4120 | 4353 | ||
4121 | 4354 | ||
4122 | /* | 4355 | /* |
@@ -4131,6 +4364,7 @@ static void raid5d(mddev_t *mddev) | |||
4131 | struct stripe_head *sh; | 4364 | struct stripe_head *sh; |
4132 | raid5_conf_t *conf = mddev_to_conf(mddev); | 4365 | raid5_conf_t *conf = mddev_to_conf(mddev); |
4133 | int handled; | 4366 | int handled; |
4367 | LIST_HEAD(raid_domain); | ||
4134 | 4368 | ||
4135 | pr_debug("+++ raid5d active\n"); | 4369 | pr_debug("+++ raid5d active\n"); |
4136 | 4370 | ||
@@ -4167,8 +4401,7 @@ static void raid5d(mddev_t *mddev) | |||
4167 | spin_unlock_irq(&conf->device_lock); | 4401 | spin_unlock_irq(&conf->device_lock); |
4168 | 4402 | ||
4169 | handled++; | 4403 | handled++; |
4170 | handle_stripe(sh, conf->spare_page); | 4404 | process_stripe(sh, &raid_domain); |
4171 | release_stripe(sh); | ||
4172 | 4405 | ||
4173 | spin_lock_irq(&conf->device_lock); | 4406 | spin_lock_irq(&conf->device_lock); |
4174 | } | 4407 | } |
@@ -4176,6 +4409,7 @@ static void raid5d(mddev_t *mddev) | |||
4176 | 4409 | ||
4177 | spin_unlock_irq(&conf->device_lock); | 4410 | spin_unlock_irq(&conf->device_lock); |
4178 | 4411 | ||
4412 | synchronize_stripe_processing(&raid_domain); | ||
4179 | async_tx_issue_pending_all(); | 4413 | async_tx_issue_pending_all(); |
4180 | unplug_slaves(mddev); | 4414 | unplug_slaves(mddev); |
4181 | 4415 | ||
@@ -4308,6 +4542,118 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) | |||
4308 | return sectors * (raid_disks - conf->max_degraded); | 4542 | return sectors * (raid_disks - conf->max_degraded); |
4309 | } | 4543 | } |
4310 | 4544 | ||
4545 | static void raid5_free_percpu(raid5_conf_t *conf) | ||
4546 | { | ||
4547 | struct raid5_percpu *percpu; | ||
4548 | unsigned long cpu; | ||
4549 | |||
4550 | if (!conf->percpu) | ||
4551 | return; | ||
4552 | |||
4553 | get_online_cpus(); | ||
4554 | for_each_possible_cpu(cpu) { | ||
4555 | percpu = per_cpu_ptr(conf->percpu, cpu); | ||
4556 | safe_put_page(percpu->spare_page); | ||
4557 | kfree(percpu->scribble); | ||
4558 | } | ||
4559 | #ifdef CONFIG_HOTPLUG_CPU | ||
4560 | unregister_cpu_notifier(&conf->cpu_notify); | ||
4561 | #endif | ||
4562 | put_online_cpus(); | ||
4563 | |||
4564 | free_percpu(conf->percpu); | ||
4565 | } | ||
4566 | |||
4567 | static void free_conf(raid5_conf_t *conf) | ||
4568 | { | ||
4569 | shrink_stripes(conf); | ||
4570 | raid5_free_percpu(conf); | ||
4571 | kfree(conf->disks); | ||
4572 | kfree(conf->stripe_hashtbl); | ||
4573 | kfree(conf); | ||
4574 | } | ||
4575 | |||
4576 | #ifdef CONFIG_HOTPLUG_CPU | ||
4577 | static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, | ||
4578 | void *hcpu) | ||
4579 | { | ||
4580 | raid5_conf_t *conf = container_of(nfb, raid5_conf_t, cpu_notify); | ||
4581 | long cpu = (long)hcpu; | ||
4582 | struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); | ||
4583 | |||
4584 | switch (action) { | ||
4585 | case CPU_UP_PREPARE: | ||
4586 | case CPU_UP_PREPARE_FROZEN: | ||
4587 | if (conf->level == 6 && !percpu->spare_page) | ||
4588 | percpu->spare_page = alloc_page(GFP_KERNEL); | ||
4589 | if (!percpu->scribble) | ||
4590 | percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); | ||
4591 | |||
4592 | if (!percpu->scribble || | ||
4593 | (conf->level == 6 && !percpu->spare_page)) { | ||
4594 | safe_put_page(percpu->spare_page); | ||
4595 | kfree(percpu->scribble); | ||
4596 | pr_err("%s: failed memory allocation for cpu%ld\n", | ||
4597 | __func__, cpu); | ||
4598 | return NOTIFY_BAD; | ||
4599 | } | ||
4600 | break; | ||
4601 | case CPU_DEAD: | ||
4602 | case CPU_DEAD_FROZEN: | ||
4603 | safe_put_page(percpu->spare_page); | ||
4604 | kfree(percpu->scribble); | ||
4605 | percpu->spare_page = NULL; | ||
4606 | percpu->scribble = NULL; | ||
4607 | break; | ||
4608 | default: | ||
4609 | break; | ||
4610 | } | ||
4611 | return NOTIFY_OK; | ||
4612 | } | ||
4613 | #endif | ||
4614 | |||
4615 | static int raid5_alloc_percpu(raid5_conf_t *conf) | ||
4616 | { | ||
4617 | unsigned long cpu; | ||
4618 | struct page *spare_page; | ||
4619 | struct raid5_percpu *allcpus; | ||
4620 | void *scribble; | ||
4621 | int err; | ||
4622 | |||
4623 | allcpus = alloc_percpu(struct raid5_percpu); | ||
4624 | if (!allcpus) | ||
4625 | return -ENOMEM; | ||
4626 | conf->percpu = allcpus; | ||
4627 | |||
4628 | get_online_cpus(); | ||
4629 | err = 0; | ||
4630 | for_each_present_cpu(cpu) { | ||
4631 | if (conf->level == 6) { | ||
4632 | spare_page = alloc_page(GFP_KERNEL); | ||
4633 | if (!spare_page) { | ||
4634 | err = -ENOMEM; | ||
4635 | break; | ||
4636 | } | ||
4637 | per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; | ||
4638 | } | ||
4639 | scribble = kmalloc(scribble_len(conf->raid_disks), GFP_KERNEL); | ||
4640 | if (!scribble) { | ||
4641 | err = -ENOMEM; | ||
4642 | break; | ||
4643 | } | ||
4644 | per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; | ||
4645 | } | ||
4646 | #ifdef CONFIG_HOTPLUG_CPU | ||
4647 | conf->cpu_notify.notifier_call = raid456_cpu_notify; | ||
4648 | conf->cpu_notify.priority = 0; | ||
4649 | if (err == 0) | ||
4650 | err = register_cpu_notifier(&conf->cpu_notify); | ||
4651 | #endif | ||
4652 | put_online_cpus(); | ||
4653 | |||
4654 | return err; | ||
4655 | } | ||
4656 | |||
4311 | static raid5_conf_t *setup_conf(mddev_t *mddev) | 4657 | static raid5_conf_t *setup_conf(mddev_t *mddev) |
4312 | { | 4658 | { |
4313 | raid5_conf_t *conf; | 4659 | raid5_conf_t *conf; |
@@ -4347,6 +4693,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
4347 | goto abort; | 4693 | goto abort; |
4348 | 4694 | ||
4349 | conf->raid_disks = mddev->raid_disks; | 4695 | conf->raid_disks = mddev->raid_disks; |
4696 | conf->scribble_len = scribble_len(conf->raid_disks); | ||
4350 | if (mddev->reshape_position == MaxSector) | 4697 | if (mddev->reshape_position == MaxSector) |
4351 | conf->previous_raid_disks = mddev->raid_disks; | 4698 | conf->previous_raid_disks = mddev->raid_disks; |
4352 | else | 4699 | else |
@@ -4362,11 +4709,10 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
4362 | if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) | 4709 | if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) |
4363 | goto abort; | 4710 | goto abort; |
4364 | 4711 | ||
4365 | if (mddev->new_level == 6) { | 4712 | conf->level = mddev->new_level; |
4366 | conf->spare_page = alloc_page(GFP_KERNEL); | 4713 | if (raid5_alloc_percpu(conf) != 0) |
4367 | if (!conf->spare_page) | 4714 | goto abort; |
4368 | goto abort; | 4715 | |
4369 | } | ||
4370 | spin_lock_init(&conf->device_lock); | 4716 | spin_lock_init(&conf->device_lock); |
4371 | init_waitqueue_head(&conf->wait_for_stripe); | 4717 | init_waitqueue_head(&conf->wait_for_stripe); |
4372 | init_waitqueue_head(&conf->wait_for_overlap); | 4718 | init_waitqueue_head(&conf->wait_for_overlap); |
@@ -4402,7 +4748,6 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
4402 | } | 4748 | } |
4403 | 4749 | ||
4404 | conf->chunk_size = mddev->new_chunk; | 4750 | conf->chunk_size = mddev->new_chunk; |
4405 | conf->level = mddev->new_level; | ||
4406 | if (conf->level == 6) | 4751 | if (conf->level == 6) |
4407 | conf->max_degraded = 2; | 4752 | conf->max_degraded = 2; |
4408 | else | 4753 | else |
@@ -4437,11 +4782,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
4437 | 4782 | ||
4438 | abort: | 4783 | abort: |
4439 | if (conf) { | 4784 | if (conf) { |
4440 | shrink_stripes(conf); | 4785 | free_conf(conf); |
4441 | safe_put_page(conf->spare_page); | ||
4442 | kfree(conf->disks); | ||
4443 | kfree(conf->stripe_hashtbl); | ||
4444 | kfree(conf); | ||
4445 | return ERR_PTR(-EIO); | 4786 | return ERR_PTR(-EIO); |
4446 | } else | 4787 | } else |
4447 | return ERR_PTR(-ENOMEM); | 4788 | return ERR_PTR(-ENOMEM); |
@@ -4607,12 +4948,8 @@ abort: | |||
4607 | md_unregister_thread(mddev->thread); | 4948 | md_unregister_thread(mddev->thread); |
4608 | mddev->thread = NULL; | 4949 | mddev->thread = NULL; |
4609 | if (conf) { | 4950 | if (conf) { |
4610 | shrink_stripes(conf); | ||
4611 | print_raid5_conf(conf); | 4951 | print_raid5_conf(conf); |
4612 | safe_put_page(conf->spare_page); | 4952 | free_conf(conf); |
4613 | kfree(conf->disks); | ||
4614 | kfree(conf->stripe_hashtbl); | ||
4615 | kfree(conf); | ||
4616 | } | 4953 | } |
4617 | mddev->private = NULL; | 4954 | mddev->private = NULL; |
4618 | printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); | 4955 | printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); |
@@ -4627,13 +4964,10 @@ static int stop(mddev_t *mddev) | |||
4627 | 4964 | ||
4628 | md_unregister_thread(mddev->thread); | 4965 | md_unregister_thread(mddev->thread); |
4629 | mddev->thread = NULL; | 4966 | mddev->thread = NULL; |
4630 | shrink_stripes(conf); | ||
4631 | kfree(conf->stripe_hashtbl); | ||
4632 | mddev->queue->backing_dev_info.congested_fn = NULL; | 4967 | mddev->queue->backing_dev_info.congested_fn = NULL; |
4633 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | 4968 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ |
4634 | sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); | 4969 | sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); |
4635 | kfree(conf->disks); | 4970 | free_conf(conf); |
4636 | kfree(conf); | ||
4637 | mddev->private = NULL; | 4971 | mddev->private = NULL; |
4638 | return 0; | 4972 | return 0; |
4639 | } | 4973 | } |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 52ba99954dec..116d0b44b2a9 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
@@ -2,6 +2,7 @@ | |||
2 | #define _RAID5_H | 2 | #define _RAID5_H |
3 | 3 | ||
4 | #include <linux/raid/xor.h> | 4 | #include <linux/raid/xor.h> |
5 | #include <linux/dmaengine.h> | ||
5 | 6 | ||
6 | /* | 7 | /* |
7 | * | 8 | * |
@@ -175,7 +176,9 @@ | |||
175 | */ | 176 | */ |
176 | enum check_states { | 177 | enum check_states { |
177 | check_state_idle = 0, | 178 | check_state_idle = 0, |
178 | check_state_run, /* parity check */ | 179 | check_state_run, /* xor parity check */ |
180 | check_state_run_q, /* q-parity check */ | ||
181 | check_state_run_pq, /* pq dual parity check */ | ||
179 | check_state_check_result, | 182 | check_state_check_result, |
180 | check_state_compute_run, /* parity repair */ | 183 | check_state_compute_run, /* parity repair */ |
181 | check_state_compute_result, | 184 | check_state_compute_result, |
@@ -215,8 +218,8 @@ struct stripe_head { | |||
215 | * @target - STRIPE_OP_COMPUTE_BLK target | 218 | * @target - STRIPE_OP_COMPUTE_BLK target |
216 | */ | 219 | */ |
217 | struct stripe_operations { | 220 | struct stripe_operations { |
218 | int target; | 221 | int target, target2; |
219 | u32 zero_sum_result; | 222 | enum sum_check_flags zero_sum_result; |
220 | } ops; | 223 | } ops; |
221 | struct r5dev { | 224 | struct r5dev { |
222 | struct bio req; | 225 | struct bio req; |
@@ -298,7 +301,7 @@ struct r6_state { | |||
298 | #define STRIPE_OP_COMPUTE_BLK 1 | 301 | #define STRIPE_OP_COMPUTE_BLK 1 |
299 | #define STRIPE_OP_PREXOR 2 | 302 | #define STRIPE_OP_PREXOR 2 |
300 | #define STRIPE_OP_BIODRAIN 3 | 303 | #define STRIPE_OP_BIODRAIN 3 |
301 | #define STRIPE_OP_POSTXOR 4 | 304 | #define STRIPE_OP_RECONSTRUCT 4 |
302 | #define STRIPE_OP_CHECK 5 | 305 | #define STRIPE_OP_CHECK 5 |
303 | 306 | ||
304 | /* | 307 | /* |
@@ -383,8 +386,21 @@ struct raid5_private_data { | |||
383 | * (fresh device added). | 386 | * (fresh device added). |
384 | * Cleared when a sync completes. | 387 | * Cleared when a sync completes. |
385 | */ | 388 | */ |
386 | 389 | /* per cpu variables */ | |
387 | struct page *spare_page; /* Used when checking P/Q in raid6 */ | 390 | struct raid5_percpu { |
391 | struct page *spare_page; /* Used when checking P/Q in raid6 */ | ||
392 | void *scribble; /* space for constructing buffer | ||
393 | * lists and performing address | ||
394 | * conversions | ||
395 | */ | ||
396 | } *percpu; | ||
397 | size_t scribble_len; /* size of scribble region must be | ||
398 | * associated with conf to handle | ||
399 | * cpu hotplug while reshaping | ||
400 | */ | ||
401 | #ifdef CONFIG_HOTPLUG_CPU | ||
402 | struct notifier_block cpu_notify; | ||
403 | #endif | ||
388 | 404 | ||
389 | /* | 405 | /* |
390 | * Free stripes pool | 406 | * Free stripes pool |