diff options
author | Artur Paszkiewicz <artur.paszkiewicz@intel.com> | 2017-03-09 03:59:59 -0500 |
---|---|---|
committer | Shaohua Li <shli@fb.com> | 2017-03-16 19:55:54 -0400 |
commit | 3418d036c81dcb604b7c7c71b209d5890a8418aa (patch) | |
tree | d02a31103e09f82858bf149ebcb511e12ed6065a /drivers/md/raid5-ppl.c | |
parent | ff875738edd44e3bc892d378deacc50bccc9d70c (diff) |
raid5-ppl: Partial Parity Log write logging implementation
Implement the calculation of partial parity for a stripe and PPL write
logging functionality. The description of PPL is added to the
documentation. More details can be found in the comments in raid5-ppl.c.
Attach a page for holding the partial parity data to stripe_head.
Allocate it only if mddev has the MD_HAS_PPL flag set.
Partial parity is the xor of not modified data chunks of a stripe and is
calculated as follows:
- reconstruct-write case:
xor data from all not updated disks in a stripe
- read-modify-write case:
xor old data and parity from all updated disks in a stripe
Implement it using the async_tx API and integrate into raid_run_ops().
It must be called when we still have access to old data, so do it when
STRIPE_OP_BIODRAIN is set, but before ops_run_prexor5(). The result is
stored into sh->ppl_page.
Partial parity is not meaningful for full stripe write and is not stored
in the log or used for recovery, so don't attempt to calculate it when
stripe has STRIPE_FULL_WRITE.
Put the PPL metadata structures to md_p.h because userspace tools
(mdadm) will also need to read/write PPL.
Warn about using PPL with enabled disk volatile write-back cache for
now. It can be removed once disk cache flushing before writing PPL is
implemented.
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
Diffstat (limited to 'drivers/md/raid5-ppl.c')
-rw-r--r-- | drivers/md/raid5-ppl.c | 703 |
1 files changed, 703 insertions, 0 deletions
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c new file mode 100644 index 000000000000..db5b72b11594 --- /dev/null +++ b/drivers/md/raid5-ppl.c | |||
@@ -0,0 +1,703 @@ | |||
1 | /* | ||
2 | * Partial Parity Log for closing the RAID5 write hole | ||
3 | * Copyright (c) 2017, Intel Corporation. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify it | ||
6 | * under the terms and conditions of the GNU General Public License, | ||
7 | * version 2, as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <linux/kernel.h> | ||
16 | #include <linux/blkdev.h> | ||
17 | #include <linux/slab.h> | ||
18 | #include <linux/crc32c.h> | ||
19 | #include <linux/flex_array.h> | ||
20 | #include <linux/async_tx.h> | ||
21 | #include <linux/raid/md_p.h> | ||
22 | #include "md.h" | ||
23 | #include "raid5.h" | ||
24 | |||
25 | /* | ||
26 | * PPL consists of a 4KB header (struct ppl_header) and at least 128KB for | ||
27 | * partial parity data. The header contains an array of entries | ||
28 | * (struct ppl_header_entry) which describe the logged write requests. | ||
29 | * Partial parity for the entries comes after the header, written in the same | ||
30 | * sequence as the entries: | ||
31 | * | ||
32 | * Header | ||
33 | * entry0 | ||
34 | * ... | ||
35 | * entryN | ||
36 | * PP data | ||
37 | * PP for entry0 | ||
38 | * ... | ||
39 | * PP for entryN | ||
40 | * | ||
41 | * An entry describes one or more consecutive stripe_heads, up to a full | ||
42 | * stripe. The modifed raid data chunks form an m-by-n matrix, where m is the | ||
43 | * number of stripe_heads in the entry and n is the number of modified data | ||
44 | * disks. Every stripe_head in the entry must write to the same data disks. | ||
45 | * An example of a valid case described by a single entry (writes to the first | ||
46 | * stripe of a 4 disk array, 16k chunk size): | ||
47 | * | ||
48 | * sh->sector dd0 dd1 dd2 ppl | ||
49 | * +-----+-----+-----+ | ||
50 | * 0 | --- | --- | --- | +----+ | ||
51 | * 8 | -W- | -W- | --- | | pp | data_sector = 8 | ||
52 | * 16 | -W- | -W- | --- | | pp | data_size = 3 * 2 * 4k | ||
53 | * 24 | -W- | -W- | --- | | pp | pp_size = 3 * 4k | ||
54 | * +-----+-----+-----+ +----+ | ||
55 | * | ||
56 | * data_sector is the first raid sector of the modified data, data_size is the | ||
57 | * total size of modified data and pp_size is the size of partial parity for | ||
58 | * this entry. Entries for full stripe writes contain no partial parity | ||
59 | * (pp_size = 0), they only mark the stripes for which parity should be | ||
60 | * recalculated after an unclean shutdown. Every entry holds a checksum of its | ||
61 | * partial parity, the header also has a checksum of the header itself. | ||
62 | * | ||
63 | * A write request is always logged to the PPL instance stored on the parity | ||
64 | * disk of the corresponding stripe. For each member disk there is one ppl_log | ||
65 | * used to handle logging for this disk, independently from others. They are | ||
66 | * grouped in child_logs array in struct ppl_conf, which is assigned to | ||
67 | * r5conf->log_private. | ||
68 | * | ||
69 | * ppl_io_unit represents a full PPL write, header_page contains the ppl_header. | ||
70 | * PPL entries for logged stripes are added in ppl_log_stripe(). A stripe_head | ||
71 | * can be appended to the last entry if it meets the conditions for a valid | ||
72 | * entry described above, otherwise a new entry is added. Checksums of entries | ||
73 | * are calculated incrementally as stripes containing partial parity are being | ||
74 | * added. ppl_submit_iounit() calculates the checksum of the header and submits | ||
75 | * a bio containing the header page and partial parity pages (sh->ppl_page) for | ||
76 | * all stripes of the io_unit. When the PPL write completes, the stripes | ||
77 | * associated with the io_unit are released and raid5d starts writing their data | ||
78 | * and parity. When all stripes are written, the io_unit is freed and the next | ||
79 | * can be submitted. | ||
80 | * | ||
81 | * An io_unit is used to gather stripes until it is submitted or becomes full | ||
82 | * (if the maximum number of entries or size of PPL is reached). Another io_unit | ||
83 | * can't be submitted until the previous has completed (PPL and stripe | ||
84 | * data+parity is written). The log->io_list tracks all io_units of a log | ||
85 | * (for a single member disk). New io_units are added to the end of the list | ||
86 | * and the first io_unit is submitted, if it is not submitted already. | ||
87 | * The current io_unit accepting new stripes is always at the end of the list. | ||
88 | */ | ||
89 | |||
90 | struct ppl_conf { | ||
91 | struct mddev *mddev; | ||
92 | |||
93 | /* array of child logs, one for each raid disk */ | ||
94 | struct ppl_log *child_logs; | ||
95 | int count; | ||
96 | |||
97 | int block_size; /* the logical block size used for data_sector | ||
98 | * in ppl_header_entry */ | ||
99 | u32 signature; /* raid array identifier */ | ||
100 | atomic64_t seq; /* current log write sequence number */ | ||
101 | |||
102 | struct kmem_cache *io_kc; | ||
103 | mempool_t *io_pool; | ||
104 | struct bio_set *bs; | ||
105 | mempool_t *meta_pool; | ||
106 | }; | ||
107 | |||
108 | struct ppl_log { | ||
109 | struct ppl_conf *ppl_conf; /* shared between all log instances */ | ||
110 | |||
111 | struct md_rdev *rdev; /* array member disk associated with | ||
112 | * this log instance */ | ||
113 | struct mutex io_mutex; | ||
114 | struct ppl_io_unit *current_io; /* current io_unit accepting new data | ||
115 | * always at the end of io_list */ | ||
116 | spinlock_t io_list_lock; | ||
117 | struct list_head io_list; /* all io_units of this log */ | ||
118 | struct list_head no_mem_stripes;/* stripes to retry if failed to | ||
119 | * allocate io_unit */ | ||
120 | }; | ||
121 | |||
122 | #define PPL_IO_INLINE_BVECS 32 | ||
123 | |||
124 | struct ppl_io_unit { | ||
125 | struct ppl_log *log; | ||
126 | |||
127 | struct page *header_page; /* for ppl_header */ | ||
128 | |||
129 | unsigned int entries_count; /* number of entries in ppl_header */ | ||
130 | unsigned int pp_size; /* total size current of partial parity */ | ||
131 | |||
132 | u64 seq; /* sequence number of this log write */ | ||
133 | struct list_head log_sibling; /* log->io_list */ | ||
134 | |||
135 | struct list_head stripe_list; /* stripes added to the io_unit */ | ||
136 | atomic_t pending_stripes; /* how many stripes not written to raid */ | ||
137 | |||
138 | bool submitted; /* true if write to log started */ | ||
139 | |||
140 | /* inline bio and its biovec for submitting the iounit */ | ||
141 | struct bio bio; | ||
142 | struct bio_vec biovec[PPL_IO_INLINE_BVECS]; | ||
143 | }; | ||
144 | |||
145 | struct dma_async_tx_descriptor * | ||
146 | ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu, | ||
147 | struct dma_async_tx_descriptor *tx) | ||
148 | { | ||
149 | int disks = sh->disks; | ||
150 | struct page **xor_srcs = flex_array_get(percpu->scribble, 0); | ||
151 | int count = 0, pd_idx = sh->pd_idx, i; | ||
152 | struct async_submit_ctl submit; | ||
153 | |||
154 | pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); | ||
155 | |||
156 | /* | ||
157 | * Partial parity is the XOR of stripe data chunks that are not changed | ||
158 | * during the write request. Depending on available data | ||
159 | * (read-modify-write vs. reconstruct-write case) we calculate it | ||
160 | * differently. | ||
161 | */ | ||
162 | if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { | ||
163 | /* rmw: xor old data and parity from updated disks */ | ||
164 | for (i = disks; i--;) { | ||
165 | struct r5dev *dev = &sh->dev[i]; | ||
166 | if (test_bit(R5_Wantdrain, &dev->flags) || i == pd_idx) | ||
167 | xor_srcs[count++] = dev->page; | ||
168 | } | ||
169 | } else if (sh->reconstruct_state == reconstruct_state_drain_run) { | ||
170 | /* rcw: xor data from all not updated disks */ | ||
171 | for (i = disks; i--;) { | ||
172 | struct r5dev *dev = &sh->dev[i]; | ||
173 | if (test_bit(R5_UPTODATE, &dev->flags)) | ||
174 | xor_srcs[count++] = dev->page; | ||
175 | } | ||
176 | } else { | ||
177 | return tx; | ||
178 | } | ||
179 | |||
180 | init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx, | ||
181 | NULL, sh, flex_array_get(percpu->scribble, 0) | ||
182 | + sizeof(struct page *) * (sh->disks + 2)); | ||
183 | |||
184 | if (count == 1) | ||
185 | tx = async_memcpy(sh->ppl_page, xor_srcs[0], 0, 0, PAGE_SIZE, | ||
186 | &submit); | ||
187 | else | ||
188 | tx = async_xor(sh->ppl_page, xor_srcs, 0, count, PAGE_SIZE, | ||
189 | &submit); | ||
190 | |||
191 | return tx; | ||
192 | } | ||
193 | |||
194 | static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log, | ||
195 | struct stripe_head *sh) | ||
196 | { | ||
197 | struct ppl_conf *ppl_conf = log->ppl_conf; | ||
198 | struct ppl_io_unit *io; | ||
199 | struct ppl_header *pplhdr; | ||
200 | |||
201 | io = mempool_alloc(ppl_conf->io_pool, GFP_ATOMIC); | ||
202 | if (!io) | ||
203 | return NULL; | ||
204 | |||
205 | memset(io, 0, sizeof(*io)); | ||
206 | io->log = log; | ||
207 | INIT_LIST_HEAD(&io->log_sibling); | ||
208 | INIT_LIST_HEAD(&io->stripe_list); | ||
209 | atomic_set(&io->pending_stripes, 0); | ||
210 | bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS); | ||
211 | |||
212 | io->header_page = mempool_alloc(ppl_conf->meta_pool, GFP_NOIO); | ||
213 | pplhdr = page_address(io->header_page); | ||
214 | clear_page(pplhdr); | ||
215 | memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED); | ||
216 | pplhdr->signature = cpu_to_le32(ppl_conf->signature); | ||
217 | |||
218 | io->seq = atomic64_add_return(1, &ppl_conf->seq); | ||
219 | pplhdr->generation = cpu_to_le64(io->seq); | ||
220 | |||
221 | return io; | ||
222 | } | ||
223 | |||
224 | static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh) | ||
225 | { | ||
226 | struct ppl_io_unit *io = log->current_io; | ||
227 | struct ppl_header_entry *e = NULL; | ||
228 | struct ppl_header *pplhdr; | ||
229 | int i; | ||
230 | sector_t data_sector = 0; | ||
231 | int data_disks = 0; | ||
232 | unsigned int entry_space = (log->rdev->ppl.size << 9) - PPL_HEADER_SIZE; | ||
233 | struct r5conf *conf = sh->raid_conf; | ||
234 | |||
235 | pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector); | ||
236 | |||
237 | /* check if current io_unit is full */ | ||
238 | if (io && (io->pp_size == entry_space || | ||
239 | io->entries_count == PPL_HDR_MAX_ENTRIES)) { | ||
240 | pr_debug("%s: add io_unit blocked by seq: %llu\n", | ||
241 | __func__, io->seq); | ||
242 | io = NULL; | ||
243 | } | ||
244 | |||
245 | /* add a new unit if there is none or the current is full */ | ||
246 | if (!io) { | ||
247 | io = ppl_new_iounit(log, sh); | ||
248 | if (!io) | ||
249 | return -ENOMEM; | ||
250 | spin_lock_irq(&log->io_list_lock); | ||
251 | list_add_tail(&io->log_sibling, &log->io_list); | ||
252 | spin_unlock_irq(&log->io_list_lock); | ||
253 | |||
254 | log->current_io = io; | ||
255 | } | ||
256 | |||
257 | for (i = 0; i < sh->disks; i++) { | ||
258 | struct r5dev *dev = &sh->dev[i]; | ||
259 | |||
260 | if (i != sh->pd_idx && test_bit(R5_Wantwrite, &dev->flags)) { | ||
261 | if (!data_disks || dev->sector < data_sector) | ||
262 | data_sector = dev->sector; | ||
263 | data_disks++; | ||
264 | } | ||
265 | } | ||
266 | BUG_ON(!data_disks); | ||
267 | |||
268 | pr_debug("%s: seq: %llu data_sector: %llu data_disks: %d\n", __func__, | ||
269 | io->seq, (unsigned long long)data_sector, data_disks); | ||
270 | |||
271 | pplhdr = page_address(io->header_page); | ||
272 | |||
273 | if (io->entries_count > 0) { | ||
274 | struct ppl_header_entry *last = | ||
275 | &pplhdr->entries[io->entries_count - 1]; | ||
276 | struct stripe_head *sh_last = list_last_entry( | ||
277 | &io->stripe_list, struct stripe_head, log_list); | ||
278 | u64 data_sector_last = le64_to_cpu(last->data_sector); | ||
279 | u32 data_size_last = le32_to_cpu(last->data_size); | ||
280 | |||
281 | /* | ||
282 | * Check if we can append the stripe to the last entry. It must | ||
283 | * be just after the last logged stripe and write to the same | ||
284 | * disks. Use bit shift and logarithm to avoid 64-bit division. | ||
285 | */ | ||
286 | if ((sh->sector == sh_last->sector + STRIPE_SECTORS) && | ||
287 | (data_sector >> ilog2(conf->chunk_sectors) == | ||
288 | data_sector_last >> ilog2(conf->chunk_sectors)) && | ||
289 | ((data_sector - data_sector_last) * data_disks == | ||
290 | data_size_last >> 9)) | ||
291 | e = last; | ||
292 | } | ||
293 | |||
294 | if (!e) { | ||
295 | e = &pplhdr->entries[io->entries_count++]; | ||
296 | e->data_sector = cpu_to_le64(data_sector); | ||
297 | e->parity_disk = cpu_to_le32(sh->pd_idx); | ||
298 | e->checksum = cpu_to_le32(~0); | ||
299 | } | ||
300 | |||
301 | le32_add_cpu(&e->data_size, data_disks << PAGE_SHIFT); | ||
302 | |||
303 | /* don't write any PP if full stripe write */ | ||
304 | if (!test_bit(STRIPE_FULL_WRITE, &sh->state)) { | ||
305 | le32_add_cpu(&e->pp_size, PAGE_SIZE); | ||
306 | io->pp_size += PAGE_SIZE; | ||
307 | e->checksum = cpu_to_le32(crc32c_le(le32_to_cpu(e->checksum), | ||
308 | page_address(sh->ppl_page), | ||
309 | PAGE_SIZE)); | ||
310 | } | ||
311 | |||
312 | list_add_tail(&sh->log_list, &io->stripe_list); | ||
313 | atomic_inc(&io->pending_stripes); | ||
314 | sh->ppl_io = io; | ||
315 | |||
316 | return 0; | ||
317 | } | ||
318 | |||
319 | int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh) | ||
320 | { | ||
321 | struct ppl_conf *ppl_conf = conf->log_private; | ||
322 | struct ppl_io_unit *io = sh->ppl_io; | ||
323 | struct ppl_log *log; | ||
324 | |||
325 | if (io || test_bit(STRIPE_SYNCING, &sh->state) || | ||
326 | !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) || | ||
327 | !test_bit(R5_Insync, &sh->dev[sh->pd_idx].flags)) { | ||
328 | clear_bit(STRIPE_LOG_TRAPPED, &sh->state); | ||
329 | return -EAGAIN; | ||
330 | } | ||
331 | |||
332 | log = &ppl_conf->child_logs[sh->pd_idx]; | ||
333 | |||
334 | mutex_lock(&log->io_mutex); | ||
335 | |||
336 | if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) { | ||
337 | mutex_unlock(&log->io_mutex); | ||
338 | return -EAGAIN; | ||
339 | } | ||
340 | |||
341 | set_bit(STRIPE_LOG_TRAPPED, &sh->state); | ||
342 | clear_bit(STRIPE_DELAYED, &sh->state); | ||
343 | atomic_inc(&sh->count); | ||
344 | |||
345 | if (ppl_log_stripe(log, sh)) { | ||
346 | spin_lock_irq(&log->io_list_lock); | ||
347 | list_add_tail(&sh->log_list, &log->no_mem_stripes); | ||
348 | spin_unlock_irq(&log->io_list_lock); | ||
349 | } | ||
350 | |||
351 | mutex_unlock(&log->io_mutex); | ||
352 | |||
353 | return 0; | ||
354 | } | ||
355 | |||
356 | static void ppl_log_endio(struct bio *bio) | ||
357 | { | ||
358 | struct ppl_io_unit *io = bio->bi_private; | ||
359 | struct ppl_log *log = io->log; | ||
360 | struct ppl_conf *ppl_conf = log->ppl_conf; | ||
361 | struct stripe_head *sh, *next; | ||
362 | |||
363 | pr_debug("%s: seq: %llu\n", __func__, io->seq); | ||
364 | |||
365 | if (bio->bi_error) | ||
366 | md_error(ppl_conf->mddev, log->rdev); | ||
367 | |||
368 | mempool_free(io->header_page, ppl_conf->meta_pool); | ||
369 | |||
370 | list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { | ||
371 | list_del_init(&sh->log_list); | ||
372 | |||
373 | set_bit(STRIPE_HANDLE, &sh->state); | ||
374 | raid5_release_stripe(sh); | ||
375 | } | ||
376 | } | ||
377 | |||
378 | static void ppl_submit_iounit_bio(struct ppl_io_unit *io, struct bio *bio) | ||
379 | { | ||
380 | char b[BDEVNAME_SIZE]; | ||
381 | |||
382 | pr_debug("%s: seq: %llu size: %u sector: %llu dev: %s\n", | ||
383 | __func__, io->seq, bio->bi_iter.bi_size, | ||
384 | (unsigned long long)bio->bi_iter.bi_sector, | ||
385 | bdevname(bio->bi_bdev, b)); | ||
386 | |||
387 | submit_bio(bio); | ||
388 | } | ||
389 | |||
390 | static void ppl_submit_iounit(struct ppl_io_unit *io) | ||
391 | { | ||
392 | struct ppl_log *log = io->log; | ||
393 | struct ppl_conf *ppl_conf = log->ppl_conf; | ||
394 | struct ppl_header *pplhdr = page_address(io->header_page); | ||
395 | struct bio *bio = &io->bio; | ||
396 | struct stripe_head *sh; | ||
397 | int i; | ||
398 | |||
399 | for (i = 0; i < io->entries_count; i++) { | ||
400 | struct ppl_header_entry *e = &pplhdr->entries[i]; | ||
401 | |||
402 | pr_debug("%s: seq: %llu entry: %d data_sector: %llu pp_size: %u data_size: %u\n", | ||
403 | __func__, io->seq, i, le64_to_cpu(e->data_sector), | ||
404 | le32_to_cpu(e->pp_size), le32_to_cpu(e->data_size)); | ||
405 | |||
406 | e->data_sector = cpu_to_le64(le64_to_cpu(e->data_sector) >> | ||
407 | ilog2(ppl_conf->block_size >> 9)); | ||
408 | e->checksum = cpu_to_le32(~le32_to_cpu(e->checksum)); | ||
409 | } | ||
410 | |||
411 | pplhdr->entries_count = cpu_to_le32(io->entries_count); | ||
412 | pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE)); | ||
413 | |||
414 | bio->bi_private = io; | ||
415 | bio->bi_end_io = ppl_log_endio; | ||
416 | bio->bi_opf = REQ_OP_WRITE | REQ_FUA; | ||
417 | bio->bi_bdev = log->rdev->bdev; | ||
418 | bio->bi_iter.bi_sector = log->rdev->ppl.sector; | ||
419 | bio_add_page(bio, io->header_page, PAGE_SIZE, 0); | ||
420 | |||
421 | list_for_each_entry(sh, &io->stripe_list, log_list) { | ||
422 | /* entries for full stripe writes have no partial parity */ | ||
423 | if (test_bit(STRIPE_FULL_WRITE, &sh->state)) | ||
424 | continue; | ||
425 | |||
426 | if (!bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0)) { | ||
427 | struct bio *prev = bio; | ||
428 | |||
429 | bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, | ||
430 | ppl_conf->bs); | ||
431 | bio->bi_opf = prev->bi_opf; | ||
432 | bio->bi_bdev = prev->bi_bdev; | ||
433 | bio->bi_iter.bi_sector = bio_end_sector(prev); | ||
434 | bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0); | ||
435 | |||
436 | bio_chain(bio, prev); | ||
437 | ppl_submit_iounit_bio(io, prev); | ||
438 | } | ||
439 | } | ||
440 | |||
441 | ppl_submit_iounit_bio(io, bio); | ||
442 | } | ||
443 | |||
444 | static void ppl_submit_current_io(struct ppl_log *log) | ||
445 | { | ||
446 | struct ppl_io_unit *io; | ||
447 | |||
448 | spin_lock_irq(&log->io_list_lock); | ||
449 | |||
450 | io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit, | ||
451 | log_sibling); | ||
452 | if (io && io->submitted) | ||
453 | io = NULL; | ||
454 | |||
455 | spin_unlock_irq(&log->io_list_lock); | ||
456 | |||
457 | if (io) { | ||
458 | io->submitted = true; | ||
459 | |||
460 | if (io == log->current_io) | ||
461 | log->current_io = NULL; | ||
462 | |||
463 | ppl_submit_iounit(io); | ||
464 | } | ||
465 | } | ||
466 | |||
467 | void ppl_write_stripe_run(struct r5conf *conf) | ||
468 | { | ||
469 | struct ppl_conf *ppl_conf = conf->log_private; | ||
470 | struct ppl_log *log; | ||
471 | int i; | ||
472 | |||
473 | for (i = 0; i < ppl_conf->count; i++) { | ||
474 | log = &ppl_conf->child_logs[i]; | ||
475 | |||
476 | mutex_lock(&log->io_mutex); | ||
477 | ppl_submit_current_io(log); | ||
478 | mutex_unlock(&log->io_mutex); | ||
479 | } | ||
480 | } | ||
481 | |||
482 | static void ppl_io_unit_finished(struct ppl_io_unit *io) | ||
483 | { | ||
484 | struct ppl_log *log = io->log; | ||
485 | unsigned long flags; | ||
486 | |||
487 | pr_debug("%s: seq: %llu\n", __func__, io->seq); | ||
488 | |||
489 | spin_lock_irqsave(&log->io_list_lock, flags); | ||
490 | |||
491 | list_del(&io->log_sibling); | ||
492 | mempool_free(io, log->ppl_conf->io_pool); | ||
493 | |||
494 | if (!list_empty(&log->no_mem_stripes)) { | ||
495 | struct stripe_head *sh = list_first_entry(&log->no_mem_stripes, | ||
496 | struct stripe_head, | ||
497 | log_list); | ||
498 | list_del_init(&sh->log_list); | ||
499 | set_bit(STRIPE_HANDLE, &sh->state); | ||
500 | raid5_release_stripe(sh); | ||
501 | } | ||
502 | |||
503 | spin_unlock_irqrestore(&log->io_list_lock, flags); | ||
504 | } | ||
505 | |||
506 | void ppl_stripe_write_finished(struct stripe_head *sh) | ||
507 | { | ||
508 | struct ppl_io_unit *io; | ||
509 | |||
510 | io = sh->ppl_io; | ||
511 | sh->ppl_io = NULL; | ||
512 | |||
513 | if (io && atomic_dec_and_test(&io->pending_stripes)) | ||
514 | ppl_io_unit_finished(io); | ||
515 | } | ||
516 | |||
517 | static void __ppl_exit_log(struct ppl_conf *ppl_conf) | ||
518 | { | ||
519 | clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags); | ||
520 | |||
521 | kfree(ppl_conf->child_logs); | ||
522 | |||
523 | mempool_destroy(ppl_conf->meta_pool); | ||
524 | if (ppl_conf->bs) | ||
525 | bioset_free(ppl_conf->bs); | ||
526 | mempool_destroy(ppl_conf->io_pool); | ||
527 | kmem_cache_destroy(ppl_conf->io_kc); | ||
528 | |||
529 | kfree(ppl_conf); | ||
530 | } | ||
531 | |||
532 | void ppl_exit_log(struct r5conf *conf) | ||
533 | { | ||
534 | struct ppl_conf *ppl_conf = conf->log_private; | ||
535 | |||
536 | if (ppl_conf) { | ||
537 | __ppl_exit_log(ppl_conf); | ||
538 | conf->log_private = NULL; | ||
539 | } | ||
540 | } | ||
541 | |||
542 | static int ppl_validate_rdev(struct md_rdev *rdev) | ||
543 | { | ||
544 | char b[BDEVNAME_SIZE]; | ||
545 | int ppl_data_sectors; | ||
546 | int ppl_size_new; | ||
547 | |||
548 | /* | ||
549 | * The configured PPL size must be enough to store | ||
550 | * the header and (at the very least) partial parity | ||
551 | * for one stripe. Round it down to ensure the data | ||
552 | * space is cleanly divisible by stripe size. | ||
553 | */ | ||
554 | ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9); | ||
555 | |||
556 | if (ppl_data_sectors > 0) | ||
557 | ppl_data_sectors = rounddown(ppl_data_sectors, STRIPE_SECTORS); | ||
558 | |||
559 | if (ppl_data_sectors <= 0) { | ||
560 | pr_warn("md/raid:%s: PPL space too small on %s\n", | ||
561 | mdname(rdev->mddev), bdevname(rdev->bdev, b)); | ||
562 | return -ENOSPC; | ||
563 | } | ||
564 | |||
565 | ppl_size_new = ppl_data_sectors + (PPL_HEADER_SIZE >> 9); | ||
566 | |||
567 | if ((rdev->ppl.sector < rdev->data_offset && | ||
568 | rdev->ppl.sector + ppl_size_new > rdev->data_offset) || | ||
569 | (rdev->ppl.sector >= rdev->data_offset && | ||
570 | rdev->data_offset + rdev->sectors > rdev->ppl.sector)) { | ||
571 | pr_warn("md/raid:%s: PPL space overlaps with data on %s\n", | ||
572 | mdname(rdev->mddev), bdevname(rdev->bdev, b)); | ||
573 | return -EINVAL; | ||
574 | } | ||
575 | |||
576 | if (!rdev->mddev->external && | ||
577 | ((rdev->ppl.offset > 0 && rdev->ppl.offset < (rdev->sb_size >> 9)) || | ||
578 | (rdev->ppl.offset <= 0 && rdev->ppl.offset + ppl_size_new > 0))) { | ||
579 | pr_warn("md/raid:%s: PPL space overlaps with superblock on %s\n", | ||
580 | mdname(rdev->mddev), bdevname(rdev->bdev, b)); | ||
581 | return -EINVAL; | ||
582 | } | ||
583 | |||
584 | rdev->ppl.size = ppl_size_new; | ||
585 | |||
586 | return 0; | ||
587 | } | ||
588 | |||
589 | int ppl_init_log(struct r5conf *conf) | ||
590 | { | ||
591 | struct ppl_conf *ppl_conf; | ||
592 | struct mddev *mddev = conf->mddev; | ||
593 | int ret = 0; | ||
594 | int i; | ||
595 | bool need_cache_flush; | ||
596 | |||
597 | pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n", | ||
598 | mdname(conf->mddev)); | ||
599 | |||
600 | if (PAGE_SIZE != 4096) | ||
601 | return -EINVAL; | ||
602 | |||
603 | if (mddev->level != 5) { | ||
604 | pr_warn("md/raid:%s PPL is not compatible with raid level %d\n", | ||
605 | mdname(mddev), mddev->level); | ||
606 | return -EINVAL; | ||
607 | } | ||
608 | |||
609 | if (mddev->bitmap_info.file || mddev->bitmap_info.offset) { | ||
610 | pr_warn("md/raid:%s PPL is not compatible with bitmap\n", | ||
611 | mdname(mddev)); | ||
612 | return -EINVAL; | ||
613 | } | ||
614 | |||
615 | if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { | ||
616 | pr_warn("md/raid:%s PPL is not compatible with journal\n", | ||
617 | mdname(mddev)); | ||
618 | return -EINVAL; | ||
619 | } | ||
620 | |||
621 | ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL); | ||
622 | if (!ppl_conf) | ||
623 | return -ENOMEM; | ||
624 | |||
625 | ppl_conf->mddev = mddev; | ||
626 | |||
627 | ppl_conf->io_kc = KMEM_CACHE(ppl_io_unit, 0); | ||
628 | if (!ppl_conf->io_kc) { | ||
629 | ret = -EINVAL; | ||
630 | goto err; | ||
631 | } | ||
632 | |||
633 | ppl_conf->io_pool = mempool_create_slab_pool(conf->raid_disks, ppl_conf->io_kc); | ||
634 | if (!ppl_conf->io_pool) { | ||
635 | ret = -EINVAL; | ||
636 | goto err; | ||
637 | } | ||
638 | |||
639 | ppl_conf->bs = bioset_create(conf->raid_disks, 0); | ||
640 | if (!ppl_conf->bs) { | ||
641 | ret = -EINVAL; | ||
642 | goto err; | ||
643 | } | ||
644 | |||
645 | ppl_conf->meta_pool = mempool_create_page_pool(conf->raid_disks, 0); | ||
646 | if (!ppl_conf->meta_pool) { | ||
647 | ret = -EINVAL; | ||
648 | goto err; | ||
649 | } | ||
650 | |||
651 | ppl_conf->count = conf->raid_disks; | ||
652 | ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log), | ||
653 | GFP_KERNEL); | ||
654 | if (!ppl_conf->child_logs) { | ||
655 | ret = -ENOMEM; | ||
656 | goto err; | ||
657 | } | ||
658 | |||
659 | atomic64_set(&ppl_conf->seq, 0); | ||
660 | |||
661 | if (!mddev->external) { | ||
662 | ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid)); | ||
663 | ppl_conf->block_size = 512; | ||
664 | } else { | ||
665 | ppl_conf->block_size = queue_logical_block_size(mddev->queue); | ||
666 | } | ||
667 | |||
668 | for (i = 0; i < ppl_conf->count; i++) { | ||
669 | struct ppl_log *log = &ppl_conf->child_logs[i]; | ||
670 | struct md_rdev *rdev = conf->disks[i].rdev; | ||
671 | |||
672 | mutex_init(&log->io_mutex); | ||
673 | spin_lock_init(&log->io_list_lock); | ||
674 | INIT_LIST_HEAD(&log->io_list); | ||
675 | INIT_LIST_HEAD(&log->no_mem_stripes); | ||
676 | |||
677 | log->ppl_conf = ppl_conf; | ||
678 | log->rdev = rdev; | ||
679 | |||
680 | if (rdev) { | ||
681 | struct request_queue *q; | ||
682 | |||
683 | ret = ppl_validate_rdev(rdev); | ||
684 | if (ret) | ||
685 | goto err; | ||
686 | |||
687 | q = bdev_get_queue(rdev->bdev); | ||
688 | if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) | ||
689 | need_cache_flush = true; | ||
690 | } | ||
691 | } | ||
692 | |||
693 | if (need_cache_flush) | ||
694 | pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n", | ||
695 | mdname(mddev)); | ||
696 | |||
697 | conf->log_private = ppl_conf; | ||
698 | |||
699 | return 0; | ||
700 | err: | ||
701 | __ppl_exit_log(ppl_conf); | ||
702 | return ret; | ||
703 | } | ||