aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/raid56.c
diff options
context:
space:
mode:
authorDavid Woodhouse <David.Woodhouse@intel.com>2013-01-29 18:40:14 -0500
committerChris Mason <chris.mason@fusionio.com>2013-02-01 14:24:23 -0500
commit53b381b3abeb86f12787a6c40fee9b2f71edc23b (patch)
treec1018ba2157778f0200d2ede0c0df48fe5df8f14 /fs/btrfs/raid56.c
parent64a167011bcabc1e855658387c8a4464b71f3138 (diff)
Btrfs: RAID5 and RAID6
This builds on David Woodhouse's original Btrfs raid5/6 implementation. The code has changed quite a bit, blame Chris Mason for any bugs. Read/modify/write is done after the higher levels of the filesystem have prepared a given bio. This means the higher layers are not responsible for building full stripes, and they don't need to query for the topology of the extents that may get allocated during delayed allocation runs. It also means different files can easily share the same stripe. But, it does expose us to incorrect parity if we crash or lose power while doing a read/modify/write cycle. This will be addressed in a later commit. Scrub is unable to repair crc errors on raid5/6 chunks. Discard does not work on raid5/6 (yet) The stripe size is fixed at 64KiB per disk. This will be tunable in a later commit. Signed-off-by: Chris Mason <chris.mason@fusionio.com>
Diffstat (limited to 'fs/btrfs/raid56.c')
-rw-r--r--fs/btrfs/raid56.c1647
1 files changed, 1647 insertions, 0 deletions
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
new file mode 100644
index 000000000000..d02510f34936
--- /dev/null
+++ b/fs/btrfs/raid56.c
@@ -0,0 +1,1647 @@
1/*
2 * Copyright (C) 2012 Fusion-io All rights reserved.
3 * Copyright (C) 2012 Intel Corp. All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public
7 * License v2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public
15 * License along with this program; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 021110-1307, USA.
18 */
19#include <linux/sched.h>
20#include <linux/wait.h>
21#include <linux/bio.h>
22#include <linux/slab.h>
23#include <linux/buffer_head.h>
24#include <linux/blkdev.h>
25#include <linux/random.h>
26#include <linux/iocontext.h>
27#include <linux/capability.h>
28#include <linux/ratelimit.h>
29#include <linux/kthread.h>
30#include <linux/raid/pq.h>
31#include <linux/hash.h>
32#include <linux/list_sort.h>
33#include <linux/raid/xor.h>
34#include <asm/div64.h>
35#include "compat.h"
36#include "ctree.h"
37#include "extent_map.h"
38#include "disk-io.h"
39#include "transaction.h"
40#include "print-tree.h"
41#include "volumes.h"
42#include "raid56.h"
43#include "async-thread.h"
44#include "check-integrity.h"
45#include "rcu-string.h"
46
47/* set when additional merges to this rbio are not allowed */
48#define RBIO_RMW_LOCKED_BIT 1
49
50struct btrfs_raid_bio {
51 struct btrfs_fs_info *fs_info;
52 struct btrfs_bio *bbio;
53
54 /*
55 * logical block numbers for the start of each stripe
56 * The last one or two are p/q. These are sorted,
57 * so raid_map[0] is the start of our full stripe
58 */
59 u64 *raid_map;
60
61 /* while we're doing rmw on a stripe
62 * we put it into a hash table so we can
63 * lock the stripe and merge more rbios
64 * into it.
65 */
66 struct list_head hash_list;
67
68 /*
69 * for scheduling work in the helper threads
70 */
71 struct btrfs_work work;
72
73 /*
74 * bio list and bio_list_lock are used
75 * to add more bios into the stripe
76 * in hopes of avoiding the full rmw
77 */
78 struct bio_list bio_list;
79 spinlock_t bio_list_lock;
80
81 /*
82 * also protected by the bio_list_lock, the
83 * stripe locking code uses plug_list to hand off
84 * the stripe lock to the next pending IO
85 */
86 struct list_head plug_list;
87
88 /*
89 * flags that tell us if it is safe to
90 * merge with this bio
91 */
92 unsigned long flags;
93
94 /* size of each individual stripe on disk */
95 int stripe_len;
96
97 /* number of data stripes (no p/q) */
98 int nr_data;
99
100 /*
101 * set if we're doing a parity rebuild
102 * for a read from higher up, which is handled
103 * differently from a parity rebuild as part of
104 * rmw
105 */
106 int read_rebuild;
107
108 /* first bad stripe */
109 int faila;
110
111 /* second bad stripe (for raid6 use) */
112 int failb;
113
114 /*
115 * number of pages needed to represent the full
116 * stripe
117 */
118 int nr_pages;
119
120 /*
121 * size of all the bios in the bio_list. This
122 * helps us decide if the rbio maps to a full
123 * stripe or not
124 */
125 int bio_list_bytes;
126
127 atomic_t refs;
128
129 /*
130 * these are two arrays of pointers. We allocate the
131 * rbio big enough to hold them both and setup their
132 * locations when the rbio is allocated
133 */
134
135 /* pointers to pages that we allocated for
136 * reading/writing stripes directly from the disk (including P/Q)
137 */
138 struct page **stripe_pages;
139
140 /*
141 * pointers to the pages in the bio_list. Stored
142 * here for faster lookup
143 */
144 struct page **bio_pages;
145};
146
147static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
148static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
149static void rmw_work(struct btrfs_work *work);
150static void read_rebuild_work(struct btrfs_work *work);
151static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
152static void async_read_rebuild(struct btrfs_raid_bio *rbio);
153static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
154static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
155static void __free_raid_bio(struct btrfs_raid_bio *rbio);
156static void index_rbio_pages(struct btrfs_raid_bio *rbio);
157static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
158
159/*
160 * the stripe hash table is used for locking, and to collect
161 * bios in hopes of making a full stripe
162 */
163int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
164{
165 struct btrfs_stripe_hash_table *table;
166 struct btrfs_stripe_hash_table *x;
167 struct btrfs_stripe_hash *cur;
168 struct btrfs_stripe_hash *h;
169 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
170 int i;
171
172 if (info->stripe_hash_table)
173 return 0;
174
175 table = kzalloc(sizeof(*table) + sizeof(*h) * num_entries, GFP_NOFS);
176 if (!table)
177 return -ENOMEM;
178
179 table->table = (void *)(table + 1);
180 h = table->table;
181
182 for (i = 0; i < num_entries; i++) {
183 cur = h + i;
184 INIT_LIST_HEAD(&cur->hash_list);
185 spin_lock_init(&cur->lock);
186 init_waitqueue_head(&cur->wait);
187 }
188
189 x = cmpxchg(&info->stripe_hash_table, NULL, table);
190 if (x)
191 kfree(x);
192 return 0;
193}
194
195/*
196 * we hash on the first logical address of the stripe
197 */
198static int rbio_bucket(struct btrfs_raid_bio *rbio)
199{
200 u64 num = rbio->raid_map[0];
201
202 /*
203 * we shift down quite a bit. We're using byte
204 * addressing, and most of the lower bits are zeros.
205 * This tends to upset hash_64, and it consistently
206 * returns just one or two different values.
207 *
208 * shifting off the lower bits fixes things.
209 */
210 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
211}
212
213/*
214 * merging means we take the bio_list from the victim and
215 * splice it into the destination. The victim should
216 * be discarded afterwards.
217 *
218 * must be called with dest->rbio_list_lock held
219 */
220static void merge_rbio(struct btrfs_raid_bio *dest,
221 struct btrfs_raid_bio *victim)
222{
223 bio_list_merge(&dest->bio_list, &victim->bio_list);
224 dest->bio_list_bytes += victim->bio_list_bytes;
225 bio_list_init(&victim->bio_list);
226}
227
228/*
229 * free the hash table used by unmount
230 */
231void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
232{
233 if (!info->stripe_hash_table)
234 return;
235 kfree(info->stripe_hash_table);
236 info->stripe_hash_table = NULL;
237}
238
239/*
240 * helper function to run the xor_blocks api. It is only
241 * able to do MAX_XOR_BLOCKS at a time, so we need to
242 * loop through.
243 */
244static void run_xor(void **pages, int src_cnt, ssize_t len)
245{
246 int src_off = 0;
247 int xor_src_cnt = 0;
248 void *dest = pages[src_cnt];
249
250 while(src_cnt > 0) {
251 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
252 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
253
254 src_cnt -= xor_src_cnt;
255 src_off += xor_src_cnt;
256 }
257}
258
259/*
260 * returns true if the bio list inside this rbio
261 * covers an entire stripe (no rmw required).
262 * Must be called with the bio list lock held, or
263 * at a time when you know it is impossible to add
264 * new bios into the list
265 */
266static int __rbio_is_full(struct btrfs_raid_bio *rbio)
267{
268 unsigned long size = rbio->bio_list_bytes;
269 int ret = 1;
270
271 if (size != rbio->nr_data * rbio->stripe_len)
272 ret = 0;
273
274 BUG_ON(size > rbio->nr_data * rbio->stripe_len);
275 return ret;
276}
277
278static int rbio_is_full(struct btrfs_raid_bio *rbio)
279{
280 unsigned long flags;
281 int ret;
282
283 spin_lock_irqsave(&rbio->bio_list_lock, flags);
284 ret = __rbio_is_full(rbio);
285 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
286 return ret;
287}
288
289/*
290 * returns 1 if it is safe to merge two rbios together.
291 * The merging is safe if the two rbios correspond to
292 * the same stripe and if they are both going in the same
293 * direction (read vs write), and if neither one is
294 * locked for final IO
295 *
296 * The caller is responsible for locking such that
297 * rmw_locked is safe to test
298 */
299static int rbio_can_merge(struct btrfs_raid_bio *last,
300 struct btrfs_raid_bio *cur)
301{
302 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
303 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
304 return 0;
305
306 if (last->raid_map[0] !=
307 cur->raid_map[0])
308 return 0;
309
310 /* reads can't merge with writes */
311 if (last->read_rebuild !=
312 cur->read_rebuild) {
313 return 0;
314 }
315
316 return 1;
317}
318
319/*
320 * helper to index into the pstripe
321 */
322static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
323{
324 index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
325 return rbio->stripe_pages[index];
326}
327
328/*
329 * helper to index into the qstripe, returns null
330 * if there is no qstripe
331 */
332static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
333{
334 if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
335 return NULL;
336
337 index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
338 PAGE_CACHE_SHIFT;
339 return rbio->stripe_pages[index];
340}
341
342/*
343 * The first stripe in the table for a logical address
344 * has the lock. rbios are added in one of three ways:
345 *
346 * 1) Nobody has the stripe locked yet. The rbio is given
347 * the lock and 0 is returned. The caller must start the IO
348 * themselves.
349 *
350 * 2) Someone has the stripe locked, but we're able to merge
351 * with the lock owner. The rbio is freed and the IO will
352 * start automatically along with the existing rbio. 1 is returned.
353 *
354 * 3) Someone has the stripe locked, but we're not able to merge.
355 * The rbio is added to the lock owner's plug list, or merged into
356 * an rbio already on the plug list. When the lock owner unlocks,
357 * the next rbio on the list is run and the IO is started automatically.
358 * 1 is returned
359 *
360 * If we return 0, the caller still owns the rbio and must continue with
361 * IO submission. If we return 1, the caller must assume the rbio has
362 * already been freed.
363 */
364static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
365{
366 int bucket = rbio_bucket(rbio);
367 struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
368 struct btrfs_raid_bio *cur;
369 struct btrfs_raid_bio *pending;
370 unsigned long flags;
371 DEFINE_WAIT(wait);
372 struct btrfs_raid_bio *freeit = NULL;
373 int ret = 0;
374 int walk = 0;
375
376 spin_lock_irqsave(&h->lock, flags);
377 list_for_each_entry(cur, &h->hash_list, hash_list) {
378 walk++;
379 if (cur->raid_map[0] == rbio->raid_map[0]) {
380 spin_lock(&cur->bio_list_lock);
381
382 /* can we merge into the lock owner? */
383 if (rbio_can_merge(cur, rbio)) {
384 merge_rbio(cur, rbio);
385 spin_unlock(&cur->bio_list_lock);
386 freeit = rbio;
387 ret = 1;
388 goto out;
389 }
390
391 /*
392 * we couldn't merge with the running
393 * rbio, see if we can merge with the
394 * pending ones. We don't have to
395 * check for rmw_locked because there
396 * is no way they are inside finish_rmw
397 * right now
398 */
399 list_for_each_entry(pending, &cur->plug_list,
400 plug_list) {
401 if (rbio_can_merge(pending, rbio)) {
402 merge_rbio(pending, rbio);
403 spin_unlock(&cur->bio_list_lock);
404 freeit = rbio;
405 ret = 1;
406 goto out;
407 }
408 }
409
410 /* no merging, put us on the tail of the plug list,
411 * our rbio will be started with the currently
412 * running rbio unlocks
413 */
414 list_add_tail(&rbio->plug_list, &cur->plug_list);
415 spin_unlock(&cur->bio_list_lock);
416 ret = 1;
417 goto out;
418 }
419 }
420
421 atomic_inc(&rbio->refs);
422 list_add(&rbio->hash_list, &h->hash_list);
423out:
424 spin_unlock_irqrestore(&h->lock, flags);
425 if (freeit)
426 __free_raid_bio(freeit);
427 return ret;
428}
429
430/*
431 * called as rmw or parity rebuild is completed. If the plug list has more
432 * rbios waiting for this stripe, the next one on the list will be started
433 */
434static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
435{
436 int bucket;
437 struct btrfs_stripe_hash *h;
438 unsigned long flags;
439
440 bucket = rbio_bucket(rbio);
441 h = rbio->fs_info->stripe_hash_table->table + bucket;
442
443 spin_lock_irqsave(&h->lock, flags);
444 spin_lock(&rbio->bio_list_lock);
445
446 if (!list_empty(&rbio->hash_list)) {
447
448 list_del_init(&rbio->hash_list);
449 atomic_dec(&rbio->refs);
450
451 /*
452 * we use the plug list to hold all the rbios
453 * waiting for the chance to lock this stripe.
454 * hand the lock over to one of them.
455 */
456 if (!list_empty(&rbio->plug_list)) {
457 struct btrfs_raid_bio *next;
458 struct list_head *head = rbio->plug_list.next;
459
460 next = list_entry(head, struct btrfs_raid_bio,
461 plug_list);
462
463 list_del_init(&rbio->plug_list);
464
465 list_add(&next->hash_list, &h->hash_list);
466 atomic_inc(&next->refs);
467 spin_unlock(&rbio->bio_list_lock);
468 spin_unlock_irqrestore(&h->lock, flags);
469
470 if (next->read_rebuild)
471 async_read_rebuild(next);
472 else
473 async_rmw_stripe(next);
474
475 goto done_nolock;
476
477 } else if (waitqueue_active(&h->wait)) {
478 spin_unlock(&rbio->bio_list_lock);
479 spin_unlock_irqrestore(&h->lock, flags);
480 wake_up(&h->wait);
481 goto done_nolock;
482 }
483 }
484 spin_unlock(&rbio->bio_list_lock);
485 spin_unlock_irqrestore(&h->lock, flags);
486
487done_nolock:
488 return;
489}
490
491static void __free_raid_bio(struct btrfs_raid_bio *rbio)
492{
493 int i;
494
495 WARN_ON(atomic_read(&rbio->refs) < 0);
496 if (!atomic_dec_and_test(&rbio->refs))
497 return;
498
499 WARN_ON(!list_empty(&rbio->hash_list));
500 WARN_ON(!bio_list_empty(&rbio->bio_list));
501
502 for (i = 0; i < rbio->nr_pages; i++) {
503 if (rbio->stripe_pages[i]) {
504 __free_page(rbio->stripe_pages[i]);
505 rbio->stripe_pages[i] = NULL;
506 }
507 }
508 kfree(rbio->raid_map);
509 kfree(rbio->bbio);
510 kfree(rbio);
511}
512
513static void free_raid_bio(struct btrfs_raid_bio *rbio)
514{
515 unlock_stripe(rbio);
516 __free_raid_bio(rbio);
517}
518
519/*
520 * this frees the rbio and runs through all the bios in the
521 * bio_list and calls end_io on them
522 */
523static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
524{
525 struct bio *cur = bio_list_get(&rbio->bio_list);
526 struct bio *next;
527 free_raid_bio(rbio);
528
529 while (cur) {
530 next = cur->bi_next;
531 cur->bi_next = NULL;
532 if (uptodate)
533 set_bit(BIO_UPTODATE, &cur->bi_flags);
534 bio_endio(cur, err);
535 cur = next;
536 }
537}
538
539/*
540 * end io function used by finish_rmw. When we finally
541 * get here, we've written a full stripe
542 */
543static void raid_write_end_io(struct bio *bio, int err)
544{
545 struct btrfs_raid_bio *rbio = bio->bi_private;
546
547 if (err)
548 fail_bio_stripe(rbio, bio);
549
550 bio_put(bio);
551
552 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
553 return;
554
555 err = 0;
556
557 /* OK, we have read all the stripes we need to. */
558 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
559 err = -EIO;
560
561 rbio_orig_end_io(rbio, err, 0);
562 return;
563}
564
565/*
566 * the read/modify/write code wants to use the original bio for
567 * any pages it included, and then use the rbio for everything
568 * else. This function decides if a given index (stripe number)
569 * and page number in that stripe fall inside the original bio
570 * or the rbio.
571 *
572 * if you set bio_list_only, you'll get a NULL back for any ranges
573 * that are outside the bio_list
574 *
575 * This doesn't take any refs on anything, you get a bare page pointer
576 * and the caller must bump refs as required.
577 *
578 * You must call index_rbio_pages once before you can trust
579 * the answers from this function.
580 */
581static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
582 int index, int pagenr, int bio_list_only)
583{
584 int chunk_page;
585 struct page *p = NULL;
586
587 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
588
589 spin_lock_irq(&rbio->bio_list_lock);
590 p = rbio->bio_pages[chunk_page];
591 spin_unlock_irq(&rbio->bio_list_lock);
592
593 if (p || bio_list_only)
594 return p;
595
596 return rbio->stripe_pages[chunk_page];
597}
598
599/*
600 * number of pages we need for the entire stripe across all the
601 * drives
602 */
603static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
604{
605 unsigned long nr = stripe_len * nr_stripes;
606 return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
607}
608
609/*
610 * allocation and initial setup for the btrfs_raid_bio. Not
611 * this does not allocate any pages for rbio->pages.
612 */
613static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
614 struct btrfs_bio *bbio, u64 *raid_map,
615 u64 stripe_len)
616{
617 struct btrfs_raid_bio *rbio;
618 int nr_data = 0;
619 int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
620 void *p;
621
622 rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
623 GFP_NOFS);
624 if (!rbio) {
625 kfree(raid_map);
626 kfree(bbio);
627 return ERR_PTR(-ENOMEM);
628 }
629
630 bio_list_init(&rbio->bio_list);
631 INIT_LIST_HEAD(&rbio->plug_list);
632 spin_lock_init(&rbio->bio_list_lock);
633 INIT_LIST_HEAD(&rbio->hash_list);
634 rbio->bbio = bbio;
635 rbio->raid_map = raid_map;
636 rbio->fs_info = root->fs_info;
637 rbio->stripe_len = stripe_len;
638 rbio->nr_pages = num_pages;
639 rbio->faila = -1;
640 rbio->failb = -1;
641 atomic_set(&rbio->refs, 1);
642
643 /*
644 * the stripe_pages and bio_pages array point to the extra
645 * memory we allocated past the end of the rbio
646 */
647 p = rbio + 1;
648 rbio->stripe_pages = p;
649 rbio->bio_pages = p + sizeof(struct page *) * num_pages;
650
651 if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
652 nr_data = bbio->num_stripes - 2;
653 else
654 nr_data = bbio->num_stripes - 1;
655
656 rbio->nr_data = nr_data;
657 return rbio;
658}
659
660/* allocate pages for all the stripes in the bio, including parity */
661static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
662{
663 int i;
664 struct page *page;
665
666 for (i = 0; i < rbio->nr_pages; i++) {
667 if (rbio->stripe_pages[i])
668 continue;
669 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
670 if (!page)
671 return -ENOMEM;
672 rbio->stripe_pages[i] = page;
673 ClearPageUptodate(page);
674 }
675 return 0;
676}
677
678/* allocate pages for just the p/q stripes */
679static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
680{
681 int i;
682 struct page *page;
683
684 i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
685
686 for (; i < rbio->nr_pages; i++) {
687 if (rbio->stripe_pages[i])
688 continue;
689 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
690 if (!page)
691 return -ENOMEM;
692 rbio->stripe_pages[i] = page;
693 }
694 return 0;
695}
696
697/*
698 * add a single page from a specific stripe into our list of bios for IO
699 * this will try to merge into existing bios if possible, and returns
700 * zero if all went well.
701 */
702int rbio_add_io_page(struct btrfs_raid_bio *rbio,
703 struct bio_list *bio_list,
704 struct page *page,
705 int stripe_nr,
706 unsigned long page_index,
707 unsigned long bio_max_len)
708{
709 struct bio *last = bio_list->tail;
710 u64 last_end = 0;
711 int ret;
712 struct bio *bio;
713 struct btrfs_bio_stripe *stripe;
714 u64 disk_start;
715
716 stripe = &rbio->bbio->stripes[stripe_nr];
717 disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
718
719 /* if the device is missing, just fail this stripe */
720 if (!stripe->dev->bdev)
721 return fail_rbio_index(rbio, stripe_nr);
722
723 /* see if we can add this page onto our existing bio */
724 if (last) {
725 last_end = (u64)last->bi_sector << 9;
726 last_end += last->bi_size;
727
728 /*
729 * we can't merge these if they are from different
730 * devices or if they are not contiguous
731 */
732 if (last_end == disk_start && stripe->dev->bdev &&
733 test_bit(BIO_UPTODATE, &last->bi_flags) &&
734 last->bi_bdev == stripe->dev->bdev) {
735 ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
736 if (ret == PAGE_CACHE_SIZE)
737 return 0;
738 }
739 }
740
741 /* put a new bio on the list */
742 bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
743 if (!bio)
744 return -ENOMEM;
745
746 bio->bi_size = 0;
747 bio->bi_bdev = stripe->dev->bdev;
748 bio->bi_sector = disk_start >> 9;
749 set_bit(BIO_UPTODATE, &bio->bi_flags);
750
751 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
752 bio_list_add(bio_list, bio);
753 return 0;
754}
755
756/*
757 * while we're doing the read/modify/write cycle, we could
758 * have errors in reading pages off the disk. This checks
759 * for errors and if we're not able to read the page it'll
760 * trigger parity reconstruction. The rmw will be finished
761 * after we've reconstructed the failed stripes
762 */
763static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
764{
765 if (rbio->faila >= 0 || rbio->failb >= 0) {
766 BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
767 __raid56_parity_recover(rbio);
768 } else {
769 finish_rmw(rbio);
770 }
771}
772
773/*
774 * these are just the pages from the rbio array, not from anything
775 * the FS sent down to us
776 */
777static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
778{
779 int index;
780 index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
781 index += page;
782 return rbio->stripe_pages[index];
783}
784
785/*
786 * helper function to walk our bio list and populate the bio_pages array with
787 * the result. This seems expensive, but it is faster than constantly
788 * searching through the bio list as we setup the IO in finish_rmw or stripe
789 * reconstruction.
790 *
791 * This must be called before you trust the answers from page_in_rbio
792 */
793static void index_rbio_pages(struct btrfs_raid_bio *rbio)
794{
795 struct bio *bio;
796 u64 start;
797 unsigned long stripe_offset;
798 unsigned long page_index;
799 struct page *p;
800 int i;
801
802 spin_lock_irq(&rbio->bio_list_lock);
803 bio_list_for_each(bio, &rbio->bio_list) {
804 start = (u64)bio->bi_sector << 9;
805 stripe_offset = start - rbio->raid_map[0];
806 page_index = stripe_offset >> PAGE_CACHE_SHIFT;
807
808 for (i = 0; i < bio->bi_vcnt; i++) {
809 p = bio->bi_io_vec[i].bv_page;
810 rbio->bio_pages[page_index + i] = p;
811 }
812 }
813 spin_unlock_irq(&rbio->bio_list_lock);
814}
815
816/*
817 * this is called from one of two situations. We either
818 * have a full stripe from the higher layers, or we've read all
819 * the missing bits off disk.
820 *
821 * This will calculate the parity and then send down any
822 * changed blocks.
823 */
824static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
825{
826 struct btrfs_bio *bbio = rbio->bbio;
827 void *pointers[bbio->num_stripes];
828 int stripe_len = rbio->stripe_len;
829 int nr_data = rbio->nr_data;
830 int stripe;
831 int pagenr;
832 int p_stripe = -1;
833 int q_stripe = -1;
834 struct bio_list bio_list;
835 struct bio *bio;
836 int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
837 int ret;
838
839 bio_list_init(&bio_list);
840
841 if (bbio->num_stripes - rbio->nr_data == 1) {
842 p_stripe = bbio->num_stripes - 1;
843 } else if (bbio->num_stripes - rbio->nr_data == 2) {
844 p_stripe = bbio->num_stripes - 2;
845 q_stripe = bbio->num_stripes - 1;
846 } else {
847 BUG();
848 }
849
850 /* at this point we either have a full stripe,
851 * or we've read the full stripe from the drive.
852 * recalculate the parity and write the new results.
853 *
854 * We're not allowed to add any new bios to the
855 * bio list here, anyone else that wants to
856 * change this stripe needs to do their own rmw.
857 */
858 spin_lock_irq(&rbio->bio_list_lock);
859 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
860 spin_unlock_irq(&rbio->bio_list_lock);
861
862 atomic_set(&rbio->bbio->error, 0);
863
864 /*
865 * now that we've set rmw_locked, run through the
866 * bio list one last time and map the page pointers
867 */
868 index_rbio_pages(rbio);
869
870 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
871 struct page *p;
872 /* first collect one page from each data stripe */
873 for (stripe = 0; stripe < nr_data; stripe++) {
874 p = page_in_rbio(rbio, stripe, pagenr, 0);
875 pointers[stripe] = kmap(p);
876 }
877
878 /* then add the parity stripe */
879 p = rbio_pstripe_page(rbio, pagenr);
880 SetPageUptodate(p);
881 pointers[stripe++] = kmap(p);
882
883 if (q_stripe != -1) {
884
885 /*
886 * raid6, add the qstripe and call the
887 * library function to fill in our p/q
888 */
889 p = rbio_qstripe_page(rbio, pagenr);
890 SetPageUptodate(p);
891 pointers[stripe++] = kmap(p);
892
893 raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
894 pointers);
895 } else {
896 /* raid5 */
897 memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
898 run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
899 }
900
901
902 for (stripe = 0; stripe < bbio->num_stripes; stripe++)
903 kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
904 }
905
906 /*
907 * time to start writing. Make bios for everything from the
908 * higher layers (the bio_list in our rbio) and our p/q. Ignore
909 * everything else.
910 */
911 for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
912 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
913 struct page *page;
914 if (stripe < rbio->nr_data) {
915 page = page_in_rbio(rbio, stripe, pagenr, 1);
916 if (!page)
917 continue;
918 } else {
919 page = rbio_stripe_page(rbio, stripe, pagenr);
920 }
921
922 ret = rbio_add_io_page(rbio, &bio_list,
923 page, stripe, pagenr, rbio->stripe_len);
924 if (ret)
925 goto cleanup;
926 }
927 }
928
929 atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
930 BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
931
932 while (1) {
933 bio = bio_list_pop(&bio_list);
934 if (!bio)
935 break;
936
937 bio->bi_private = rbio;
938 bio->bi_end_io = raid_write_end_io;
939 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
940 submit_bio(WRITE, bio);
941 }
942 return;
943
944cleanup:
945 rbio_orig_end_io(rbio, -EIO, 0);
946}
947
948/*
949 * helper to find the stripe number for a given bio. Used to figure out which
950 * stripe has failed. This expects the bio to correspond to a physical disk,
951 * so it looks up based on physical sector numbers.
952 */
953static int find_bio_stripe(struct btrfs_raid_bio *rbio,
954 struct bio *bio)
955{
956 u64 physical = bio->bi_sector;
957 u64 stripe_start;
958 int i;
959 struct btrfs_bio_stripe *stripe;
960
961 physical <<= 9;
962
963 for (i = 0; i < rbio->bbio->num_stripes; i++) {
964 stripe = &rbio->bbio->stripes[i];
965 stripe_start = stripe->physical;
966 if (physical >= stripe_start &&
967 physical < stripe_start + rbio->stripe_len) {
968 return i;
969 }
970 }
971 return -1;
972}
973
974/*
975 * helper to find the stripe number for a given
976 * bio (before mapping). Used to figure out which stripe has
977 * failed. This looks up based on logical block numbers.
978 */
979static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
980 struct bio *bio)
981{
982 u64 logical = bio->bi_sector;
983 u64 stripe_start;
984 int i;
985
986 logical <<= 9;
987
988 for (i = 0; i < rbio->nr_data; i++) {
989 stripe_start = rbio->raid_map[i];
990 if (logical >= stripe_start &&
991 logical < stripe_start + rbio->stripe_len) {
992 return i;
993 }
994 }
995 return -1;
996}
997
998/*
999 * returns -EIO if we had too many failures
1000 */
1001static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1002{
1003 unsigned long flags;
1004 int ret = 0;
1005
1006 spin_lock_irqsave(&rbio->bio_list_lock, flags);
1007
1008 /* we already know this stripe is bad, move on */
1009 if (rbio->faila == failed || rbio->failb == failed)
1010 goto out;
1011
1012 if (rbio->faila == -1) {
1013 /* first failure on this rbio */
1014 rbio->faila = failed;
1015 atomic_inc(&rbio->bbio->error);
1016 } else if (rbio->failb == -1) {
1017 /* second failure on this rbio */
1018 rbio->failb = failed;
1019 atomic_inc(&rbio->bbio->error);
1020 } else {
1021 ret = -EIO;
1022 }
1023out:
1024 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1025
1026 return ret;
1027}
1028
1029/*
1030 * helper to fail a stripe based on a physical disk
1031 * bio.
1032 */
1033static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1034 struct bio *bio)
1035{
1036 int failed = find_bio_stripe(rbio, bio);
1037
1038 if (failed < 0)
1039 return -EIO;
1040
1041 return fail_rbio_index(rbio, failed);
1042}
1043
1044/*
1045 * this sets each page in the bio uptodate. It should only be used on private
1046 * rbio pages, nothing that comes in from the higher layers
1047 */
1048static void set_bio_pages_uptodate(struct bio *bio)
1049{
1050 int i;
1051 struct page *p;
1052
1053 for (i = 0; i < bio->bi_vcnt; i++) {
1054 p = bio->bi_io_vec[i].bv_page;
1055 SetPageUptodate(p);
1056 }
1057}
1058
1059/*
1060 * end io for the read phase of the rmw cycle. All the bios here are physical
1061 * stripe bios we've read from the disk so we can recalculate the parity of the
1062 * stripe.
1063 *
1064 * This will usually kick off finish_rmw once all the bios are read in, but it
1065 * may trigger parity reconstruction if we had any errors along the way
1066 */
1067static void raid_rmw_end_io(struct bio *bio, int err)
1068{
1069 struct btrfs_raid_bio *rbio = bio->bi_private;
1070
1071 if (err)
1072 fail_bio_stripe(rbio, bio);
1073 else
1074 set_bio_pages_uptodate(bio);
1075
1076 bio_put(bio);
1077
1078 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1079 return;
1080
1081 err = 0;
1082 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1083 goto cleanup;
1084
1085 /*
1086 * this will normally call finish_rmw to start our write
1087 * but if there are any failed stripes we'll reconstruct
1088 * from parity first
1089 */
1090 validate_rbio_for_rmw(rbio);
1091 return;
1092
1093cleanup:
1094
1095 rbio_orig_end_io(rbio, -EIO, 0);
1096}
1097
1098static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1099{
1100 rbio->work.flags = 0;
1101 rbio->work.func = rmw_work;
1102
1103 btrfs_queue_worker(&rbio->fs_info->rmw_workers,
1104 &rbio->work);
1105}
1106
1107static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1108{
1109 rbio->work.flags = 0;
1110 rbio->work.func = read_rebuild_work;
1111
1112 btrfs_queue_worker(&rbio->fs_info->rmw_workers,
1113 &rbio->work);
1114}
1115
1116/*
1117 * the stripe must be locked by the caller. It will
1118 * unlock after all the writes are done
1119 */
1120static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1121{
1122 int bios_to_read = 0;
1123 struct btrfs_bio *bbio = rbio->bbio;
1124 struct bio_list bio_list;
1125 int ret;
1126 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1127 int pagenr;
1128 int stripe;
1129 struct bio *bio;
1130
1131 bio_list_init(&bio_list);
1132
1133 ret = alloc_rbio_pages(rbio);
1134 if (ret)
1135 goto cleanup;
1136
1137 index_rbio_pages(rbio);
1138
1139 atomic_set(&rbio->bbio->error, 0);
1140 /*
1141 * build a list of bios to read all the missing parts of this
1142 * stripe
1143 */
1144 for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1145 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1146 struct page *page;
1147 /*
1148 * we want to find all the pages missing from
1149 * the rbio and read them from the disk. If
1150 * page_in_rbio finds a page in the bio list
1151 * we don't need to read it off the stripe.
1152 */
1153 page = page_in_rbio(rbio, stripe, pagenr, 1);
1154 if (page)
1155 continue;
1156
1157 page = rbio_stripe_page(rbio, stripe, pagenr);
1158 ret = rbio_add_io_page(rbio, &bio_list, page,
1159 stripe, pagenr, rbio->stripe_len);
1160 if (ret)
1161 goto cleanup;
1162 }
1163 }
1164
1165 bios_to_read = bio_list_size(&bio_list);
1166 if (!bios_to_read) {
1167 /*
1168 * this can happen if others have merged with
1169 * us, it means there is nothing left to read.
1170 * But if there are missing devices it may not be
1171 * safe to do the full stripe write yet.
1172 */
1173 goto finish;
1174 }
1175
1176 /*
1177 * the bbio may be freed once we submit the last bio. Make sure
1178 * not to touch it after that
1179 */
1180 atomic_set(&bbio->stripes_pending, bios_to_read);
1181 while (1) {
1182 bio = bio_list_pop(&bio_list);
1183 if (!bio)
1184 break;
1185
1186 bio->bi_private = rbio;
1187 bio->bi_end_io = raid_rmw_end_io;
1188
1189 btrfs_bio_wq_end_io(rbio->fs_info, bio,
1190 BTRFS_WQ_ENDIO_RAID56);
1191
1192 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1193 submit_bio(READ, bio);
1194 }
1195 /* the actual write will happen once the reads are done */
1196 return 0;
1197
1198cleanup:
1199 rbio_orig_end_io(rbio, -EIO, 0);
1200 return -EIO;
1201
1202finish:
1203 validate_rbio_for_rmw(rbio);
1204 return 0;
1205}
1206
1207/*
1208 * if the upper layers pass in a full stripe, we thank them by only allocating
1209 * enough pages to hold the parity, and sending it all down quickly.
1210 */
1211static int full_stripe_write(struct btrfs_raid_bio *rbio)
1212{
1213 int ret;
1214
1215 ret = alloc_rbio_parity_pages(rbio);
1216 if (ret)
1217 return ret;
1218
1219 ret = lock_stripe_add(rbio);
1220 if (ret == 0)
1221 finish_rmw(rbio);
1222 return 0;
1223}
1224
1225/*
1226 * partial stripe writes get handed over to async helpers.
1227 * We're really hoping to merge a few more writes into this
1228 * rbio before calculating new parity
1229 */
1230static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1231{
1232 int ret;
1233
1234 ret = lock_stripe_add(rbio);
1235 if (ret == 0)
1236 async_rmw_stripe(rbio);
1237 return 0;
1238}
1239
1240/*
1241 * sometimes while we were reading from the drive to
1242 * recalculate parity, enough new bios come into create
1243 * a full stripe. So we do a check here to see if we can
1244 * go directly to finish_rmw
1245 */
1246static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1247{
1248 /* head off into rmw land if we don't have a full stripe */
1249 if (!rbio_is_full(rbio))
1250 return partial_stripe_write(rbio);
1251 return full_stripe_write(rbio);
1252}
1253
1254/*
1255 * our main entry point for writes from the rest of the FS.
1256 */
1257int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1258 struct btrfs_bio *bbio, u64 *raid_map,
1259 u64 stripe_len)
1260{
1261 struct btrfs_raid_bio *rbio;
1262
1263 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1264 if (IS_ERR(rbio)) {
1265 kfree(raid_map);
1266 kfree(bbio);
1267 return PTR_ERR(rbio);
1268 }
1269 bio_list_add(&rbio->bio_list, bio);
1270 rbio->bio_list_bytes = bio->bi_size;
1271 return __raid56_parity_write(rbio);
1272}
1273
1274/*
1275 * all parity reconstruction happens here. We've read in everything
1276 * we can find from the drives and this does the heavy lifting of
1277 * sorting the good from the bad.
1278 */
1279static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1280{
1281 int pagenr, stripe;
1282 void **pointers;
1283 int faila = -1, failb = -1;
1284 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1285 struct page *page;
1286 int err;
1287 int i;
1288
1289 pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
1290 GFP_NOFS);
1291 if (!pointers) {
1292 err = -ENOMEM;
1293 goto cleanup_io;
1294 }
1295
1296 faila = rbio->faila;
1297 failb = rbio->failb;
1298
1299 if (rbio->read_rebuild) {
1300 spin_lock_irq(&rbio->bio_list_lock);
1301 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1302 spin_unlock_irq(&rbio->bio_list_lock);
1303 }
1304
1305 index_rbio_pages(rbio);
1306
1307 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1308 /* setup our array of pointers with pages
1309 * from each stripe
1310 */
1311 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1312 /*
1313 * if we're rebuilding a read, we have to use
1314 * pages from the bio list
1315 */
1316 if (rbio->read_rebuild &&
1317 (stripe == faila || stripe == failb)) {
1318 page = page_in_rbio(rbio, stripe, pagenr, 0);
1319 } else {
1320 page = rbio_stripe_page(rbio, stripe, pagenr);
1321 }
1322 pointers[stripe] = kmap(page);
1323 }
1324
1325 /* all raid6 handling here */
1326 if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
1327 RAID6_Q_STRIPE) {
1328
1329 /*
1330 * single failure, rebuild from parity raid5
1331 * style
1332 */
1333 if (failb < 0) {
1334 if (faila == rbio->nr_data) {
1335 /*
1336 * Just the P stripe has failed, without
1337 * a bad data or Q stripe.
1338 * TODO, we should redo the xor here.
1339 */
1340 err = -EIO;
1341 goto cleanup;
1342 }
1343 /*
1344 * a single failure in raid6 is rebuilt
1345 * in the pstripe code below
1346 */
1347 goto pstripe;
1348 }
1349
1350 /* make sure our ps and qs are in order */
1351 if (faila > failb) {
1352 int tmp = failb;
1353 failb = faila;
1354 faila = tmp;
1355 }
1356
1357 /* if the q stripe is failed, do a pstripe reconstruction
1358 * from the xors.
1359 * If both the q stripe and the P stripe are failed, we're
1360 * here due to a crc mismatch and we can't give them the
1361 * data they want
1362 */
1363 if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
1364 if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
1365 err = -EIO;
1366 goto cleanup;
1367 }
1368 /*
1369 * otherwise we have one bad data stripe and
1370 * a good P stripe. raid5!
1371 */
1372 goto pstripe;
1373 }
1374
1375 if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
1376 raid6_datap_recov(rbio->bbio->num_stripes,
1377 PAGE_SIZE, faila, pointers);
1378 } else {
1379 raid6_2data_recov(rbio->bbio->num_stripes,
1380 PAGE_SIZE, faila, failb,
1381 pointers);
1382 }
1383 } else {
1384 void *p;
1385
1386 /* rebuild from P stripe here (raid5 or raid6) */
1387 BUG_ON(failb != -1);
1388pstripe:
1389 /* Copy parity block into failed block to start with */
1390 memcpy(pointers[faila],
1391 pointers[rbio->nr_data],
1392 PAGE_CACHE_SIZE);
1393
1394 /* rearrange the pointer array */
1395 p = pointers[faila];
1396 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1397 pointers[stripe] = pointers[stripe + 1];
1398 pointers[rbio->nr_data - 1] = p;
1399
1400 /* xor in the rest */
1401 run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
1402 }
1403 /* if we're doing this rebuild as part of an rmw, go through
1404 * and set all of our private rbio pages in the
1405 * failed stripes as uptodate. This way finish_rmw will
1406 * know they can be trusted. If this was a read reconstruction,
1407 * other endio functions will fiddle the uptodate bits
1408 */
1409 if (!rbio->read_rebuild) {
1410 for (i = 0; i < nr_pages; i++) {
1411 if (faila != -1) {
1412 page = rbio_stripe_page(rbio, faila, i);
1413 SetPageUptodate(page);
1414 }
1415 if (failb != -1) {
1416 page = rbio_stripe_page(rbio, failb, i);
1417 SetPageUptodate(page);
1418 }
1419 }
1420 }
1421 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1422 /*
1423 * if we're rebuilding a read, we have to use
1424 * pages from the bio list
1425 */
1426 if (rbio->read_rebuild &&
1427 (stripe == faila || stripe == failb)) {
1428 page = page_in_rbio(rbio, stripe, pagenr, 0);
1429 } else {
1430 page = rbio_stripe_page(rbio, stripe, pagenr);
1431 }
1432 kunmap(page);
1433 }
1434 }
1435
1436 err = 0;
1437cleanup:
1438 kfree(pointers);
1439
1440cleanup_io:
1441
1442 if (rbio->read_rebuild) {
1443 rbio_orig_end_io(rbio, err, err == 0);
1444 } else if (err == 0) {
1445 rbio->faila = -1;
1446 rbio->failb = -1;
1447 finish_rmw(rbio);
1448 } else {
1449 rbio_orig_end_io(rbio, err, 0);
1450 }
1451}
1452
1453/*
1454 * This is called only for stripes we've read from disk to
1455 * reconstruct the parity.
1456 */
1457static void raid_recover_end_io(struct bio *bio, int err)
1458{
1459 struct btrfs_raid_bio *rbio = bio->bi_private;
1460
1461 /*
1462 * we only read stripe pages off the disk, set them
1463 * up to date if there were no errors
1464 */
1465 if (err)
1466 fail_bio_stripe(rbio, bio);
1467 else
1468 set_bio_pages_uptodate(bio);
1469 bio_put(bio);
1470
1471 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1472 return;
1473
1474 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1475 rbio_orig_end_io(rbio, -EIO, 0);
1476 else
1477 __raid_recover_end_io(rbio);
1478}
1479
1480/*
1481 * reads everything we need off the disk to reconstruct
1482 * the parity. endio handlers trigger final reconstruction
1483 * when the IO is done.
1484 *
1485 * This is used both for reads from the higher layers and for
1486 * parity construction required to finish a rmw cycle.
1487 */
1488static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1489{
1490 int bios_to_read = 0;
1491 struct btrfs_bio *bbio = rbio->bbio;
1492 struct bio_list bio_list;
1493 int ret;
1494 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1495 int pagenr;
1496 int stripe;
1497 struct bio *bio;
1498
1499 bio_list_init(&bio_list);
1500
1501 ret = alloc_rbio_pages(rbio);
1502 if (ret)
1503 goto cleanup;
1504
1505 atomic_set(&rbio->bbio->error, 0);
1506
1507 /*
1508 * read everything that hasn't failed.
1509 */
1510 for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1511 if (rbio->faila == stripe ||
1512 rbio->failb == stripe)
1513 continue;
1514
1515 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1516 struct page *p;
1517
1518 /*
1519 * the rmw code may have already read this
1520 * page in
1521 */
1522 p = rbio_stripe_page(rbio, stripe, pagenr);
1523 if (PageUptodate(p))
1524 continue;
1525
1526 ret = rbio_add_io_page(rbio, &bio_list,
1527 rbio_stripe_page(rbio, stripe, pagenr),
1528 stripe, pagenr, rbio->stripe_len);
1529 if (ret < 0)
1530 goto cleanup;
1531 }
1532 }
1533
1534 bios_to_read = bio_list_size(&bio_list);
1535 if (!bios_to_read) {
1536 /*
1537 * we might have no bios to read just because the pages
1538 * were up to date, or we might have no bios to read because
1539 * the devices were gone.
1540 */
1541 if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
1542 __raid_recover_end_io(rbio);
1543 goto out;
1544 } else {
1545 goto cleanup;
1546 }
1547 }
1548
1549 /*
1550 * the bbio may be freed once we submit the last bio. Make sure
1551 * not to touch it after that
1552 */
1553 atomic_set(&bbio->stripes_pending, bios_to_read);
1554 while (1) {
1555 bio = bio_list_pop(&bio_list);
1556 if (!bio)
1557 break;
1558
1559 bio->bi_private = rbio;
1560 bio->bi_end_io = raid_recover_end_io;
1561
1562 btrfs_bio_wq_end_io(rbio->fs_info, bio,
1563 BTRFS_WQ_ENDIO_RAID56);
1564
1565 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1566 submit_bio(READ, bio);
1567 }
1568out:
1569 return 0;
1570
1571cleanup:
1572 if (rbio->read_rebuild)
1573 rbio_orig_end_io(rbio, -EIO, 0);
1574 return -EIO;
1575}
1576
1577/*
1578 * the main entry point for reads from the higher layers. This
1579 * is really only called when the normal read path had a failure,
1580 * so we assume the bio they send down corresponds to a failed part
1581 * of the drive.
1582 */
1583int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
1584 struct btrfs_bio *bbio, u64 *raid_map,
1585 u64 stripe_len, int mirror_num)
1586{
1587 struct btrfs_raid_bio *rbio;
1588 int ret;
1589
1590 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1591 if (IS_ERR(rbio)) {
1592 return PTR_ERR(rbio);
1593 }
1594
1595 rbio->read_rebuild = 1;
1596 bio_list_add(&rbio->bio_list, bio);
1597 rbio->bio_list_bytes = bio->bi_size;
1598
1599 rbio->faila = find_logical_bio_stripe(rbio, bio);
1600 if (rbio->faila == -1) {
1601 BUG();
1602 kfree(rbio);
1603 return -EIO;
1604 }
1605
1606 /*
1607 * reconstruct from the q stripe if they are
1608 * asking for mirror 3
1609 */
1610 if (mirror_num == 3)
1611 rbio->failb = bbio->num_stripes - 2;
1612
1613 ret = lock_stripe_add(rbio);
1614
1615 /*
1616 * __raid56_parity_recover will end the bio with
1617 * any errors it hits. We don't want to return
1618 * its error value up the stack because our caller
1619 * will end up calling bio_endio with any nonzero
1620 * return
1621 */
1622 if (ret == 0)
1623 __raid56_parity_recover(rbio);
1624 /*
1625 * our rbio has been added to the list of
1626 * rbios that will be handled after the
1627 * currently lock owner is done
1628 */
1629 return 0;
1630
1631}
1632
1633static void rmw_work(struct btrfs_work *work)
1634{
1635 struct btrfs_raid_bio *rbio;
1636
1637 rbio = container_of(work, struct btrfs_raid_bio, work);
1638 raid56_rmw_stripe(rbio);
1639}
1640
1641static void read_rebuild_work(struct btrfs_work *work)
1642{
1643 struct btrfs_raid_bio *rbio;
1644
1645 rbio = container_of(work, struct btrfs_raid_bio, work);
1646 __raid56_parity_recover(rbio);
1647}