aboutsummaryrefslogtreecommitdiffstats
path: root/fs/exofs/ore_raid.c
diff options
context:
space:
mode:
authorBoaz Harrosh <bharrosh@panasas.com>2011-10-12 12:42:22 -0400
committerBoaz Harrosh <bharrosh@panasas.com>2011-10-24 19:55:36 -0400
commita1fec1dbbc8db974d2582e4040590cebe72171e4 (patch)
tree9dcbe1933b7f40256f40393f3c86dbb16e8fb953 /fs/exofs/ore_raid.c
parent3e335672e018c06e007f85a5d54afd721fb3d6d5 (diff)
ore: RAID5 read
This patch introduces the first stage of RAID5 support mainly the skip-over-raid-units when reading. For writes it inserts BLANK units, into where XOR blocks should be calculated and written to. It introduces the new "general raid maths", and the main additional parameters and components needed for raid5. Since at this stage it could corrupt future version that actually do support raid5. The enablement of raid5 mounting and setting of parity-count > 0 is disabled. So the raid5 code will never be used. Mounting of raid5 is only enabled later once the basic XOR write is also in. But if the patch "enable RAID5" is applied this code has been tested to be able to properly read raid5 volumes and is according to standard. Also it has been tested that the new maths still properly supports RAID0 and grouping code just as before. (BTW: I have found more bugs in the pnfs-obj RAID math fixed here) The ore.c file is getting too big, so new ore_raid.[hc] files are added that will include the special raid stuff that are not used in striping and mirrors. In future write support these will get bigger. When adding the ore_raid.c to Kbuild file I was forced to rename ore.ko to libore.ko. Is it possible to keep source file, say ore.c and module file ore.ko the same even if there are multiple files inside ore.ko? Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Diffstat (limited to 'fs/exofs/ore_raid.c')
-rw-r--r--fs/exofs/ore_raid.c140
1 files changed, 140 insertions, 0 deletions
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
new file mode 100644
index 000000000000..8d4b93a93c67
--- /dev/null
+++ b/fs/exofs/ore_raid.c
@@ -0,0 +1,140 @@
1/*
2 * Copyright (C) 2011
3 * Boaz Harrosh <bharrosh@panasas.com>
4 *
5 * This file is part of the objects raid engine (ore).
6 *
7 * It is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as published
9 * by the Free Software Foundation.
10 *
11 * You should have received a copy of the GNU General Public License
12 * along with "ore". If not, write to the Free Software Foundation, Inc:
13 * "Free Software Foundation <info@fsf.org>"
14 */
15
16#include <linux/gfp.h>
17
18#include "ore_raid.h"
19
20struct page *_raid_page_alloc(void)
21{
22 return alloc_page(GFP_KERNEL);
23}
24
25void _raid_page_free(struct page *p)
26{
27 __free_page(p);
28}
29
30void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
31 bool not_last)
32{
33 struct osd_sg_entry *sge;
34
35 ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d "
36 "offset=0x%llx length=0x%x last_sgs_total=0x%x\n",
37 per_dev->dev, cur_len, not_last, per_dev->cur_sg,
38 _LLU(per_dev->offset), per_dev->length,
39 per_dev->last_sgs_total);
40
41 if (!per_dev->cur_sg) {
42 sge = per_dev->sglist;
43
44 /* First time we prepare two entries */
45 if (per_dev->length) {
46 ++per_dev->cur_sg;
47 sge->offset = per_dev->offset;
48 sge->len = per_dev->length;
49 } else {
50 /* Here the parity is the first unit of this object.
51 * This happens every time we reach a parity device on
52 * the same stripe as the per_dev->offset. We need to
53 * just skip this unit.
54 */
55 per_dev->offset += cur_len;
56 return;
57 }
58 } else {
59 /* finalize the last one */
60 sge = &per_dev->sglist[per_dev->cur_sg - 1];
61 sge->len = per_dev->length - per_dev->last_sgs_total;
62 }
63
64 if (not_last) {
65 /* Partly prepare the next one */
66 struct osd_sg_entry *next_sge = sge + 1;
67
68 ++per_dev->cur_sg;
69 next_sge->offset = sge->offset + sge->len + cur_len;
70 /* Save cur len so we know how mutch was added next time */
71 per_dev->last_sgs_total = per_dev->length;
72 next_sge->len = 0;
73 } else if (!sge->len) {
74 /* Optimize for when the last unit is a parity */
75 --per_dev->cur_sg;
76 }
77}
78
79/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */
80int _ore_add_parity_unit(struct ore_io_state *ios,
81 struct ore_striping_info *si,
82 struct ore_per_dev_state *per_dev,
83 unsigned cur_len)
84{
85 if (ios->reading) {
86 BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev);
87 _ore_add_sg_seg(per_dev, cur_len, true);
88 } else {
89 struct page **pages = ios->parity_pages + ios->cur_par_page;
90 unsigned num_pages = ios->layout->stripe_unit / PAGE_SIZE;
91 unsigned array_start = 0;
92 unsigned i;
93 int ret;
94
95 for (i = 0; i < num_pages; i++) {
96 pages[i] = _raid_page_alloc();
97 if (unlikely(!pages[i]))
98 return -ENOMEM;
99
100 ++(ios->cur_par_page);
101 /* TODO: only read support for now */
102 clear_highpage(pages[i]);
103 }
104
105 ORE_DBGMSG("writing dev=%d num_pages=%d cur_par_page=%d",
106 per_dev->dev, num_pages, ios->cur_par_page);
107
108 ret = _ore_add_stripe_unit(ios, &array_start, 0, pages,
109 per_dev, num_pages * PAGE_SIZE);
110 if (unlikely(ret))
111 return ret;
112 }
113 return 0;
114}
115
116int _ore_post_alloc_raid_stuff(struct ore_io_state *ios)
117{
118 /*TODO: Only raid writes has stuff to add here */
119 return 0;
120}
121
122void _ore_free_raid_stuff(struct ore_io_state *ios)
123{
124 if (ios->parity_pages) { /* writing and raid */
125 unsigned i;
126
127 for (i = 0; i < ios->cur_par_page; i++) {
128 struct page *page = ios->parity_pages[i];
129
130 if (page)
131 _raid_page_free(page);
132 }
133 if (ios->extra_part_alloc)
134 kfree(ios->parity_pages);
135 } else {
136 /* Will only be set if raid reading && sglist is big */
137 if (ios->extra_part_alloc)
138 kfree(ios->per_dev[0].sglist);
139 }
140}