diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-10-26 15:33:50 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-10-26 15:33:50 -0400 |
commit | c28cfd60e4ec3f494b73ef7d6c661f5f491cd84f (patch) | |
tree | 390c23c07b4f484528b6fa5a72bae1b879df35b1 | |
parent | dfa4a423cf80afe8f81a36d8e663961c4acca343 (diff) | |
parent | 44231e686b2ba3b5702db867bb84e6d76b7cf2c7 (diff) |
Merge branch 'for-linus' of git://git.open-osd.org/linux-open-osd
* 'for-linus' of git://git.open-osd.org/linux-open-osd: (21 commits)
ore: Enable RAID5 mounts
exofs: Support for RAID5 read-4-write interface.
ore: RAID5 Write
ore: RAID5 read
fs/Makefile: Always inspect exofs/
ore: Make ore_calc_stripe_info EXPORT_SYMBOL
ore/exofs: Change ore_check_io API
ore/exofs: Define new ore_verify_layout
ore: Support for partial component table
ore: Support for short read/writes
exofs: Support for short read/writes
ore: Remove check for ios->kern_buff in _prepare_for_striping to later
ore: cleanup: Embed an ore_striping_info inside ore_io_state
ore: Only IO one group at a time (API change)
ore/exofs: Change the type of the devices array (API change)
ore: Make ore_striping_info and ore_calc_stripe_info public
exofs: Remove unused data_map member from exofs_sb_info
exofs: Rename struct ore_components comps => oc
exofs/super.c: local functions should be static
exofs/ore.c: local functions should be static
...
-rw-r--r-- | drivers/scsi/osd/Kconfig | 4 | ||||
-rw-r--r-- | fs/Makefile | 2 | ||||
-rw-r--r-- | fs/exofs/Kbuild | 3 | ||||
-rw-r--r-- | fs/exofs/Kconfig | 9 | ||||
-rw-r--r-- | fs/exofs/exofs.h | 26 | ||||
-rw-r--r-- | fs/exofs/inode.c | 233 | ||||
-rw-r--r-- | fs/exofs/ore.c | 656 | ||||
-rw-r--r-- | fs/exofs/ore_raid.c | 660 | ||||
-rw-r--r-- | fs/exofs/ore_raid.h | 79 | ||||
-rw-r--r-- | fs/exofs/super.c | 205 | ||||
-rw-r--r-- | include/scsi/osd_ore.h | 80 |
11 files changed, 1583 insertions, 374 deletions
diff --git a/drivers/scsi/osd/Kconfig b/drivers/scsi/osd/Kconfig index 861b5cebaeae..a0703514eb0f 100644 --- a/drivers/scsi/osd/Kconfig +++ b/drivers/scsi/osd/Kconfig | |||
@@ -11,10 +11,6 @@ | |||
11 | # it under the terms of the GNU General Public version 2 License as | 11 | # it under the terms of the GNU General Public version 2 License as |
12 | # published by the Free Software Foundation | 12 | # published by the Free Software Foundation |
13 | # | 13 | # |
14 | # FIXME: SCSI_OSD_INITIATOR should select CONFIG (HMAC) SHA1 somehow. | ||
15 | # How is it done properly? | ||
16 | # | ||
17 | |||
18 | config SCSI_OSD_INITIATOR | 14 | config SCSI_OSD_INITIATOR |
19 | tristate "OSD-Initiator library" | 15 | tristate "OSD-Initiator library" |
20 | depends on SCSI | 16 | depends on SCSI |
diff --git a/fs/Makefile b/fs/Makefile index afc109691a9b..5c30a13341eb 100644 --- a/fs/Makefile +++ b/fs/Makefile | |||
@@ -120,6 +120,6 @@ obj-$(CONFIG_DEBUG_FS) += debugfs/ | |||
120 | obj-$(CONFIG_OCFS2_FS) += ocfs2/ | 120 | obj-$(CONFIG_OCFS2_FS) += ocfs2/ |
121 | obj-$(CONFIG_BTRFS_FS) += btrfs/ | 121 | obj-$(CONFIG_BTRFS_FS) += btrfs/ |
122 | obj-$(CONFIG_GFS2_FS) += gfs2/ | 122 | obj-$(CONFIG_GFS2_FS) += gfs2/ |
123 | obj-$(CONFIG_EXOFS_FS) += exofs/ | 123 | obj-$(y) += exofs/ # Multiple mods, used by nfs/objlayout |
124 | obj-$(CONFIG_CEPH_FS) += ceph/ | 124 | obj-$(CONFIG_CEPH_FS) += ceph/ |
125 | obj-$(CONFIG_PSTORE) += pstore/ | 125 | obj-$(CONFIG_PSTORE) += pstore/ |
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild index c5a5855a6c44..352ba149d23e 100644 --- a/fs/exofs/Kbuild +++ b/fs/exofs/Kbuild | |||
@@ -13,7 +13,8 @@ | |||
13 | # | 13 | # |
14 | 14 | ||
15 | # ore module library | 15 | # ore module library |
16 | obj-$(CONFIG_ORE) += ore.o | 16 | libore-y := ore.o ore_raid.o |
17 | obj-$(CONFIG_ORE) += libore.o | ||
17 | 18 | ||
18 | exofs-y := inode.o file.o symlink.o namei.o dir.o super.o | 19 | exofs-y := inode.o file.o symlink.o namei.o dir.o super.o |
19 | obj-$(CONFIG_EXOFS_FS) += exofs.o | 20 | obj-$(CONFIG_EXOFS_FS) += exofs.o |
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig index 70bae4149291..fa9a286c8771 100644 --- a/fs/exofs/Kconfig +++ b/fs/exofs/Kconfig | |||
@@ -1,10 +1,17 @@ | |||
1 | # Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects | ||
2 | # for every ORE user we do it like this. Any user should add itself here | ||
3 | # at the "depends on EXOFS_FS || ..." with an ||. The dependencies are | ||
4 | # selected here, and we default to "ON". So in effect it is like been | ||
5 | # selected by any of the users. | ||
1 | config ORE | 6 | config ORE |
2 | tristate | 7 | tristate |
8 | depends on EXOFS_FS | ||
9 | select ASYNC_XOR | ||
10 | default SCSI_OSD_ULD | ||
3 | 11 | ||
4 | config EXOFS_FS | 12 | config EXOFS_FS |
5 | tristate "exofs: OSD based file system support" | 13 | tristate "exofs: OSD based file system support" |
6 | depends on SCSI_OSD_ULD | 14 | depends on SCSI_OSD_ULD |
7 | select ORE | ||
8 | help | 15 | help |
9 | EXOFS is a file system that uses an OSD storage device, | 16 | EXOFS is a file system that uses an OSD storage device, |
10 | as its backing storage. | 17 | as its backing storage. |
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index f4e442ec7445..51f4b4c40f09 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h | |||
@@ -53,6 +53,10 @@ | |||
53 | /* u64 has problems with printk this will cast it to unsigned long long */ | 53 | /* u64 has problems with printk this will cast it to unsigned long long */ |
54 | #define _LLU(x) (unsigned long long)(x) | 54 | #define _LLU(x) (unsigned long long)(x) |
55 | 55 | ||
56 | struct exofs_dev { | ||
57 | struct ore_dev ored; | ||
58 | unsigned did; | ||
59 | }; | ||
56 | /* | 60 | /* |
57 | * our extension to the in-memory superblock | 61 | * our extension to the in-memory superblock |
58 | */ | 62 | */ |
@@ -66,13 +70,9 @@ struct exofs_sb_info { | |||
66 | u32 s_next_generation; /* next gen # to use */ | 70 | u32 s_next_generation; /* next gen # to use */ |
67 | atomic_t s_curr_pending; /* number of pending commands */ | 71 | atomic_t s_curr_pending; /* number of pending commands */ |
68 | 72 | ||
69 | struct pnfs_osd_data_map data_map; /* Default raid to use | ||
70 | * FIXME: Needed ? | ||
71 | */ | ||
72 | struct ore_layout layout; /* Default files layout */ | 73 | struct ore_layout layout; /* Default files layout */ |
73 | struct ore_comp one_comp; /* id & cred of partition id=0*/ | 74 | struct ore_comp one_comp; /* id & cred of partition id=0*/ |
74 | struct ore_components comps; /* comps for the partition */ | 75 | struct ore_components oc; /* comps for the partition */ |
75 | struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */ | ||
76 | }; | 76 | }; |
77 | 77 | ||
78 | /* | 78 | /* |
@@ -86,7 +86,7 @@ struct exofs_i_info { | |||
86 | uint32_t i_dir_start_lookup; /* which page to start lookup */ | 86 | uint32_t i_dir_start_lookup; /* which page to start lookup */ |
87 | uint64_t i_commit_size; /* the object's written length */ | 87 | uint64_t i_commit_size; /* the object's written length */ |
88 | struct ore_comp one_comp; /* same component for all devices */ | 88 | struct ore_comp one_comp; /* same component for all devices */ |
89 | struct ore_components comps; /* inode view of the device table */ | 89 | struct ore_components oc; /* inode view of the device table */ |
90 | }; | 90 | }; |
91 | 91 | ||
92 | static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) | 92 | static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) |
@@ -207,7 +207,7 @@ extern const struct inode_operations exofs_fast_symlink_inode_operations; | |||
207 | * bigger and that the device table repeats twice. | 207 | * bigger and that the device table repeats twice. |
208 | * See: exofs_read_lookup_dev_table() | 208 | * See: exofs_read_lookup_dev_table() |
209 | */ | 209 | */ |
210 | static inline void exofs_init_comps(struct ore_components *comps, | 210 | static inline void exofs_init_comps(struct ore_components *oc, |
211 | struct ore_comp *one_comp, | 211 | struct ore_comp *one_comp, |
212 | struct exofs_sb_info *sbi, osd_id oid) | 212 | struct exofs_sb_info *sbi, osd_id oid) |
213 | { | 213 | { |
@@ -217,13 +217,15 @@ static inline void exofs_init_comps(struct ore_components *comps, | |||
217 | one_comp->obj.id = oid; | 217 | one_comp->obj.id = oid; |
218 | exofs_make_credential(one_comp->cred, &one_comp->obj); | 218 | exofs_make_credential(one_comp->cred, &one_comp->obj); |
219 | 219 | ||
220 | comps->numdevs = sbi->comps.numdevs; | 220 | oc->first_dev = 0; |
221 | comps->single_comp = EC_SINGLE_COMP; | 221 | oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 * |
222 | comps->comps = one_comp; | 222 | sbi->layout.group_count; |
223 | oc->single_comp = EC_SINGLE_COMP; | ||
224 | oc->comps = one_comp; | ||
223 | 225 | ||
224 | /* Round robin device view of the table */ | 226 | /* Round robin device view of the table */ |
225 | first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->comps.numdevs; | 227 | first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs; |
226 | comps->ods = sbi->comps.ods + first_dev; | 228 | oc->ods = &sbi->oc.ods[first_dev]; |
227 | } | 229 | } |
228 | 230 | ||
229 | #endif | 231 | #endif |
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index f39a38fc2349..3e5f3a6be90a 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c | |||
@@ -37,11 +37,7 @@ | |||
37 | 37 | ||
38 | #define EXOFS_DBGMSG2(M...) do {} while (0) | 38 | #define EXOFS_DBGMSG2(M...) do {} while (0) |
39 | 39 | ||
40 | enum { BIO_MAX_PAGES_KMALLOC = | 40 | enum {MAX_PAGES_KMALLOC = PAGE_SIZE / sizeof(struct page *), }; |
41 | (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), | ||
42 | MAX_PAGES_KMALLOC = | ||
43 | PAGE_SIZE / sizeof(struct page *), | ||
44 | }; | ||
45 | 41 | ||
46 | unsigned exofs_max_io_pages(struct ore_layout *layout, | 42 | unsigned exofs_max_io_pages(struct ore_layout *layout, |
47 | unsigned expected_pages) | 43 | unsigned expected_pages) |
@@ -49,8 +45,7 @@ unsigned exofs_max_io_pages(struct ore_layout *layout, | |||
49 | unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC); | 45 | unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC); |
50 | 46 | ||
51 | /* TODO: easily support bio chaining */ | 47 | /* TODO: easily support bio chaining */ |
52 | pages = min_t(unsigned, pages, | 48 | pages = min_t(unsigned, pages, layout->max_io_length / PAGE_SIZE); |
53 | layout->group_width * BIO_MAX_PAGES_KMALLOC); | ||
54 | return pages; | 49 | return pages; |
55 | } | 50 | } |
56 | 51 | ||
@@ -68,6 +63,7 @@ struct page_collect { | |||
68 | bool read_4_write; /* This means two things: that the read is sync | 63 | bool read_4_write; /* This means two things: that the read is sync |
69 | * And the pages should not be unlocked. | 64 | * And the pages should not be unlocked. |
70 | */ | 65 | */ |
66 | struct page *that_locked_page; | ||
71 | }; | 67 | }; |
72 | 68 | ||
73 | static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, | 69 | static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, |
@@ -86,6 +82,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, | |||
86 | pcol->length = 0; | 82 | pcol->length = 0; |
87 | pcol->pg_first = -1; | 83 | pcol->pg_first = -1; |
88 | pcol->read_4_write = false; | 84 | pcol->read_4_write = false; |
85 | pcol->that_locked_page = NULL; | ||
89 | } | 86 | } |
90 | 87 | ||
91 | static void _pcol_reset(struct page_collect *pcol) | 88 | static void _pcol_reset(struct page_collect *pcol) |
@@ -98,6 +95,7 @@ static void _pcol_reset(struct page_collect *pcol) | |||
98 | pcol->length = 0; | 95 | pcol->length = 0; |
99 | pcol->pg_first = -1; | 96 | pcol->pg_first = -1; |
100 | pcol->ios = NULL; | 97 | pcol->ios = NULL; |
98 | pcol->that_locked_page = NULL; | ||
101 | 99 | ||
102 | /* this is probably the end of the loop but in writes | 100 | /* this is probably the end of the loop but in writes |
103 | * it might not end here. don't be left with nothing | 101 | * it might not end here. don't be left with nothing |
@@ -149,14 +147,17 @@ static int pcol_add_page(struct page_collect *pcol, struct page *page, | |||
149 | return 0; | 147 | return 0; |
150 | } | 148 | } |
151 | 149 | ||
150 | enum {PAGE_WAS_NOT_IN_IO = 17}; | ||
152 | static int update_read_page(struct page *page, int ret) | 151 | static int update_read_page(struct page *page, int ret) |
153 | { | 152 | { |
154 | if (ret == 0) { | 153 | switch (ret) { |
154 | case 0: | ||
155 | /* Everything is OK */ | 155 | /* Everything is OK */ |
156 | SetPageUptodate(page); | 156 | SetPageUptodate(page); |
157 | if (PageError(page)) | 157 | if (PageError(page)) |
158 | ClearPageError(page); | 158 | ClearPageError(page); |
159 | } else if (ret == -EFAULT) { | 159 | break; |
160 | case -EFAULT: | ||
160 | /* In this case we were trying to read something that wasn't on | 161 | /* In this case we were trying to read something that wasn't on |
161 | * disk yet - return a page full of zeroes. This should be OK, | 162 | * disk yet - return a page full of zeroes. This should be OK, |
162 | * because the object should be empty (if there was a write | 163 | * because the object should be empty (if there was a write |
@@ -167,16 +168,22 @@ static int update_read_page(struct page *page, int ret) | |||
167 | SetPageUptodate(page); | 168 | SetPageUptodate(page); |
168 | if (PageError(page)) | 169 | if (PageError(page)) |
169 | ClearPageError(page); | 170 | ClearPageError(page); |
170 | ret = 0; /* recovered error */ | ||
171 | EXOFS_DBGMSG("recovered read error\n"); | 171 | EXOFS_DBGMSG("recovered read error\n"); |
172 | } else /* Error */ | 172 | /* fall through */ |
173 | case PAGE_WAS_NOT_IN_IO: | ||
174 | ret = 0; /* recovered error */ | ||
175 | break; | ||
176 | default: | ||
173 | SetPageError(page); | 177 | SetPageError(page); |
174 | 178 | } | |
175 | return ret; | 179 | return ret; |
176 | } | 180 | } |
177 | 181 | ||
178 | static void update_write_page(struct page *page, int ret) | 182 | static void update_write_page(struct page *page, int ret) |
179 | { | 183 | { |
184 | if (unlikely(ret == PAGE_WAS_NOT_IN_IO)) | ||
185 | return; /* don't pass start don't collect $200 */ | ||
186 | |||
180 | if (ret) { | 187 | if (ret) { |
181 | mapping_set_error(page->mapping, ret); | 188 | mapping_set_error(page->mapping, ret); |
182 | SetPageError(page); | 189 | SetPageError(page); |
@@ -190,15 +197,16 @@ static void update_write_page(struct page *page, int ret) | |||
190 | static int __readpages_done(struct page_collect *pcol) | 197 | static int __readpages_done(struct page_collect *pcol) |
191 | { | 198 | { |
192 | int i; | 199 | int i; |
193 | u64 resid; | ||
194 | u64 good_bytes; | 200 | u64 good_bytes; |
195 | u64 length = 0; | 201 | u64 length = 0; |
196 | int ret = ore_check_io(pcol->ios, &resid); | 202 | int ret = ore_check_io(pcol->ios, NULL); |
197 | 203 | ||
198 | if (likely(!ret)) | 204 | if (likely(!ret)) { |
199 | good_bytes = pcol->length; | 205 | good_bytes = pcol->length; |
200 | else | 206 | ret = PAGE_WAS_NOT_IN_IO; |
201 | good_bytes = pcol->length - resid; | 207 | } else { |
208 | good_bytes = 0; | ||
209 | } | ||
202 | 210 | ||
203 | EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx" | 211 | EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx" |
204 | " length=0x%lx nr_pages=%u\n", | 212 | " length=0x%lx nr_pages=%u\n", |
@@ -259,6 +267,46 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) | |||
259 | } | 267 | } |
260 | } | 268 | } |
261 | 269 | ||
270 | static int _maybe_not_all_in_one_io(struct ore_io_state *ios, | ||
271 | struct page_collect *pcol_src, struct page_collect *pcol) | ||
272 | { | ||
273 | /* length was wrong or offset was not page aligned */ | ||
274 | BUG_ON(pcol_src->nr_pages < ios->nr_pages); | ||
275 | |||
276 | if (pcol_src->nr_pages > ios->nr_pages) { | ||
277 | struct page **src_page; | ||
278 | unsigned pages_less = pcol_src->nr_pages - ios->nr_pages; | ||
279 | unsigned long len_less = pcol_src->length - ios->length; | ||
280 | unsigned i; | ||
281 | int ret; | ||
282 | |||
283 | /* This IO was trimmed */ | ||
284 | pcol_src->nr_pages = ios->nr_pages; | ||
285 | pcol_src->length = ios->length; | ||
286 | |||
287 | /* Left over pages are passed to the next io */ | ||
288 | pcol->expected_pages += pages_less; | ||
289 | pcol->nr_pages = pages_less; | ||
290 | pcol->length = len_less; | ||
291 | src_page = pcol_src->pages + pcol_src->nr_pages; | ||
292 | pcol->pg_first = (*src_page)->index; | ||
293 | |||
294 | ret = pcol_try_alloc(pcol); | ||
295 | if (unlikely(ret)) | ||
296 | return ret; | ||
297 | |||
298 | for (i = 0; i < pages_less; ++i) | ||
299 | pcol->pages[i] = *src_page++; | ||
300 | |||
301 | EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x " | ||
302 | "pages_less=0x%x expected_pages=0x%x " | ||
303 | "next_offset=0x%llx next_len=0x%lx\n", | ||
304 | pcol_src->nr_pages, pages_less, pcol->expected_pages, | ||
305 | pcol->pg_first * PAGE_SIZE, pcol->length); | ||
306 | } | ||
307 | return 0; | ||
308 | } | ||
309 | |||
262 | static int read_exec(struct page_collect *pcol) | 310 | static int read_exec(struct page_collect *pcol) |
263 | { | 311 | { |
264 | struct exofs_i_info *oi = exofs_i(pcol->inode); | 312 | struct exofs_i_info *oi = exofs_i(pcol->inode); |
@@ -270,7 +318,7 @@ static int read_exec(struct page_collect *pcol) | |||
270 | return 0; | 318 | return 0; |
271 | 319 | ||
272 | if (!pcol->ios) { | 320 | if (!pcol->ios) { |
273 | int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, true, | 321 | int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true, |
274 | pcol->pg_first << PAGE_CACHE_SHIFT, | 322 | pcol->pg_first << PAGE_CACHE_SHIFT, |
275 | pcol->length, &pcol->ios); | 323 | pcol->length, &pcol->ios); |
276 | 324 | ||
@@ -280,7 +328,6 @@ static int read_exec(struct page_collect *pcol) | |||
280 | 328 | ||
281 | ios = pcol->ios; | 329 | ios = pcol->ios; |
282 | ios->pages = pcol->pages; | 330 | ios->pages = pcol->pages; |
283 | ios->nr_pages = pcol->nr_pages; | ||
284 | 331 | ||
285 | if (pcol->read_4_write) { | 332 | if (pcol->read_4_write) { |
286 | ore_read(pcol->ios); | 333 | ore_read(pcol->ios); |
@@ -296,17 +343,23 @@ static int read_exec(struct page_collect *pcol) | |||
296 | *pcol_copy = *pcol; | 343 | *pcol_copy = *pcol; |
297 | ios->done = readpages_done; | 344 | ios->done = readpages_done; |
298 | ios->private = pcol_copy; | 345 | ios->private = pcol_copy; |
346 | |||
347 | /* pages ownership was passed to pcol_copy */ | ||
348 | _pcol_reset(pcol); | ||
349 | |||
350 | ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol); | ||
351 | if (unlikely(ret)) | ||
352 | goto err; | ||
353 | |||
354 | EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n", | ||
355 | pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length)); | ||
356 | |||
299 | ret = ore_read(ios); | 357 | ret = ore_read(ios); |
300 | if (unlikely(ret)) | 358 | if (unlikely(ret)) |
301 | goto err; | 359 | goto err; |
302 | 360 | ||
303 | atomic_inc(&pcol->sbi->s_curr_pending); | 361 | atomic_inc(&pcol->sbi->s_curr_pending); |
304 | 362 | ||
305 | EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", | ||
306 | oi->one_comp.obj.id, _LLU(ios->offset), pcol->length); | ||
307 | |||
308 | /* pages ownership was passed to pcol_copy */ | ||
309 | _pcol_reset(pcol); | ||
310 | return 0; | 363 | return 0; |
311 | 364 | ||
312 | err: | 365 | err: |
@@ -341,6 +394,8 @@ static int readpage_strip(void *data, struct page *page) | |||
341 | EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino, | 394 | EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino, |
342 | page->index); | 395 | page->index); |
343 | 396 | ||
397 | pcol->that_locked_page = page; | ||
398 | |||
344 | if (page->index < end_index) | 399 | if (page->index < end_index) |
345 | len = PAGE_CACHE_SIZE; | 400 | len = PAGE_CACHE_SIZE; |
346 | else if (page->index == end_index) | 401 | else if (page->index == end_index) |
@@ -429,6 +484,10 @@ static int exofs_readpages(struct file *file, struct address_space *mapping, | |||
429 | return ret; | 484 | return ret; |
430 | } | 485 | } |
431 | 486 | ||
487 | ret = read_exec(&pcol); | ||
488 | if (unlikely(ret)) | ||
489 | return ret; | ||
490 | |||
432 | return read_exec(&pcol); | 491 | return read_exec(&pcol); |
433 | } | 492 | } |
434 | 493 | ||
@@ -462,17 +521,18 @@ static void writepages_done(struct ore_io_state *ios, void *p) | |||
462 | { | 521 | { |
463 | struct page_collect *pcol = p; | 522 | struct page_collect *pcol = p; |
464 | int i; | 523 | int i; |
465 | u64 resid; | ||
466 | u64 good_bytes; | 524 | u64 good_bytes; |
467 | u64 length = 0; | 525 | u64 length = 0; |
468 | int ret = ore_check_io(ios, &resid); | 526 | int ret = ore_check_io(ios, NULL); |
469 | 527 | ||
470 | atomic_dec(&pcol->sbi->s_curr_pending); | 528 | atomic_dec(&pcol->sbi->s_curr_pending); |
471 | 529 | ||
472 | if (likely(!ret)) | 530 | if (likely(!ret)) { |
473 | good_bytes = pcol->length; | 531 | good_bytes = pcol->length; |
474 | else | 532 | ret = PAGE_WAS_NOT_IN_IO; |
475 | good_bytes = pcol->length - resid; | 533 | } else { |
534 | good_bytes = 0; | ||
535 | } | ||
476 | 536 | ||
477 | EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx" | 537 | EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx" |
478 | " length=0x%lx nr_pages=%u\n", | 538 | " length=0x%lx nr_pages=%u\n", |
@@ -505,6 +565,56 @@ static void writepages_done(struct ore_io_state *ios, void *p) | |||
505 | EXOFS_DBGMSG2("writepages_done END\n"); | 565 | EXOFS_DBGMSG2("writepages_done END\n"); |
506 | } | 566 | } |
507 | 567 | ||
568 | static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) | ||
569 | { | ||
570 | struct page_collect *pcol = priv; | ||
571 | pgoff_t index = offset / PAGE_SIZE; | ||
572 | |||
573 | if (!pcol->that_locked_page || | ||
574 | (pcol->that_locked_page->index != index)) { | ||
575 | struct page *page = find_get_page(pcol->inode->i_mapping, index); | ||
576 | |||
577 | if (!page) { | ||
578 | page = find_or_create_page(pcol->inode->i_mapping, | ||
579 | index, GFP_NOFS); | ||
580 | if (unlikely(!page)) { | ||
581 | EXOFS_DBGMSG("grab_cache_page Failed " | ||
582 | "index=0x%llx\n", _LLU(index)); | ||
583 | return NULL; | ||
584 | } | ||
585 | unlock_page(page); | ||
586 | } | ||
587 | if (PageDirty(page) || PageWriteback(page)) | ||
588 | *uptodate = true; | ||
589 | else | ||
590 | *uptodate = PageUptodate(page); | ||
591 | EXOFS_DBGMSG("index=0x%lx uptodate=%d\n", index, *uptodate); | ||
592 | return page; | ||
593 | } else { | ||
594 | EXOFS_DBGMSG("YES that_locked_page index=0x%lx\n", | ||
595 | pcol->that_locked_page->index); | ||
596 | *uptodate = true; | ||
597 | return pcol->that_locked_page; | ||
598 | } | ||
599 | } | ||
600 | |||
601 | static void __r4w_put_page(void *priv, struct page *page) | ||
602 | { | ||
603 | struct page_collect *pcol = priv; | ||
604 | |||
605 | if (pcol->that_locked_page != page) { | ||
606 | EXOFS_DBGMSG("index=0x%lx\n", page->index); | ||
607 | page_cache_release(page); | ||
608 | return; | ||
609 | } | ||
610 | EXOFS_DBGMSG("that_locked_page index=0x%lx\n", page->index); | ||
611 | } | ||
612 | |||
613 | static const struct _ore_r4w_op _r4w_op = { | ||
614 | .get_page = &__r4w_get_page, | ||
615 | .put_page = &__r4w_put_page, | ||
616 | }; | ||
617 | |||
508 | static int write_exec(struct page_collect *pcol) | 618 | static int write_exec(struct page_collect *pcol) |
509 | { | 619 | { |
510 | struct exofs_i_info *oi = exofs_i(pcol->inode); | 620 | struct exofs_i_info *oi = exofs_i(pcol->inode); |
@@ -516,10 +626,9 @@ static int write_exec(struct page_collect *pcol) | |||
516 | return 0; | 626 | return 0; |
517 | 627 | ||
518 | BUG_ON(pcol->ios); | 628 | BUG_ON(pcol->ios); |
519 | ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, false, | 629 | ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false, |
520 | pcol->pg_first << PAGE_CACHE_SHIFT, | 630 | pcol->pg_first << PAGE_CACHE_SHIFT, |
521 | pcol->length, &pcol->ios); | 631 | pcol->length, &pcol->ios); |
522 | |||
523 | if (unlikely(ret)) | 632 | if (unlikely(ret)) |
524 | goto err; | 633 | goto err; |
525 | 634 | ||
@@ -534,10 +643,20 @@ static int write_exec(struct page_collect *pcol) | |||
534 | 643 | ||
535 | ios = pcol->ios; | 644 | ios = pcol->ios; |
536 | ios->pages = pcol_copy->pages; | 645 | ios->pages = pcol_copy->pages; |
537 | ios->nr_pages = pcol_copy->nr_pages; | ||
538 | ios->done = writepages_done; | 646 | ios->done = writepages_done; |
647 | ios->r4w = &_r4w_op; | ||
539 | ios->private = pcol_copy; | 648 | ios->private = pcol_copy; |
540 | 649 | ||
650 | /* pages ownership was passed to pcol_copy */ | ||
651 | _pcol_reset(pcol); | ||
652 | |||
653 | ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol); | ||
654 | if (unlikely(ret)) | ||
655 | goto err; | ||
656 | |||
657 | EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n", | ||
658 | pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length)); | ||
659 | |||
541 | ret = ore_write(ios); | 660 | ret = ore_write(ios); |
542 | if (unlikely(ret)) { | 661 | if (unlikely(ret)) { |
543 | EXOFS_ERR("write_exec: ore_write() Failed\n"); | 662 | EXOFS_ERR("write_exec: ore_write() Failed\n"); |
@@ -545,11 +664,6 @@ static int write_exec(struct page_collect *pcol) | |||
545 | } | 664 | } |
546 | 665 | ||
547 | atomic_inc(&pcol->sbi->s_curr_pending); | 666 | atomic_inc(&pcol->sbi->s_curr_pending); |
548 | EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n", | ||
549 | pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset), | ||
550 | pcol->length); | ||
551 | /* pages ownership was passed to pcol_copy */ | ||
552 | _pcol_reset(pcol); | ||
553 | return 0; | 667 | return 0; |
554 | 668 | ||
555 | err: | 669 | err: |
@@ -689,14 +803,33 @@ static int exofs_writepages(struct address_space *mapping, | |||
689 | _pcol_init(&pcol, expected_pages, mapping->host); | 803 | _pcol_init(&pcol, expected_pages, mapping->host); |
690 | 804 | ||
691 | ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol); | 805 | ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol); |
692 | if (ret) { | 806 | if (unlikely(ret)) { |
693 | EXOFS_ERR("write_cache_pages => %d\n", ret); | 807 | EXOFS_ERR("write_cache_pages => %d\n", ret); |
694 | return ret; | 808 | return ret; |
695 | } | 809 | } |
696 | 810 | ||
697 | return write_exec(&pcol); | 811 | ret = write_exec(&pcol); |
812 | if (unlikely(ret)) | ||
813 | return ret; | ||
814 | |||
815 | if (wbc->sync_mode == WB_SYNC_ALL) { | ||
816 | return write_exec(&pcol); /* pump the last reminder */ | ||
817 | } else if (pcol.nr_pages) { | ||
818 | /* not SYNC let the reminder join the next writeout */ | ||
819 | unsigned i; | ||
820 | |||
821 | for (i = 0; i < pcol.nr_pages; i++) { | ||
822 | struct page *page = pcol.pages[i]; | ||
823 | |||
824 | end_page_writeback(page); | ||
825 | set_page_dirty(page); | ||
826 | unlock_page(page); | ||
827 | } | ||
828 | } | ||
829 | return 0; | ||
698 | } | 830 | } |
699 | 831 | ||
832 | /* | ||
700 | static int exofs_writepage(struct page *page, struct writeback_control *wbc) | 833 | static int exofs_writepage(struct page *page, struct writeback_control *wbc) |
701 | { | 834 | { |
702 | struct page_collect pcol; | 835 | struct page_collect pcol; |
@@ -712,7 +845,7 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc) | |||
712 | 845 | ||
713 | return write_exec(&pcol); | 846 | return write_exec(&pcol); |
714 | } | 847 | } |
715 | 848 | */ | |
716 | /* i_mutex held using inode->i_size directly */ | 849 | /* i_mutex held using inode->i_size directly */ |
717 | static void _write_failed(struct inode *inode, loff_t to) | 850 | static void _write_failed(struct inode *inode, loff_t to) |
718 | { | 851 | { |
@@ -818,7 +951,7 @@ static void exofs_invalidatepage(struct page *page, unsigned long offset) | |||
818 | const struct address_space_operations exofs_aops = { | 951 | const struct address_space_operations exofs_aops = { |
819 | .readpage = exofs_readpage, | 952 | .readpage = exofs_readpage, |
820 | .readpages = exofs_readpages, | 953 | .readpages = exofs_readpages, |
821 | .writepage = exofs_writepage, | 954 | .writepage = NULL, |
822 | .writepages = exofs_writepages, | 955 | .writepages = exofs_writepages, |
823 | .write_begin = exofs_write_begin_export, | 956 | .write_begin = exofs_write_begin_export, |
824 | .write_end = exofs_write_end, | 957 | .write_end = exofs_write_end, |
@@ -860,7 +993,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize) | |||
860 | 993 | ||
861 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 994 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; |
862 | 995 | ||
863 | ret = ore_truncate(&sbi->layout, &oi->comps, (u64)newsize); | 996 | ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize); |
864 | if (likely(!ret)) | 997 | if (likely(!ret)) |
865 | truncate_setsize(inode, newsize); | 998 | truncate_setsize(inode, newsize); |
866 | 999 | ||
@@ -927,14 +1060,14 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, | |||
927 | struct exofs_on_disk_inode_layout *layout; | 1060 | struct exofs_on_disk_inode_layout *layout; |
928 | int ret; | 1061 | int ret; |
929 | 1062 | ||
930 | ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); | 1063 | ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); |
931 | if (unlikely(ret)) { | 1064 | if (unlikely(ret)) { |
932 | EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); | 1065 | EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); |
933 | return ret; | 1066 | return ret; |
934 | } | 1067 | } |
935 | 1068 | ||
936 | attrs[1].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs); | 1069 | attrs[1].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs); |
937 | attrs[2].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs); | 1070 | attrs[2].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs); |
938 | 1071 | ||
939 | ios->in_attr = attrs; | 1072 | ios->in_attr = attrs; |
940 | ios->in_attr_len = ARRAY_SIZE(attrs); | 1073 | ios->in_attr_len = ARRAY_SIZE(attrs); |
@@ -1018,7 +1151,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) | |||
1018 | return inode; | 1151 | return inode; |
1019 | oi = exofs_i(inode); | 1152 | oi = exofs_i(inode); |
1020 | __oi_init(oi); | 1153 | __oi_init(oi); |
1021 | exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info, | 1154 | exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info, |
1022 | exofs_oi_objno(oi)); | 1155 | exofs_oi_objno(oi)); |
1023 | 1156 | ||
1024 | /* read the inode from the osd */ | 1157 | /* read the inode from the osd */ |
@@ -1172,13 +1305,13 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) | |||
1172 | spin_unlock(&sbi->s_next_gen_lock); | 1305 | spin_unlock(&sbi->s_next_gen_lock); |
1173 | insert_inode_hash(inode); | 1306 | insert_inode_hash(inode); |
1174 | 1307 | ||
1175 | exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info, | 1308 | exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info, |
1176 | exofs_oi_objno(oi)); | 1309 | exofs_oi_objno(oi)); |
1177 | exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */ | 1310 | exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */ |
1178 | 1311 | ||
1179 | mark_inode_dirty(inode); | 1312 | mark_inode_dirty(inode); |
1180 | 1313 | ||
1181 | ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); | 1314 | ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); |
1182 | if (unlikely(ret)) { | 1315 | if (unlikely(ret)) { |
1183 | EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n"); | 1316 | EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n"); |
1184 | return ERR_PTR(ret); | 1317 | return ERR_PTR(ret); |
@@ -1267,7 +1400,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync) | |||
1267 | } else | 1400 | } else |
1268 | memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); | 1401 | memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); |
1269 | 1402 | ||
1270 | ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); | 1403 | ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); |
1271 | if (unlikely(ret)) { | 1404 | if (unlikely(ret)) { |
1272 | EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); | 1405 | EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); |
1273 | goto free_args; | 1406 | goto free_args; |
@@ -1350,7 +1483,7 @@ void exofs_evict_inode(struct inode *inode) | |||
1350 | /* ignore the error, attempt a remove anyway */ | 1483 | /* ignore the error, attempt a remove anyway */ |
1351 | 1484 | ||
1352 | /* Now Remove the OSD objects */ | 1485 | /* Now Remove the OSD objects */ |
1353 | ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); | 1486 | ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); |
1354 | if (unlikely(ret)) { | 1487 | if (unlikely(ret)) { |
1355 | EXOFS_ERR("%s: ore_get_io_state failed\n", __func__); | 1488 | EXOFS_ERR("%s: ore_get_io_state failed\n", __func__); |
1356 | return; | 1489 | return; |
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 25305af88198..fcfa86ae6faf 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c | |||
@@ -24,76 +24,287 @@ | |||
24 | 24 | ||
25 | #include <linux/slab.h> | 25 | #include <linux/slab.h> |
26 | #include <asm/div64.h> | 26 | #include <asm/div64.h> |
27 | #include <linux/lcm.h> | ||
27 | 28 | ||
28 | #include <scsi/osd_ore.h> | 29 | #include "ore_raid.h" |
29 | 30 | ||
30 | #define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) | 31 | MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); |
32 | MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); | ||
33 | MODULE_LICENSE("GPL"); | ||
34 | |||
35 | /* ore_verify_layout does a couple of things: | ||
36 | * 1. Given a minimum number of needed parameters fixes up the rest of the | ||
37 | * members to be operatonals for the ore. The needed parameters are those | ||
38 | * that are defined by the pnfs-objects layout STD. | ||
39 | * 2. Check to see if the current ore code actually supports these parameters | ||
40 | * for example stripe_unit must be a multple of the system PAGE_SIZE, | ||
41 | * and etc... | ||
42 | * 3. Cache some havily used calculations that will be needed by users. | ||
43 | */ | ||
44 | |||
45 | enum { BIO_MAX_PAGES_KMALLOC = | ||
46 | (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),}; | ||
31 | 47 | ||
32 | #ifdef CONFIG_EXOFS_DEBUG | 48 | int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) |
33 | #define ORE_DBGMSG(fmt, a...) \ | 49 | { |
34 | printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) | 50 | u64 stripe_length; |
35 | #else | 51 | |
36 | #define ORE_DBGMSG(fmt, a...) \ | 52 | switch (layout->raid_algorithm) { |
37 | do { if (0) printk(fmt, ##a); } while (0) | 53 | case PNFS_OSD_RAID_0: |
38 | #endif | 54 | layout->parity = 0; |
55 | break; | ||
56 | case PNFS_OSD_RAID_5: | ||
57 | layout->parity = 1; | ||
58 | break; | ||
59 | case PNFS_OSD_RAID_PQ: | ||
60 | case PNFS_OSD_RAID_4: | ||
61 | default: | ||
62 | ORE_ERR("Only RAID_0/5 for now\n"); | ||
63 | return -EINVAL; | ||
64 | } | ||
65 | if (0 != (layout->stripe_unit & ~PAGE_MASK)) { | ||
66 | ORE_ERR("Stripe Unit(0x%llx)" | ||
67 | " must be Multples of PAGE_SIZE(0x%lx)\n", | ||
68 | _LLU(layout->stripe_unit), PAGE_SIZE); | ||
69 | return -EINVAL; | ||
70 | } | ||
71 | if (layout->group_width) { | ||
72 | if (!layout->group_depth) { | ||
73 | ORE_ERR("group_depth == 0 && group_width != 0\n"); | ||
74 | return -EINVAL; | ||
75 | } | ||
76 | if (total_comps < (layout->group_width * layout->mirrors_p1)) { | ||
77 | ORE_ERR("Data Map wrong, " | ||
78 | "numdevs=%d < group_width=%d * mirrors=%d\n", | ||
79 | total_comps, layout->group_width, | ||
80 | layout->mirrors_p1); | ||
81 | return -EINVAL; | ||
82 | } | ||
83 | layout->group_count = total_comps / layout->mirrors_p1 / | ||
84 | layout->group_width; | ||
85 | } else { | ||
86 | if (layout->group_depth) { | ||
87 | printk(KERN_NOTICE "Warning: group_depth ignored " | ||
88 | "group_width == 0 && group_depth == %lld\n", | ||
89 | _LLU(layout->group_depth)); | ||
90 | } | ||
91 | layout->group_width = total_comps / layout->mirrors_p1; | ||
92 | layout->group_depth = -1; | ||
93 | layout->group_count = 1; | ||
94 | } | ||
39 | 95 | ||
40 | /* u64 has problems with printk this will cast it to unsigned long long */ | 96 | stripe_length = (u64)layout->group_width * layout->stripe_unit; |
41 | #define _LLU(x) (unsigned long long)(x) | 97 | if (stripe_length >= (1ULL << 32)) { |
98 | ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n", | ||
99 | _LLU(stripe_length)); | ||
100 | return -EINVAL; | ||
101 | } | ||
42 | 102 | ||
43 | #define ORE_DBGMSG2(M...) do {} while (0) | 103 | layout->max_io_length = |
44 | /* #define ORE_DBGMSG2 ORE_DBGMSG */ | 104 | (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) * |
105 | layout->group_width; | ||
106 | if (layout->parity) { | ||
107 | unsigned stripe_length = | ||
108 | (layout->group_width - layout->parity) * | ||
109 | layout->stripe_unit; | ||
45 | 110 | ||
46 | MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); | 111 | layout->max_io_length /= stripe_length; |
47 | MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); | 112 | layout->max_io_length *= stripe_length; |
48 | MODULE_LICENSE("GPL"); | 113 | } |
114 | return 0; | ||
115 | } | ||
116 | EXPORT_SYMBOL(ore_verify_layout); | ||
49 | 117 | ||
50 | static u8 *_ios_cred(struct ore_io_state *ios, unsigned index) | 118 | static u8 *_ios_cred(struct ore_io_state *ios, unsigned index) |
51 | { | 119 | { |
52 | return ios->comps->comps[index & ios->comps->single_comp].cred; | 120 | return ios->oc->comps[index & ios->oc->single_comp].cred; |
53 | } | 121 | } |
54 | 122 | ||
55 | static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index) | 123 | static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index) |
56 | { | 124 | { |
57 | return &ios->comps->comps[index & ios->comps->single_comp].obj; | 125 | return &ios->oc->comps[index & ios->oc->single_comp].obj; |
58 | } | 126 | } |
59 | 127 | ||
60 | static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) | 128 | static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) |
61 | { | 129 | { |
62 | return ios->comps->ods[index]; | 130 | ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n", |
131 | ios->oc->first_dev, ios->oc->numdevs, index, | ||
132 | ios->oc->ods); | ||
133 | |||
134 | return ore_comp_dev(ios->oc, index); | ||
63 | } | 135 | } |
64 | 136 | ||
65 | int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, | 137 | int _ore_get_io_state(struct ore_layout *layout, |
138 | struct ore_components *oc, unsigned numdevs, | ||
139 | unsigned sgs_per_dev, unsigned num_par_pages, | ||
140 | struct ore_io_state **pios) | ||
141 | { | ||
142 | struct ore_io_state *ios; | ||
143 | struct page **pages; | ||
144 | struct osd_sg_entry *sgilist; | ||
145 | struct __alloc_all_io_state { | ||
146 | struct ore_io_state ios; | ||
147 | struct ore_per_dev_state per_dev[numdevs]; | ||
148 | union { | ||
149 | struct osd_sg_entry sglist[sgs_per_dev * numdevs]; | ||
150 | struct page *pages[num_par_pages]; | ||
151 | }; | ||
152 | } *_aios; | ||
153 | |||
154 | if (likely(sizeof(*_aios) <= PAGE_SIZE)) { | ||
155 | _aios = kzalloc(sizeof(*_aios), GFP_KERNEL); | ||
156 | if (unlikely(!_aios)) { | ||
157 | ORE_DBGMSG("Failed kzalloc bytes=%zd\n", | ||
158 | sizeof(*_aios)); | ||
159 | *pios = NULL; | ||
160 | return -ENOMEM; | ||
161 | } | ||
162 | pages = num_par_pages ? _aios->pages : NULL; | ||
163 | sgilist = sgs_per_dev ? _aios->sglist : NULL; | ||
164 | ios = &_aios->ios; | ||
165 | } else { | ||
166 | struct __alloc_small_io_state { | ||
167 | struct ore_io_state ios; | ||
168 | struct ore_per_dev_state per_dev[numdevs]; | ||
169 | } *_aio_small; | ||
170 | union __extra_part { | ||
171 | struct osd_sg_entry sglist[sgs_per_dev * numdevs]; | ||
172 | struct page *pages[num_par_pages]; | ||
173 | } *extra_part; | ||
174 | |||
175 | _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL); | ||
176 | if (unlikely(!_aio_small)) { | ||
177 | ORE_DBGMSG("Failed alloc first part bytes=%zd\n", | ||
178 | sizeof(*_aio_small)); | ||
179 | *pios = NULL; | ||
180 | return -ENOMEM; | ||
181 | } | ||
182 | extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL); | ||
183 | if (unlikely(!extra_part)) { | ||
184 | ORE_DBGMSG("Failed alloc second part bytes=%zd\n", | ||
185 | sizeof(*extra_part)); | ||
186 | kfree(_aio_small); | ||
187 | *pios = NULL; | ||
188 | return -ENOMEM; | ||
189 | } | ||
190 | |||
191 | pages = num_par_pages ? extra_part->pages : NULL; | ||
192 | sgilist = sgs_per_dev ? extra_part->sglist : NULL; | ||
193 | /* In this case the per_dev[0].sgilist holds the pointer to | ||
194 | * be freed | ||
195 | */ | ||
196 | ios = &_aio_small->ios; | ||
197 | ios->extra_part_alloc = true; | ||
198 | } | ||
199 | |||
200 | if (pages) { | ||
201 | ios->parity_pages = pages; | ||
202 | ios->max_par_pages = num_par_pages; | ||
203 | } | ||
204 | if (sgilist) { | ||
205 | unsigned d; | ||
206 | |||
207 | for (d = 0; d < numdevs; ++d) { | ||
208 | ios->per_dev[d].sglist = sgilist; | ||
209 | sgilist += sgs_per_dev; | ||
210 | } | ||
211 | ios->sgs_per_dev = sgs_per_dev; | ||
212 | } | ||
213 | |||
214 | ios->layout = layout; | ||
215 | ios->oc = oc; | ||
216 | *pios = ios; | ||
217 | return 0; | ||
218 | } | ||
219 | |||
220 | /* Allocate an io_state for only a single group of devices | ||
221 | * | ||
222 | * If a user needs to call ore_read/write() this version must be used becase it | ||
223 | * allocates extra stuff for striping and raid. | ||
224 | * The ore might decide to only IO less then @length bytes do to alignmets | ||
225 | * and constrains as follows: | ||
226 | * - The IO cannot cross group boundary. | ||
227 | * - In raid5/6 The end of the IO must align at end of a stripe eg. | ||
228 | * (@offset + @length) % strip_size == 0. Or the complete range is within a | ||
229 | * single stripe. | ||
230 | * - Memory condition only permitted a shorter IO. (A user can use @length=~0 | ||
231 | * And check the returned ios->length for max_io_size.) | ||
232 | * | ||
233 | * The caller must check returned ios->length (and/or ios->nr_pages) and | ||
234 | * re-issue these pages that fall outside of ios->length | ||
235 | */ | ||
236 | int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, | ||
66 | bool is_reading, u64 offset, u64 length, | 237 | bool is_reading, u64 offset, u64 length, |
67 | struct ore_io_state **pios) | 238 | struct ore_io_state **pios) |
68 | { | 239 | { |
69 | struct ore_io_state *ios; | 240 | struct ore_io_state *ios; |
241 | unsigned numdevs = layout->group_width * layout->mirrors_p1; | ||
242 | unsigned sgs_per_dev = 0, max_par_pages = 0; | ||
243 | int ret; | ||
70 | 244 | ||
71 | /*TODO: Maybe use kmem_cach per sbi of size | 245 | if (layout->parity && length) { |
72 | * exofs_io_state_size(layout->s_numdevs) | 246 | unsigned data_devs = layout->group_width - layout->parity; |
73 | */ | 247 | unsigned stripe_size = layout->stripe_unit * data_devs; |
74 | ios = kzalloc(ore_io_state_size(comps->numdevs), GFP_KERNEL); | 248 | unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; |
75 | if (unlikely(!ios)) { | 249 | u32 remainder; |
76 | ORE_DBGMSG("Failed kzalloc bytes=%d\n", | 250 | u64 num_stripes; |
77 | ore_io_state_size(comps->numdevs)); | 251 | u64 num_raid_units; |
78 | *pios = NULL; | 252 | |
79 | return -ENOMEM; | 253 | num_stripes = div_u64_rem(length, stripe_size, &remainder); |
254 | if (remainder) | ||
255 | ++num_stripes; | ||
256 | |||
257 | num_raid_units = num_stripes * layout->parity; | ||
258 | |||
259 | if (is_reading) { | ||
260 | /* For reads add per_dev sglist array */ | ||
261 | /* TODO: Raid 6 we need twice more. Actually: | ||
262 | * num_stripes / LCMdP(W,P); | ||
263 | * if (W%P != 0) num_stripes *= parity; | ||
264 | */ | ||
265 | |||
266 | /* first/last seg is split */ | ||
267 | num_raid_units += layout->group_width; | ||
268 | sgs_per_dev = div_u64(num_raid_units, data_devs); | ||
269 | } else { | ||
270 | /* For Writes add parity pages array. */ | ||
271 | max_par_pages = num_raid_units * pages_in_unit * | ||
272 | sizeof(struct page *); | ||
273 | } | ||
80 | } | 274 | } |
81 | 275 | ||
82 | ios->layout = layout; | 276 | ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages, |
83 | ios->comps = comps; | 277 | pios); |
84 | ios->offset = offset; | 278 | if (unlikely(ret)) |
85 | ios->length = length; | 279 | return ret; |
280 | |||
281 | ios = *pios; | ||
86 | ios->reading = is_reading; | 282 | ios->reading = is_reading; |
283 | ios->offset = offset; | ||
284 | |||
285 | if (length) { | ||
286 | ore_calc_stripe_info(layout, offset, length, &ios->si); | ||
287 | ios->length = ios->si.length; | ||
288 | ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE; | ||
289 | if (layout->parity) | ||
290 | _ore_post_alloc_raid_stuff(ios); | ||
291 | } | ||
87 | 292 | ||
88 | *pios = ios; | ||
89 | return 0; | 293 | return 0; |
90 | } | 294 | } |
91 | EXPORT_SYMBOL(ore_get_rw_state); | 295 | EXPORT_SYMBOL(ore_get_rw_state); |
92 | 296 | ||
93 | int ore_get_io_state(struct ore_layout *layout, struct ore_components *comps, | 297 | /* Allocate an io_state for all the devices in the comps array |
94 | struct ore_io_state **ios) | 298 | * |
299 | * This version of io_state allocation is used mostly by create/remove | ||
300 | * and trunc where we currently need all the devices. The only wastful | ||
301 | * bit is the read/write_attributes with no IO. Those sites should | ||
302 | * be converted to use ore_get_rw_state() with length=0 | ||
303 | */ | ||
304 | int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, | ||
305 | struct ore_io_state **pios) | ||
95 | { | 306 | { |
96 | return ore_get_rw_state(layout, comps, true, 0, 0, ios); | 307 | return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios); |
97 | } | 308 | } |
98 | EXPORT_SYMBOL(ore_get_io_state); | 309 | EXPORT_SYMBOL(ore_get_io_state); |
99 | 310 | ||
@@ -111,6 +322,7 @@ void ore_put_io_state(struct ore_io_state *ios) | |||
111 | bio_put(per_dev->bio); | 322 | bio_put(per_dev->bio); |
112 | } | 323 | } |
113 | 324 | ||
325 | _ore_free_raid_stuff(ios); | ||
114 | kfree(ios); | 326 | kfree(ios); |
115 | } | 327 | } |
116 | } | 328 | } |
@@ -138,7 +350,7 @@ static void _done_io(struct osd_request *or, void *p) | |||
138 | kref_put(&ios->kref, _last_io); | 350 | kref_put(&ios->kref, _last_io); |
139 | } | 351 | } |
140 | 352 | ||
141 | static int ore_io_execute(struct ore_io_state *ios) | 353 | int ore_io_execute(struct ore_io_state *ios) |
142 | { | 354 | { |
143 | DECLARE_COMPLETION_ONSTACK(wait); | 355 | DECLARE_COMPLETION_ONSTACK(wait); |
144 | bool sync = (ios->done == NULL); | 356 | bool sync = (ios->done == NULL); |
@@ -198,7 +410,7 @@ static void _clear_bio(struct bio *bio) | |||
198 | } | 410 | } |
199 | } | 411 | } |
200 | 412 | ||
201 | int ore_check_io(struct ore_io_state *ios, u64 *resid) | 413 | int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error) |
202 | { | 414 | { |
203 | enum osd_err_priority acumulated_osd_err = 0; | 415 | enum osd_err_priority acumulated_osd_err = 0; |
204 | int acumulated_lin_err = 0; | 416 | int acumulated_lin_err = 0; |
@@ -206,7 +418,8 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid) | |||
206 | 418 | ||
207 | for (i = 0; i < ios->numdevs; i++) { | 419 | for (i = 0; i < ios->numdevs; i++) { |
208 | struct osd_sense_info osi; | 420 | struct osd_sense_info osi; |
209 | struct osd_request *or = ios->per_dev[i].or; | 421 | struct ore_per_dev_state *per_dev = &ios->per_dev[i]; |
422 | struct osd_request *or = per_dev->or; | ||
210 | int ret; | 423 | int ret; |
211 | 424 | ||
212 | if (unlikely(!or)) | 425 | if (unlikely(!or)) |
@@ -218,29 +431,31 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid) | |||
218 | 431 | ||
219 | if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { | 432 | if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { |
220 | /* start read offset passed endof file */ | 433 | /* start read offset passed endof file */ |
221 | _clear_bio(ios->per_dev[i].bio); | 434 | _clear_bio(per_dev->bio); |
222 | ORE_DBGMSG("start read offset passed end of file " | 435 | ORE_DBGMSG("start read offset passed end of file " |
223 | "offset=0x%llx, length=0x%llx\n", | 436 | "offset=0x%llx, length=0x%llx\n", |
224 | _LLU(ios->per_dev[i].offset), | 437 | _LLU(per_dev->offset), |
225 | _LLU(ios->per_dev[i].length)); | 438 | _LLU(per_dev->length)); |
226 | 439 | ||
227 | continue; /* we recovered */ | 440 | continue; /* we recovered */ |
228 | } | 441 | } |
229 | 442 | ||
443 | if (on_dev_error) { | ||
444 | u64 residual = ios->reading ? | ||
445 | or->in.residual : or->out.residual; | ||
446 | u64 offset = (ios->offset + ios->length) - residual; | ||
447 | struct ore_dev *od = ios->oc->ods[ | ||
448 | per_dev->dev - ios->oc->first_dev]; | ||
449 | |||
450 | on_dev_error(ios, od, per_dev->dev, osi.osd_err_pri, | ||
451 | offset, residual); | ||
452 | } | ||
230 | if (osi.osd_err_pri >= acumulated_osd_err) { | 453 | if (osi.osd_err_pri >= acumulated_osd_err) { |
231 | acumulated_osd_err = osi.osd_err_pri; | 454 | acumulated_osd_err = osi.osd_err_pri; |
232 | acumulated_lin_err = ret; | 455 | acumulated_lin_err = ret; |
233 | } | 456 | } |
234 | } | 457 | } |
235 | 458 | ||
236 | /* TODO: raid specific residual calculations */ | ||
237 | if (resid) { | ||
238 | if (likely(!acumulated_lin_err)) | ||
239 | *resid = 0; | ||
240 | else | ||
241 | *resid = ios->length; | ||
242 | } | ||
243 | |||
244 | return acumulated_lin_err; | 459 | return acumulated_lin_err; |
245 | } | 460 | } |
246 | EXPORT_SYMBOL(ore_check_io); | 461 | EXPORT_SYMBOL(ore_check_io); |
@@ -248,61 +463,65 @@ EXPORT_SYMBOL(ore_check_io); | |||
248 | /* | 463 | /* |
249 | * L - logical offset into the file | 464 | * L - logical offset into the file |
250 | * | 465 | * |
251 | * U - The number of bytes in a stripe within a group | 466 | * D - number of Data devices |
467 | * D = group_width - parity | ||
252 | * | 468 | * |
253 | * U = stripe_unit * group_width | 469 | * U - The number of bytes in a stripe within a group |
470 | * U = stripe_unit * D | ||
254 | * | 471 | * |
255 | * T - The number of bytes striped within a group of component objects | 472 | * T - The number of bytes striped within a group of component objects |
256 | * (before advancing to the next group) | 473 | * (before advancing to the next group) |
257 | * | 474 | * T = U * group_depth |
258 | * T = stripe_unit * group_width * group_depth | ||
259 | * | 475 | * |
260 | * S - The number of bytes striped across all component objects | 476 | * S - The number of bytes striped across all component objects |
261 | * before the pattern repeats | 477 | * before the pattern repeats |
478 | * S = T * group_count | ||
262 | * | 479 | * |
263 | * S = stripe_unit * group_width * group_depth * group_count | 480 | * M - The "major" (i.e., across all components) cycle number |
264 | * | ||
265 | * M - The "major" (i.e., across all components) stripe number | ||
266 | * | ||
267 | * M = L / S | 481 | * M = L / S |
268 | * | 482 | * |
269 | * G - Counts the groups from the beginning of the major stripe | 483 | * G - Counts the groups from the beginning of the major cycle |
270 | * | ||
271 | * G = (L - (M * S)) / T [or (L % S) / T] | 484 | * G = (L - (M * S)) / T [or (L % S) / T] |
272 | * | 485 | * |
273 | * H - The byte offset within the group | 486 | * H - The byte offset within the group |
274 | * | ||
275 | * H = (L - (M * S)) % T [or (L % S) % T] | 487 | * H = (L - (M * S)) % T [or (L % S) % T] |
276 | * | 488 | * |
277 | * N - The "minor" (i.e., across the group) stripe number | 489 | * N - The "minor" (i.e., across the group) stripe number |
278 | * | ||
279 | * N = H / U | 490 | * N = H / U |
280 | * | 491 | * |
281 | * C - The component index coresponding to L | 492 | * C - The component index coresponding to L |
282 | * | 493 | * |
283 | * C = (H - (N * U)) / stripe_unit + G * group_width | 494 | * C = (H - (N * U)) / stripe_unit + G * D |
284 | * [or (L % U) / stripe_unit + G * group_width] | 495 | * [or (L % U) / stripe_unit + G * D] |
285 | * | 496 | * |
286 | * O - The component offset coresponding to L | 497 | * O - The component offset coresponding to L |
287 | * | ||
288 | * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit | 498 | * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit |
499 | * | ||
500 | * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity | ||
501 | * divide by parity | ||
502 | * LCMdP = lcm(group_width, parity) / parity | ||
503 | * | ||
504 | * R - The parity Rotation stripe | ||
505 | * (Note parity cycle always starts at a group's boundary) | ||
506 | * R = N % LCMdP | ||
507 | * | ||
508 | * I = the first parity device index | ||
509 | * I = (group_width + group_width - R*parity - parity) % group_width | ||
510 | * | ||
511 | * Craid - The component index Rotated | ||
512 | * Craid = (group_width + C - R*parity) % group_width | ||
513 | * (We add the group_width to avoid negative numbers modulo math) | ||
289 | */ | 514 | */ |
290 | struct _striping_info { | 515 | void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, |
291 | u64 obj_offset; | 516 | u64 length, struct ore_striping_info *si) |
292 | u64 group_length; | ||
293 | u64 M; /* for truncate */ | ||
294 | unsigned dev; | ||
295 | unsigned unit_off; | ||
296 | }; | ||
297 | |||
298 | static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset, | ||
299 | struct _striping_info *si) | ||
300 | { | 517 | { |
301 | u32 stripe_unit = layout->stripe_unit; | 518 | u32 stripe_unit = layout->stripe_unit; |
302 | u32 group_width = layout->group_width; | 519 | u32 group_width = layout->group_width; |
303 | u64 group_depth = layout->group_depth; | 520 | u64 group_depth = layout->group_depth; |
521 | u32 parity = layout->parity; | ||
304 | 522 | ||
305 | u32 U = stripe_unit * group_width; | 523 | u32 D = group_width - parity; |
524 | u32 U = D * stripe_unit; | ||
306 | u64 T = U * group_depth; | 525 | u64 T = U * group_depth; |
307 | u64 S = T * layout->group_count; | 526 | u64 S = T * layout->group_count; |
308 | u64 M = div64_u64(file_offset, S); | 527 | u64 M = div64_u64(file_offset, S); |
@@ -318,39 +537,65 @@ static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset, | |||
318 | u32 N = div_u64(H, U); | 537 | u32 N = div_u64(H, U); |
319 | 538 | ||
320 | /* "H - (N * U)" is just "H % U" so it's bound to u32 */ | 539 | /* "H - (N * U)" is just "H % U" so it's bound to u32 */ |
321 | si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; | 540 | u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; |
322 | si->dev *= layout->mirrors_p1; | ||
323 | 541 | ||
324 | div_u64_rem(file_offset, stripe_unit, &si->unit_off); | 542 | div_u64_rem(file_offset, stripe_unit, &si->unit_off); |
325 | 543 | ||
326 | si->obj_offset = si->unit_off + (N * stripe_unit) + | 544 | si->obj_offset = si->unit_off + (N * stripe_unit) + |
327 | (M * group_depth * stripe_unit); | 545 | (M * group_depth * stripe_unit); |
328 | 546 | ||
329 | si->group_length = T - H; | 547 | if (parity) { |
548 | u32 LCMdP = lcm(group_width, parity) / parity; | ||
549 | /* R = N % LCMdP; */ | ||
550 | u32 RxP = (N % LCMdP) * parity; | ||
551 | u32 first_dev = C - C % group_width; | ||
552 | |||
553 | si->par_dev = (group_width + group_width - parity - RxP) % | ||
554 | group_width + first_dev; | ||
555 | si->dev = (group_width + C - RxP) % group_width + first_dev; | ||
556 | si->bytes_in_stripe = U; | ||
557 | si->first_stripe_start = M * S + G * T + N * U; | ||
558 | } else { | ||
559 | /* Make the math correct see _prepare_one_group */ | ||
560 | si->par_dev = group_width; | ||
561 | si->dev = C; | ||
562 | } | ||
563 | |||
564 | si->dev *= layout->mirrors_p1; | ||
565 | si->par_dev *= layout->mirrors_p1; | ||
566 | si->offset = file_offset; | ||
567 | si->length = T - H; | ||
568 | if (si->length > length) | ||
569 | si->length = length; | ||
330 | si->M = M; | 570 | si->M = M; |
331 | } | 571 | } |
572 | EXPORT_SYMBOL(ore_calc_stripe_info); | ||
332 | 573 | ||
333 | static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, | 574 | int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, |
334 | unsigned pgbase, struct ore_per_dev_state *per_dev, | 575 | unsigned pgbase, struct page **pages, |
335 | int cur_len) | 576 | struct ore_per_dev_state *per_dev, int cur_len) |
336 | { | 577 | { |
337 | unsigned pg = *cur_pg; | 578 | unsigned pg = *cur_pg; |
338 | struct request_queue *q = | 579 | struct request_queue *q = |
339 | osd_request_queue(_ios_od(ios, per_dev->dev)); | 580 | osd_request_queue(_ios_od(ios, per_dev->dev)); |
340 | 581 | unsigned len = cur_len; | |
341 | per_dev->length += cur_len; | 582 | int ret; |
342 | 583 | ||
343 | if (per_dev->bio == NULL) { | 584 | if (per_dev->bio == NULL) { |
344 | unsigned pages_in_stripe = ios->layout->group_width * | 585 | unsigned pages_in_stripe = ios->layout->group_width * |
345 | (ios->layout->stripe_unit / PAGE_SIZE); | 586 | (ios->layout->stripe_unit / PAGE_SIZE); |
346 | unsigned bio_size = (ios->nr_pages + pages_in_stripe) / | 587 | unsigned nr_pages = ios->nr_pages * ios->layout->group_width / |
347 | ios->layout->group_width; | 588 | (ios->layout->group_width - |
589 | ios->layout->parity); | ||
590 | unsigned bio_size = (nr_pages + pages_in_stripe) / | ||
591 | ios->layout->group_width; | ||
348 | 592 | ||
349 | per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); | 593 | per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); |
350 | if (unlikely(!per_dev->bio)) { | 594 | if (unlikely(!per_dev->bio)) { |
351 | ORE_DBGMSG("Failed to allocate BIO size=%u\n", | 595 | ORE_DBGMSG("Failed to allocate BIO size=%u\n", |
352 | bio_size); | 596 | bio_size); |
353 | return -ENOMEM; | 597 | ret = -ENOMEM; |
598 | goto out; | ||
354 | } | 599 | } |
355 | } | 600 | } |
356 | 601 | ||
@@ -358,64 +603,90 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, | |||
358 | unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); | 603 | unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); |
359 | unsigned added_len; | 604 | unsigned added_len; |
360 | 605 | ||
361 | BUG_ON(ios->nr_pages <= pg); | ||
362 | cur_len -= pglen; | 606 | cur_len -= pglen; |
363 | 607 | ||
364 | added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg], | 608 | added_len = bio_add_pc_page(q, per_dev->bio, pages[pg], |
365 | pglen, pgbase); | 609 | pglen, pgbase); |
366 | if (unlikely(pglen != added_len)) | 610 | if (unlikely(pglen != added_len)) { |
367 | return -ENOMEM; | 611 | ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n", |
612 | per_dev->bio->bi_vcnt); | ||
613 | ret = -ENOMEM; | ||
614 | goto out; | ||
615 | } | ||
616 | _add_stripe_page(ios->sp2d, &ios->si, pages[pg]); | ||
617 | |||
368 | pgbase = 0; | 618 | pgbase = 0; |
369 | ++pg; | 619 | ++pg; |
370 | } | 620 | } |
371 | BUG_ON(cur_len); | 621 | BUG_ON(cur_len); |
372 | 622 | ||
623 | per_dev->length += len; | ||
373 | *cur_pg = pg; | 624 | *cur_pg = pg; |
374 | return 0; | 625 | ret = 0; |
626 | out: /* we fail the complete unit on an error eg don't advance | ||
627 | * per_dev->length and cur_pg. This means that we might have a bigger | ||
628 | * bio than the CDB requested length (per_dev->length). That's fine | ||
629 | * only the oposite is fatal. | ||
630 | */ | ||
631 | return ret; | ||
375 | } | 632 | } |
376 | 633 | ||
377 | static int _prepare_one_group(struct ore_io_state *ios, u64 length, | 634 | static int _prepare_for_striping(struct ore_io_state *ios) |
378 | struct _striping_info *si) | ||
379 | { | 635 | { |
636 | struct ore_striping_info *si = &ios->si; | ||
380 | unsigned stripe_unit = ios->layout->stripe_unit; | 637 | unsigned stripe_unit = ios->layout->stripe_unit; |
381 | unsigned mirrors_p1 = ios->layout->mirrors_p1; | 638 | unsigned mirrors_p1 = ios->layout->mirrors_p1; |
382 | unsigned devs_in_group = ios->layout->group_width * mirrors_p1; | 639 | unsigned group_width = ios->layout->group_width; |
640 | unsigned devs_in_group = group_width * mirrors_p1; | ||
383 | unsigned dev = si->dev; | 641 | unsigned dev = si->dev; |
384 | unsigned first_dev = dev - (dev % devs_in_group); | 642 | unsigned first_dev = dev - (dev % devs_in_group); |
385 | unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; | 643 | unsigned dev_order; |
386 | unsigned cur_pg = ios->pages_consumed; | 644 | unsigned cur_pg = ios->pages_consumed; |
645 | u64 length = ios->length; | ||
387 | int ret = 0; | 646 | int ret = 0; |
388 | 647 | ||
648 | if (!ios->pages) { | ||
649 | ios->numdevs = ios->layout->mirrors_p1; | ||
650 | return 0; | ||
651 | } | ||
652 | |||
653 | BUG_ON(length > si->length); | ||
654 | |||
655 | dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev); | ||
656 | si->cur_comp = dev_order; | ||
657 | si->cur_pg = si->unit_off / PAGE_SIZE; | ||
658 | |||
389 | while (length) { | 659 | while (length) { |
390 | struct ore_per_dev_state *per_dev = &ios->per_dev[dev]; | 660 | unsigned comp = dev - first_dev; |
661 | struct ore_per_dev_state *per_dev = &ios->per_dev[comp]; | ||
391 | unsigned cur_len, page_off = 0; | 662 | unsigned cur_len, page_off = 0; |
392 | 663 | ||
393 | if (!per_dev->length) { | 664 | if (!per_dev->length) { |
394 | per_dev->dev = dev; | 665 | per_dev->dev = dev; |
395 | if (dev < si->dev) { | 666 | if (dev == si->dev) { |
396 | per_dev->offset = si->obj_offset + stripe_unit - | 667 | WARN_ON(dev == si->par_dev); |
397 | si->unit_off; | ||
398 | cur_len = stripe_unit; | ||
399 | } else if (dev == si->dev) { | ||
400 | per_dev->offset = si->obj_offset; | 668 | per_dev->offset = si->obj_offset; |
401 | cur_len = stripe_unit - si->unit_off; | 669 | cur_len = stripe_unit - si->unit_off; |
402 | page_off = si->unit_off & ~PAGE_MASK; | 670 | page_off = si->unit_off & ~PAGE_MASK; |
403 | BUG_ON(page_off && (page_off != ios->pgbase)); | 671 | BUG_ON(page_off && (page_off != ios->pgbase)); |
404 | } else { /* dev > si->dev */ | 672 | } else { |
405 | per_dev->offset = si->obj_offset - si->unit_off; | 673 | if (si->cur_comp > dev_order) |
674 | per_dev->offset = | ||
675 | si->obj_offset - si->unit_off; | ||
676 | else /* si->cur_comp < dev_order */ | ||
677 | per_dev->offset = | ||
678 | si->obj_offset + stripe_unit - | ||
679 | si->unit_off; | ||
406 | cur_len = stripe_unit; | 680 | cur_len = stripe_unit; |
407 | } | 681 | } |
408 | |||
409 | if (max_comp < dev) | ||
410 | max_comp = dev; | ||
411 | } else { | 682 | } else { |
412 | cur_len = stripe_unit; | 683 | cur_len = stripe_unit; |
413 | } | 684 | } |
414 | if (cur_len >= length) | 685 | if (cur_len >= length) |
415 | cur_len = length; | 686 | cur_len = length; |
416 | 687 | ||
417 | ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, | 688 | ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages, |
418 | cur_len); | 689 | per_dev, cur_len); |
419 | if (unlikely(ret)) | 690 | if (unlikely(ret)) |
420 | goto out; | 691 | goto out; |
421 | 692 | ||
@@ -423,60 +694,60 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length, | |||
423 | dev = (dev % devs_in_group) + first_dev; | 694 | dev = (dev % devs_in_group) + first_dev; |
424 | 695 | ||
425 | length -= cur_len; | 696 | length -= cur_len; |
426 | } | ||
427 | out: | ||
428 | ios->numdevs = max_comp + mirrors_p1; | ||
429 | ios->pages_consumed = cur_pg; | ||
430 | return ret; | ||
431 | } | ||
432 | |||
433 | static int _prepare_for_striping(struct ore_io_state *ios) | ||
434 | { | ||
435 | u64 length = ios->length; | ||
436 | u64 offset = ios->offset; | ||
437 | struct _striping_info si; | ||
438 | int ret = 0; | ||
439 | 697 | ||
440 | if (!ios->pages) { | 698 | si->cur_comp = (si->cur_comp + 1) % group_width; |
441 | if (ios->kern_buff) { | 699 | if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) { |
442 | struct ore_per_dev_state *per_dev = &ios->per_dev[0]; | 700 | if (!length && ios->sp2d) { |
701 | /* If we are writing and this is the very last | ||
702 | * stripe. then operate on parity dev. | ||
703 | */ | ||
704 | dev = si->par_dev; | ||
705 | } | ||
706 | if (ios->sp2d) | ||
707 | /* In writes cur_len just means if it's the | ||
708 | * last one. See _ore_add_parity_unit. | ||
709 | */ | ||
710 | cur_len = length; | ||
711 | per_dev = &ios->per_dev[dev - first_dev]; | ||
712 | if (!per_dev->length) { | ||
713 | /* Only/always the parity unit of the first | ||
714 | * stripe will be empty. So this is a chance to | ||
715 | * initialize the per_dev info. | ||
716 | */ | ||
717 | per_dev->dev = dev; | ||
718 | per_dev->offset = si->obj_offset - si->unit_off; | ||
719 | } | ||
443 | 720 | ||
444 | _calc_stripe_info(ios->layout, ios->offset, &si); | 721 | ret = _ore_add_parity_unit(ios, si, per_dev, cur_len); |
445 | per_dev->offset = si.obj_offset; | 722 | if (unlikely(ret)) |
446 | per_dev->dev = si.dev; | 723 | goto out; |
447 | 724 | ||
448 | /* no cross device without page array */ | 725 | /* Rotate next par_dev backwards with wraping */ |
449 | BUG_ON((ios->layout->group_width > 1) && | 726 | si->par_dev = (devs_in_group + si->par_dev - |
450 | (si.unit_off + ios->length > | 727 | ios->layout->parity * mirrors_p1) % |
451 | ios->layout->stripe_unit)); | 728 | devs_in_group + first_dev; |
729 | /* Next stripe, start fresh */ | ||
730 | si->cur_comp = 0; | ||
731 | si->cur_pg = 0; | ||
452 | } | 732 | } |
453 | ios->numdevs = ios->layout->mirrors_p1; | ||
454 | return 0; | ||
455 | } | ||
456 | |||
457 | while (length) { | ||
458 | _calc_stripe_info(ios->layout, offset, &si); | ||
459 | |||
460 | if (length < si.group_length) | ||
461 | si.group_length = length; | ||
462 | |||
463 | ret = _prepare_one_group(ios, si.group_length, &si); | ||
464 | if (unlikely(ret)) | ||
465 | goto out; | ||
466 | |||
467 | offset += si.group_length; | ||
468 | length -= si.group_length; | ||
469 | } | 733 | } |
470 | |||
471 | out: | 734 | out: |
472 | return ret; | 735 | ios->numdevs = devs_in_group; |
736 | ios->pages_consumed = cur_pg; | ||
737 | if (unlikely(ret)) { | ||
738 | if (length == ios->length) | ||
739 | return ret; | ||
740 | else | ||
741 | ios->length -= length; | ||
742 | } | ||
743 | return 0; | ||
473 | } | 744 | } |
474 | 745 | ||
475 | int ore_create(struct ore_io_state *ios) | 746 | int ore_create(struct ore_io_state *ios) |
476 | { | 747 | { |
477 | int i, ret; | 748 | int i, ret; |
478 | 749 | ||
479 | for (i = 0; i < ios->comps->numdevs; i++) { | 750 | for (i = 0; i < ios->oc->numdevs; i++) { |
480 | struct osd_request *or; | 751 | struct osd_request *or; |
481 | 752 | ||
482 | or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); | 753 | or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); |
@@ -501,7 +772,7 @@ int ore_remove(struct ore_io_state *ios) | |||
501 | { | 772 | { |
502 | int i, ret; | 773 | int i, ret; |
503 | 774 | ||
504 | for (i = 0; i < ios->comps->numdevs; i++) { | 775 | for (i = 0; i < ios->oc->numdevs; i++) { |
505 | struct osd_request *or; | 776 | struct osd_request *or; |
506 | 777 | ||
507 | or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); | 778 | or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); |
@@ -543,7 +814,6 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) | |||
543 | goto out; | 814 | goto out; |
544 | } | 815 | } |
545 | per_dev->or = or; | 816 | per_dev->or = or; |
546 | per_dev->offset = master_dev->offset; | ||
547 | 817 | ||
548 | if (ios->pages) { | 818 | if (ios->pages) { |
549 | struct bio *bio; | 819 | struct bio *bio; |
@@ -562,6 +832,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) | |||
562 | __bio_clone(bio, master_dev->bio); | 832 | __bio_clone(bio, master_dev->bio); |
563 | bio->bi_bdev = NULL; | 833 | bio->bi_bdev = NULL; |
564 | bio->bi_next = NULL; | 834 | bio->bi_next = NULL; |
835 | per_dev->offset = master_dev->offset; | ||
565 | per_dev->length = master_dev->length; | 836 | per_dev->length = master_dev->length; |
566 | per_dev->bio = bio; | 837 | per_dev->bio = bio; |
567 | per_dev->dev = dev; | 838 | per_dev->dev = dev; |
@@ -579,7 +850,15 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) | |||
579 | _LLU(per_dev->offset), | 850 | _LLU(per_dev->offset), |
580 | _LLU(per_dev->length), dev); | 851 | _LLU(per_dev->length), dev); |
581 | } else if (ios->kern_buff) { | 852 | } else if (ios->kern_buff) { |
582 | ret = osd_req_write_kern(or, _ios_obj(ios, dev), | 853 | per_dev->offset = ios->si.obj_offset; |
854 | per_dev->dev = ios->si.dev + dev; | ||
855 | |||
856 | /* no cross device without page array */ | ||
857 | BUG_ON((ios->layout->group_width > 1) && | ||
858 | (ios->si.unit_off + ios->length > | ||
859 | ios->layout->stripe_unit)); | ||
860 | |||
861 | ret = osd_req_write_kern(or, _ios_obj(ios, per_dev->dev), | ||
583 | per_dev->offset, | 862 | per_dev->offset, |
584 | ios->kern_buff, ios->length); | 863 | ios->kern_buff, ios->length); |
585 | if (unlikely(ret)) | 864 | if (unlikely(ret)) |
@@ -588,7 +867,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) | |||
588 | "length=0x%llx dev=%d\n", | 867 | "length=0x%llx dev=%d\n", |
589 | _LLU(_ios_obj(ios, dev)->id), | 868 | _LLU(_ios_obj(ios, dev)->id), |
590 | _LLU(per_dev->offset), | 869 | _LLU(per_dev->offset), |
591 | _LLU(ios->length), dev); | 870 | _LLU(ios->length), per_dev->dev); |
592 | } else { | 871 | } else { |
593 | osd_req_set_attributes(or, _ios_obj(ios, dev)); | 872 | osd_req_set_attributes(or, _ios_obj(ios, dev)); |
594 | ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", | 873 | ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", |
@@ -614,6 +893,14 @@ int ore_write(struct ore_io_state *ios) | |||
614 | int i; | 893 | int i; |
615 | int ret; | 894 | int ret; |
616 | 895 | ||
896 | if (unlikely(ios->sp2d && !ios->r4w)) { | ||
897 | /* A library is attempting a RAID-write without providing | ||
898 | * a pages lock interface. | ||
899 | */ | ||
900 | WARN_ON_ONCE(1); | ||
901 | return -ENOTSUPP; | ||
902 | } | ||
903 | |||
617 | ret = _prepare_for_striping(ios); | 904 | ret = _prepare_for_striping(ios); |
618 | if (unlikely(ret)) | 905 | if (unlikely(ret)) |
619 | return ret; | 906 | return ret; |
@@ -629,7 +916,7 @@ int ore_write(struct ore_io_state *ios) | |||
629 | } | 916 | } |
630 | EXPORT_SYMBOL(ore_write); | 917 | EXPORT_SYMBOL(ore_write); |
631 | 918 | ||
632 | static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp) | 919 | int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp) |
633 | { | 920 | { |
634 | struct osd_request *or; | 921 | struct osd_request *or; |
635 | struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; | 922 | struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; |
@@ -648,22 +935,27 @@ static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp) | |||
648 | per_dev->or = or; | 935 | per_dev->or = or; |
649 | 936 | ||
650 | if (ios->pages) { | 937 | if (ios->pages) { |
651 | osd_req_read(or, obj, per_dev->offset, | 938 | if (per_dev->cur_sg) { |
652 | per_dev->bio, per_dev->length); | 939 | /* finalize the last sg_entry */ |
940 | _ore_add_sg_seg(per_dev, 0, false); | ||
941 | if (unlikely(!per_dev->cur_sg)) | ||
942 | return 0; /* Skip parity only device */ | ||
943 | |||
944 | osd_req_read_sg(or, obj, per_dev->bio, | ||
945 | per_dev->sglist, per_dev->cur_sg); | ||
946 | } else { | ||
947 | /* The no raid case */ | ||
948 | osd_req_read(or, obj, per_dev->offset, | ||
949 | per_dev->bio, per_dev->length); | ||
950 | } | ||
951 | |||
653 | ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" | 952 | ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" |
654 | " dev=%d\n", _LLU(obj->id), | 953 | " dev=%d sg_len=%d\n", _LLU(obj->id), |
655 | _LLU(per_dev->offset), _LLU(per_dev->length), | 954 | _LLU(per_dev->offset), _LLU(per_dev->length), |
656 | first_dev); | 955 | first_dev, per_dev->cur_sg); |
657 | } else if (ios->kern_buff) { | ||
658 | int ret = osd_req_read_kern(or, obj, per_dev->offset, | ||
659 | ios->kern_buff, ios->length); | ||
660 | ORE_DBGMSG2("read_kern(0x%llx) offset=0x%llx " | ||
661 | "length=0x%llx dev=%d ret=>%d\n", | ||
662 | _LLU(obj->id), _LLU(per_dev->offset), | ||
663 | _LLU(ios->length), first_dev, ret); | ||
664 | if (unlikely(ret)) | ||
665 | return ret; | ||
666 | } else { | 956 | } else { |
957 | BUG_ON(ios->kern_buff); | ||
958 | |||
667 | osd_req_get_attributes(or, obj); | 959 | osd_req_get_attributes(or, obj); |
668 | ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", | 960 | ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", |
669 | _LLU(obj->id), | 961 | _LLU(obj->id), |
@@ -688,7 +980,7 @@ int ore_read(struct ore_io_state *ios) | |||
688 | return ret; | 980 | return ret; |
689 | 981 | ||
690 | for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { | 982 | for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { |
691 | ret = _read_mirror(ios, i); | 983 | ret = _ore_read_mirror(ios, i); |
692 | if (unlikely(ret)) | 984 | if (unlikely(ret)) |
693 | return ret; | 985 | return ret; |
694 | } | 986 | } |
@@ -744,31 +1036,29 @@ static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp, | |||
744 | } | 1036 | } |
745 | 1037 | ||
746 | struct _trunc_info { | 1038 | struct _trunc_info { |
747 | struct _striping_info si; | 1039 | struct ore_striping_info si; |
748 | u64 prev_group_obj_off; | 1040 | u64 prev_group_obj_off; |
749 | u64 next_group_obj_off; | 1041 | u64 next_group_obj_off; |
750 | 1042 | ||
751 | unsigned first_group_dev; | 1043 | unsigned first_group_dev; |
752 | unsigned nex_group_dev; | 1044 | unsigned nex_group_dev; |
753 | unsigned max_devs; | ||
754 | }; | 1045 | }; |
755 | 1046 | ||
756 | void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, | 1047 | static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, |
757 | struct _trunc_info *ti) | 1048 | struct _trunc_info *ti) |
758 | { | 1049 | { |
759 | unsigned stripe_unit = layout->stripe_unit; | 1050 | unsigned stripe_unit = layout->stripe_unit; |
760 | 1051 | ||
761 | _calc_stripe_info(layout, file_offset, &ti->si); | 1052 | ore_calc_stripe_info(layout, file_offset, 0, &ti->si); |
762 | 1053 | ||
763 | ti->prev_group_obj_off = ti->si.M * stripe_unit; | 1054 | ti->prev_group_obj_off = ti->si.M * stripe_unit; |
764 | ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0; | 1055 | ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0; |
765 | 1056 | ||
766 | ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width); | 1057 | ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width); |
767 | ti->nex_group_dev = ti->first_group_dev + layout->group_width; | 1058 | ti->nex_group_dev = ti->first_group_dev + layout->group_width; |
768 | ti->max_devs = layout->group_width * layout->group_count; | ||
769 | } | 1059 | } |
770 | 1060 | ||
771 | int ore_truncate(struct ore_layout *layout, struct ore_components *comps, | 1061 | int ore_truncate(struct ore_layout *layout, struct ore_components *oc, |
772 | u64 size) | 1062 | u64 size) |
773 | { | 1063 | { |
774 | struct ore_io_state *ios; | 1064 | struct ore_io_state *ios; |
@@ -779,22 +1069,22 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps, | |||
779 | struct _trunc_info ti; | 1069 | struct _trunc_info ti; |
780 | int i, ret; | 1070 | int i, ret; |
781 | 1071 | ||
782 | ret = ore_get_io_state(layout, comps, &ios); | 1072 | ret = ore_get_io_state(layout, oc, &ios); |
783 | if (unlikely(ret)) | 1073 | if (unlikely(ret)) |
784 | return ret; | 1074 | return ret; |
785 | 1075 | ||
786 | _calc_trunk_info(ios->layout, size, &ti); | 1076 | _calc_trunk_info(ios->layout, size, &ti); |
787 | 1077 | ||
788 | size_attrs = kcalloc(ti.max_devs, sizeof(*size_attrs), | 1078 | size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs), |
789 | GFP_KERNEL); | 1079 | GFP_KERNEL); |
790 | if (unlikely(!size_attrs)) { | 1080 | if (unlikely(!size_attrs)) { |
791 | ret = -ENOMEM; | 1081 | ret = -ENOMEM; |
792 | goto out; | 1082 | goto out; |
793 | } | 1083 | } |
794 | 1084 | ||
795 | ios->numdevs = ios->comps->numdevs; | 1085 | ios->numdevs = ios->oc->numdevs; |
796 | 1086 | ||
797 | for (i = 0; i < ti.max_devs; ++i) { | 1087 | for (i = 0; i < ios->numdevs; ++i) { |
798 | struct exofs_trunc_attr *size_attr = &size_attrs[i]; | 1088 | struct exofs_trunc_attr *size_attr = &size_attrs[i]; |
799 | u64 obj_size; | 1089 | u64 obj_size; |
800 | 1090 | ||
@@ -815,7 +1105,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps, | |||
815 | size_attr->attr.val_ptr = &size_attr->newsize; | 1105 | size_attr->attr.val_ptr = &size_attr->newsize; |
816 | 1106 | ||
817 | ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", | 1107 | ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", |
818 | _LLU(comps->comps->obj.id), _LLU(obj_size), i); | 1108 | _LLU(oc->comps->obj.id), _LLU(obj_size), i); |
819 | ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, | 1109 | ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, |
820 | &size_attr->attr); | 1110 | &size_attr->attr); |
821 | if (unlikely(ret)) | 1111 | if (unlikely(ret)) |
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c new file mode 100644 index 000000000000..29c47e5c4a86 --- /dev/null +++ b/fs/exofs/ore_raid.c | |||
@@ -0,0 +1,660 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2011 | ||
3 | * Boaz Harrosh <bharrosh@panasas.com> | ||
4 | * | ||
5 | * This file is part of the objects raid engine (ore). | ||
6 | * | ||
7 | * It is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as published | ||
9 | * by the Free Software Foundation. | ||
10 | * | ||
11 | * You should have received a copy of the GNU General Public License | ||
12 | * along with "ore". If not, write to the Free Software Foundation, Inc: | ||
13 | * "Free Software Foundation <info@fsf.org>" | ||
14 | */ | ||
15 | |||
16 | #include <linux/gfp.h> | ||
17 | #include <linux/async_tx.h> | ||
18 | |||
19 | #include "ore_raid.h" | ||
20 | |||
21 | #undef ORE_DBGMSG2 | ||
22 | #define ORE_DBGMSG2 ORE_DBGMSG | ||
23 | |||
24 | struct page *_raid_page_alloc(void) | ||
25 | { | ||
26 | return alloc_page(GFP_KERNEL); | ||
27 | } | ||
28 | |||
29 | void _raid_page_free(struct page *p) | ||
30 | { | ||
31 | __free_page(p); | ||
32 | } | ||
33 | |||
34 | /* This struct is forward declare in ore_io_state, but is private to here. | ||
35 | * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit. | ||
36 | * | ||
37 | * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn. | ||
38 | * Ascending page index access is sp2d(p-minor, c-major). But storage is | ||
39 | * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor | ||
40 | * API. | ||
41 | */ | ||
42 | struct __stripe_pages_2d { | ||
43 | /* Cache some hot path repeated calculations */ | ||
44 | unsigned parity; | ||
45 | unsigned data_devs; | ||
46 | unsigned pages_in_unit; | ||
47 | |||
48 | bool needed ; | ||
49 | |||
50 | /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */ | ||
51 | struct __1_page_stripe { | ||
52 | bool alloc; | ||
53 | unsigned write_count; | ||
54 | struct async_submit_ctl submit; | ||
55 | struct dma_async_tx_descriptor *tx; | ||
56 | |||
57 | /* The size of this array is data_devs + parity */ | ||
58 | struct page **pages; | ||
59 | struct page **scribble; | ||
60 | /* bool array, size of this array is data_devs */ | ||
61 | char *page_is_read; | ||
62 | } _1p_stripes[]; | ||
63 | }; | ||
64 | |||
65 | /* This can get bigger then a page. So support multiple page allocations | ||
66 | * _sp2d_free should be called even if _sp2d_alloc fails (by returning | ||
67 | * none-zero). | ||
68 | */ | ||
69 | static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width, | ||
70 | unsigned parity, struct __stripe_pages_2d **psp2d) | ||
71 | { | ||
72 | struct __stripe_pages_2d *sp2d; | ||
73 | unsigned data_devs = group_width - parity; | ||
74 | struct _alloc_all_bytes { | ||
75 | struct __alloc_stripe_pages_2d { | ||
76 | struct __stripe_pages_2d sp2d; | ||
77 | struct __1_page_stripe _1p_stripes[pages_in_unit]; | ||
78 | } __asp2d; | ||
79 | struct __alloc_1p_arrays { | ||
80 | struct page *pages[group_width]; | ||
81 | struct page *scribble[group_width]; | ||
82 | char page_is_read[data_devs]; | ||
83 | } __a1pa[pages_in_unit]; | ||
84 | } *_aab; | ||
85 | struct __alloc_1p_arrays *__a1pa; | ||
86 | struct __alloc_1p_arrays *__a1pa_end; | ||
87 | const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]); | ||
88 | unsigned num_a1pa, alloc_size, i; | ||
89 | |||
90 | /* FIXME: check these numbers in ore_verify_layout */ | ||
91 | BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE); | ||
92 | BUG_ON(sizeof__a1pa > PAGE_SIZE); | ||
93 | |||
94 | if (sizeof(*_aab) > PAGE_SIZE) { | ||
95 | num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa; | ||
96 | alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa; | ||
97 | } else { | ||
98 | num_a1pa = pages_in_unit; | ||
99 | alloc_size = sizeof(*_aab); | ||
100 | } | ||
101 | |||
102 | _aab = kzalloc(alloc_size, GFP_KERNEL); | ||
103 | if (unlikely(!_aab)) { | ||
104 | ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size); | ||
105 | return -ENOMEM; | ||
106 | } | ||
107 | |||
108 | sp2d = &_aab->__asp2d.sp2d; | ||
109 | *psp2d = sp2d; /* From here Just call _sp2d_free */ | ||
110 | |||
111 | __a1pa = _aab->__a1pa; | ||
112 | __a1pa_end = __a1pa + num_a1pa; | ||
113 | |||
114 | for (i = 0; i < pages_in_unit; ++i) { | ||
115 | if (unlikely(__a1pa >= __a1pa_end)) { | ||
116 | num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa, | ||
117 | pages_in_unit - i); | ||
118 | |||
119 | __a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL); | ||
120 | if (unlikely(!__a1pa)) { | ||
121 | ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n", | ||
122 | num_a1pa); | ||
123 | return -ENOMEM; | ||
124 | } | ||
125 | __a1pa_end = __a1pa + num_a1pa; | ||
126 | /* First *pages is marked for kfree of the buffer */ | ||
127 | sp2d->_1p_stripes[i].alloc = true; | ||
128 | } | ||
129 | |||
130 | sp2d->_1p_stripes[i].pages = __a1pa->pages; | ||
131 | sp2d->_1p_stripes[i].scribble = __a1pa->scribble ; | ||
132 | sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read; | ||
133 | ++__a1pa; | ||
134 | } | ||
135 | |||
136 | sp2d->parity = parity; | ||
137 | sp2d->data_devs = data_devs; | ||
138 | sp2d->pages_in_unit = pages_in_unit; | ||
139 | return 0; | ||
140 | } | ||
141 | |||
142 | static void _sp2d_reset(struct __stripe_pages_2d *sp2d, | ||
143 | const struct _ore_r4w_op *r4w, void *priv) | ||
144 | { | ||
145 | unsigned data_devs = sp2d->data_devs; | ||
146 | unsigned group_width = data_devs + sp2d->parity; | ||
147 | unsigned p; | ||
148 | |||
149 | if (!sp2d->needed) | ||
150 | return; | ||
151 | |||
152 | for (p = 0; p < sp2d->pages_in_unit; p++) { | ||
153 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | ||
154 | |||
155 | if (_1ps->write_count < group_width) { | ||
156 | unsigned c; | ||
157 | |||
158 | for (c = 0; c < data_devs; c++) | ||
159 | if (_1ps->page_is_read[c]) { | ||
160 | struct page *page = _1ps->pages[c]; | ||
161 | |||
162 | r4w->put_page(priv, page); | ||
163 | _1ps->page_is_read[c] = false; | ||
164 | } | ||
165 | } | ||
166 | |||
167 | memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages)); | ||
168 | _1ps->write_count = 0; | ||
169 | _1ps->tx = NULL; | ||
170 | } | ||
171 | |||
172 | sp2d->needed = false; | ||
173 | } | ||
174 | |||
175 | static void _sp2d_free(struct __stripe_pages_2d *sp2d) | ||
176 | { | ||
177 | unsigned i; | ||
178 | |||
179 | if (!sp2d) | ||
180 | return; | ||
181 | |||
182 | for (i = 0; i < sp2d->pages_in_unit; ++i) { | ||
183 | if (sp2d->_1p_stripes[i].alloc) | ||
184 | kfree(sp2d->_1p_stripes[i].pages); | ||
185 | } | ||
186 | |||
187 | kfree(sp2d); | ||
188 | } | ||
189 | |||
190 | static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d) | ||
191 | { | ||
192 | unsigned p; | ||
193 | |||
194 | for (p = 0; p < sp2d->pages_in_unit; p++) { | ||
195 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | ||
196 | |||
197 | if (_1ps->write_count) | ||
198 | return p; | ||
199 | } | ||
200 | |||
201 | return ~0; | ||
202 | } | ||
203 | |||
204 | static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d) | ||
205 | { | ||
206 | unsigned p; | ||
207 | |||
208 | for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { | ||
209 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | ||
210 | |||
211 | if (_1ps->write_count) | ||
212 | return p; | ||
213 | } | ||
214 | |||
215 | return ~0; | ||
216 | } | ||
217 | |||
218 | static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) | ||
219 | { | ||
220 | unsigned p; | ||
221 | for (p = 0; p < sp2d->pages_in_unit; p++) { | ||
222 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | ||
223 | |||
224 | if (!_1ps->write_count) | ||
225 | continue; | ||
226 | |||
227 | init_async_submit(&_1ps->submit, | ||
228 | ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK, | ||
229 | NULL, | ||
230 | NULL, NULL, | ||
231 | (addr_conv_t *)_1ps->scribble); | ||
232 | |||
233 | /* TODO: raid6 */ | ||
234 | _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages, | ||
235 | 0, sp2d->data_devs, PAGE_SIZE, | ||
236 | &_1ps->submit); | ||
237 | } | ||
238 | |||
239 | for (p = 0; p < sp2d->pages_in_unit; p++) { | ||
240 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | ||
241 | /* NOTE: We wait for HW synchronously (I don't have such HW | ||
242 | * to test with.) Is parallelism needed with today's multi | ||
243 | * cores? | ||
244 | */ | ||
245 | async_tx_issue_pending(_1ps->tx); | ||
246 | } | ||
247 | } | ||
248 | |||
249 | void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, | ||
250 | struct ore_striping_info *si, struct page *page) | ||
251 | { | ||
252 | struct __1_page_stripe *_1ps; | ||
253 | |||
254 | sp2d->needed = true; | ||
255 | |||
256 | _1ps = &sp2d->_1p_stripes[si->cur_pg]; | ||
257 | _1ps->pages[si->cur_comp] = page; | ||
258 | ++_1ps->write_count; | ||
259 | |||
260 | si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit; | ||
261 | /* si->cur_comp is advanced outside at main loop */ | ||
262 | } | ||
263 | |||
264 | void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, | ||
265 | bool not_last) | ||
266 | { | ||
267 | struct osd_sg_entry *sge; | ||
268 | |||
269 | ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d " | ||
270 | "offset=0x%llx length=0x%x last_sgs_total=0x%x\n", | ||
271 | per_dev->dev, cur_len, not_last, per_dev->cur_sg, | ||
272 | _LLU(per_dev->offset), per_dev->length, | ||
273 | per_dev->last_sgs_total); | ||
274 | |||
275 | if (!per_dev->cur_sg) { | ||
276 | sge = per_dev->sglist; | ||
277 | |||
278 | /* First time we prepare two entries */ | ||
279 | if (per_dev->length) { | ||
280 | ++per_dev->cur_sg; | ||
281 | sge->offset = per_dev->offset; | ||
282 | sge->len = per_dev->length; | ||
283 | } else { | ||
284 | /* Here the parity is the first unit of this object. | ||
285 | * This happens every time we reach a parity device on | ||
286 | * the same stripe as the per_dev->offset. We need to | ||
287 | * just skip this unit. | ||
288 | */ | ||
289 | per_dev->offset += cur_len; | ||
290 | return; | ||
291 | } | ||
292 | } else { | ||
293 | /* finalize the last one */ | ||
294 | sge = &per_dev->sglist[per_dev->cur_sg - 1]; | ||
295 | sge->len = per_dev->length - per_dev->last_sgs_total; | ||
296 | } | ||
297 | |||
298 | if (not_last) { | ||
299 | /* Partly prepare the next one */ | ||
300 | struct osd_sg_entry *next_sge = sge + 1; | ||
301 | |||
302 | ++per_dev->cur_sg; | ||
303 | next_sge->offset = sge->offset + sge->len + cur_len; | ||
304 | /* Save cur len so we know how mutch was added next time */ | ||
305 | per_dev->last_sgs_total = per_dev->length; | ||
306 | next_sge->len = 0; | ||
307 | } else if (!sge->len) { | ||
308 | /* Optimize for when the last unit is a parity */ | ||
309 | --per_dev->cur_sg; | ||
310 | } | ||
311 | } | ||
312 | |||
313 | static int _alloc_read_4_write(struct ore_io_state *ios) | ||
314 | { | ||
315 | struct ore_layout *layout = ios->layout; | ||
316 | int ret; | ||
317 | /* We want to only read those pages not in cache so worst case | ||
318 | * is a stripe populated with every other page | ||
319 | */ | ||
320 | unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2; | ||
321 | |||
322 | ret = _ore_get_io_state(layout, ios->oc, | ||
323 | layout->group_width * layout->mirrors_p1, | ||
324 | sgs_per_dev, 0, &ios->ios_read_4_write); | ||
325 | return ret; | ||
326 | } | ||
327 | |||
328 | /* @si contains info of the to-be-inserted page. Update of @si should be | ||
329 | * maintained by caller. Specificaly si->dev, si->obj_offset, ... | ||
330 | */ | ||
331 | static int _add_to_read_4_write(struct ore_io_state *ios, | ||
332 | struct ore_striping_info *si, struct page *page) | ||
333 | { | ||
334 | struct request_queue *q; | ||
335 | struct ore_per_dev_state *per_dev; | ||
336 | struct ore_io_state *read_ios; | ||
337 | unsigned first_dev = si->dev - (si->dev % | ||
338 | (ios->layout->group_width * ios->layout->mirrors_p1)); | ||
339 | unsigned comp = si->dev - first_dev; | ||
340 | unsigned added_len; | ||
341 | |||
342 | if (!ios->ios_read_4_write) { | ||
343 | int ret = _alloc_read_4_write(ios); | ||
344 | |||
345 | if (unlikely(ret)) | ||
346 | return ret; | ||
347 | } | ||
348 | |||
349 | read_ios = ios->ios_read_4_write; | ||
350 | read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1; | ||
351 | |||
352 | per_dev = &read_ios->per_dev[comp]; | ||
353 | if (!per_dev->length) { | ||
354 | per_dev->bio = bio_kmalloc(GFP_KERNEL, | ||
355 | ios->sp2d->pages_in_unit); | ||
356 | if (unlikely(!per_dev->bio)) { | ||
357 | ORE_DBGMSG("Failed to allocate BIO size=%u\n", | ||
358 | ios->sp2d->pages_in_unit); | ||
359 | return -ENOMEM; | ||
360 | } | ||
361 | per_dev->offset = si->obj_offset; | ||
362 | per_dev->dev = si->dev; | ||
363 | } else if (si->obj_offset != (per_dev->offset + per_dev->length)) { | ||
364 | u64 gap = si->obj_offset - (per_dev->offset + per_dev->length); | ||
365 | |||
366 | _ore_add_sg_seg(per_dev, gap, true); | ||
367 | } | ||
368 | q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev)); | ||
369 | added_len = bio_add_pc_page(q, per_dev->bio, page, PAGE_SIZE, 0); | ||
370 | if (unlikely(added_len != PAGE_SIZE)) { | ||
371 | ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n", | ||
372 | per_dev->bio->bi_vcnt); | ||
373 | return -ENOMEM; | ||
374 | } | ||
375 | |||
376 | per_dev->length += PAGE_SIZE; | ||
377 | return 0; | ||
378 | } | ||
379 | |||
380 | static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) | ||
381 | { | ||
382 | struct bio_vec *bv; | ||
383 | unsigned i, d; | ||
384 | |||
385 | /* loop on all devices all pages */ | ||
386 | for (d = 0; d < ios->numdevs; d++) { | ||
387 | struct bio *bio = ios->per_dev[d].bio; | ||
388 | |||
389 | if (!bio) | ||
390 | continue; | ||
391 | |||
392 | __bio_for_each_segment(bv, bio, i, 0) { | ||
393 | struct page *page = bv->bv_page; | ||
394 | |||
395 | SetPageUptodate(page); | ||
396 | if (PageError(page)) | ||
397 | ClearPageError(page); | ||
398 | } | ||
399 | } | ||
400 | } | ||
401 | |||
402 | /* read_4_write is hacked to read the start of the first stripe and/or | ||
403 | * the end of the last stripe. If needed, with an sg-gap at each device/page. | ||
404 | * It is assumed to be called after the to_be_written pages of the first stripe | ||
405 | * are populating ios->sp2d[][] | ||
406 | * | ||
407 | * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations | ||
408 | * These pages are held at sp2d[p].pages[c] but with | ||
409 | * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are | ||
410 | * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is | ||
411 | * @uptodate=true, so we don't need to read it, only unlock, after IO. | ||
412 | * | ||
413 | * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then | ||
414 | * to-be-written count, we should consider the xor-in-place mode. | ||
415 | * need_to_read_pages_count is the actual number of pages not present in cache. | ||
416 | * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough | ||
417 | * approximation? In this mode the read pages are put in the empty places of | ||
418 | * ios->sp2d[p][*], xor is calculated the same way. These pages are | ||
419 | * allocated/freed and don't go through cache | ||
420 | */ | ||
421 | static int _read_4_write(struct ore_io_state *ios) | ||
422 | { | ||
423 | struct ore_io_state *ios_read; | ||
424 | struct ore_striping_info read_si; | ||
425 | struct __stripe_pages_2d *sp2d = ios->sp2d; | ||
426 | u64 offset = ios->si.first_stripe_start; | ||
427 | u64 last_stripe_end; | ||
428 | unsigned bytes_in_stripe = ios->si.bytes_in_stripe; | ||
429 | unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1; | ||
430 | int ret; | ||
431 | |||
432 | if (offset == ios->offset) /* Go to start collect $200 */ | ||
433 | goto read_last_stripe; | ||
434 | |||
435 | min_p = _sp2d_min_pg(sp2d); | ||
436 | max_p = _sp2d_max_pg(sp2d); | ||
437 | |||
438 | for (c = 0; ; c++) { | ||
439 | ore_calc_stripe_info(ios->layout, offset, 0, &read_si); | ||
440 | read_si.obj_offset += min_p * PAGE_SIZE; | ||
441 | offset += min_p * PAGE_SIZE; | ||
442 | for (p = min_p; p <= max_p; p++) { | ||
443 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | ||
444 | struct page **pp = &_1ps->pages[c]; | ||
445 | bool uptodate; | ||
446 | |||
447 | if (*pp) | ||
448 | /* to-be-written pages start here */ | ||
449 | goto read_last_stripe; | ||
450 | |||
451 | *pp = ios->r4w->get_page(ios->private, offset, | ||
452 | &uptodate); | ||
453 | if (unlikely(!*pp)) | ||
454 | return -ENOMEM; | ||
455 | |||
456 | if (!uptodate) | ||
457 | _add_to_read_4_write(ios, &read_si, *pp); | ||
458 | |||
459 | /* Mark read-pages to be cache_released */ | ||
460 | _1ps->page_is_read[c] = true; | ||
461 | read_si.obj_offset += PAGE_SIZE; | ||
462 | offset += PAGE_SIZE; | ||
463 | } | ||
464 | offset += (sp2d->pages_in_unit - p) * PAGE_SIZE; | ||
465 | } | ||
466 | |||
467 | read_last_stripe: | ||
468 | offset = ios->offset + (ios->length + PAGE_SIZE - 1) / | ||
469 | PAGE_SIZE * PAGE_SIZE; | ||
470 | last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe) | ||
471 | * bytes_in_stripe; | ||
472 | if (offset == last_stripe_end) /* Optimize for the aligned case */ | ||
473 | goto read_it; | ||
474 | |||
475 | ore_calc_stripe_info(ios->layout, offset, 0, &read_si); | ||
476 | p = read_si.unit_off / PAGE_SIZE; | ||
477 | c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, | ||
478 | ios->layout->mirrors_p1, read_si.par_dev, read_si.dev); | ||
479 | |||
480 | BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end); | ||
481 | /* unaligned IO must be within a single stripe */ | ||
482 | |||
483 | if (min_p == sp2d->pages_in_unit) { | ||
484 | /* Didn't do it yet */ | ||
485 | min_p = _sp2d_min_pg(sp2d); | ||
486 | max_p = _sp2d_max_pg(sp2d); | ||
487 | } | ||
488 | |||
489 | while (offset < last_stripe_end) { | ||
490 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | ||
491 | |||
492 | if ((min_p <= p) && (p <= max_p)) { | ||
493 | struct page *page; | ||
494 | bool uptodate; | ||
495 | |||
496 | BUG_ON(_1ps->pages[c]); | ||
497 | page = ios->r4w->get_page(ios->private, offset, | ||
498 | &uptodate); | ||
499 | if (unlikely(!page)) | ||
500 | return -ENOMEM; | ||
501 | |||
502 | _1ps->pages[c] = page; | ||
503 | /* Mark read-pages to be cache_released */ | ||
504 | _1ps->page_is_read[c] = true; | ||
505 | if (!uptodate) | ||
506 | _add_to_read_4_write(ios, &read_si, page); | ||
507 | } | ||
508 | |||
509 | offset += PAGE_SIZE; | ||
510 | if (p == (sp2d->pages_in_unit - 1)) { | ||
511 | ++c; | ||
512 | p = 0; | ||
513 | ore_calc_stripe_info(ios->layout, offset, 0, &read_si); | ||
514 | } else { | ||
515 | read_si.obj_offset += PAGE_SIZE; | ||
516 | ++p; | ||
517 | } | ||
518 | } | ||
519 | |||
520 | read_it: | ||
521 | ios_read = ios->ios_read_4_write; | ||
522 | if (!ios_read) | ||
523 | return 0; | ||
524 | |||
525 | /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change | ||
526 | * to check for per_dev->bio | ||
527 | */ | ||
528 | ios_read->pages = ios->pages; | ||
529 | |||
530 | /* Now read these devices */ | ||
531 | for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) { | ||
532 | ret = _ore_read_mirror(ios_read, i); | ||
533 | if (unlikely(ret)) | ||
534 | return ret; | ||
535 | } | ||
536 | |||
537 | ret = ore_io_execute(ios_read); /* Synchronus execution */ | ||
538 | if (unlikely(ret)) { | ||
539 | ORE_DBGMSG("!! ore_io_execute => %d\n", ret); | ||
540 | return ret; | ||
541 | } | ||
542 | |||
543 | _mark_read4write_pages_uptodate(ios_read, ret); | ||
544 | return 0; | ||
545 | } | ||
546 | |||
547 | /* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */ | ||
548 | int _ore_add_parity_unit(struct ore_io_state *ios, | ||
549 | struct ore_striping_info *si, | ||
550 | struct ore_per_dev_state *per_dev, | ||
551 | unsigned cur_len) | ||
552 | { | ||
553 | if (ios->reading) { | ||
554 | BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev); | ||
555 | _ore_add_sg_seg(per_dev, cur_len, true); | ||
556 | } else { | ||
557 | struct __stripe_pages_2d *sp2d = ios->sp2d; | ||
558 | struct page **pages = ios->parity_pages + ios->cur_par_page; | ||
559 | unsigned num_pages; | ||
560 | unsigned array_start = 0; | ||
561 | unsigned i; | ||
562 | int ret; | ||
563 | |||
564 | si->cur_pg = _sp2d_min_pg(sp2d); | ||
565 | num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg; | ||
566 | |||
567 | if (!cur_len) /* If last stripe operate on parity comp */ | ||
568 | si->cur_comp = sp2d->data_devs; | ||
569 | |||
570 | if (!per_dev->length) { | ||
571 | per_dev->offset += si->cur_pg * PAGE_SIZE; | ||
572 | /* If first stripe, Read in all read4write pages | ||
573 | * (if needed) before we calculate the first parity. | ||
574 | */ | ||
575 | _read_4_write(ios); | ||
576 | } | ||
577 | |||
578 | for (i = 0; i < num_pages; i++) { | ||
579 | pages[i] = _raid_page_alloc(); | ||
580 | if (unlikely(!pages[i])) | ||
581 | return -ENOMEM; | ||
582 | |||
583 | ++(ios->cur_par_page); | ||
584 | } | ||
585 | |||
586 | BUG_ON(si->cur_comp != sp2d->data_devs); | ||
587 | BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit); | ||
588 | |||
589 | ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, | ||
590 | per_dev, num_pages * PAGE_SIZE); | ||
591 | if (unlikely(ret)) | ||
592 | return ret; | ||
593 | |||
594 | /* TODO: raid6 if (last_parity_dev) */ | ||
595 | _gen_xor_unit(sp2d); | ||
596 | _sp2d_reset(sp2d, ios->r4w, ios->private); | ||
597 | } | ||
598 | return 0; | ||
599 | } | ||
600 | |||
601 | int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) | ||
602 | { | ||
603 | struct ore_layout *layout = ios->layout; | ||
604 | |||
605 | if (ios->parity_pages) { | ||
606 | unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; | ||
607 | unsigned stripe_size = ios->si.bytes_in_stripe; | ||
608 | u64 last_stripe, first_stripe; | ||
609 | |||
610 | if (_sp2d_alloc(pages_in_unit, layout->group_width, | ||
611 | layout->parity, &ios->sp2d)) { | ||
612 | return -ENOMEM; | ||
613 | } | ||
614 | |||
615 | BUG_ON(ios->offset % PAGE_SIZE); | ||
616 | |||
617 | /* Round io down to last full strip */ | ||
618 | first_stripe = div_u64(ios->offset, stripe_size); | ||
619 | last_stripe = div_u64(ios->offset + ios->length, stripe_size); | ||
620 | |||
621 | /* If an IO spans more then a single stripe it must end at | ||
622 | * a stripe boundary. The reminder at the end is pushed into the | ||
623 | * next IO. | ||
624 | */ | ||
625 | if (last_stripe != first_stripe) { | ||
626 | ios->length = last_stripe * stripe_size - ios->offset; | ||
627 | |||
628 | BUG_ON(!ios->length); | ||
629 | ios->nr_pages = (ios->length + PAGE_SIZE - 1) / | ||
630 | PAGE_SIZE; | ||
631 | ios->si.length = ios->length; /*make it consistent */ | ||
632 | } | ||
633 | } | ||
634 | return 0; | ||
635 | } | ||
636 | |||
637 | void _ore_free_raid_stuff(struct ore_io_state *ios) | ||
638 | { | ||
639 | if (ios->sp2d) { /* writing and raid */ | ||
640 | unsigned i; | ||
641 | |||
642 | for (i = 0; i < ios->cur_par_page; i++) { | ||
643 | struct page *page = ios->parity_pages[i]; | ||
644 | |||
645 | if (page) | ||
646 | _raid_page_free(page); | ||
647 | } | ||
648 | if (ios->extra_part_alloc) | ||
649 | kfree(ios->parity_pages); | ||
650 | /* If IO returned an error pages might need unlocking */ | ||
651 | _sp2d_reset(ios->sp2d, ios->r4w, ios->private); | ||
652 | _sp2d_free(ios->sp2d); | ||
653 | } else { | ||
654 | /* Will only be set if raid reading && sglist is big */ | ||
655 | if (ios->extra_part_alloc) | ||
656 | kfree(ios->per_dev[0].sglist); | ||
657 | } | ||
658 | if (ios->ios_read_4_write) | ||
659 | ore_put_io_state(ios->ios_read_4_write); | ||
660 | } | ||
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h new file mode 100644 index 000000000000..2ffd2c3c6e46 --- /dev/null +++ b/fs/exofs/ore_raid.h | |||
@@ -0,0 +1,79 @@ | |||
1 | /* | ||
2 | * Copyright (C) from 2011 | ||
3 | * Boaz Harrosh <bharrosh@panasas.com> | ||
4 | * | ||
5 | * This file is part of the objects raid engine (ore). | ||
6 | * | ||
7 | * It is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as published | ||
9 | * by the Free Software Foundation. | ||
10 | * | ||
11 | * You should have received a copy of the GNU General Public License | ||
12 | * along with "ore". If not, write to the Free Software Foundation, Inc: | ||
13 | * "Free Software Foundation <info@fsf.org>" | ||
14 | */ | ||
15 | |||
16 | #include <scsi/osd_ore.h> | ||
17 | |||
18 | #define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) | ||
19 | |||
20 | #ifdef CONFIG_EXOFS_DEBUG | ||
21 | #define ORE_DBGMSG(fmt, a...) \ | ||
22 | printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) | ||
23 | #else | ||
24 | #define ORE_DBGMSG(fmt, a...) \ | ||
25 | do { if (0) printk(fmt, ##a); } while (0) | ||
26 | #endif | ||
27 | |||
28 | /* u64 has problems with printk this will cast it to unsigned long long */ | ||
29 | #define _LLU(x) (unsigned long long)(x) | ||
30 | |||
31 | #define ORE_DBGMSG2(M...) do {} while (0) | ||
32 | /* #define ORE_DBGMSG2 ORE_DBGMSG */ | ||
33 | |||
34 | /* Calculate the component order in a stripe. eg the logical data unit | ||
35 | * address within the stripe of @dev given the @par_dev of this stripe. | ||
36 | */ | ||
37 | static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1, | ||
38 | unsigned par_dev, unsigned dev) | ||
39 | { | ||
40 | unsigned first_dev = dev - dev % devs_in_group; | ||
41 | |||
42 | dev -= first_dev; | ||
43 | par_dev -= first_dev; | ||
44 | |||
45 | if (devs_in_group == par_dev) /* The raid 0 case */ | ||
46 | return dev / mirrors_p1; | ||
47 | /* raid4/5/6 case */ | ||
48 | return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) / | ||
49 | mirrors_p1; | ||
50 | } | ||
51 | |||
52 | /* ios_raid.c stuff needed by ios.c */ | ||
53 | int _ore_post_alloc_raid_stuff(struct ore_io_state *ios); | ||
54 | void _ore_free_raid_stuff(struct ore_io_state *ios); | ||
55 | |||
56 | void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, | ||
57 | bool not_last); | ||
58 | int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, | ||
59 | struct ore_per_dev_state *per_dev, unsigned cur_len); | ||
60 | void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, | ||
61 | struct ore_striping_info *si, struct page *page); | ||
62 | static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d, | ||
63 | struct ore_striping_info *si, struct page *page) | ||
64 | { | ||
65 | if (!sp2d) /* Inline the fast path */ | ||
66 | return; /* Hay no raid stuff */ | ||
67 | _ore_add_stripe_page(sp2d, si, page); | ||
68 | } | ||
69 | |||
70 | /* ios.c stuff needed by ios_raid.c */ | ||
71 | int _ore_get_io_state(struct ore_layout *layout, | ||
72 | struct ore_components *oc, unsigned numdevs, | ||
73 | unsigned sgs_per_dev, unsigned num_par_pages, | ||
74 | struct ore_io_state **pios); | ||
75 | int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, | ||
76 | unsigned pgbase, struct page **pages, | ||
77 | struct ore_per_dev_state *per_dev, int cur_len); | ||
78 | int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp); | ||
79 | int ore_io_execute(struct ore_io_state *ios); | ||
diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 274894053b02..057b237b8b69 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c | |||
@@ -266,7 +266,7 @@ static int __sbi_read_stats(struct exofs_sb_info *sbi) | |||
266 | struct ore_io_state *ios; | 266 | struct ore_io_state *ios; |
267 | int ret; | 267 | int ret; |
268 | 268 | ||
269 | ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); | 269 | ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); |
270 | if (unlikely(ret)) { | 270 | if (unlikely(ret)) { |
271 | EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); | 271 | EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); |
272 | return ret; | 272 | return ret; |
@@ -321,7 +321,7 @@ int exofs_sbi_write_stats(struct exofs_sb_info *sbi) | |||
321 | struct ore_io_state *ios; | 321 | struct ore_io_state *ios; |
322 | int ret; | 322 | int ret; |
323 | 323 | ||
324 | ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); | 324 | ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); |
325 | if (unlikely(ret)) { | 325 | if (unlikely(ret)) { |
326 | EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); | 326 | EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); |
327 | return ret; | 327 | return ret; |
@@ -355,12 +355,12 @@ static const struct export_operations exofs_export_ops; | |||
355 | /* | 355 | /* |
356 | * Write the superblock to the OSD | 356 | * Write the superblock to the OSD |
357 | */ | 357 | */ |
358 | int exofs_sync_fs(struct super_block *sb, int wait) | 358 | static int exofs_sync_fs(struct super_block *sb, int wait) |
359 | { | 359 | { |
360 | struct exofs_sb_info *sbi; | 360 | struct exofs_sb_info *sbi; |
361 | struct exofs_fscb *fscb; | 361 | struct exofs_fscb *fscb; |
362 | struct ore_comp one_comp; | 362 | struct ore_comp one_comp; |
363 | struct ore_components comps; | 363 | struct ore_components oc; |
364 | struct ore_io_state *ios; | 364 | struct ore_io_state *ios; |
365 | int ret = -ENOMEM; | 365 | int ret = -ENOMEM; |
366 | 366 | ||
@@ -378,9 +378,9 @@ int exofs_sync_fs(struct super_block *sb, int wait) | |||
378 | * the writeable info is set in exofs_sbi_write_stats() above. | 378 | * the writeable info is set in exofs_sbi_write_stats() above. |
379 | */ | 379 | */ |
380 | 380 | ||
381 | exofs_init_comps(&comps, &one_comp, sbi, EXOFS_SUPER_ID); | 381 | exofs_init_comps(&oc, &one_comp, sbi, EXOFS_SUPER_ID); |
382 | 382 | ||
383 | ret = ore_get_io_state(&sbi->layout, &comps, &ios); | 383 | ret = ore_get_io_state(&sbi->layout, &oc, &ios); |
384 | if (unlikely(ret)) | 384 | if (unlikely(ret)) |
385 | goto out; | 385 | goto out; |
386 | 386 | ||
@@ -429,19 +429,20 @@ static void _exofs_print_device(const char *msg, const char *dev_path, | |||
429 | msg, dev_path ?: "", odi->osdname, _LLU(pid)); | 429 | msg, dev_path ?: "", odi->osdname, _LLU(pid)); |
430 | } | 430 | } |
431 | 431 | ||
432 | void exofs_free_sbi(struct exofs_sb_info *sbi) | 432 | static void exofs_free_sbi(struct exofs_sb_info *sbi) |
433 | { | 433 | { |
434 | while (sbi->comps.numdevs) { | 434 | unsigned numdevs = sbi->oc.numdevs; |
435 | int i = --sbi->comps.numdevs; | 435 | |
436 | struct osd_dev *od = sbi->comps.ods[i]; | 436 | while (numdevs) { |
437 | unsigned i = --numdevs; | ||
438 | struct osd_dev *od = ore_comp_dev(&sbi->oc, i); | ||
437 | 439 | ||
438 | if (od) { | 440 | if (od) { |
439 | sbi->comps.ods[i] = NULL; | 441 | ore_comp_set_dev(&sbi->oc, i, NULL); |
440 | osduld_put_device(od); | 442 | osduld_put_device(od); |
441 | } | 443 | } |
442 | } | 444 | } |
443 | if (sbi->comps.ods != sbi->_min_one_dev) | 445 | kfree(sbi->oc.ods); |
444 | kfree(sbi->comps.ods); | ||
445 | kfree(sbi); | 446 | kfree(sbi); |
446 | } | 447 | } |
447 | 448 | ||
@@ -468,7 +469,7 @@ static void exofs_put_super(struct super_block *sb) | |||
468 | msecs_to_jiffies(100)); | 469 | msecs_to_jiffies(100)); |
469 | } | 470 | } |
470 | 471 | ||
471 | _exofs_print_device("Unmounting", NULL, sbi->comps.ods[0], | 472 | _exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0), |
472 | sbi->one_comp.obj.partition); | 473 | sbi->one_comp.obj.partition); |
473 | 474 | ||
474 | bdi_destroy(&sbi->bdi); | 475 | bdi_destroy(&sbi->bdi); |
@@ -479,76 +480,20 @@ static void exofs_put_super(struct super_block *sb) | |||
479 | static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, | 480 | static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, |
480 | struct exofs_device_table *dt) | 481 | struct exofs_device_table *dt) |
481 | { | 482 | { |
482 | u64 stripe_length; | 483 | int ret; |
483 | 484 | ||
484 | sbi->data_map.odm_num_comps = | 485 | sbi->layout.stripe_unit = |
485 | le32_to_cpu(dt->dt_data_map.cb_num_comps); | ||
486 | sbi->data_map.odm_stripe_unit = | ||
487 | le64_to_cpu(dt->dt_data_map.cb_stripe_unit); | 486 | le64_to_cpu(dt->dt_data_map.cb_stripe_unit); |
488 | sbi->data_map.odm_group_width = | 487 | sbi->layout.group_width = |
489 | le32_to_cpu(dt->dt_data_map.cb_group_width); | 488 | le32_to_cpu(dt->dt_data_map.cb_group_width); |
490 | sbi->data_map.odm_group_depth = | 489 | sbi->layout.group_depth = |
491 | le32_to_cpu(dt->dt_data_map.cb_group_depth); | 490 | le32_to_cpu(dt->dt_data_map.cb_group_depth); |
492 | sbi->data_map.odm_mirror_cnt = | 491 | sbi->layout.mirrors_p1 = |
493 | le32_to_cpu(dt->dt_data_map.cb_mirror_cnt); | 492 | le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1; |
494 | sbi->data_map.odm_raid_algorithm = | 493 | sbi->layout.raid_algorithm = |
495 | le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); | 494 | le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); |
496 | 495 | ||
497 | /* FIXME: Only raid0 for now. if not so, do not mount */ | 496 | ret = ore_verify_layout(numdevs, &sbi->layout); |
498 | if (sbi->data_map.odm_num_comps != numdevs) { | ||
499 | EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n", | ||
500 | sbi->data_map.odm_num_comps, numdevs); | ||
501 | return -EINVAL; | ||
502 | } | ||
503 | if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) { | ||
504 | EXOFS_ERR("Only RAID_0 for now\n"); | ||
505 | return -EINVAL; | ||
506 | } | ||
507 | if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) { | ||
508 | EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n", | ||
509 | numdevs, sbi->data_map.odm_mirror_cnt); | ||
510 | return -EINVAL; | ||
511 | } | ||
512 | |||
513 | if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) { | ||
514 | EXOFS_ERR("Stripe Unit(0x%llx)" | ||
515 | " must be Multples of PAGE_SIZE(0x%lx)\n", | ||
516 | _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE); | ||
517 | return -EINVAL; | ||
518 | } | ||
519 | |||
520 | sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit; | ||
521 | sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1; | ||
522 | |||
523 | if (sbi->data_map.odm_group_width) { | ||
524 | sbi->layout.group_width = sbi->data_map.odm_group_width; | ||
525 | sbi->layout.group_depth = sbi->data_map.odm_group_depth; | ||
526 | if (!sbi->layout.group_depth) { | ||
527 | EXOFS_ERR("group_depth == 0 && group_width != 0\n"); | ||
528 | return -EINVAL; | ||
529 | } | ||
530 | sbi->layout.group_count = sbi->data_map.odm_num_comps / | ||
531 | sbi->layout.mirrors_p1 / | ||
532 | sbi->data_map.odm_group_width; | ||
533 | } else { | ||
534 | if (sbi->data_map.odm_group_depth) { | ||
535 | printk(KERN_NOTICE "Warning: group_depth ignored " | ||
536 | "group_width == 0 && group_depth == %d\n", | ||
537 | sbi->data_map.odm_group_depth); | ||
538 | sbi->data_map.odm_group_depth = 0; | ||
539 | } | ||
540 | sbi->layout.group_width = sbi->data_map.odm_num_comps / | ||
541 | sbi->layout.mirrors_p1; | ||
542 | sbi->layout.group_depth = -1; | ||
543 | sbi->layout.group_count = 1; | ||
544 | } | ||
545 | |||
546 | stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit; | ||
547 | if (stripe_length >= (1ULL << 32)) { | ||
548 | EXOFS_ERR("Total Stripe length(0x%llx)" | ||
549 | " >= 32bit is not supported\n", _LLU(stripe_length)); | ||
550 | return -EINVAL; | ||
551 | } | ||
552 | 497 | ||
553 | EXOFS_DBGMSG("exofs: layout: " | 498 | EXOFS_DBGMSG("exofs: layout: " |
554 | "num_comps=%u stripe_unit=0x%x group_width=%u " | 499 | "num_comps=%u stripe_unit=0x%x group_width=%u " |
@@ -558,8 +503,8 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, | |||
558 | sbi->layout.group_width, | 503 | sbi->layout.group_width, |
559 | _LLU(sbi->layout.group_depth), | 504 | _LLU(sbi->layout.group_depth), |
560 | sbi->layout.mirrors_p1, | 505 | sbi->layout.mirrors_p1, |
561 | sbi->data_map.odm_raid_algorithm); | 506 | sbi->layout.raid_algorithm); |
562 | return 0; | 507 | return ret; |
563 | } | 508 | } |
564 | 509 | ||
565 | static unsigned __ra_pages(struct ore_layout *layout) | 510 | static unsigned __ra_pages(struct ore_layout *layout) |
@@ -605,12 +550,40 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, | |||
605 | return !(odi->systemid_len || odi->osdname_len); | 550 | return !(odi->systemid_len || odi->osdname_len); |
606 | } | 551 | } |
607 | 552 | ||
553 | int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs, | ||
554 | struct exofs_dev **peds) | ||
555 | { | ||
556 | struct __alloc_ore_devs_and_exofs_devs { | ||
557 | /* Twice bigger table: See exofs_init_comps() and comment at | ||
558 | * exofs_read_lookup_dev_table() | ||
559 | */ | ||
560 | struct ore_dev *oreds[numdevs * 2 - 1]; | ||
561 | struct exofs_dev eds[numdevs]; | ||
562 | } *aoded; | ||
563 | struct exofs_dev *eds; | ||
564 | unsigned i; | ||
565 | |||
566 | aoded = kzalloc(sizeof(*aoded), GFP_KERNEL); | ||
567 | if (unlikely(!aoded)) { | ||
568 | EXOFS_ERR("ERROR: faild allocating Device array[%d]\n", | ||
569 | numdevs); | ||
570 | return -ENOMEM; | ||
571 | } | ||
572 | |||
573 | sbi->oc.ods = aoded->oreds; | ||
574 | *peds = eds = aoded->eds; | ||
575 | for (i = 0; i < numdevs; ++i) | ||
576 | aoded->oreds[i] = &eds[i].ored; | ||
577 | return 0; | ||
578 | } | ||
579 | |||
608 | static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, | 580 | static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, |
609 | struct osd_dev *fscb_od, | 581 | struct osd_dev *fscb_od, |
610 | unsigned table_count) | 582 | unsigned table_count) |
611 | { | 583 | { |
612 | struct ore_comp comp; | 584 | struct ore_comp comp; |
613 | struct exofs_device_table *dt; | 585 | struct exofs_device_table *dt; |
586 | struct exofs_dev *eds; | ||
614 | unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + | 587 | unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + |
615 | sizeof(*dt); | 588 | sizeof(*dt); |
616 | unsigned numdevs, i; | 589 | unsigned numdevs, i; |
@@ -623,7 +596,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, | |||
623 | return -ENOMEM; | 596 | return -ENOMEM; |
624 | } | 597 | } |
625 | 598 | ||
626 | sbi->comps.numdevs = 0; | 599 | sbi->oc.numdevs = 0; |
627 | 600 | ||
628 | comp.obj.partition = sbi->one_comp.obj.partition; | 601 | comp.obj.partition = sbi->one_comp.obj.partition; |
629 | comp.obj.id = EXOFS_DEVTABLE_ID; | 602 | comp.obj.id = EXOFS_DEVTABLE_ID; |
@@ -647,20 +620,16 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, | |||
647 | if (unlikely(ret)) | 620 | if (unlikely(ret)) |
648 | goto out; | 621 | goto out; |
649 | 622 | ||
650 | if (likely(numdevs > 1)) { | 623 | ret = __alloc_dev_table(sbi, numdevs, &eds); |
651 | unsigned size = numdevs * sizeof(sbi->comps.ods[0]); | 624 | if (unlikely(ret)) |
652 | 625 | goto out; | |
653 | /* Twice bigger table: See exofs_init_comps() and below | 626 | /* exofs round-robins the device table view according to inode |
654 | * comment | 627 | * number. We hold a: twice bigger table hence inodes can point |
655 | */ | 628 | * to any device and have a sequential view of the table |
656 | sbi->comps.ods = kzalloc(size + size - 1, GFP_KERNEL); | 629 | * starting at this device. See exofs_init_comps() |
657 | if (unlikely(!sbi->comps.ods)) { | 630 | */ |
658 | EXOFS_ERR("ERROR: faild allocating Device array[%d]\n", | 631 | memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0], |
659 | numdevs); | 632 | (numdevs - 1) * sizeof(sbi->oc.ods[0])); |
660 | ret = -ENOMEM; | ||
661 | goto out; | ||
662 | } | ||
663 | } | ||
664 | 633 | ||
665 | for (i = 0; i < numdevs; i++) { | 634 | for (i = 0; i < numdevs; i++) { |
666 | struct exofs_fscb fscb; | 635 | struct exofs_fscb fscb; |
@@ -676,13 +645,16 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, | |||
676 | printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n", | 645 | printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n", |
677 | i, odi.osdname); | 646 | i, odi.osdname); |
678 | 647 | ||
648 | /* the exofs id is currently the table index */ | ||
649 | eds[i].did = i; | ||
650 | |||
679 | /* On all devices the device table is identical. The user can | 651 | /* On all devices the device table is identical. The user can |
680 | * specify any one of the participating devices on the command | 652 | * specify any one of the participating devices on the command |
681 | * line. We always keep them in device-table order. | 653 | * line. We always keep them in device-table order. |
682 | */ | 654 | */ |
683 | if (fscb_od && osduld_device_same(fscb_od, &odi)) { | 655 | if (fscb_od && osduld_device_same(fscb_od, &odi)) { |
684 | sbi->comps.ods[i] = fscb_od; | 656 | eds[i].ored.od = fscb_od; |
685 | ++sbi->comps.numdevs; | 657 | ++sbi->oc.numdevs; |
686 | fscb_od = NULL; | 658 | fscb_od = NULL; |
687 | continue; | 659 | continue; |
688 | } | 660 | } |
@@ -695,8 +667,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, | |||
695 | goto out; | 667 | goto out; |
696 | } | 668 | } |
697 | 669 | ||
698 | sbi->comps.ods[i] = od; | 670 | eds[i].ored.od = od; |
699 | ++sbi->comps.numdevs; | 671 | ++sbi->oc.numdevs; |
700 | 672 | ||
701 | /* Read the fscb of the other devices to make sure the FS | 673 | /* Read the fscb of the other devices to make sure the FS |
702 | * partition is there. | 674 | * partition is there. |
@@ -718,21 +690,10 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, | |||
718 | 690 | ||
719 | out: | 691 | out: |
720 | kfree(dt); | 692 | kfree(dt); |
721 | if (likely(!ret)) { | 693 | if (unlikely(fscb_od && !ret)) { |
722 | unsigned numdevs = sbi->comps.numdevs; | ||
723 | |||
724 | if (unlikely(fscb_od)) { | ||
725 | EXOFS_ERR("ERROR: Bad device-table container device not present\n"); | 694 | EXOFS_ERR("ERROR: Bad device-table container device not present\n"); |
726 | osduld_put_device(fscb_od); | 695 | osduld_put_device(fscb_od); |
727 | return -EINVAL; | 696 | return -EINVAL; |
728 | } | ||
729 | /* exofs round-robins the device table view according to inode | ||
730 | * number. We hold a: twice bigger table hence inodes can point | ||
731 | * to any device and have a sequential view of the table | ||
732 | * starting at this device. See exofs_init_comps() | ||
733 | */ | ||
734 | for (i = 0; i < numdevs - 1; ++i) | ||
735 | sbi->comps.ods[i + numdevs] = sbi->comps.ods[i]; | ||
736 | } | 697 | } |
737 | return ret; | 698 | return ret; |
738 | } | 699 | } |
@@ -783,10 +744,9 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) | |||
783 | sbi->one_comp.obj.partition = opts->pid; | 744 | sbi->one_comp.obj.partition = opts->pid; |
784 | sbi->one_comp.obj.id = 0; | 745 | sbi->one_comp.obj.id = 0; |
785 | exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj); | 746 | exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj); |
786 | sbi->comps.numdevs = 1; | 747 | sbi->oc.numdevs = 1; |
787 | sbi->comps.single_comp = EC_SINGLE_COMP; | 748 | sbi->oc.single_comp = EC_SINGLE_COMP; |
788 | sbi->comps.comps = &sbi->one_comp; | 749 | sbi->oc.comps = &sbi->one_comp; |
789 | sbi->comps.ods = sbi->_min_one_dev; | ||
790 | 750 | ||
791 | /* fill in some other data by hand */ | 751 | /* fill in some other data by hand */ |
792 | memset(sb->s_id, 0, sizeof(sb->s_id)); | 752 | memset(sb->s_id, 0, sizeof(sb->s_id)); |
@@ -835,7 +795,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) | |||
835 | if (unlikely(ret)) | 795 | if (unlikely(ret)) |
836 | goto free_sbi; | 796 | goto free_sbi; |
837 | } else { | 797 | } else { |
838 | sbi->comps.ods[0] = od; | 798 | struct exofs_dev *eds; |
799 | |||
800 | ret = __alloc_dev_table(sbi, 1, &eds); | ||
801 | if (unlikely(ret)) | ||
802 | goto free_sbi; | ||
803 | |||
804 | ore_comp_set_dev(&sbi->oc, 0, od); | ||
839 | } | 805 | } |
840 | 806 | ||
841 | __sbi_read_stats(sbi); | 807 | __sbi_read_stats(sbi); |
@@ -875,7 +841,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) | |||
875 | goto free_sbi; | 841 | goto free_sbi; |
876 | } | 842 | } |
877 | 843 | ||
878 | _exofs_print_device("Mounting", opts->dev_name, sbi->comps.ods[0], | 844 | _exofs_print_device("Mounting", opts->dev_name, |
845 | ore_comp_dev(&sbi->oc, 0), | ||
879 | sbi->one_comp.obj.partition); | 846 | sbi->one_comp.obj.partition); |
880 | return 0; | 847 | return 0; |
881 | 848 | ||
@@ -924,7 +891,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) | |||
924 | uint64_t used = ULLONG_MAX; | 891 | uint64_t used = ULLONG_MAX; |
925 | int ret; | 892 | int ret; |
926 | 893 | ||
927 | ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); | 894 | ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); |
928 | if (ret) { | 895 | if (ret) { |
929 | EXOFS_DBGMSG("ore_get_io_state failed.\n"); | 896 | EXOFS_DBGMSG("ore_get_io_state failed.\n"); |
930 | return ret; | 897 | return ret; |
@@ -981,7 +948,7 @@ static const struct super_operations exofs_sops = { | |||
981 | * EXPORT OPERATIONS | 948 | * EXPORT OPERATIONS |
982 | *****************************************************************************/ | 949 | *****************************************************************************/ |
983 | 950 | ||
984 | struct dentry *exofs_get_parent(struct dentry *child) | 951 | static struct dentry *exofs_get_parent(struct dentry *child) |
985 | { | 952 | { |
986 | unsigned long ino = exofs_parent_ino(child); | 953 | unsigned long ino = exofs_parent_ino(child); |
987 | 954 | ||
diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index c5c5e008e6de..f05fa826f89e 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h | |||
@@ -34,15 +34,30 @@ struct ore_comp { | |||
34 | 34 | ||
35 | struct ore_layout { | 35 | struct ore_layout { |
36 | /* Our way of looking at the data_map */ | 36 | /* Our way of looking at the data_map */ |
37 | enum pnfs_osd_raid_algorithm4 | ||
38 | raid_algorithm; | ||
37 | unsigned stripe_unit; | 39 | unsigned stripe_unit; |
38 | unsigned mirrors_p1; | 40 | unsigned mirrors_p1; |
39 | 41 | ||
40 | unsigned group_width; | 42 | unsigned group_width; |
43 | unsigned parity; | ||
41 | u64 group_depth; | 44 | u64 group_depth; |
42 | unsigned group_count; | 45 | unsigned group_count; |
46 | |||
47 | /* Cached often needed calculations filled in by | ||
48 | * ore_verify_layout | ||
49 | */ | ||
50 | unsigned long max_io_length; /* Max length that should be passed to | ||
51 | * ore_get_rw_state | ||
52 | */ | ||
53 | }; | ||
54 | |||
55 | struct ore_dev { | ||
56 | struct osd_dev *od; | ||
43 | }; | 57 | }; |
44 | 58 | ||
45 | struct ore_components { | 59 | struct ore_components { |
60 | unsigned first_dev; /* First logical device no */ | ||
46 | unsigned numdevs; /* Num of devices in array */ | 61 | unsigned numdevs; /* Num of devices in array */ |
47 | /* If @single_comp == EC_SINGLE_COMP, @comps points to a single | 62 | /* If @single_comp == EC_SINGLE_COMP, @comps points to a single |
48 | * component. else there are @numdevs components | 63 | * component. else there are @numdevs components |
@@ -51,20 +66,60 @@ struct ore_components { | |||
51 | EC_SINGLE_COMP = 0, EC_MULTPLE_COMPS = 0xffffffff | 66 | EC_SINGLE_COMP = 0, EC_MULTPLE_COMPS = 0xffffffff |
52 | } single_comp; | 67 | } single_comp; |
53 | struct ore_comp *comps; | 68 | struct ore_comp *comps; |
54 | struct osd_dev **ods; /* osd_dev array */ | 69 | |
70 | /* Array of pointers to ore_dev-* . User will usually have these pointed | ||
71 | * too a bigger struct which contain an "ore_dev ored" member and use | ||
72 | * container_of(oc->ods[i], struct foo_dev, ored) to access the bigger | ||
73 | * structure. | ||
74 | */ | ||
75 | struct ore_dev **ods; | ||
76 | }; | ||
77 | |||
78 | /* ore_comp_dev Recievies a logical device index */ | ||
79 | static inline struct osd_dev *ore_comp_dev( | ||
80 | const struct ore_components *oc, unsigned i) | ||
81 | { | ||
82 | BUG_ON((i < oc->first_dev) || (oc->first_dev + oc->numdevs <= i)); | ||
83 | return oc->ods[i - oc->first_dev]->od; | ||
84 | } | ||
85 | |||
86 | static inline void ore_comp_set_dev( | ||
87 | struct ore_components *oc, unsigned i, struct osd_dev *od) | ||
88 | { | ||
89 | oc->ods[i - oc->first_dev]->od = od; | ||
90 | } | ||
91 | |||
92 | struct ore_striping_info { | ||
93 | u64 offset; | ||
94 | u64 obj_offset; | ||
95 | u64 length; | ||
96 | u64 first_stripe_start; /* only used in raid writes */ | ||
97 | u64 M; /* for truncate */ | ||
98 | unsigned bytes_in_stripe; | ||
99 | unsigned dev; | ||
100 | unsigned par_dev; | ||
101 | unsigned unit_off; | ||
102 | unsigned cur_pg; | ||
103 | unsigned cur_comp; | ||
55 | }; | 104 | }; |
56 | 105 | ||
57 | struct ore_io_state; | 106 | struct ore_io_state; |
58 | typedef void (*ore_io_done_fn)(struct ore_io_state *ios, void *private); | 107 | typedef void (*ore_io_done_fn)(struct ore_io_state *ios, void *private); |
108 | struct _ore_r4w_op { | ||
109 | /* @Priv given here is passed ios->private */ | ||
110 | struct page * (*get_page)(void *priv, u64 page_index, bool *uptodate); | ||
111 | void (*put_page)(void *priv, struct page *page); | ||
112 | }; | ||
59 | 113 | ||
60 | struct ore_io_state { | 114 | struct ore_io_state { |
61 | struct kref kref; | 115 | struct kref kref; |
116 | struct ore_striping_info si; | ||
62 | 117 | ||
63 | void *private; | 118 | void *private; |
64 | ore_io_done_fn done; | 119 | ore_io_done_fn done; |
65 | 120 | ||
66 | struct ore_layout *layout; | 121 | struct ore_layout *layout; |
67 | struct ore_components *comps; | 122 | struct ore_components *oc; |
68 | 123 | ||
69 | /* Global read/write IO*/ | 124 | /* Global read/write IO*/ |
70 | loff_t offset; | 125 | loff_t offset; |
@@ -84,6 +139,16 @@ struct ore_io_state { | |||
84 | 139 | ||
85 | bool reading; | 140 | bool reading; |
86 | 141 | ||
142 | /* House keeping of Parity pages */ | ||
143 | bool extra_part_alloc; | ||
144 | struct page **parity_pages; | ||
145 | unsigned max_par_pages; | ||
146 | unsigned cur_par_page; | ||
147 | unsigned sgs_per_dev; | ||
148 | struct __stripe_pages_2d *sp2d; | ||
149 | struct ore_io_state *ios_read_4_write; | ||
150 | const struct _ore_r4w_op *r4w; | ||
151 | |||
87 | /* Variable array of size numdevs */ | 152 | /* Variable array of size numdevs */ |
88 | unsigned numdevs; | 153 | unsigned numdevs; |
89 | struct ore_per_dev_state { | 154 | struct ore_per_dev_state { |
@@ -91,7 +156,10 @@ struct ore_io_state { | |||
91 | struct bio *bio; | 156 | struct bio *bio; |
92 | loff_t offset; | 157 | loff_t offset; |
93 | unsigned length; | 158 | unsigned length; |
159 | unsigned last_sgs_total; | ||
94 | unsigned dev; | 160 | unsigned dev; |
161 | struct osd_sg_entry *sglist; | ||
162 | unsigned cur_sg; | ||
95 | } per_dev[]; | 163 | } per_dev[]; |
96 | }; | 164 | }; |
97 | 165 | ||
@@ -102,6 +170,9 @@ static inline unsigned ore_io_state_size(unsigned numdevs) | |||
102 | } | 170 | } |
103 | 171 | ||
104 | /* ore.c */ | 172 | /* ore.c */ |
173 | int ore_verify_layout(unsigned total_comps, struct ore_layout *layout); | ||
174 | void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, | ||
175 | u64 length, struct ore_striping_info *si); | ||
105 | int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, | 176 | int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, |
106 | bool is_reading, u64 offset, u64 length, | 177 | bool is_reading, u64 offset, u64 length, |
107 | struct ore_io_state **ios); | 178 | struct ore_io_state **ios); |
@@ -109,7 +180,10 @@ int ore_get_io_state(struct ore_layout *layout, struct ore_components *comps, | |||
109 | struct ore_io_state **ios); | 180 | struct ore_io_state **ios); |
110 | void ore_put_io_state(struct ore_io_state *ios); | 181 | void ore_put_io_state(struct ore_io_state *ios); |
111 | 182 | ||
112 | int ore_check_io(struct ore_io_state *ios, u64 *resid); | 183 | typedef void (*ore_on_dev_error)(struct ore_io_state *ios, struct ore_dev *od, |
184 | unsigned dev_index, enum osd_err_priority oep, | ||
185 | u64 dev_offset, u64 dev_len); | ||
186 | int ore_check_io(struct ore_io_state *ios, ore_on_dev_error rep); | ||
113 | 187 | ||
114 | int ore_create(struct ore_io_state *ios); | 188 | int ore_create(struct ore_io_state *ios); |
115 | int ore_remove(struct ore_io_state *ios); | 189 | int ore_remove(struct ore_io_state *ios); |