diff options
Diffstat (limited to 'fs/exofs')
| -rw-r--r-- | fs/exofs/Kbuild | 3 | ||||
| -rw-r--r-- | fs/exofs/Kconfig | 9 | ||||
| -rw-r--r-- | fs/exofs/exofs.h | 26 | ||||
| -rw-r--r-- | fs/exofs/inode.c | 235 | ||||
| -rw-r--r-- | fs/exofs/ore.c | 657 | ||||
| -rw-r--r-- | fs/exofs/ore_raid.c | 660 | ||||
| -rw-r--r-- | fs/exofs/ore_raid.h | 79 | ||||
| -rw-r--r-- | fs/exofs/super.c | 206 |
8 files changed, 1508 insertions, 367 deletions
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild index c5a5855a6c44..352ba149d23e 100644 --- a/fs/exofs/Kbuild +++ b/fs/exofs/Kbuild | |||
| @@ -13,7 +13,8 @@ | |||
| 13 | # | 13 | # |
| 14 | 14 | ||
| 15 | # ore module library | 15 | # ore module library |
| 16 | obj-$(CONFIG_ORE) += ore.o | 16 | libore-y := ore.o ore_raid.o |
| 17 | obj-$(CONFIG_ORE) += libore.o | ||
| 17 | 18 | ||
| 18 | exofs-y := inode.o file.o symlink.o namei.o dir.o super.o | 19 | exofs-y := inode.o file.o symlink.o namei.o dir.o super.o |
| 19 | obj-$(CONFIG_EXOFS_FS) += exofs.o | 20 | obj-$(CONFIG_EXOFS_FS) += exofs.o |
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig index 70bae4149291..da42f32c49be 100644 --- a/fs/exofs/Kconfig +++ b/fs/exofs/Kconfig | |||
| @@ -1,10 +1,17 @@ | |||
| 1 | # Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects | ||
| 2 | # for every ORE user we do it like this. Any user should add itself here | ||
| 3 | # at the "depends on EXOFS_FS || ..." with an ||. The dependencies are | ||
| 4 | # selected here, and we default to "ON". So in effect it is like been | ||
| 5 | # selected by any of the users. | ||
| 1 | config ORE | 6 | config ORE |
| 2 | tristate | 7 | tristate |
| 8 | depends on EXOFS_FS || PNFS_OBJLAYOUT | ||
| 9 | select ASYNC_XOR | ||
| 10 | default SCSI_OSD_ULD | ||
| 3 | 11 | ||
| 4 | config EXOFS_FS | 12 | config EXOFS_FS |
| 5 | tristate "exofs: OSD based file system support" | 13 | tristate "exofs: OSD based file system support" |
| 6 | depends on SCSI_OSD_ULD | 14 | depends on SCSI_OSD_ULD |
| 7 | select ORE | ||
| 8 | help | 15 | help |
| 9 | EXOFS is a file system that uses an OSD storage device, | 16 | EXOFS is a file system that uses an OSD storage device, |
| 10 | as its backing storage. | 17 | as its backing storage. |
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index f4e442ec7445..51f4b4c40f09 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h | |||
| @@ -53,6 +53,10 @@ | |||
| 53 | /* u64 has problems with printk this will cast it to unsigned long long */ | 53 | /* u64 has problems with printk this will cast it to unsigned long long */ |
| 54 | #define _LLU(x) (unsigned long long)(x) | 54 | #define _LLU(x) (unsigned long long)(x) |
| 55 | 55 | ||
| 56 | struct exofs_dev { | ||
| 57 | struct ore_dev ored; | ||
| 58 | unsigned did; | ||
| 59 | }; | ||
| 56 | /* | 60 | /* |
| 57 | * our extension to the in-memory superblock | 61 | * our extension to the in-memory superblock |
| 58 | */ | 62 | */ |
| @@ -66,13 +70,9 @@ struct exofs_sb_info { | |||
| 66 | u32 s_next_generation; /* next gen # to use */ | 70 | u32 s_next_generation; /* next gen # to use */ |
| 67 | atomic_t s_curr_pending; /* number of pending commands */ | 71 | atomic_t s_curr_pending; /* number of pending commands */ |
| 68 | 72 | ||
| 69 | struct pnfs_osd_data_map data_map; /* Default raid to use | ||
| 70 | * FIXME: Needed ? | ||
| 71 | */ | ||
| 72 | struct ore_layout layout; /* Default files layout */ | 73 | struct ore_layout layout; /* Default files layout */ |
| 73 | struct ore_comp one_comp; /* id & cred of partition id=0*/ | 74 | struct ore_comp one_comp; /* id & cred of partition id=0*/ |
| 74 | struct ore_components comps; /* comps for the partition */ | 75 | struct ore_components oc; /* comps for the partition */ |
| 75 | struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */ | ||
| 76 | }; | 76 | }; |
| 77 | 77 | ||
| 78 | /* | 78 | /* |
| @@ -86,7 +86,7 @@ struct exofs_i_info { | |||
| 86 | uint32_t i_dir_start_lookup; /* which page to start lookup */ | 86 | uint32_t i_dir_start_lookup; /* which page to start lookup */ |
| 87 | uint64_t i_commit_size; /* the object's written length */ | 87 | uint64_t i_commit_size; /* the object's written length */ |
| 88 | struct ore_comp one_comp; /* same component for all devices */ | 88 | struct ore_comp one_comp; /* same component for all devices */ |
| 89 | struct ore_components comps; /* inode view of the device table */ | 89 | struct ore_components oc; /* inode view of the device table */ |
| 90 | }; | 90 | }; |
| 91 | 91 | ||
| 92 | static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) | 92 | static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) |
| @@ -207,7 +207,7 @@ extern const struct inode_operations exofs_fast_symlink_inode_operations; | |||
| 207 | * bigger and that the device table repeats twice. | 207 | * bigger and that the device table repeats twice. |
| 208 | * See: exofs_read_lookup_dev_table() | 208 | * See: exofs_read_lookup_dev_table() |
| 209 | */ | 209 | */ |
| 210 | static inline void exofs_init_comps(struct ore_components *comps, | 210 | static inline void exofs_init_comps(struct ore_components *oc, |
| 211 | struct ore_comp *one_comp, | 211 | struct ore_comp *one_comp, |
| 212 | struct exofs_sb_info *sbi, osd_id oid) | 212 | struct exofs_sb_info *sbi, osd_id oid) |
| 213 | { | 213 | { |
| @@ -217,13 +217,15 @@ static inline void exofs_init_comps(struct ore_components *comps, | |||
| 217 | one_comp->obj.id = oid; | 217 | one_comp->obj.id = oid; |
| 218 | exofs_make_credential(one_comp->cred, &one_comp->obj); | 218 | exofs_make_credential(one_comp->cred, &one_comp->obj); |
| 219 | 219 | ||
| 220 | comps->numdevs = sbi->comps.numdevs; | 220 | oc->first_dev = 0; |
| 221 | comps->single_comp = EC_SINGLE_COMP; | 221 | oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 * |
| 222 | comps->comps = one_comp; | 222 | sbi->layout.group_count; |
| 223 | oc->single_comp = EC_SINGLE_COMP; | ||
| 224 | oc->comps = one_comp; | ||
| 223 | 225 | ||
| 224 | /* Round robin device view of the table */ | 226 | /* Round robin device view of the table */ |
| 225 | first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->comps.numdevs; | 227 | first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs; |
| 226 | comps->ods = sbi->comps.ods + first_dev; | 228 | oc->ods = &sbi->oc.ods[first_dev]; |
| 227 | } | 229 | } |
| 228 | 230 | ||
| 229 | #endif | 231 | #endif |
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index f39a38fc2349..f6dbf7768ce6 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c | |||
| @@ -37,11 +37,7 @@ | |||
| 37 | 37 | ||
| 38 | #define EXOFS_DBGMSG2(M...) do {} while (0) | 38 | #define EXOFS_DBGMSG2(M...) do {} while (0) |
| 39 | 39 | ||
| 40 | enum { BIO_MAX_PAGES_KMALLOC = | 40 | enum {MAX_PAGES_KMALLOC = PAGE_SIZE / sizeof(struct page *), }; |
| 41 | (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), | ||
| 42 | MAX_PAGES_KMALLOC = | ||
| 43 | PAGE_SIZE / sizeof(struct page *), | ||
| 44 | }; | ||
| 45 | 41 | ||
| 46 | unsigned exofs_max_io_pages(struct ore_layout *layout, | 42 | unsigned exofs_max_io_pages(struct ore_layout *layout, |
| 47 | unsigned expected_pages) | 43 | unsigned expected_pages) |
| @@ -49,8 +45,7 @@ unsigned exofs_max_io_pages(struct ore_layout *layout, | |||
| 49 | unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC); | 45 | unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC); |
| 50 | 46 | ||
| 51 | /* TODO: easily support bio chaining */ | 47 | /* TODO: easily support bio chaining */ |
| 52 | pages = min_t(unsigned, pages, | 48 | pages = min_t(unsigned, pages, layout->max_io_length / PAGE_SIZE); |
| 53 | layout->group_width * BIO_MAX_PAGES_KMALLOC); | ||
| 54 | return pages; | 49 | return pages; |
| 55 | } | 50 | } |
| 56 | 51 | ||
| @@ -68,6 +63,7 @@ struct page_collect { | |||
| 68 | bool read_4_write; /* This means two things: that the read is sync | 63 | bool read_4_write; /* This means two things: that the read is sync |
| 69 | * And the pages should not be unlocked. | 64 | * And the pages should not be unlocked. |
| 70 | */ | 65 | */ |
| 66 | struct page *that_locked_page; | ||
| 71 | }; | 67 | }; |
| 72 | 68 | ||
| 73 | static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, | 69 | static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, |
| @@ -86,6 +82,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, | |||
| 86 | pcol->length = 0; | 82 | pcol->length = 0; |
| 87 | pcol->pg_first = -1; | 83 | pcol->pg_first = -1; |
| 88 | pcol->read_4_write = false; | 84 | pcol->read_4_write = false; |
| 85 | pcol->that_locked_page = NULL; | ||
| 89 | } | 86 | } |
| 90 | 87 | ||
| 91 | static void _pcol_reset(struct page_collect *pcol) | 88 | static void _pcol_reset(struct page_collect *pcol) |
| @@ -98,6 +95,7 @@ static void _pcol_reset(struct page_collect *pcol) | |||
| 98 | pcol->length = 0; | 95 | pcol->length = 0; |
| 99 | pcol->pg_first = -1; | 96 | pcol->pg_first = -1; |
| 100 | pcol->ios = NULL; | 97 | pcol->ios = NULL; |
| 98 | pcol->that_locked_page = NULL; | ||
| 101 | 99 | ||
| 102 | /* this is probably the end of the loop but in writes | 100 | /* this is probably the end of the loop but in writes |
| 103 | * it might not end here. don't be left with nothing | 101 | * it might not end here. don't be left with nothing |
| @@ -149,14 +147,17 @@ static int pcol_add_page(struct page_collect *pcol, struct page *page, | |||
| 149 | return 0; | 147 | return 0; |
| 150 | } | 148 | } |
| 151 | 149 | ||
| 150 | enum {PAGE_WAS_NOT_IN_IO = 17}; | ||
| 152 | static int update_read_page(struct page *page, int ret) | 151 | static int update_read_page(struct page *page, int ret) |
| 153 | { | 152 | { |
| 154 | if (ret == 0) { | 153 | switch (ret) { |
| 154 | case 0: | ||
| 155 | /* Everything is OK */ | 155 | /* Everything is OK */ |
| 156 | SetPageUptodate(page); | 156 | SetPageUptodate(page); |
| 157 | if (PageError(page)) | 157 | if (PageError(page)) |
| 158 | ClearPageError(page); | 158 | ClearPageError(page); |
| 159 | } else if (ret == -EFAULT) { | 159 | break; |
| 160 | case -EFAULT: | ||
| 160 | /* In this case we were trying to read something that wasn't on | 161 | /* In this case we were trying to read something that wasn't on |
| 161 | * disk yet - return a page full of zeroes. This should be OK, | 162 | * disk yet - return a page full of zeroes. This should be OK, |
| 162 | * because the object should be empty (if there was a write | 163 | * because the object should be empty (if there was a write |
| @@ -167,16 +168,22 @@ static int update_read_page(struct page *page, int ret) | |||
| 167 | SetPageUptodate(page); | 168 | SetPageUptodate(page); |
| 168 | if (PageError(page)) | 169 | if (PageError(page)) |
| 169 | ClearPageError(page); | 170 | ClearPageError(page); |
| 170 | ret = 0; /* recovered error */ | ||
| 171 | EXOFS_DBGMSG("recovered read error\n"); | 171 | EXOFS_DBGMSG("recovered read error\n"); |
| 172 | } else /* Error */ | 172 | /* fall through */ |
| 173 | case PAGE_WAS_NOT_IN_IO: | ||
| 174 | ret = 0; /* recovered error */ | ||
| 175 | break; | ||
| 176 | default: | ||
| 173 | SetPageError(page); | 177 | SetPageError(page); |
| 174 | 178 | } | |
| 175 | return ret; | 179 | return ret; |
| 176 | } | 180 | } |
| 177 | 181 | ||
| 178 | static void update_write_page(struct page *page, int ret) | 182 | static void update_write_page(struct page *page, int ret) |
| 179 | { | 183 | { |
| 184 | if (unlikely(ret == PAGE_WAS_NOT_IN_IO)) | ||
| 185 | return; /* don't pass start don't collect $200 */ | ||
| 186 | |||
| 180 | if (ret) { | 187 | if (ret) { |
| 181 | mapping_set_error(page->mapping, ret); | 188 | mapping_set_error(page->mapping, ret); |
| 182 | SetPageError(page); | 189 | SetPageError(page); |
| @@ -190,15 +197,16 @@ static void update_write_page(struct page *page, int ret) | |||
| 190 | static int __readpages_done(struct page_collect *pcol) | 197 | static int __readpages_done(struct page_collect *pcol) |
| 191 | { | 198 | { |
| 192 | int i; | 199 | int i; |
| 193 | u64 resid; | ||
| 194 | u64 good_bytes; | 200 | u64 good_bytes; |
| 195 | u64 length = 0; | 201 | u64 length = 0; |
| 196 | int ret = ore_check_io(pcol->ios, &resid); | 202 | int ret = ore_check_io(pcol->ios, NULL); |
| 197 | 203 | ||
| 198 | if (likely(!ret)) | 204 | if (likely(!ret)) { |
| 199 | good_bytes = pcol->length; | 205 | good_bytes = pcol->length; |
| 200 | else | 206 | ret = PAGE_WAS_NOT_IN_IO; |
| 201 | good_bytes = pcol->length - resid; | 207 | } else { |
| 208 | good_bytes = 0; | ||
| 209 | } | ||
| 202 | 210 | ||
| 203 | EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx" | 211 | EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx" |
| 204 | " length=0x%lx nr_pages=%u\n", | 212 | " length=0x%lx nr_pages=%u\n", |
| @@ -259,6 +267,46 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) | |||
| 259 | } | 267 | } |
| 260 | } | 268 | } |
| 261 | 269 | ||
| 270 | static int _maybe_not_all_in_one_io(struct ore_io_state *ios, | ||
| 271 | struct page_collect *pcol_src, struct page_collect *pcol) | ||
| 272 | { | ||
| 273 | /* length was wrong or offset was not page aligned */ | ||
| 274 | BUG_ON(pcol_src->nr_pages < ios->nr_pages); | ||
| 275 | |||
| 276 | if (pcol_src->nr_pages > ios->nr_pages) { | ||
| 277 | struct page **src_page; | ||
| 278 | unsigned pages_less = pcol_src->nr_pages - ios->nr_pages; | ||
| 279 | unsigned long len_less = pcol_src->length - ios->length; | ||
| 280 | unsigned i; | ||
| 281 | int ret; | ||
| 282 | |||
| 283 | /* This IO was trimmed */ | ||
| 284 | pcol_src->nr_pages = ios->nr_pages; | ||
| 285 | pcol_src->length = ios->length; | ||
| 286 | |||
| 287 | /* Left over pages are passed to the next io */ | ||
| 288 | pcol->expected_pages += pages_less; | ||
| 289 | pcol->nr_pages = pages_less; | ||
| 290 | pcol->length = len_less; | ||
| 291 | src_page = pcol_src->pages + pcol_src->nr_pages; | ||
| 292 | pcol->pg_first = (*src_page)->index; | ||
| 293 | |||
| 294 | ret = pcol_try_alloc(pcol); | ||
| 295 | if (unlikely(ret)) | ||
| 296 | return ret; | ||
| 297 | |||
| 298 | for (i = 0; i < pages_less; ++i) | ||
| 299 | pcol->pages[i] = *src_page++; | ||
| 300 | |||
| 301 | EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x " | ||
| 302 | "pages_less=0x%x expected_pages=0x%x " | ||
| 303 | "next_offset=0x%llx next_len=0x%lx\n", | ||
| 304 | pcol_src->nr_pages, pages_less, pcol->expected_pages, | ||
| 305 | pcol->pg_first * PAGE_SIZE, pcol->length); | ||
| 306 | } | ||
| 307 | return 0; | ||
| 308 | } | ||
| 309 | |||
| 262 | static int read_exec(struct page_collect *pcol) | 310 | static int read_exec(struct page_collect *pcol) |
| 263 | { | 311 | { |
| 264 | struct exofs_i_info *oi = exofs_i(pcol->inode); | 312 | struct exofs_i_info *oi = exofs_i(pcol->inode); |
| @@ -270,7 +318,7 @@ static int read_exec(struct page_collect *pcol) | |||
| 270 | return 0; | 318 | return 0; |
| 271 | 319 | ||
| 272 | if (!pcol->ios) { | 320 | if (!pcol->ios) { |
| 273 | int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, true, | 321 | int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true, |
| 274 | pcol->pg_first << PAGE_CACHE_SHIFT, | 322 | pcol->pg_first << PAGE_CACHE_SHIFT, |
| 275 | pcol->length, &pcol->ios); | 323 | pcol->length, &pcol->ios); |
| 276 | 324 | ||
| @@ -280,7 +328,6 @@ static int read_exec(struct page_collect *pcol) | |||
| 280 | 328 | ||
| 281 | ios = pcol->ios; | 329 | ios = pcol->ios; |
| 282 | ios->pages = pcol->pages; | 330 | ios->pages = pcol->pages; |
| 283 | ios->nr_pages = pcol->nr_pages; | ||
| 284 | 331 | ||
| 285 | if (pcol->read_4_write) { | 332 | if (pcol->read_4_write) { |
| 286 | ore_read(pcol->ios); | 333 | ore_read(pcol->ios); |
| @@ -296,17 +343,23 @@ static int read_exec(struct page_collect *pcol) | |||
| 296 | *pcol_copy = *pcol; | 343 | *pcol_copy = *pcol; |
| 297 | ios->done = readpages_done; | 344 | ios->done = readpages_done; |
| 298 | ios->private = pcol_copy; | 345 | ios->private = pcol_copy; |
| 346 | |||
| 347 | /* pages ownership was passed to pcol_copy */ | ||
| 348 | _pcol_reset(pcol); | ||
| 349 | |||
| 350 | ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol); | ||
| 351 | if (unlikely(ret)) | ||
| 352 | goto err; | ||
| 353 | |||
| 354 | EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n", | ||
| 355 | pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length)); | ||
| 356 | |||
| 299 | ret = ore_read(ios); | 357 | ret = ore_read(ios); |
| 300 | if (unlikely(ret)) | 358 | if (unlikely(ret)) |
| 301 | goto err; | 359 | goto err; |
| 302 | 360 | ||
| 303 | atomic_inc(&pcol->sbi->s_curr_pending); | 361 | atomic_inc(&pcol->sbi->s_curr_pending); |
| 304 | 362 | ||
| 305 | EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", | ||
| 306 | oi->one_comp.obj.id, _LLU(ios->offset), pcol->length); | ||
| 307 | |||
| 308 | /* pages ownership was passed to pcol_copy */ | ||
| 309 | _pcol_reset(pcol); | ||
| 310 | return 0; | 363 | return 0; |
| 311 | 364 | ||
| 312 | err: | 365 | err: |
| @@ -341,6 +394,8 @@ static int readpage_strip(void *data, struct page *page) | |||
| 341 | EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino, | 394 | EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino, |
| 342 | page->index); | 395 | page->index); |
| 343 | 396 | ||
| 397 | pcol->that_locked_page = page; | ||
| 398 | |||
| 344 | if (page->index < end_index) | 399 | if (page->index < end_index) |
| 345 | len = PAGE_CACHE_SIZE; | 400 | len = PAGE_CACHE_SIZE; |
| 346 | else if (page->index == end_index) | 401 | else if (page->index == end_index) |
| @@ -429,6 +484,10 @@ static int exofs_readpages(struct file *file, struct address_space *mapping, | |||
| 429 | return ret; | 484 | return ret; |
| 430 | } | 485 | } |
| 431 | 486 | ||
| 487 | ret = read_exec(&pcol); | ||
| 488 | if (unlikely(ret)) | ||
| 489 | return ret; | ||
| 490 | |||
| 432 | return read_exec(&pcol); | 491 | return read_exec(&pcol); |
| 433 | } | 492 | } |
| 434 | 493 | ||
| @@ -462,17 +521,18 @@ static void writepages_done(struct ore_io_state *ios, void *p) | |||
| 462 | { | 521 | { |
| 463 | struct page_collect *pcol = p; | 522 | struct page_collect *pcol = p; |
| 464 | int i; | 523 | int i; |
| 465 | u64 resid; | ||
| 466 | u64 good_bytes; | 524 | u64 good_bytes; |
| 467 | u64 length = 0; | 525 | u64 length = 0; |
| 468 | int ret = ore_check_io(ios, &resid); | 526 | int ret = ore_check_io(ios, NULL); |
| 469 | 527 | ||
| 470 | atomic_dec(&pcol->sbi->s_curr_pending); | 528 | atomic_dec(&pcol->sbi->s_curr_pending); |
| 471 | 529 | ||
| 472 | if (likely(!ret)) | 530 | if (likely(!ret)) { |
| 473 | good_bytes = pcol->length; | 531 | good_bytes = pcol->length; |
| 474 | else | 532 | ret = PAGE_WAS_NOT_IN_IO; |
| 475 | good_bytes = pcol->length - resid; | 533 | } else { |
| 534 | good_bytes = 0; | ||
| 535 | } | ||
| 476 | 536 | ||
| 477 | EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx" | 537 | EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx" |
| 478 | " length=0x%lx nr_pages=%u\n", | 538 | " length=0x%lx nr_pages=%u\n", |
| @@ -505,6 +565,56 @@ static void writepages_done(struct ore_io_state *ios, void *p) | |||
| 505 | EXOFS_DBGMSG2("writepages_done END\n"); | 565 | EXOFS_DBGMSG2("writepages_done END\n"); |
| 506 | } | 566 | } |
| 507 | 567 | ||
| 568 | static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) | ||
| 569 | { | ||
| 570 | struct page_collect *pcol = priv; | ||
| 571 | pgoff_t index = offset / PAGE_SIZE; | ||
| 572 | |||
| 573 | if (!pcol->that_locked_page || | ||
| 574 | (pcol->that_locked_page->index != index)) { | ||
| 575 | struct page *page = find_get_page(pcol->inode->i_mapping, index); | ||
| 576 | |||
| 577 | if (!page) { | ||
| 578 | page = find_or_create_page(pcol->inode->i_mapping, | ||
| 579 | index, GFP_NOFS); | ||
| 580 | if (unlikely(!page)) { | ||
| 581 | EXOFS_DBGMSG("grab_cache_page Failed " | ||
| 582 | "index=0x%llx\n", _LLU(index)); | ||
| 583 | return NULL; | ||
| 584 | } | ||
| 585 | unlock_page(page); | ||
| 586 | } | ||
| 587 | if (PageDirty(page) || PageWriteback(page)) | ||
| 588 | *uptodate = true; | ||
| 589 | else | ||
| 590 | *uptodate = PageUptodate(page); | ||
| 591 | EXOFS_DBGMSG("index=0x%lx uptodate=%d\n", index, *uptodate); | ||
| 592 | return page; | ||
| 593 | } else { | ||
| 594 | EXOFS_DBGMSG("YES that_locked_page index=0x%lx\n", | ||
| 595 | pcol->that_locked_page->index); | ||
| 596 | *uptodate = true; | ||
| 597 | return pcol->that_locked_page; | ||
| 598 | } | ||
| 599 | } | ||
| 600 | |||
| 601 | static void __r4w_put_page(void *priv, struct page *page) | ||
| 602 | { | ||
| 603 | struct page_collect *pcol = priv; | ||
| 604 | |||
| 605 | if (pcol->that_locked_page != page) { | ||
| 606 | EXOFS_DBGMSG("index=0x%lx\n", page->index); | ||
| 607 | page_cache_release(page); | ||
| 608 | return; | ||
| 609 | } | ||
| 610 | EXOFS_DBGMSG("that_locked_page index=0x%lx\n", page->index); | ||
| 611 | } | ||
| 612 | |||
| 613 | static const struct _ore_r4w_op _r4w_op = { | ||
| 614 | .get_page = &__r4w_get_page, | ||
| 615 | .put_page = &__r4w_put_page, | ||
| 616 | }; | ||
| 617 | |||
| 508 | static int write_exec(struct page_collect *pcol) | 618 | static int write_exec(struct page_collect *pcol) |
| 509 | { | 619 | { |
| 510 | struct exofs_i_info *oi = exofs_i(pcol->inode); | 620 | struct exofs_i_info *oi = exofs_i(pcol->inode); |
| @@ -516,10 +626,9 @@ static int write_exec(struct page_collect *pcol) | |||
| 516 | return 0; | 626 | return 0; |
| 517 | 627 | ||
| 518 | BUG_ON(pcol->ios); | 628 | BUG_ON(pcol->ios); |
| 519 | ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, false, | 629 | ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false, |
| 520 | pcol->pg_first << PAGE_CACHE_SHIFT, | 630 | pcol->pg_first << PAGE_CACHE_SHIFT, |
| 521 | pcol->length, &pcol->ios); | 631 | pcol->length, &pcol->ios); |
| 522 | |||
| 523 | if (unlikely(ret)) | 632 | if (unlikely(ret)) |
| 524 | goto err; | 633 | goto err; |
| 525 | 634 | ||
| @@ -534,10 +643,20 @@ static int write_exec(struct page_collect *pcol) | |||
| 534 | 643 | ||
| 535 | ios = pcol->ios; | 644 | ios = pcol->ios; |
| 536 | ios->pages = pcol_copy->pages; | 645 | ios->pages = pcol_copy->pages; |
| 537 | ios->nr_pages = pcol_copy->nr_pages; | ||
| 538 | ios->done = writepages_done; | 646 | ios->done = writepages_done; |
| 647 | ios->r4w = &_r4w_op; | ||
| 539 | ios->private = pcol_copy; | 648 | ios->private = pcol_copy; |
| 540 | 649 | ||
| 650 | /* pages ownership was passed to pcol_copy */ | ||
| 651 | _pcol_reset(pcol); | ||
| 652 | |||
| 653 | ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol); | ||
| 654 | if (unlikely(ret)) | ||
| 655 | goto err; | ||
| 656 | |||
| 657 | EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n", | ||
| 658 | pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length)); | ||
| 659 | |||
| 541 | ret = ore_write(ios); | 660 | ret = ore_write(ios); |
| 542 | if (unlikely(ret)) { | 661 | if (unlikely(ret)) { |
| 543 | EXOFS_ERR("write_exec: ore_write() Failed\n"); | 662 | EXOFS_ERR("write_exec: ore_write() Failed\n"); |
| @@ -545,11 +664,6 @@ static int write_exec(struct page_collect *pcol) | |||
| 545 | } | 664 | } |
| 546 | 665 | ||
| 547 | atomic_inc(&pcol->sbi->s_curr_pending); | 666 | atomic_inc(&pcol->sbi->s_curr_pending); |
| 548 | EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n", | ||
| 549 | pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset), | ||
| 550 | pcol->length); | ||
| 551 | /* pages ownership was passed to pcol_copy */ | ||
| 552 | _pcol_reset(pcol); | ||
| 553 | return 0; | 667 | return 0; |
| 554 | 668 | ||
| 555 | err: | 669 | err: |
| @@ -689,14 +803,33 @@ static int exofs_writepages(struct address_space *mapping, | |||
| 689 | _pcol_init(&pcol, expected_pages, mapping->host); | 803 | _pcol_init(&pcol, expected_pages, mapping->host); |
| 690 | 804 | ||
| 691 | ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol); | 805 | ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol); |
| 692 | if (ret) { | 806 | if (unlikely(ret)) { |
| 693 | EXOFS_ERR("write_cache_pages => %d\n", ret); | 807 | EXOFS_ERR("write_cache_pages => %d\n", ret); |
| 694 | return ret; | 808 | return ret; |
| 695 | } | 809 | } |
| 696 | 810 | ||
| 697 | return write_exec(&pcol); | 811 | ret = write_exec(&pcol); |
| 812 | if (unlikely(ret)) | ||
| 813 | return ret; | ||
| 814 | |||
| 815 | if (wbc->sync_mode == WB_SYNC_ALL) { | ||
| 816 | return write_exec(&pcol); /* pump the last reminder */ | ||
| 817 | } else if (pcol.nr_pages) { | ||
| 818 | /* not SYNC let the reminder join the next writeout */ | ||
| 819 | unsigned i; | ||
| 820 | |||
| 821 | for (i = 0; i < pcol.nr_pages; i++) { | ||
| 822 | struct page *page = pcol.pages[i]; | ||
| 823 | |||
| 824 | end_page_writeback(page); | ||
| 825 | set_page_dirty(page); | ||
| 826 | unlock_page(page); | ||
| 827 | } | ||
| 828 | } | ||
| 829 | return 0; | ||
| 698 | } | 830 | } |
| 699 | 831 | ||
| 832 | /* | ||
| 700 | static int exofs_writepage(struct page *page, struct writeback_control *wbc) | 833 | static int exofs_writepage(struct page *page, struct writeback_control *wbc) |
| 701 | { | 834 | { |
| 702 | struct page_collect pcol; | 835 | struct page_collect pcol; |
| @@ -712,7 +845,7 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc) | |||
| 712 | 845 | ||
| 713 | return write_exec(&pcol); | 846 | return write_exec(&pcol); |
| 714 | } | 847 | } |
| 715 | 848 | */ | |
| 716 | /* i_mutex held using inode->i_size directly */ | 849 | /* i_mutex held using inode->i_size directly */ |
| 717 | static void _write_failed(struct inode *inode, loff_t to) | 850 | static void _write_failed(struct inode *inode, loff_t to) |
| 718 | { | 851 | { |
| @@ -818,7 +951,7 @@ static void exofs_invalidatepage(struct page *page, unsigned long offset) | |||
| 818 | const struct address_space_operations exofs_aops = { | 951 | const struct address_space_operations exofs_aops = { |
| 819 | .readpage = exofs_readpage, | 952 | .readpage = exofs_readpage, |
| 820 | .readpages = exofs_readpages, | 953 | .readpages = exofs_readpages, |
| 821 | .writepage = exofs_writepage, | 954 | .writepage = NULL, |
| 822 | .writepages = exofs_writepages, | 955 | .writepages = exofs_writepages, |
| 823 | .write_begin = exofs_write_begin_export, | 956 | .write_begin = exofs_write_begin_export, |
| 824 | .write_end = exofs_write_end, | 957 | .write_end = exofs_write_end, |
| @@ -860,7 +993,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize) | |||
| 860 | 993 | ||
| 861 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 994 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; |
| 862 | 995 | ||
| 863 | ret = ore_truncate(&sbi->layout, &oi->comps, (u64)newsize); | 996 | ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize); |
| 864 | if (likely(!ret)) | 997 | if (likely(!ret)) |
| 865 | truncate_setsize(inode, newsize); | 998 | truncate_setsize(inode, newsize); |
| 866 | 999 | ||
| @@ -927,14 +1060,14 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, | |||
| 927 | struct exofs_on_disk_inode_layout *layout; | 1060 | struct exofs_on_disk_inode_layout *layout; |
| 928 | int ret; | 1061 | int ret; |
| 929 | 1062 | ||
| 930 | ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); | 1063 | ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); |
| 931 | if (unlikely(ret)) { | 1064 | if (unlikely(ret)) { |
| 932 | EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); | 1065 | EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); |
| 933 | return ret; | 1066 | return ret; |
| 934 | } | 1067 | } |
| 935 | 1068 | ||
| 936 | attrs[1].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs); | 1069 | attrs[1].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs); |
| 937 | attrs[2].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs); | 1070 | attrs[2].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs); |
| 938 | 1071 | ||
| 939 | ios->in_attr = attrs; | 1072 | ios->in_attr = attrs; |
| 940 | ios->in_attr_len = ARRAY_SIZE(attrs); | 1073 | ios->in_attr_len = ARRAY_SIZE(attrs); |
| @@ -1018,7 +1151,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) | |||
| 1018 | return inode; | 1151 | return inode; |
| 1019 | oi = exofs_i(inode); | 1152 | oi = exofs_i(inode); |
| 1020 | __oi_init(oi); | 1153 | __oi_init(oi); |
| 1021 | exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info, | 1154 | exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info, |
| 1022 | exofs_oi_objno(oi)); | 1155 | exofs_oi_objno(oi)); |
| 1023 | 1156 | ||
| 1024 | /* read the inode from the osd */ | 1157 | /* read the inode from the osd */ |
| @@ -1032,7 +1165,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) | |||
| 1032 | inode->i_mode = le16_to_cpu(fcb.i_mode); | 1165 | inode->i_mode = le16_to_cpu(fcb.i_mode); |
| 1033 | inode->i_uid = le32_to_cpu(fcb.i_uid); | 1166 | inode->i_uid = le32_to_cpu(fcb.i_uid); |
| 1034 | inode->i_gid = le32_to_cpu(fcb.i_gid); | 1167 | inode->i_gid = le32_to_cpu(fcb.i_gid); |
| 1035 | inode->i_nlink = le16_to_cpu(fcb.i_links_count); | 1168 | set_nlink(inode, le16_to_cpu(fcb.i_links_count)); |
| 1036 | inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime); | 1169 | inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime); |
| 1037 | inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime); | 1170 | inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime); |
| 1038 | inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime); | 1171 | inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime); |
| @@ -1172,13 +1305,13 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) | |||
| 1172 | spin_unlock(&sbi->s_next_gen_lock); | 1305 | spin_unlock(&sbi->s_next_gen_lock); |
| 1173 | insert_inode_hash(inode); | 1306 | insert_inode_hash(inode); |
| 1174 | 1307 | ||
| 1175 | exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info, | 1308 | exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info, |
| 1176 | exofs_oi_objno(oi)); | 1309 | exofs_oi_objno(oi)); |
| 1177 | exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */ | 1310 | exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */ |
| 1178 | 1311 | ||
| 1179 | mark_inode_dirty(inode); | 1312 | mark_inode_dirty(inode); |
| 1180 | 1313 | ||
| 1181 | ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); | 1314 | ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); |
| 1182 | if (unlikely(ret)) { | 1315 | if (unlikely(ret)) { |
| 1183 | EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n"); | 1316 | EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n"); |
| 1184 | return ERR_PTR(ret); | 1317 | return ERR_PTR(ret); |
| @@ -1267,7 +1400,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync) | |||
| 1267 | } else | 1400 | } else |
| 1268 | memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); | 1401 | memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); |
| 1269 | 1402 | ||
| 1270 | ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); | 1403 | ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); |
| 1271 | if (unlikely(ret)) { | 1404 | if (unlikely(ret)) { |
| 1272 | EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); | 1405 | EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); |
| 1273 | goto free_args; | 1406 | goto free_args; |
| @@ -1350,7 +1483,7 @@ void exofs_evict_inode(struct inode *inode) | |||
| 1350 | /* ignore the error, attempt a remove anyway */ | 1483 | /* ignore the error, attempt a remove anyway */ |
| 1351 | 1484 | ||
| 1352 | /* Now Remove the OSD objects */ | 1485 | /* Now Remove the OSD objects */ |
| 1353 | ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); | 1486 | ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); |
| 1354 | if (unlikely(ret)) { | 1487 | if (unlikely(ret)) { |
| 1355 | EXOFS_ERR("%s: ore_get_io_state failed\n", __func__); | 1488 | EXOFS_ERR("%s: ore_get_io_state failed\n", __func__); |
| 1356 | return; | 1489 | return; |
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 25305af88198..d271ad837202 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c | |||
| @@ -23,77 +23,289 @@ | |||
| 23 | */ | 23 | */ |
| 24 | 24 | ||
| 25 | #include <linux/slab.h> | 25 | #include <linux/slab.h> |
| 26 | #include <linux/module.h> | ||
| 26 | #include <asm/div64.h> | 27 | #include <asm/div64.h> |
| 28 | #include <linux/lcm.h> | ||
| 27 | 29 | ||
| 28 | #include <scsi/osd_ore.h> | 30 | #include "ore_raid.h" |
| 29 | 31 | ||
| 30 | #define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) | 32 | MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); |
| 33 | MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); | ||
| 34 | MODULE_LICENSE("GPL"); | ||
| 35 | |||
| 36 | /* ore_verify_layout does a couple of things: | ||
| 37 | * 1. Given a minimum number of needed parameters fixes up the rest of the | ||
| 38 | * members to be operatonals for the ore. The needed parameters are those | ||
| 39 | * that are defined by the pnfs-objects layout STD. | ||
| 40 | * 2. Check to see if the current ore code actually supports these parameters | ||
| 41 | * for example stripe_unit must be a multple of the system PAGE_SIZE, | ||
| 42 | * and etc... | ||
| 43 | * 3. Cache some havily used calculations that will be needed by users. | ||
| 44 | */ | ||
| 45 | |||
| 46 | enum { BIO_MAX_PAGES_KMALLOC = | ||
| 47 | (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),}; | ||
| 31 | 48 | ||
| 32 | #ifdef CONFIG_EXOFS_DEBUG | 49 | int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) |
| 33 | #define ORE_DBGMSG(fmt, a...) \ | 50 | { |
| 34 | printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) | 51 | u64 stripe_length; |
| 35 | #else | 52 | |
| 36 | #define ORE_DBGMSG(fmt, a...) \ | 53 | switch (layout->raid_algorithm) { |
| 37 | do { if (0) printk(fmt, ##a); } while (0) | 54 | case PNFS_OSD_RAID_0: |
| 38 | #endif | 55 | layout->parity = 0; |
| 56 | break; | ||
| 57 | case PNFS_OSD_RAID_5: | ||
| 58 | layout->parity = 1; | ||
| 59 | break; | ||
| 60 | case PNFS_OSD_RAID_PQ: | ||
| 61 | case PNFS_OSD_RAID_4: | ||
| 62 | default: | ||
| 63 | ORE_ERR("Only RAID_0/5 for now\n"); | ||
| 64 | return -EINVAL; | ||
| 65 | } | ||
| 66 | if (0 != (layout->stripe_unit & ~PAGE_MASK)) { | ||
| 67 | ORE_ERR("Stripe Unit(0x%llx)" | ||
| 68 | " must be Multples of PAGE_SIZE(0x%lx)\n", | ||
| 69 | _LLU(layout->stripe_unit), PAGE_SIZE); | ||
| 70 | return -EINVAL; | ||
| 71 | } | ||
| 72 | if (layout->group_width) { | ||
| 73 | if (!layout->group_depth) { | ||
| 74 | ORE_ERR("group_depth == 0 && group_width != 0\n"); | ||
| 75 | return -EINVAL; | ||
| 76 | } | ||
| 77 | if (total_comps < (layout->group_width * layout->mirrors_p1)) { | ||
| 78 | ORE_ERR("Data Map wrong, " | ||
| 79 | "numdevs=%d < group_width=%d * mirrors=%d\n", | ||
| 80 | total_comps, layout->group_width, | ||
| 81 | layout->mirrors_p1); | ||
| 82 | return -EINVAL; | ||
| 83 | } | ||
| 84 | layout->group_count = total_comps / layout->mirrors_p1 / | ||
| 85 | layout->group_width; | ||
| 86 | } else { | ||
| 87 | if (layout->group_depth) { | ||
| 88 | printk(KERN_NOTICE "Warning: group_depth ignored " | ||
| 89 | "group_width == 0 && group_depth == %lld\n", | ||
| 90 | _LLU(layout->group_depth)); | ||
| 91 | } | ||
| 92 | layout->group_width = total_comps / layout->mirrors_p1; | ||
| 93 | layout->group_depth = -1; | ||
| 94 | layout->group_count = 1; | ||
| 95 | } | ||
| 39 | 96 | ||
| 40 | /* u64 has problems with printk this will cast it to unsigned long long */ | 97 | stripe_length = (u64)layout->group_width * layout->stripe_unit; |
| 41 | #define _LLU(x) (unsigned long long)(x) | 98 | if (stripe_length >= (1ULL << 32)) { |
| 99 | ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n", | ||
| 100 | _LLU(stripe_length)); | ||
| 101 | return -EINVAL; | ||
| 102 | } | ||
| 42 | 103 | ||
| 43 | #define ORE_DBGMSG2(M...) do {} while (0) | 104 | layout->max_io_length = |
| 44 | /* #define ORE_DBGMSG2 ORE_DBGMSG */ | 105 | (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) * |
| 106 | layout->group_width; | ||
| 107 | if (layout->parity) { | ||
| 108 | unsigned stripe_length = | ||
| 109 | (layout->group_width - layout->parity) * | ||
| 110 | layout->stripe_unit; | ||
| 45 | 111 | ||
| 46 | MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); | 112 | layout->max_io_length /= stripe_length; |
| 47 | MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); | 113 | layout->max_io_length *= stripe_length; |
| 48 | MODULE_LICENSE("GPL"); | 114 | } |
| 115 | return 0; | ||
| 116 | } | ||
| 117 | EXPORT_SYMBOL(ore_verify_layout); | ||
| 49 | 118 | ||
| 50 | static u8 *_ios_cred(struct ore_io_state *ios, unsigned index) | 119 | static u8 *_ios_cred(struct ore_io_state *ios, unsigned index) |
| 51 | { | 120 | { |
| 52 | return ios->comps->comps[index & ios->comps->single_comp].cred; | 121 | return ios->oc->comps[index & ios->oc->single_comp].cred; |
| 53 | } | 122 | } |
| 54 | 123 | ||
| 55 | static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index) | 124 | static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index) |
| 56 | { | 125 | { |
| 57 | return &ios->comps->comps[index & ios->comps->single_comp].obj; | 126 | return &ios->oc->comps[index & ios->oc->single_comp].obj; |
| 58 | } | 127 | } |
| 59 | 128 | ||
| 60 | static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) | 129 | static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) |
| 61 | { | 130 | { |
| 62 | return ios->comps->ods[index]; | 131 | ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n", |
| 132 | ios->oc->first_dev, ios->oc->numdevs, index, | ||
| 133 | ios->oc->ods); | ||
| 134 | |||
| 135 | return ore_comp_dev(ios->oc, index); | ||
| 63 | } | 136 | } |
| 64 | 137 | ||
| 65 | int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, | 138 | int _ore_get_io_state(struct ore_layout *layout, |
| 139 | struct ore_components *oc, unsigned numdevs, | ||
| 140 | unsigned sgs_per_dev, unsigned num_par_pages, | ||
| 141 | struct ore_io_state **pios) | ||
| 142 | { | ||
| 143 | struct ore_io_state *ios; | ||
| 144 | struct page **pages; | ||
| 145 | struct osd_sg_entry *sgilist; | ||
| 146 | struct __alloc_all_io_state { | ||
| 147 | struct ore_io_state ios; | ||
| 148 | struct ore_per_dev_state per_dev[numdevs]; | ||
| 149 | union { | ||
| 150 | struct osd_sg_entry sglist[sgs_per_dev * numdevs]; | ||
| 151 | struct page *pages[num_par_pages]; | ||
| 152 | }; | ||
| 153 | } *_aios; | ||
| 154 | |||
| 155 | if (likely(sizeof(*_aios) <= PAGE_SIZE)) { | ||
| 156 | _aios = kzalloc(sizeof(*_aios), GFP_KERNEL); | ||
| 157 | if (unlikely(!_aios)) { | ||
| 158 | ORE_DBGMSG("Failed kzalloc bytes=%zd\n", | ||
| 159 | sizeof(*_aios)); | ||
| 160 | *pios = NULL; | ||
| 161 | return -ENOMEM; | ||
| 162 | } | ||
| 163 | pages = num_par_pages ? _aios->pages : NULL; | ||
| 164 | sgilist = sgs_per_dev ? _aios->sglist : NULL; | ||
| 165 | ios = &_aios->ios; | ||
| 166 | } else { | ||
| 167 | struct __alloc_small_io_state { | ||
| 168 | struct ore_io_state ios; | ||
| 169 | struct ore_per_dev_state per_dev[numdevs]; | ||
| 170 | } *_aio_small; | ||
| 171 | union __extra_part { | ||
| 172 | struct osd_sg_entry sglist[sgs_per_dev * numdevs]; | ||
| 173 | struct page *pages[num_par_pages]; | ||
| 174 | } *extra_part; | ||
| 175 | |||
| 176 | _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL); | ||
| 177 | if (unlikely(!_aio_small)) { | ||
| 178 | ORE_DBGMSG("Failed alloc first part bytes=%zd\n", | ||
| 179 | sizeof(*_aio_small)); | ||
| 180 | *pios = NULL; | ||
| 181 | return -ENOMEM; | ||
| 182 | } | ||
| 183 | extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL); | ||
| 184 | if (unlikely(!extra_part)) { | ||
| 185 | ORE_DBGMSG("Failed alloc second part bytes=%zd\n", | ||
| 186 | sizeof(*extra_part)); | ||
| 187 | kfree(_aio_small); | ||
| 188 | *pios = NULL; | ||
| 189 | return -ENOMEM; | ||
| 190 | } | ||
| 191 | |||
| 192 | pages = num_par_pages ? extra_part->pages : NULL; | ||
| 193 | sgilist = sgs_per_dev ? extra_part->sglist : NULL; | ||
| 194 | /* In this case the per_dev[0].sgilist holds the pointer to | ||
| 195 | * be freed | ||
| 196 | */ | ||
| 197 | ios = &_aio_small->ios; | ||
| 198 | ios->extra_part_alloc = true; | ||
| 199 | } | ||
| 200 | |||
| 201 | if (pages) { | ||
| 202 | ios->parity_pages = pages; | ||
| 203 | ios->max_par_pages = num_par_pages; | ||
| 204 | } | ||
| 205 | if (sgilist) { | ||
| 206 | unsigned d; | ||
| 207 | |||
| 208 | for (d = 0; d < numdevs; ++d) { | ||
| 209 | ios->per_dev[d].sglist = sgilist; | ||
| 210 | sgilist += sgs_per_dev; | ||
| 211 | } | ||
| 212 | ios->sgs_per_dev = sgs_per_dev; | ||
| 213 | } | ||
| 214 | |||
| 215 | ios->layout = layout; | ||
| 216 | ios->oc = oc; | ||
| 217 | *pios = ios; | ||
| 218 | return 0; | ||
| 219 | } | ||
| 220 | |||
| 221 | /* Allocate an io_state for only a single group of devices | ||
| 222 | * | ||
| 223 | * If a user needs to call ore_read/write() this version must be used becase it | ||
| 224 | * allocates extra stuff for striping and raid. | ||
| 225 | * The ore might decide to only IO less then @length bytes do to alignmets | ||
| 226 | * and constrains as follows: | ||
| 227 | * - The IO cannot cross group boundary. | ||
| 228 | * - In raid5/6 The end of the IO must align at end of a stripe eg. | ||
| 229 | * (@offset + @length) % strip_size == 0. Or the complete range is within a | ||
| 230 | * single stripe. | ||
| 231 | * - Memory condition only permitted a shorter IO. (A user can use @length=~0 | ||
| 232 | * And check the returned ios->length for max_io_size.) | ||
| 233 | * | ||
| 234 | * The caller must check returned ios->length (and/or ios->nr_pages) and | ||
| 235 | * re-issue these pages that fall outside of ios->length | ||
| 236 | */ | ||
| 237 | int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, | ||
| 66 | bool is_reading, u64 offset, u64 length, | 238 | bool is_reading, u64 offset, u64 length, |
| 67 | struct ore_io_state **pios) | 239 | struct ore_io_state **pios) |
| 68 | { | 240 | { |
| 69 | struct ore_io_state *ios; | 241 | struct ore_io_state *ios; |
| 242 | unsigned numdevs = layout->group_width * layout->mirrors_p1; | ||
| 243 | unsigned sgs_per_dev = 0, max_par_pages = 0; | ||
| 244 | int ret; | ||
| 70 | 245 | ||
| 71 | /*TODO: Maybe use kmem_cach per sbi of size | 246 | if (layout->parity && length) { |
| 72 | * exofs_io_state_size(layout->s_numdevs) | 247 | unsigned data_devs = layout->group_width - layout->parity; |
| 73 | */ | 248 | unsigned stripe_size = layout->stripe_unit * data_devs; |
| 74 | ios = kzalloc(ore_io_state_size(comps->numdevs), GFP_KERNEL); | 249 | unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; |
| 75 | if (unlikely(!ios)) { | 250 | u32 remainder; |
| 76 | ORE_DBGMSG("Failed kzalloc bytes=%d\n", | 251 | u64 num_stripes; |
| 77 | ore_io_state_size(comps->numdevs)); | 252 | u64 num_raid_units; |
| 78 | *pios = NULL; | 253 | |
| 79 | return -ENOMEM; | 254 | num_stripes = div_u64_rem(length, stripe_size, &remainder); |
| 255 | if (remainder) | ||
| 256 | ++num_stripes; | ||
| 257 | |||
| 258 | num_raid_units = num_stripes * layout->parity; | ||
| 259 | |||
| 260 | if (is_reading) { | ||
| 261 | /* For reads add per_dev sglist array */ | ||
| 262 | /* TODO: Raid 6 we need twice more. Actually: | ||
| 263 | * num_stripes / LCMdP(W,P); | ||
| 264 | * if (W%P != 0) num_stripes *= parity; | ||
| 265 | */ | ||
| 266 | |||
| 267 | /* first/last seg is split */ | ||
| 268 | num_raid_units += layout->group_width; | ||
| 269 | sgs_per_dev = div_u64(num_raid_units, data_devs); | ||
| 270 | } else { | ||
| 271 | /* For Writes add parity pages array. */ | ||
| 272 | max_par_pages = num_raid_units * pages_in_unit * | ||
| 273 | sizeof(struct page *); | ||
| 274 | } | ||
| 80 | } | 275 | } |
| 81 | 276 | ||
| 82 | ios->layout = layout; | 277 | ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages, |
| 83 | ios->comps = comps; | 278 | pios); |
| 84 | ios->offset = offset; | 279 | if (unlikely(ret)) |
| 85 | ios->length = length; | 280 | return ret; |
| 281 | |||
| 282 | ios = *pios; | ||
| 86 | ios->reading = is_reading; | 283 | ios->reading = is_reading; |
| 284 | ios->offset = offset; | ||
| 285 | |||
| 286 | if (length) { | ||
| 287 | ore_calc_stripe_info(layout, offset, length, &ios->si); | ||
| 288 | ios->length = ios->si.length; | ||
| 289 | ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE; | ||
| 290 | if (layout->parity) | ||
| 291 | _ore_post_alloc_raid_stuff(ios); | ||
| 292 | } | ||
| 87 | 293 | ||
| 88 | *pios = ios; | ||
| 89 | return 0; | 294 | return 0; |
| 90 | } | 295 | } |
| 91 | EXPORT_SYMBOL(ore_get_rw_state); | 296 | EXPORT_SYMBOL(ore_get_rw_state); |
| 92 | 297 | ||
| 93 | int ore_get_io_state(struct ore_layout *layout, struct ore_components *comps, | 298 | /* Allocate an io_state for all the devices in the comps array |
| 94 | struct ore_io_state **ios) | 299 | * |
| 300 | * This version of io_state allocation is used mostly by create/remove | ||
| 301 | * and trunc where we currently need all the devices. The only wastful | ||
| 302 | * bit is the read/write_attributes with no IO. Those sites should | ||
| 303 | * be converted to use ore_get_rw_state() with length=0 | ||
| 304 | */ | ||
| 305 | int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, | ||
| 306 | struct ore_io_state **pios) | ||
| 95 | { | 307 | { |
| 96 | return ore_get_rw_state(layout, comps, true, 0, 0, ios); | 308 | return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios); |
| 97 | } | 309 | } |
| 98 | EXPORT_SYMBOL(ore_get_io_state); | 310 | EXPORT_SYMBOL(ore_get_io_state); |
| 99 | 311 | ||
| @@ -111,6 +323,7 @@ void ore_put_io_state(struct ore_io_state *ios) | |||
| 111 | bio_put(per_dev->bio); | 323 | bio_put(per_dev->bio); |
| 112 | } | 324 | } |
| 113 | 325 | ||
| 326 | _ore_free_raid_stuff(ios); | ||
| 114 | kfree(ios); | 327 | kfree(ios); |
| 115 | } | 328 | } |
| 116 | } | 329 | } |
| @@ -138,7 +351,7 @@ static void _done_io(struct osd_request *or, void *p) | |||
| 138 | kref_put(&ios->kref, _last_io); | 351 | kref_put(&ios->kref, _last_io); |
| 139 | } | 352 | } |
| 140 | 353 | ||
| 141 | static int ore_io_execute(struct ore_io_state *ios) | 354 | int ore_io_execute(struct ore_io_state *ios) |
| 142 | { | 355 | { |
| 143 | DECLARE_COMPLETION_ONSTACK(wait); | 356 | DECLARE_COMPLETION_ONSTACK(wait); |
| 144 | bool sync = (ios->done == NULL); | 357 | bool sync = (ios->done == NULL); |
| @@ -198,7 +411,7 @@ static void _clear_bio(struct bio *bio) | |||
| 198 | } | 411 | } |
| 199 | } | 412 | } |
| 200 | 413 | ||
| 201 | int ore_check_io(struct ore_io_state *ios, u64 *resid) | 414 | int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error) |
| 202 | { | 415 | { |
| 203 | enum osd_err_priority acumulated_osd_err = 0; | 416 | enum osd_err_priority acumulated_osd_err = 0; |
| 204 | int acumulated_lin_err = 0; | 417 | int acumulated_lin_err = 0; |
| @@ -206,7 +419,8 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid) | |||
| 206 | 419 | ||
| 207 | for (i = 0; i < ios->numdevs; i++) { | 420 | for (i = 0; i < ios->numdevs; i++) { |
| 208 | struct osd_sense_info osi; | 421 | struct osd_sense_info osi; |
| 209 | struct osd_request *or = ios->per_dev[i].or; | 422 | struct ore_per_dev_state *per_dev = &ios->per_dev[i]; |
| 423 | struct osd_request *or = per_dev->or; | ||
| 210 | int ret; | 424 | int ret; |
| 211 | 425 | ||
| 212 | if (unlikely(!or)) | 426 | if (unlikely(!or)) |
| @@ -218,29 +432,31 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid) | |||
| 218 | 432 | ||
| 219 | if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { | 433 | if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { |
| 220 | /* start read offset passed endof file */ | 434 | /* start read offset passed endof file */ |
| 221 | _clear_bio(ios->per_dev[i].bio); | 435 | _clear_bio(per_dev->bio); |
| 222 | ORE_DBGMSG("start read offset passed end of file " | 436 | ORE_DBGMSG("start read offset passed end of file " |
| 223 | "offset=0x%llx, length=0x%llx\n", | 437 | "offset=0x%llx, length=0x%llx\n", |
| 224 | _LLU(ios->per_dev[i].offset), | 438 | _LLU(per_dev->offset), |
| 225 | _LLU(ios->per_dev[i].length)); | 439 | _LLU(per_dev->length)); |
| 226 | 440 | ||
| 227 | continue; /* we recovered */ | 441 | continue; /* we recovered */ |
| 228 | } | 442 | } |
| 229 | 443 | ||
| 444 | if (on_dev_error) { | ||
| 445 | u64 residual = ios->reading ? | ||
| 446 | or->in.residual : or->out.residual; | ||
| 447 | u64 offset = (ios->offset + ios->length) - residual; | ||
| 448 | struct ore_dev *od = ios->oc->ods[ | ||
| 449 | per_dev->dev - ios->oc->first_dev]; | ||
| 450 | |||
| 451 | on_dev_error(ios, od, per_dev->dev, osi.osd_err_pri, | ||
| 452 | offset, residual); | ||
| 453 | } | ||
| 230 | if (osi.osd_err_pri >= acumulated_osd_err) { | 454 | if (osi.osd_err_pri >= acumulated_osd_err) { |
| 231 | acumulated_osd_err = osi.osd_err_pri; | 455 | acumulated_osd_err = osi.osd_err_pri; |
| 232 | acumulated_lin_err = ret; | 456 | acumulated_lin_err = ret; |
| 233 | } | 457 | } |
| 234 | } | 458 | } |
| 235 | 459 | ||
| 236 | /* TODO: raid specific residual calculations */ | ||
| 237 | if (resid) { | ||
| 238 | if (likely(!acumulated_lin_err)) | ||
| 239 | *resid = 0; | ||
| 240 | else | ||
| 241 | *resid = ios->length; | ||
| 242 | } | ||
| 243 | |||
| 244 | return acumulated_lin_err; | 460 | return acumulated_lin_err; |
| 245 | } | 461 | } |
| 246 | EXPORT_SYMBOL(ore_check_io); | 462 | EXPORT_SYMBOL(ore_check_io); |
| @@ -248,61 +464,65 @@ EXPORT_SYMBOL(ore_check_io); | |||
| 248 | /* | 464 | /* |
| 249 | * L - logical offset into the file | 465 | * L - logical offset into the file |
| 250 | * | 466 | * |
| 251 | * U - The number of bytes in a stripe within a group | 467 | * D - number of Data devices |
| 468 | * D = group_width - parity | ||
| 252 | * | 469 | * |
| 253 | * U = stripe_unit * group_width | 470 | * U - The number of bytes in a stripe within a group |
| 471 | * U = stripe_unit * D | ||
| 254 | * | 472 | * |
| 255 | * T - The number of bytes striped within a group of component objects | 473 | * T - The number of bytes striped within a group of component objects |
| 256 | * (before advancing to the next group) | 474 | * (before advancing to the next group) |
| 257 | * | 475 | * T = U * group_depth |
| 258 | * T = stripe_unit * group_width * group_depth | ||
| 259 | * | 476 | * |
| 260 | * S - The number of bytes striped across all component objects | 477 | * S - The number of bytes striped across all component objects |
| 261 | * before the pattern repeats | 478 | * before the pattern repeats |
| 479 | * S = T * group_count | ||
| 262 | * | 480 | * |
| 263 | * S = stripe_unit * group_width * group_depth * group_count | 481 | * M - The "major" (i.e., across all components) cycle number |
| 264 | * | ||
| 265 | * M - The "major" (i.e., across all components) stripe number | ||
| 266 | * | ||
| 267 | * M = L / S | 482 | * M = L / S |
| 268 | * | 483 | * |
| 269 | * G - Counts the groups from the beginning of the major stripe | 484 | * G - Counts the groups from the beginning of the major cycle |
| 270 | * | ||
| 271 | * G = (L - (M * S)) / T [or (L % S) / T] | 485 | * G = (L - (M * S)) / T [or (L % S) / T] |
| 272 | * | 486 | * |
| 273 | * H - The byte offset within the group | 487 | * H - The byte offset within the group |
| 274 | * | ||
| 275 | * H = (L - (M * S)) % T [or (L % S) % T] | 488 | * H = (L - (M * S)) % T [or (L % S) % T] |
| 276 | * | 489 | * |
| 277 | * N - The "minor" (i.e., across the group) stripe number | 490 | * N - The "minor" (i.e., across the group) stripe number |
| 278 | * | ||
| 279 | * N = H / U | 491 | * N = H / U |
| 280 | * | 492 | * |
| 281 | * C - The component index coresponding to L | 493 | * C - The component index coresponding to L |
| 282 | * | 494 | * |
| 283 | * C = (H - (N * U)) / stripe_unit + G * group_width | 495 | * C = (H - (N * U)) / stripe_unit + G * D |
| 284 | * [or (L % U) / stripe_unit + G * group_width] | 496 | * [or (L % U) / stripe_unit + G * D] |
| 285 | * | 497 | * |
| 286 | * O - The component offset coresponding to L | 498 | * O - The component offset coresponding to L |
| 287 | * | ||
| 288 | * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit | 499 | * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit |
| 500 | * | ||
| 501 | * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity | ||
| 502 | * divide by parity | ||
| 503 | * LCMdP = lcm(group_width, parity) / parity | ||
| 504 | * | ||
| 505 | * R - The parity Rotation stripe | ||
| 506 | * (Note parity cycle always starts at a group's boundary) | ||
| 507 | * R = N % LCMdP | ||
| 508 | * | ||
| 509 | * I = the first parity device index | ||
| 510 | * I = (group_width + group_width - R*parity - parity) % group_width | ||
| 511 | * | ||
| 512 | * Craid - The component index Rotated | ||
| 513 | * Craid = (group_width + C - R*parity) % group_width | ||
| 514 | * (We add the group_width to avoid negative numbers modulo math) | ||
| 289 | */ | 515 | */ |
| 290 | struct _striping_info { | 516 | void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, |
| 291 | u64 obj_offset; | 517 | u64 length, struct ore_striping_info *si) |
| 292 | u64 group_length; | ||
| 293 | u64 M; /* for truncate */ | ||
| 294 | unsigned dev; | ||
| 295 | unsigned unit_off; | ||
| 296 | }; | ||
| 297 | |||
| 298 | static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset, | ||
| 299 | struct _striping_info *si) | ||
| 300 | { | 518 | { |
| 301 | u32 stripe_unit = layout->stripe_unit; | 519 | u32 stripe_unit = layout->stripe_unit; |
| 302 | u32 group_width = layout->group_width; | 520 | u32 group_width = layout->group_width; |
| 303 | u64 group_depth = layout->group_depth; | 521 | u64 group_depth = layout->group_depth; |
| 522 | u32 parity = layout->parity; | ||
| 304 | 523 | ||
| 305 | u32 U = stripe_unit * group_width; | 524 | u32 D = group_width - parity; |
| 525 | u32 U = D * stripe_unit; | ||
| 306 | u64 T = U * group_depth; | 526 | u64 T = U * group_depth; |
| 307 | u64 S = T * layout->group_count; | 527 | u64 S = T * layout->group_count; |
| 308 | u64 M = div64_u64(file_offset, S); | 528 | u64 M = div64_u64(file_offset, S); |
| @@ -318,39 +538,65 @@ static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset, | |||
| 318 | u32 N = div_u64(H, U); | 538 | u32 N = div_u64(H, U); |
| 319 | 539 | ||
| 320 | /* "H - (N * U)" is just "H % U" so it's bound to u32 */ | 540 | /* "H - (N * U)" is just "H % U" so it's bound to u32 */ |
| 321 | si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; | 541 | u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; |
| 322 | si->dev *= layout->mirrors_p1; | ||
| 323 | 542 | ||
| 324 | div_u64_rem(file_offset, stripe_unit, &si->unit_off); | 543 | div_u64_rem(file_offset, stripe_unit, &si->unit_off); |
| 325 | 544 | ||
| 326 | si->obj_offset = si->unit_off + (N * stripe_unit) + | 545 | si->obj_offset = si->unit_off + (N * stripe_unit) + |
| 327 | (M * group_depth * stripe_unit); | 546 | (M * group_depth * stripe_unit); |
| 328 | 547 | ||
| 329 | si->group_length = T - H; | 548 | if (parity) { |
| 549 | u32 LCMdP = lcm(group_width, parity) / parity; | ||
| 550 | /* R = N % LCMdP; */ | ||
| 551 | u32 RxP = (N % LCMdP) * parity; | ||
| 552 | u32 first_dev = C - C % group_width; | ||
| 553 | |||
| 554 | si->par_dev = (group_width + group_width - parity - RxP) % | ||
| 555 | group_width + first_dev; | ||
| 556 | si->dev = (group_width + C - RxP) % group_width + first_dev; | ||
| 557 | si->bytes_in_stripe = U; | ||
| 558 | si->first_stripe_start = M * S + G * T + N * U; | ||
| 559 | } else { | ||
| 560 | /* Make the math correct see _prepare_one_group */ | ||
| 561 | si->par_dev = group_width; | ||
| 562 | si->dev = C; | ||
| 563 | } | ||
| 564 | |||
| 565 | si->dev *= layout->mirrors_p1; | ||
| 566 | si->par_dev *= layout->mirrors_p1; | ||
| 567 | si->offset = file_offset; | ||
| 568 | si->length = T - H; | ||
| 569 | if (si->length > length) | ||
| 570 | si->length = length; | ||
| 330 | si->M = M; | 571 | si->M = M; |
| 331 | } | 572 | } |
| 573 | EXPORT_SYMBOL(ore_calc_stripe_info); | ||
| 332 | 574 | ||
| 333 | static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, | 575 | int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, |
| 334 | unsigned pgbase, struct ore_per_dev_state *per_dev, | 576 | unsigned pgbase, struct page **pages, |
| 335 | int cur_len) | 577 | struct ore_per_dev_state *per_dev, int cur_len) |
| 336 | { | 578 | { |
| 337 | unsigned pg = *cur_pg; | 579 | unsigned pg = *cur_pg; |
| 338 | struct request_queue *q = | 580 | struct request_queue *q = |
| 339 | osd_request_queue(_ios_od(ios, per_dev->dev)); | 581 | osd_request_queue(_ios_od(ios, per_dev->dev)); |
| 340 | 582 | unsigned len = cur_len; | |
| 341 | per_dev->length += cur_len; | 583 | int ret; |
| 342 | 584 | ||
| 343 | if (per_dev->bio == NULL) { | 585 | if (per_dev->bio == NULL) { |
| 344 | unsigned pages_in_stripe = ios->layout->group_width * | 586 | unsigned pages_in_stripe = ios->layout->group_width * |
| 345 | (ios->layout->stripe_unit / PAGE_SIZE); | 587 | (ios->layout->stripe_unit / PAGE_SIZE); |
| 346 | unsigned bio_size = (ios->nr_pages + pages_in_stripe) / | 588 | unsigned nr_pages = ios->nr_pages * ios->layout->group_width / |
| 347 | ios->layout->group_width; | 589 | (ios->layout->group_width - |
| 590 | ios->layout->parity); | ||
| 591 | unsigned bio_size = (nr_pages + pages_in_stripe) / | ||
| 592 | ios->layout->group_width; | ||
| 348 | 593 | ||
| 349 | per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); | 594 | per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); |
| 350 | if (unlikely(!per_dev->bio)) { | 595 | if (unlikely(!per_dev->bio)) { |
| 351 | ORE_DBGMSG("Failed to allocate BIO size=%u\n", | 596 | ORE_DBGMSG("Failed to allocate BIO size=%u\n", |
| 352 | bio_size); | 597 | bio_size); |
| 353 | return -ENOMEM; | 598 | ret = -ENOMEM; |
| 599 | goto out; | ||
| 354 | } | 600 | } |
| 355 | } | 601 | } |
| 356 | 602 | ||
| @@ -358,64 +604,90 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, | |||
| 358 | unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); | 604 | unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); |
| 359 | unsigned added_len; | 605 | unsigned added_len; |
| 360 | 606 | ||
| 361 | BUG_ON(ios->nr_pages <= pg); | ||
| 362 | cur_len -= pglen; | 607 | cur_len -= pglen; |
| 363 | 608 | ||
| 364 | added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg], | 609 | added_len = bio_add_pc_page(q, per_dev->bio, pages[pg], |
| 365 | pglen, pgbase); | 610 | pglen, pgbase); |
| 366 | if (unlikely(pglen != added_len)) | 611 | if (unlikely(pglen != added_len)) { |
| 367 | return -ENOMEM; | 612 | ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n", |
| 613 | per_dev->bio->bi_vcnt); | ||
| 614 | ret = -ENOMEM; | ||
| 615 | goto out; | ||
| 616 | } | ||
| 617 | _add_stripe_page(ios->sp2d, &ios->si, pages[pg]); | ||
| 618 | |||
| 368 | pgbase = 0; | 619 | pgbase = 0; |
| 369 | ++pg; | 620 | ++pg; |
| 370 | } | 621 | } |
| 371 | BUG_ON(cur_len); | 622 | BUG_ON(cur_len); |
| 372 | 623 | ||
| 624 | per_dev->length += len; | ||
| 373 | *cur_pg = pg; | 625 | *cur_pg = pg; |
| 374 | return 0; | 626 | ret = 0; |
| 627 | out: /* we fail the complete unit on an error eg don't advance | ||
| 628 | * per_dev->length and cur_pg. This means that we might have a bigger | ||
| 629 | * bio than the CDB requested length (per_dev->length). That's fine | ||
| 630 | * only the oposite is fatal. | ||
| 631 | */ | ||
| 632 | return ret; | ||
| 375 | } | 633 | } |
| 376 | 634 | ||
| 377 | static int _prepare_one_group(struct ore_io_state *ios, u64 length, | 635 | static int _prepare_for_striping(struct ore_io_state *ios) |
| 378 | struct _striping_info *si) | ||
| 379 | { | 636 | { |
| 637 | struct ore_striping_info *si = &ios->si; | ||
| 380 | unsigned stripe_unit = ios->layout->stripe_unit; | 638 | unsigned stripe_unit = ios->layout->stripe_unit; |
| 381 | unsigned mirrors_p1 = ios->layout->mirrors_p1; | 639 | unsigned mirrors_p1 = ios->layout->mirrors_p1; |
| 382 | unsigned devs_in_group = ios->layout->group_width * mirrors_p1; | 640 | unsigned group_width = ios->layout->group_width; |
| 641 | unsigned devs_in_group = group_width * mirrors_p1; | ||
| 383 | unsigned dev = si->dev; | 642 | unsigned dev = si->dev; |
| 384 | unsigned first_dev = dev - (dev % devs_in_group); | 643 | unsigned first_dev = dev - (dev % devs_in_group); |
| 385 | unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; | 644 | unsigned dev_order; |
| 386 | unsigned cur_pg = ios->pages_consumed; | 645 | unsigned cur_pg = ios->pages_consumed; |
| 646 | u64 length = ios->length; | ||
| 387 | int ret = 0; | 647 | int ret = 0; |
| 388 | 648 | ||
| 649 | if (!ios->pages) { | ||
| 650 | ios->numdevs = ios->layout->mirrors_p1; | ||
| 651 | return 0; | ||
| 652 | } | ||
| 653 | |||
| 654 | BUG_ON(length > si->length); | ||
| 655 | |||
| 656 | dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev); | ||
| 657 | si->cur_comp = dev_order; | ||
| 658 | si->cur_pg = si->unit_off / PAGE_SIZE; | ||
| 659 | |||
| 389 | while (length) { | 660 | while (length) { |
| 390 | struct ore_per_dev_state *per_dev = &ios->per_dev[dev]; | 661 | unsigned comp = dev - first_dev; |
| 662 | struct ore_per_dev_state *per_dev = &ios->per_dev[comp]; | ||
| 391 | unsigned cur_len, page_off = 0; | 663 | unsigned cur_len, page_off = 0; |
| 392 | 664 | ||
| 393 | if (!per_dev->length) { | 665 | if (!per_dev->length) { |
| 394 | per_dev->dev = dev; | 666 | per_dev->dev = dev; |
| 395 | if (dev < si->dev) { | 667 | if (dev == si->dev) { |
| 396 | per_dev->offset = si->obj_offset + stripe_unit - | 668 | WARN_ON(dev == si->par_dev); |
| 397 | si->unit_off; | ||
| 398 | cur_len = stripe_unit; | ||
| 399 | } else if (dev == si->dev) { | ||
| 400 | per_dev->offset = si->obj_offset; | 669 | per_dev->offset = si->obj_offset; |
| 401 | cur_len = stripe_unit - si->unit_off; | 670 | cur_len = stripe_unit - si->unit_off; |
| 402 | page_off = si->unit_off & ~PAGE_MASK; | 671 | page_off = si->unit_off & ~PAGE_MASK; |
| 403 | BUG_ON(page_off && (page_off != ios->pgbase)); | 672 | BUG_ON(page_off && (page_off != ios->pgbase)); |
| 404 | } else { /* dev > si->dev */ | 673 | } else { |
| 405 | per_dev->offset = si->obj_offset - si->unit_off; | 674 | if (si->cur_comp > dev_order) |
| 675 | per_dev->offset = | ||
| 676 | si->obj_offset - si->unit_off; | ||
| 677 | else /* si->cur_comp < dev_order */ | ||
| 678 | per_dev->offset = | ||
| 679 | si->obj_offset + stripe_unit - | ||
| 680 | si->unit_off; | ||
| 406 | cur_len = stripe_unit; | 681 | cur_len = stripe_unit; |
| 407 | } | 682 | } |
| 408 | |||
| 409 | if (max_comp < dev) | ||
| 410 | max_comp = dev; | ||
| 411 | } else { | 683 | } else { |
| 412 | cur_len = stripe_unit; | 684 | cur_len = stripe_unit; |
| 413 | } | 685 | } |
| 414 | if (cur_len >= length) | 686 | if (cur_len >= length) |
| 415 | cur_len = length; | 687 | cur_len = length; |
| 416 | 688 | ||
| 417 | ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, | 689 | ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages, |
| 418 | cur_len); | 690 | per_dev, cur_len); |
| 419 | if (unlikely(ret)) | 691 | if (unlikely(ret)) |
| 420 | goto out; | 692 | goto out; |
| 421 | 693 | ||
| @@ -423,60 +695,60 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length, | |||
| 423 | dev = (dev % devs_in_group) + first_dev; | 695 | dev = (dev % devs_in_group) + first_dev; |
| 424 | 696 | ||
| 425 | length -= cur_len; | 697 | length -= cur_len; |
| 426 | } | ||
| 427 | out: | ||
| 428 | ios->numdevs = max_comp + mirrors_p1; | ||
| 429 | ios->pages_consumed = cur_pg; | ||
| 430 | return ret; | ||
| 431 | } | ||
| 432 | |||
| 433 | static int _prepare_for_striping(struct ore_io_state *ios) | ||
| 434 | { | ||
| 435 | u64 length = ios->length; | ||
| 436 | u64 offset = ios->offset; | ||
| 437 | struct _striping_info si; | ||
| 438 | int ret = 0; | ||
| 439 | 698 | ||
| 440 | if (!ios->pages) { | 699 | si->cur_comp = (si->cur_comp + 1) % group_width; |
| 441 | if (ios->kern_buff) { | 700 | if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) { |
| 442 | struct ore_per_dev_state *per_dev = &ios->per_dev[0]; | 701 | if (!length && ios->sp2d) { |
| 702 | /* If we are writing and this is the very last | ||
| 703 | * stripe. then operate on parity dev. | ||
| 704 | */ | ||
| 705 | dev = si->par_dev; | ||
| 706 | } | ||
| 707 | if (ios->sp2d) | ||
| 708 | /* In writes cur_len just means if it's the | ||
| 709 | * last one. See _ore_add_parity_unit. | ||
| 710 | */ | ||
| 711 | cur_len = length; | ||
| 712 | per_dev = &ios->per_dev[dev - first_dev]; | ||
| 713 | if (!per_dev->length) { | ||
| 714 | /* Only/always the parity unit of the first | ||
| 715 | * stripe will be empty. So this is a chance to | ||
| 716 | * initialize the per_dev info. | ||
| 717 | */ | ||
| 718 | per_dev->dev = dev; | ||
| 719 | per_dev->offset = si->obj_offset - si->unit_off; | ||
| 720 | } | ||
| 443 | 721 | ||
| 444 | _calc_stripe_info(ios->layout, ios->offset, &si); | 722 | ret = _ore_add_parity_unit(ios, si, per_dev, cur_len); |
| 445 | per_dev->offset = si.obj_offset; | 723 | if (unlikely(ret)) |
| 446 | per_dev->dev = si.dev; | 724 | goto out; |
| 447 | 725 | ||
| 448 | /* no cross device without page array */ | 726 | /* Rotate next par_dev backwards with wraping */ |
| 449 | BUG_ON((ios->layout->group_width > 1) && | 727 | si->par_dev = (devs_in_group + si->par_dev - |
| 450 | (si.unit_off + ios->length > | 728 | ios->layout->parity * mirrors_p1) % |
| 451 | ios->layout->stripe_unit)); | 729 | devs_in_group + first_dev; |
| 730 | /* Next stripe, start fresh */ | ||
| 731 | si->cur_comp = 0; | ||
| 732 | si->cur_pg = 0; | ||
| 452 | } | 733 | } |
| 453 | ios->numdevs = ios->layout->mirrors_p1; | ||
| 454 | return 0; | ||
| 455 | } | ||
| 456 | |||
| 457 | while (length) { | ||
| 458 | _calc_stripe_info(ios->layout, offset, &si); | ||
| 459 | |||
| 460 | if (length < si.group_length) | ||
| 461 | si.group_length = length; | ||
| 462 | |||
| 463 | ret = _prepare_one_group(ios, si.group_length, &si); | ||
| 464 | if (unlikely(ret)) | ||
| 465 | goto out; | ||
| 466 | |||
| 467 | offset += si.group_length; | ||
| 468 | length -= si.group_length; | ||
| 469 | } | 734 | } |
| 470 | |||
| 471 | out: | 735 | out: |
| 472 | return ret; | 736 | ios->numdevs = devs_in_group; |
| 737 | ios->pages_consumed = cur_pg; | ||
| 738 | if (unlikely(ret)) { | ||
| 739 | if (length == ios->length) | ||
| 740 | return ret; | ||
| 741 | else | ||
| 742 | ios->length -= length; | ||
| 743 | } | ||
| 744 | return 0; | ||
| 473 | } | 745 | } |
| 474 | 746 | ||
| 475 | int ore_create(struct ore_io_state *ios) | 747 | int ore_create(struct ore_io_state *ios) |
| 476 | { | 748 | { |
| 477 | int i, ret; | 749 | int i, ret; |
| 478 | 750 | ||
| 479 | for (i = 0; i < ios->comps->numdevs; i++) { | 751 | for (i = 0; i < ios->oc->numdevs; i++) { |
| 480 | struct osd_request *or; | 752 | struct osd_request *or; |
| 481 | 753 | ||
| 482 | or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); | 754 | or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); |
| @@ -501,7 +773,7 @@ int ore_remove(struct ore_io_state *ios) | |||
| 501 | { | 773 | { |
| 502 | int i, ret; | 774 | int i, ret; |
| 503 | 775 | ||
| 504 | for (i = 0; i < ios->comps->numdevs; i++) { | 776 | for (i = 0; i < ios->oc->numdevs; i++) { |
| 505 | struct osd_request *or; | 777 | struct osd_request *or; |
| 506 | 778 | ||
| 507 | or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); | 779 | or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); |
| @@ -543,7 +815,6 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) | |||
| 543 | goto out; | 815 | goto out; |
| 544 | } | 816 | } |
| 545 | per_dev->or = or; | 817 | per_dev->or = or; |
| 546 | per_dev->offset = master_dev->offset; | ||
| 547 | 818 | ||
| 548 | if (ios->pages) { | 819 | if (ios->pages) { |
| 549 | struct bio *bio; | 820 | struct bio *bio; |
| @@ -562,6 +833,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) | |||
| 562 | __bio_clone(bio, master_dev->bio); | 833 | __bio_clone(bio, master_dev->bio); |
| 563 | bio->bi_bdev = NULL; | 834 | bio->bi_bdev = NULL; |
| 564 | bio->bi_next = NULL; | 835 | bio->bi_next = NULL; |
| 836 | per_dev->offset = master_dev->offset; | ||
| 565 | per_dev->length = master_dev->length; | 837 | per_dev->length = master_dev->length; |
| 566 | per_dev->bio = bio; | 838 | per_dev->bio = bio; |
| 567 | per_dev->dev = dev; | 839 | per_dev->dev = dev; |
| @@ -579,7 +851,15 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) | |||
| 579 | _LLU(per_dev->offset), | 851 | _LLU(per_dev->offset), |
| 580 | _LLU(per_dev->length), dev); | 852 | _LLU(per_dev->length), dev); |
| 581 | } else if (ios->kern_buff) { | 853 | } else if (ios->kern_buff) { |
| 582 | ret = osd_req_write_kern(or, _ios_obj(ios, dev), | 854 | per_dev->offset = ios->si.obj_offset; |
| 855 | per_dev->dev = ios->si.dev + dev; | ||
| 856 | |||
| 857 | /* no cross device without page array */ | ||
| 858 | BUG_ON((ios->layout->group_width > 1) && | ||
| 859 | (ios->si.unit_off + ios->length > | ||
| 860 | ios->layout->stripe_unit)); | ||
| 861 | |||
| 862 | ret = osd_req_write_kern(or, _ios_obj(ios, per_dev->dev), | ||
| 583 | per_dev->offset, | 863 | per_dev->offset, |
| 584 | ios->kern_buff, ios->length); | 864 | ios->kern_buff, ios->length); |
| 585 | if (unlikely(ret)) | 865 | if (unlikely(ret)) |
| @@ -588,7 +868,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) | |||
| 588 | "length=0x%llx dev=%d\n", | 868 | "length=0x%llx dev=%d\n", |
| 589 | _LLU(_ios_obj(ios, dev)->id), | 869 | _LLU(_ios_obj(ios, dev)->id), |
| 590 | _LLU(per_dev->offset), | 870 | _LLU(per_dev->offset), |
| 591 | _LLU(ios->length), dev); | 871 | _LLU(ios->length), per_dev->dev); |
| 592 | } else { | 872 | } else { |
| 593 | osd_req_set_attributes(or, _ios_obj(ios, dev)); | 873 | osd_req_set_attributes(or, _ios_obj(ios, dev)); |
| 594 | ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", | 874 | ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", |
| @@ -614,6 +894,14 @@ int ore_write(struct ore_io_state *ios) | |||
| 614 | int i; | 894 | int i; |
| 615 | int ret; | 895 | int ret; |
| 616 | 896 | ||
| 897 | if (unlikely(ios->sp2d && !ios->r4w)) { | ||
| 898 | /* A library is attempting a RAID-write without providing | ||
| 899 | * a pages lock interface. | ||
| 900 | */ | ||
| 901 | WARN_ON_ONCE(1); | ||
| 902 | return -ENOTSUPP; | ||
| 903 | } | ||
| 904 | |||
| 617 | ret = _prepare_for_striping(ios); | 905 | ret = _prepare_for_striping(ios); |
| 618 | if (unlikely(ret)) | 906 | if (unlikely(ret)) |
| 619 | return ret; | 907 | return ret; |
| @@ -629,7 +917,7 @@ int ore_write(struct ore_io_state *ios) | |||
| 629 | } | 917 | } |
| 630 | EXPORT_SYMBOL(ore_write); | 918 | EXPORT_SYMBOL(ore_write); |
| 631 | 919 | ||
| 632 | static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp) | 920 | int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp) |
| 633 | { | 921 | { |
| 634 | struct osd_request *or; | 922 | struct osd_request *or; |
| 635 | struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; | 923 | struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; |
| @@ -648,22 +936,27 @@ static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp) | |||
| 648 | per_dev->or = or; | 936 | per_dev->or = or; |
| 649 | 937 | ||
| 650 | if (ios->pages) { | 938 | if (ios->pages) { |
| 651 | osd_req_read(or, obj, per_dev->offset, | 939 | if (per_dev->cur_sg) { |
| 652 | per_dev->bio, per_dev->length); | 940 | /* finalize the last sg_entry */ |
| 941 | _ore_add_sg_seg(per_dev, 0, false); | ||
| 942 | if (unlikely(!per_dev->cur_sg)) | ||
| 943 | return 0; /* Skip parity only device */ | ||
| 944 | |||
| 945 | osd_req_read_sg(or, obj, per_dev->bio, | ||
| 946 | per_dev->sglist, per_dev->cur_sg); | ||
| 947 | } else { | ||
| 948 | /* The no raid case */ | ||
| 949 | osd_req_read(or, obj, per_dev->offset, | ||
| 950 | per_dev->bio, per_dev->length); | ||
| 951 | } | ||
| 952 | |||
| 653 | ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" | 953 | ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" |
| 654 | " dev=%d\n", _LLU(obj->id), | 954 | " dev=%d sg_len=%d\n", _LLU(obj->id), |
| 655 | _LLU(per_dev->offset), _LLU(per_dev->length), | 955 | _LLU(per_dev->offset), _LLU(per_dev->length), |
| 656 | first_dev); | 956 | first_dev, per_dev->cur_sg); |
| 657 | } else if (ios->kern_buff) { | ||
| 658 | int ret = osd_req_read_kern(or, obj, per_dev->offset, | ||
| 659 | ios->kern_buff, ios->length); | ||
| 660 | ORE_DBGMSG2("read_kern(0x%llx) offset=0x%llx " | ||
| 661 | "length=0x%llx dev=%d ret=>%d\n", | ||
| 662 | _LLU(obj->id), _LLU(per_dev->offset), | ||
| 663 | _LLU(ios->length), first_dev, ret); | ||
| 664 | if (unlikely(ret)) | ||
| 665 | return ret; | ||
| 666 | } else { | 957 | } else { |
| 958 | BUG_ON(ios->kern_buff); | ||
| 959 | |||
| 667 | osd_req_get_attributes(or, obj); | 960 | osd_req_get_attributes(or, obj); |
| 668 | ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", | 961 | ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", |
| 669 | _LLU(obj->id), | 962 | _LLU(obj->id), |
| @@ -688,7 +981,7 @@ int ore_read(struct ore_io_state *ios) | |||
| 688 | return ret; | 981 | return ret; |
| 689 | 982 | ||
| 690 | for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { | 983 | for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { |
| 691 | ret = _read_mirror(ios, i); | 984 | ret = _ore_read_mirror(ios, i); |
| 692 | if (unlikely(ret)) | 985 | if (unlikely(ret)) |
| 693 | return ret; | 986 | return ret; |
| 694 | } | 987 | } |
| @@ -744,31 +1037,29 @@ static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp, | |||
| 744 | } | 1037 | } |
| 745 | 1038 | ||
| 746 | struct _trunc_info { | 1039 | struct _trunc_info { |
| 747 | struct _striping_info si; | 1040 | struct ore_striping_info si; |
| 748 | u64 prev_group_obj_off; | 1041 | u64 prev_group_obj_off; |
| 749 | u64 next_group_obj_off; | 1042 | u64 next_group_obj_off; |
| 750 | 1043 | ||
| 751 | unsigned first_group_dev; | 1044 | unsigned first_group_dev; |
| 752 | unsigned nex_group_dev; | 1045 | unsigned nex_group_dev; |
| 753 | unsigned max_devs; | ||
| 754 | }; | 1046 | }; |
| 755 | 1047 | ||
| 756 | void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, | 1048 | static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, |
| 757 | struct _trunc_info *ti) | 1049 | struct _trunc_info *ti) |
| 758 | { | 1050 | { |
| 759 | unsigned stripe_unit = layout->stripe_unit; | 1051 | unsigned stripe_unit = layout->stripe_unit; |
| 760 | 1052 | ||
| 761 | _calc_stripe_info(layout, file_offset, &ti->si); | 1053 | ore_calc_stripe_info(layout, file_offset, 0, &ti->si); |
| 762 | 1054 | ||
| 763 | ti->prev_group_obj_off = ti->si.M * stripe_unit; | 1055 | ti->prev_group_obj_off = ti->si.M * stripe_unit; |
| 764 | ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0; | 1056 | ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0; |
| 765 | 1057 | ||
| 766 | ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width); | 1058 | ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width); |
| 767 | ti->nex_group_dev = ti->first_group_dev + layout->group_width; | 1059 | ti->nex_group_dev = ti->first_group_dev + layout->group_width; |
| 768 | ti->max_devs = layout->group_width * layout->group_count; | ||
| 769 | } | 1060 | } |
| 770 | 1061 | ||
| 771 | int ore_truncate(struct ore_layout *layout, struct ore_components *comps, | 1062 | int ore_truncate(struct ore_layout *layout, struct ore_components *oc, |
| 772 | u64 size) | 1063 | u64 size) |
| 773 | { | 1064 | { |
| 774 | struct ore_io_state *ios; | 1065 | struct ore_io_state *ios; |
| @@ -779,22 +1070,22 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps, | |||
| 779 | struct _trunc_info ti; | 1070 | struct _trunc_info ti; |
| 780 | int i, ret; | 1071 | int i, ret; |
| 781 | 1072 | ||
| 782 | ret = ore_get_io_state(layout, comps, &ios); | 1073 | ret = ore_get_io_state(layout, oc, &ios); |
| 783 | if (unlikely(ret)) | 1074 | if (unlikely(ret)) |
| 784 | return ret; | 1075 | return ret; |
| 785 | 1076 | ||
| 786 | _calc_trunk_info(ios->layout, size, &ti); | 1077 | _calc_trunk_info(ios->layout, size, &ti); |
| 787 | 1078 | ||
| 788 | size_attrs = kcalloc(ti.max_devs, sizeof(*size_attrs), | 1079 | size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs), |
| 789 | GFP_KERNEL); | 1080 | GFP_KERNEL); |
| 790 | if (unlikely(!size_attrs)) { | 1081 | if (unlikely(!size_attrs)) { |
| 791 | ret = -ENOMEM; | 1082 | ret = -ENOMEM; |
| 792 | goto out; | 1083 | goto out; |
| 793 | } | 1084 | } |
| 794 | 1085 | ||
| 795 | ios->numdevs = ios->comps->numdevs; | 1086 | ios->numdevs = ios->oc->numdevs; |
| 796 | 1087 | ||
| 797 | for (i = 0; i < ti.max_devs; ++i) { | 1088 | for (i = 0; i < ios->numdevs; ++i) { |
| 798 | struct exofs_trunc_attr *size_attr = &size_attrs[i]; | 1089 | struct exofs_trunc_attr *size_attr = &size_attrs[i]; |
| 799 | u64 obj_size; | 1090 | u64 obj_size; |
| 800 | 1091 | ||
| @@ -815,7 +1106,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps, | |||
| 815 | size_attr->attr.val_ptr = &size_attr->newsize; | 1106 | size_attr->attr.val_ptr = &size_attr->newsize; |
| 816 | 1107 | ||
| 817 | ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", | 1108 | ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", |
| 818 | _LLU(comps->comps->obj.id), _LLU(obj_size), i); | 1109 | _LLU(oc->comps->obj.id), _LLU(obj_size), i); |
| 819 | ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, | 1110 | ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, |
| 820 | &size_attr->attr); | 1111 | &size_attr->attr); |
| 821 | if (unlikely(ret)) | 1112 | if (unlikely(ret)) |
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c new file mode 100644 index 000000000000..29c47e5c4a86 --- /dev/null +++ b/fs/exofs/ore_raid.c | |||
| @@ -0,0 +1,660 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2011 | ||
| 3 | * Boaz Harrosh <bharrosh@panasas.com> | ||
| 4 | * | ||
| 5 | * This file is part of the objects raid engine (ore). | ||
| 6 | * | ||
| 7 | * It is free software; you can redistribute it and/or modify | ||
| 8 | * it under the terms of the GNU General Public License version 2 as published | ||
| 9 | * by the Free Software Foundation. | ||
| 10 | * | ||
| 11 | * You should have received a copy of the GNU General Public License | ||
| 12 | * along with "ore". If not, write to the Free Software Foundation, Inc: | ||
| 13 | * "Free Software Foundation <info@fsf.org>" | ||
| 14 | */ | ||
| 15 | |||
| 16 | #include <linux/gfp.h> | ||
| 17 | #include <linux/async_tx.h> | ||
| 18 | |||
| 19 | #include "ore_raid.h" | ||
| 20 | |||
| 21 | #undef ORE_DBGMSG2 | ||
| 22 | #define ORE_DBGMSG2 ORE_DBGMSG | ||
| 23 | |||
| 24 | struct page *_raid_page_alloc(void) | ||
| 25 | { | ||
| 26 | return alloc_page(GFP_KERNEL); | ||
| 27 | } | ||
| 28 | |||
| 29 | void _raid_page_free(struct page *p) | ||
| 30 | { | ||
| 31 | __free_page(p); | ||
| 32 | } | ||
| 33 | |||
| 34 | /* This struct is forward declare in ore_io_state, but is private to here. | ||
| 35 | * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit. | ||
| 36 | * | ||
| 37 | * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn. | ||
| 38 | * Ascending page index access is sp2d(p-minor, c-major). But storage is | ||
| 39 | * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor | ||
| 40 | * API. | ||
| 41 | */ | ||
| 42 | struct __stripe_pages_2d { | ||
| 43 | /* Cache some hot path repeated calculations */ | ||
| 44 | unsigned parity; | ||
| 45 | unsigned data_devs; | ||
| 46 | unsigned pages_in_unit; | ||
| 47 | |||
| 48 | bool needed ; | ||
| 49 | |||
| 50 | /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */ | ||
| 51 | struct __1_page_stripe { | ||
| 52 | bool alloc; | ||
| 53 | unsigned write_count; | ||
| 54 | struct async_submit_ctl submit; | ||
| 55 | struct dma_async_tx_descriptor *tx; | ||
| 56 | |||
| 57 | /* The size of this array is data_devs + parity */ | ||
| 58 | struct page **pages; | ||
| 59 | struct page **scribble; | ||
| 60 | /* bool array, size of this array is data_devs */ | ||
| 61 | char *page_is_read; | ||
| 62 | } _1p_stripes[]; | ||
| 63 | }; | ||
| 64 | |||
| 65 | /* This can get bigger then a page. So support multiple page allocations | ||
| 66 | * _sp2d_free should be called even if _sp2d_alloc fails (by returning | ||
| 67 | * none-zero). | ||
| 68 | */ | ||
| 69 | static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width, | ||
| 70 | unsigned parity, struct __stripe_pages_2d **psp2d) | ||
| 71 | { | ||
| 72 | struct __stripe_pages_2d *sp2d; | ||
| 73 | unsigned data_devs = group_width - parity; | ||
| 74 | struct _alloc_all_bytes { | ||
| 75 | struct __alloc_stripe_pages_2d { | ||
| 76 | struct __stripe_pages_2d sp2d; | ||
| 77 | struct __1_page_stripe _1p_stripes[pages_in_unit]; | ||
| 78 | } __asp2d; | ||
| 79 | struct __alloc_1p_arrays { | ||
| 80 | struct page *pages[group_width]; | ||
| 81 | struct page *scribble[group_width]; | ||
| 82 | char page_is_read[data_devs]; | ||
| 83 | } __a1pa[pages_in_unit]; | ||
| 84 | } *_aab; | ||
| 85 | struct __alloc_1p_arrays *__a1pa; | ||
| 86 | struct __alloc_1p_arrays *__a1pa_end; | ||
| 87 | const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]); | ||
| 88 | unsigned num_a1pa, alloc_size, i; | ||
| 89 | |||
| 90 | /* FIXME: check these numbers in ore_verify_layout */ | ||
| 91 | BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE); | ||
| 92 | BUG_ON(sizeof__a1pa > PAGE_SIZE); | ||
| 93 | |||
| 94 | if (sizeof(*_aab) > PAGE_SIZE) { | ||
| 95 | num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa; | ||
| 96 | alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa; | ||
| 97 | } else { | ||
| 98 | num_a1pa = pages_in_unit; | ||
| 99 | alloc_size = sizeof(*_aab); | ||
| 100 | } | ||
| 101 | |||
| 102 | _aab = kzalloc(alloc_size, GFP_KERNEL); | ||
| 103 | if (unlikely(!_aab)) { | ||
| 104 | ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size); | ||
| 105 | return -ENOMEM; | ||
| 106 | } | ||
| 107 | |||
| 108 | sp2d = &_aab->__asp2d.sp2d; | ||
| 109 | *psp2d = sp2d; /* From here Just call _sp2d_free */ | ||
| 110 | |||
| 111 | __a1pa = _aab->__a1pa; | ||
| 112 | __a1pa_end = __a1pa + num_a1pa; | ||
| 113 | |||
| 114 | for (i = 0; i < pages_in_unit; ++i) { | ||
| 115 | if (unlikely(__a1pa >= __a1pa_end)) { | ||
| 116 | num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa, | ||
| 117 | pages_in_unit - i); | ||
| 118 | |||
| 119 | __a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL); | ||
| 120 | if (unlikely(!__a1pa)) { | ||
| 121 | ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n", | ||
| 122 | num_a1pa); | ||
| 123 | return -ENOMEM; | ||
| 124 | } | ||
| 125 | __a1pa_end = __a1pa + num_a1pa; | ||
| 126 | /* First *pages is marked for kfree of the buffer */ | ||
| 127 | sp2d->_1p_stripes[i].alloc = true; | ||
| 128 | } | ||
| 129 | |||
| 130 | sp2d->_1p_stripes[i].pages = __a1pa->pages; | ||
| 131 | sp2d->_1p_stripes[i].scribble = __a1pa->scribble ; | ||
| 132 | sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read; | ||
| 133 | ++__a1pa; | ||
| 134 | } | ||
| 135 | |||
| 136 | sp2d->parity = parity; | ||
| 137 | sp2d->data_devs = data_devs; | ||
| 138 | sp2d->pages_in_unit = pages_in_unit; | ||
| 139 | return 0; | ||
| 140 | } | ||
| 141 | |||
| 142 | static void _sp2d_reset(struct __stripe_pages_2d *sp2d, | ||
| 143 | const struct _ore_r4w_op *r4w, void *priv) | ||
| 144 | { | ||
| 145 | unsigned data_devs = sp2d->data_devs; | ||
| 146 | unsigned group_width = data_devs + sp2d->parity; | ||
| 147 | unsigned p; | ||
| 148 | |||
| 149 | if (!sp2d->needed) | ||
| 150 | return; | ||
| 151 | |||
| 152 | for (p = 0; p < sp2d->pages_in_unit; p++) { | ||
| 153 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | ||
| 154 | |||
| 155 | if (_1ps->write_count < group_width) { | ||
| 156 | unsigned c; | ||
| 157 | |||
| 158 | for (c = 0; c < data_devs; c++) | ||
| 159 | if (_1ps->page_is_read[c]) { | ||
| 160 | struct page *page = _1ps->pages[c]; | ||
| 161 | |||
| 162 | r4w->put_page(priv, page); | ||
| 163 | _1ps->page_is_read[c] = false; | ||
| 164 | } | ||
| 165 | } | ||
| 166 | |||
| 167 | memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages)); | ||
| 168 | _1ps->write_count = 0; | ||
| 169 | _1ps->tx = NULL; | ||
| 170 | } | ||
| 171 | |||
| 172 | sp2d->needed = false; | ||
| 173 | } | ||
| 174 | |||
| 175 | static void _sp2d_free(struct __stripe_pages_2d *sp2d) | ||
| 176 | { | ||
| 177 | unsigned i; | ||
| 178 | |||
| 179 | if (!sp2d) | ||
| 180 | return; | ||
| 181 | |||
| 182 | for (i = 0; i < sp2d->pages_in_unit; ++i) { | ||
| 183 | if (sp2d->_1p_stripes[i].alloc) | ||
| 184 | kfree(sp2d->_1p_stripes[i].pages); | ||
| 185 | } | ||
| 186 | |||
| 187 | kfree(sp2d); | ||
| 188 | } | ||
| 189 | |||
| 190 | static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d) | ||
| 191 | { | ||
| 192 | unsigned p; | ||
| 193 | |||
| 194 | for (p = 0; p < sp2d->pages_in_unit; p++) { | ||
| 195 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | ||
| 196 | |||
| 197 | if (_1ps->write_count) | ||
| 198 | return p; | ||
| 199 | } | ||
| 200 | |||
| 201 | return ~0; | ||
| 202 | } | ||
| 203 | |||
| 204 | static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d) | ||
| 205 | { | ||
| 206 | unsigned p; | ||
| 207 | |||
| 208 | for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { | ||
| 209 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | ||
| 210 | |||
| 211 | if (_1ps->write_count) | ||
| 212 | return p; | ||
| 213 | } | ||
| 214 | |||
| 215 | return ~0; | ||
| 216 | } | ||
| 217 | |||
| 218 | static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) | ||
| 219 | { | ||
| 220 | unsigned p; | ||
| 221 | for (p = 0; p < sp2d->pages_in_unit; p++) { | ||
| 222 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | ||
| 223 | |||
| 224 | if (!_1ps->write_count) | ||
| 225 | continue; | ||
| 226 | |||
| 227 | init_async_submit(&_1ps->submit, | ||
| 228 | ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK, | ||
| 229 | NULL, | ||
| 230 | NULL, NULL, | ||
| 231 | (addr_conv_t *)_1ps->scribble); | ||
| 232 | |||
| 233 | /* TODO: raid6 */ | ||
| 234 | _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages, | ||
| 235 | 0, sp2d->data_devs, PAGE_SIZE, | ||
| 236 | &_1ps->submit); | ||
| 237 | } | ||
| 238 | |||
| 239 | for (p = 0; p < sp2d->pages_in_unit; p++) { | ||
| 240 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | ||
| 241 | /* NOTE: We wait for HW synchronously (I don't have such HW | ||
| 242 | * to test with.) Is parallelism needed with today's multi | ||
| 243 | * cores? | ||
| 244 | */ | ||
| 245 | async_tx_issue_pending(_1ps->tx); | ||
| 246 | } | ||
| 247 | } | ||
| 248 | |||
| 249 | void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, | ||
| 250 | struct ore_striping_info *si, struct page *page) | ||
| 251 | { | ||
| 252 | struct __1_page_stripe *_1ps; | ||
| 253 | |||
| 254 | sp2d->needed = true; | ||
| 255 | |||
| 256 | _1ps = &sp2d->_1p_stripes[si->cur_pg]; | ||
| 257 | _1ps->pages[si->cur_comp] = page; | ||
| 258 | ++_1ps->write_count; | ||
| 259 | |||
| 260 | si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit; | ||
| 261 | /* si->cur_comp is advanced outside at main loop */ | ||
| 262 | } | ||
| 263 | |||
| 264 | void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, | ||
| 265 | bool not_last) | ||
| 266 | { | ||
| 267 | struct osd_sg_entry *sge; | ||
| 268 | |||
| 269 | ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d " | ||
| 270 | "offset=0x%llx length=0x%x last_sgs_total=0x%x\n", | ||
| 271 | per_dev->dev, cur_len, not_last, per_dev->cur_sg, | ||
| 272 | _LLU(per_dev->offset), per_dev->length, | ||
| 273 | per_dev->last_sgs_total); | ||
| 274 | |||
| 275 | if (!per_dev->cur_sg) { | ||
| 276 | sge = per_dev->sglist; | ||
| 277 | |||
| 278 | /* First time we prepare two entries */ | ||
| 279 | if (per_dev->length) { | ||
| 280 | ++per_dev->cur_sg; | ||
| 281 | sge->offset = per_dev->offset; | ||
| 282 | sge->len = per_dev->length; | ||
| 283 | } else { | ||
| 284 | /* Here the parity is the first unit of this object. | ||
| 285 | * This happens every time we reach a parity device on | ||
| 286 | * the same stripe as the per_dev->offset. We need to | ||
| 287 | * just skip this unit. | ||
| 288 | */ | ||
| 289 | per_dev->offset += cur_len; | ||
| 290 | return; | ||
| 291 | } | ||
| 292 | } else { | ||
| 293 | /* finalize the last one */ | ||
| 294 | sge = &per_dev->sglist[per_dev->cur_sg - 1]; | ||
| 295 | sge->len = per_dev->length - per_dev->last_sgs_total; | ||
| 296 | } | ||
| 297 | |||
| 298 | if (not_last) { | ||
| 299 | /* Partly prepare the next one */ | ||
| 300 | struct osd_sg_entry *next_sge = sge + 1; | ||
| 301 | |||
| 302 | ++per_dev->cur_sg; | ||
| 303 | next_sge->offset = sge->offset + sge->len + cur_len; | ||
| 304 | /* Save cur len so we know how mutch was added next time */ | ||
| 305 | per_dev->last_sgs_total = per_dev->length; | ||
| 306 | next_sge->len = 0; | ||
| 307 | } else if (!sge->len) { | ||
| 308 | /* Optimize for when the last unit is a parity */ | ||
| 309 | --per_dev->cur_sg; | ||
| 310 | } | ||
| 311 | } | ||
| 312 | |||
| 313 | static int _alloc_read_4_write(struct ore_io_state *ios) | ||
| 314 | { | ||
| 315 | struct ore_layout *layout = ios->layout; | ||
| 316 | int ret; | ||
| 317 | /* We want to only read those pages not in cache so worst case | ||
| 318 | * is a stripe populated with every other page | ||
| 319 | */ | ||
| 320 | unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2; | ||
| 321 | |||
| 322 | ret = _ore_get_io_state(layout, ios->oc, | ||
| 323 | layout->group_width * layout->mirrors_p1, | ||
| 324 | sgs_per_dev, 0, &ios->ios_read_4_write); | ||
| 325 | return ret; | ||
| 326 | } | ||
| 327 | |||
| 328 | /* @si contains info of the to-be-inserted page. Update of @si should be | ||
| 329 | * maintained by caller. Specificaly si->dev, si->obj_offset, ... | ||
| 330 | */ | ||
| 331 | static int _add_to_read_4_write(struct ore_io_state *ios, | ||
| 332 | struct ore_striping_info *si, struct page *page) | ||
| 333 | { | ||
| 334 | struct request_queue *q; | ||
| 335 | struct ore_per_dev_state *per_dev; | ||
| 336 | struct ore_io_state *read_ios; | ||
| 337 | unsigned first_dev = si->dev - (si->dev % | ||
| 338 | (ios->layout->group_width * ios->layout->mirrors_p1)); | ||
| 339 | unsigned comp = si->dev - first_dev; | ||
| 340 | unsigned added_len; | ||
| 341 | |||
| 342 | if (!ios->ios_read_4_write) { | ||
| 343 | int ret = _alloc_read_4_write(ios); | ||
| 344 | |||
| 345 | if (unlikely(ret)) | ||
| 346 | return ret; | ||
| 347 | } | ||
| 348 | |||
| 349 | read_ios = ios->ios_read_4_write; | ||
| 350 | read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1; | ||
| 351 | |||
| 352 | per_dev = &read_ios->per_dev[comp]; | ||
| 353 | if (!per_dev->length) { | ||
| 354 | per_dev->bio = bio_kmalloc(GFP_KERNEL, | ||
| 355 | ios->sp2d->pages_in_unit); | ||
| 356 | if (unlikely(!per_dev->bio)) { | ||
| 357 | ORE_DBGMSG("Failed to allocate BIO size=%u\n", | ||
| 358 | ios->sp2d->pages_in_unit); | ||
| 359 | return -ENOMEM; | ||
| 360 | } | ||
| 361 | per_dev->offset = si->obj_offset; | ||
| 362 | per_dev->dev = si->dev; | ||
| 363 | } else if (si->obj_offset != (per_dev->offset + per_dev->length)) { | ||
| 364 | u64 gap = si->obj_offset - (per_dev->offset + per_dev->length); | ||
| 365 | |||
| 366 | _ore_add_sg_seg(per_dev, gap, true); | ||
| 367 | } | ||
| 368 | q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev)); | ||
| 369 | added_len = bio_add_pc_page(q, per_dev->bio, page, PAGE_SIZE, 0); | ||
| 370 | if (unlikely(added_len != PAGE_SIZE)) { | ||
| 371 | ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n", | ||
| 372 | per_dev->bio->bi_vcnt); | ||
| 373 | return -ENOMEM; | ||
| 374 | } | ||
| 375 | |||
| 376 | per_dev->length += PAGE_SIZE; | ||
| 377 | return 0; | ||
| 378 | } | ||
| 379 | |||
| 380 | static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) | ||
| 381 | { | ||
| 382 | struct bio_vec *bv; | ||
| 383 | unsigned i, d; | ||
| 384 | |||
| 385 | /* loop on all devices all pages */ | ||
| 386 | for (d = 0; d < ios->numdevs; d++) { | ||
| 387 | struct bio *bio = ios->per_dev[d].bio; | ||
| 388 | |||
| 389 | if (!bio) | ||
| 390 | continue; | ||
| 391 | |||
| 392 | __bio_for_each_segment(bv, bio, i, 0) { | ||
| 393 | struct page *page = bv->bv_page; | ||
| 394 | |||
| 395 | SetPageUptodate(page); | ||
| 396 | if (PageError(page)) | ||
| 397 | ClearPageError(page); | ||
| 398 | } | ||
| 399 | } | ||
| 400 | } | ||
| 401 | |||
| 402 | /* read_4_write is hacked to read the start of the first stripe and/or | ||
| 403 | * the end of the last stripe. If needed, with an sg-gap at each device/page. | ||
| 404 | * It is assumed to be called after the to_be_written pages of the first stripe | ||
| 405 | * are populating ios->sp2d[][] | ||
| 406 | * | ||
| 407 | * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations | ||
| 408 | * These pages are held at sp2d[p].pages[c] but with | ||
| 409 | * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are | ||
| 410 | * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is | ||
| 411 | * @uptodate=true, so we don't need to read it, only unlock, after IO. | ||
| 412 | * | ||
| 413 | * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then | ||
| 414 | * to-be-written count, we should consider the xor-in-place mode. | ||
| 415 | * need_to_read_pages_count is the actual number of pages not present in cache. | ||
| 416 | * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough | ||
| 417 | * approximation? In this mode the read pages are put in the empty places of | ||
| 418 | * ios->sp2d[p][*], xor is calculated the same way. These pages are | ||
| 419 | * allocated/freed and don't go through cache | ||
| 420 | */ | ||
| 421 | static int _read_4_write(struct ore_io_state *ios) | ||
| 422 | { | ||
| 423 | struct ore_io_state *ios_read; | ||
| 424 | struct ore_striping_info read_si; | ||
| 425 | struct __stripe_pages_2d *sp2d = ios->sp2d; | ||
| 426 | u64 offset = ios->si.first_stripe_start; | ||
| 427 | u64 last_stripe_end; | ||
| 428 | unsigned bytes_in_stripe = ios->si.bytes_in_stripe; | ||
| 429 | unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1; | ||
| 430 | int ret; | ||
| 431 | |||
| 432 | if (offset == ios->offset) /* Go to start collect $200 */ | ||
| 433 | goto read_last_stripe; | ||
| 434 | |||
| 435 | min_p = _sp2d_min_pg(sp2d); | ||
| 436 | max_p = _sp2d_max_pg(sp2d); | ||
| 437 | |||
| 438 | for (c = 0; ; c++) { | ||
| 439 | ore_calc_stripe_info(ios->layout, offset, 0, &read_si); | ||
| 440 | read_si.obj_offset += min_p * PAGE_SIZE; | ||
| 441 | offset += min_p * PAGE_SIZE; | ||
| 442 | for (p = min_p; p <= max_p; p++) { | ||
| 443 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | ||
| 444 | struct page **pp = &_1ps->pages[c]; | ||
| 445 | bool uptodate; | ||
| 446 | |||
| 447 | if (*pp) | ||
| 448 | /* to-be-written pages start here */ | ||
| 449 | goto read_last_stripe; | ||
| 450 | |||
| 451 | *pp = ios->r4w->get_page(ios->private, offset, | ||
| 452 | &uptodate); | ||
| 453 | if (unlikely(!*pp)) | ||
| 454 | return -ENOMEM; | ||
| 455 | |||
| 456 | if (!uptodate) | ||
| 457 | _add_to_read_4_write(ios, &read_si, *pp); | ||
| 458 | |||
| 459 | /* Mark read-pages to be cache_released */ | ||
| 460 | _1ps->page_is_read[c] = true; | ||
| 461 | read_si.obj_offset += PAGE_SIZE; | ||
| 462 | offset += PAGE_SIZE; | ||
| 463 | } | ||
| 464 | offset += (sp2d->pages_in_unit - p) * PAGE_SIZE; | ||
| 465 | } | ||
| 466 | |||
| 467 | read_last_stripe: | ||
| 468 | offset = ios->offset + (ios->length + PAGE_SIZE - 1) / | ||
| 469 | PAGE_SIZE * PAGE_SIZE; | ||
| 470 | last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe) | ||
| 471 | * bytes_in_stripe; | ||
| 472 | if (offset == last_stripe_end) /* Optimize for the aligned case */ | ||
| 473 | goto read_it; | ||
| 474 | |||
| 475 | ore_calc_stripe_info(ios->layout, offset, 0, &read_si); | ||
| 476 | p = read_si.unit_off / PAGE_SIZE; | ||
| 477 | c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, | ||
| 478 | ios->layout->mirrors_p1, read_si.par_dev, read_si.dev); | ||
| 479 | |||
| 480 | BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end); | ||
| 481 | /* unaligned IO must be within a single stripe */ | ||
| 482 | |||
| 483 | if (min_p == sp2d->pages_in_unit) { | ||
| 484 | /* Didn't do it yet */ | ||
| 485 | min_p = _sp2d_min_pg(sp2d); | ||
| 486 | max_p = _sp2d_max_pg(sp2d); | ||
| 487 | } | ||
| 488 | |||
| 489 | while (offset < last_stripe_end) { | ||
| 490 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | ||
| 491 | |||
| 492 | if ((min_p <= p) && (p <= max_p)) { | ||
| 493 | struct page *page; | ||
| 494 | bool uptodate; | ||
| 495 | |||
| 496 | BUG_ON(_1ps->pages[c]); | ||
| 497 | page = ios->r4w->get_page(ios->private, offset, | ||
| 498 | &uptodate); | ||
| 499 | if (unlikely(!page)) | ||
| 500 | return -ENOMEM; | ||
| 501 | |||
| 502 | _1ps->pages[c] = page; | ||
| 503 | /* Mark read-pages to be cache_released */ | ||
| 504 | _1ps->page_is_read[c] = true; | ||
| 505 | if (!uptodate) | ||
| 506 | _add_to_read_4_write(ios, &read_si, page); | ||
| 507 | } | ||
| 508 | |||
| 509 | offset += PAGE_SIZE; | ||
| 510 | if (p == (sp2d->pages_in_unit - 1)) { | ||
| 511 | ++c; | ||
| 512 | p = 0; | ||
| 513 | ore_calc_stripe_info(ios->layout, offset, 0, &read_si); | ||
| 514 | } else { | ||
| 515 | read_si.obj_offset += PAGE_SIZE; | ||
| 516 | ++p; | ||
| 517 | } | ||
| 518 | } | ||
| 519 | |||
| 520 | read_it: | ||
| 521 | ios_read = ios->ios_read_4_write; | ||
| 522 | if (!ios_read) | ||
| 523 | return 0; | ||
| 524 | |||
| 525 | /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change | ||
| 526 | * to check for per_dev->bio | ||
| 527 | */ | ||
| 528 | ios_read->pages = ios->pages; | ||
| 529 | |||
| 530 | /* Now read these devices */ | ||
| 531 | for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) { | ||
| 532 | ret = _ore_read_mirror(ios_read, i); | ||
| 533 | if (unlikely(ret)) | ||
| 534 | return ret; | ||
| 535 | } | ||
| 536 | |||
| 537 | ret = ore_io_execute(ios_read); /* Synchronus execution */ | ||
| 538 | if (unlikely(ret)) { | ||
| 539 | ORE_DBGMSG("!! ore_io_execute => %d\n", ret); | ||
| 540 | return ret; | ||
| 541 | } | ||
| 542 | |||
| 543 | _mark_read4write_pages_uptodate(ios_read, ret); | ||
| 544 | return 0; | ||
| 545 | } | ||
| 546 | |||
| 547 | /* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */ | ||
| 548 | int _ore_add_parity_unit(struct ore_io_state *ios, | ||
| 549 | struct ore_striping_info *si, | ||
| 550 | struct ore_per_dev_state *per_dev, | ||
| 551 | unsigned cur_len) | ||
| 552 | { | ||
| 553 | if (ios->reading) { | ||
| 554 | BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev); | ||
| 555 | _ore_add_sg_seg(per_dev, cur_len, true); | ||
| 556 | } else { | ||
| 557 | struct __stripe_pages_2d *sp2d = ios->sp2d; | ||
| 558 | struct page **pages = ios->parity_pages + ios->cur_par_page; | ||
| 559 | unsigned num_pages; | ||
| 560 | unsigned array_start = 0; | ||
| 561 | unsigned i; | ||
| 562 | int ret; | ||
| 563 | |||
| 564 | si->cur_pg = _sp2d_min_pg(sp2d); | ||
| 565 | num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg; | ||
| 566 | |||
| 567 | if (!cur_len) /* If last stripe operate on parity comp */ | ||
| 568 | si->cur_comp = sp2d->data_devs; | ||
| 569 | |||
| 570 | if (!per_dev->length) { | ||
| 571 | per_dev->offset += si->cur_pg * PAGE_SIZE; | ||
| 572 | /* If first stripe, Read in all read4write pages | ||
| 573 | * (if needed) before we calculate the first parity. | ||
| 574 | */ | ||
| 575 | _read_4_write(ios); | ||
| 576 | } | ||
| 577 | |||
| 578 | for (i = 0; i < num_pages; i++) { | ||
| 579 | pages[i] = _raid_page_alloc(); | ||
| 580 | if (unlikely(!pages[i])) | ||
| 581 | return -ENOMEM; | ||
| 582 | |||
| 583 | ++(ios->cur_par_page); | ||
| 584 | } | ||
| 585 | |||
| 586 | BUG_ON(si->cur_comp != sp2d->data_devs); | ||
| 587 | BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit); | ||
| 588 | |||
| 589 | ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, | ||
| 590 | per_dev, num_pages * PAGE_SIZE); | ||
| 591 | if (unlikely(ret)) | ||
| 592 | return ret; | ||
| 593 | |||
| 594 | /* TODO: raid6 if (last_parity_dev) */ | ||
| 595 | _gen_xor_unit(sp2d); | ||
| 596 | _sp2d_reset(sp2d, ios->r4w, ios->private); | ||
| 597 | } | ||
| 598 | return 0; | ||
| 599 | } | ||
| 600 | |||
| 601 | int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) | ||
| 602 | { | ||
| 603 | struct ore_layout *layout = ios->layout; | ||
| 604 | |||
| 605 | if (ios->parity_pages) { | ||
| 606 | unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; | ||
| 607 | unsigned stripe_size = ios->si.bytes_in_stripe; | ||
| 608 | u64 last_stripe, first_stripe; | ||
| 609 | |||
| 610 | if (_sp2d_alloc(pages_in_unit, layout->group_width, | ||
| 611 | layout->parity, &ios->sp2d)) { | ||
| 612 | return -ENOMEM; | ||
| 613 | } | ||
| 614 | |||
| 615 | BUG_ON(ios->offset % PAGE_SIZE); | ||
| 616 | |||
| 617 | /* Round io down to last full strip */ | ||
| 618 | first_stripe = div_u64(ios->offset, stripe_size); | ||
| 619 | last_stripe = div_u64(ios->offset + ios->length, stripe_size); | ||
| 620 | |||
| 621 | /* If an IO spans more then a single stripe it must end at | ||
| 622 | * a stripe boundary. The reminder at the end is pushed into the | ||
| 623 | * next IO. | ||
| 624 | */ | ||
| 625 | if (last_stripe != first_stripe) { | ||
| 626 | ios->length = last_stripe * stripe_size - ios->offset; | ||
| 627 | |||
| 628 | BUG_ON(!ios->length); | ||
| 629 | ios->nr_pages = (ios->length + PAGE_SIZE - 1) / | ||
| 630 | PAGE_SIZE; | ||
| 631 | ios->si.length = ios->length; /*make it consistent */ | ||
| 632 | } | ||
| 633 | } | ||
| 634 | return 0; | ||
| 635 | } | ||
| 636 | |||
| 637 | void _ore_free_raid_stuff(struct ore_io_state *ios) | ||
| 638 | { | ||
| 639 | if (ios->sp2d) { /* writing and raid */ | ||
| 640 | unsigned i; | ||
| 641 | |||
| 642 | for (i = 0; i < ios->cur_par_page; i++) { | ||
| 643 | struct page *page = ios->parity_pages[i]; | ||
| 644 | |||
| 645 | if (page) | ||
| 646 | _raid_page_free(page); | ||
| 647 | } | ||
| 648 | if (ios->extra_part_alloc) | ||
| 649 | kfree(ios->parity_pages); | ||
| 650 | /* If IO returned an error pages might need unlocking */ | ||
| 651 | _sp2d_reset(ios->sp2d, ios->r4w, ios->private); | ||
| 652 | _sp2d_free(ios->sp2d); | ||
| 653 | } else { | ||
| 654 | /* Will only be set if raid reading && sglist is big */ | ||
| 655 | if (ios->extra_part_alloc) | ||
| 656 | kfree(ios->per_dev[0].sglist); | ||
| 657 | } | ||
| 658 | if (ios->ios_read_4_write) | ||
| 659 | ore_put_io_state(ios->ios_read_4_write); | ||
| 660 | } | ||
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h new file mode 100644 index 000000000000..2ffd2c3c6e46 --- /dev/null +++ b/fs/exofs/ore_raid.h | |||
| @@ -0,0 +1,79 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) from 2011 | ||
| 3 | * Boaz Harrosh <bharrosh@panasas.com> | ||
| 4 | * | ||
| 5 | * This file is part of the objects raid engine (ore). | ||
| 6 | * | ||
| 7 | * It is free software; you can redistribute it and/or modify | ||
| 8 | * it under the terms of the GNU General Public License version 2 as published | ||
| 9 | * by the Free Software Foundation. | ||
| 10 | * | ||
| 11 | * You should have received a copy of the GNU General Public License | ||
| 12 | * along with "ore". If not, write to the Free Software Foundation, Inc: | ||
| 13 | * "Free Software Foundation <info@fsf.org>" | ||
| 14 | */ | ||
| 15 | |||
| 16 | #include <scsi/osd_ore.h> | ||
| 17 | |||
| 18 | #define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) | ||
| 19 | |||
| 20 | #ifdef CONFIG_EXOFS_DEBUG | ||
| 21 | #define ORE_DBGMSG(fmt, a...) \ | ||
| 22 | printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) | ||
| 23 | #else | ||
| 24 | #define ORE_DBGMSG(fmt, a...) \ | ||
| 25 | do { if (0) printk(fmt, ##a); } while (0) | ||
| 26 | #endif | ||
| 27 | |||
| 28 | /* u64 has problems with printk this will cast it to unsigned long long */ | ||
| 29 | #define _LLU(x) (unsigned long long)(x) | ||
| 30 | |||
| 31 | #define ORE_DBGMSG2(M...) do {} while (0) | ||
| 32 | /* #define ORE_DBGMSG2 ORE_DBGMSG */ | ||
| 33 | |||
| 34 | /* Calculate the component order in a stripe. eg the logical data unit | ||
| 35 | * address within the stripe of @dev given the @par_dev of this stripe. | ||
| 36 | */ | ||
| 37 | static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1, | ||
| 38 | unsigned par_dev, unsigned dev) | ||
| 39 | { | ||
| 40 | unsigned first_dev = dev - dev % devs_in_group; | ||
| 41 | |||
| 42 | dev -= first_dev; | ||
| 43 | par_dev -= first_dev; | ||
| 44 | |||
| 45 | if (devs_in_group == par_dev) /* The raid 0 case */ | ||
| 46 | return dev / mirrors_p1; | ||
| 47 | /* raid4/5/6 case */ | ||
| 48 | return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) / | ||
| 49 | mirrors_p1; | ||
| 50 | } | ||
| 51 | |||
| 52 | /* ios_raid.c stuff needed by ios.c */ | ||
| 53 | int _ore_post_alloc_raid_stuff(struct ore_io_state *ios); | ||
| 54 | void _ore_free_raid_stuff(struct ore_io_state *ios); | ||
| 55 | |||
| 56 | void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, | ||
| 57 | bool not_last); | ||
| 58 | int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, | ||
| 59 | struct ore_per_dev_state *per_dev, unsigned cur_len); | ||
| 60 | void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, | ||
| 61 | struct ore_striping_info *si, struct page *page); | ||
| 62 | static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d, | ||
| 63 | struct ore_striping_info *si, struct page *page) | ||
| 64 | { | ||
| 65 | if (!sp2d) /* Inline the fast path */ | ||
| 66 | return; /* Hay no raid stuff */ | ||
| 67 | _ore_add_stripe_page(sp2d, si, page); | ||
| 68 | } | ||
| 69 | |||
| 70 | /* ios.c stuff needed by ios_raid.c */ | ||
| 71 | int _ore_get_io_state(struct ore_layout *layout, | ||
| 72 | struct ore_components *oc, unsigned numdevs, | ||
| 73 | unsigned sgs_per_dev, unsigned num_par_pages, | ||
| 74 | struct ore_io_state **pios); | ||
| 75 | int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, | ||
| 76 | unsigned pgbase, struct page **pages, | ||
| 77 | struct ore_per_dev_state *per_dev, int cur_len); | ||
| 78 | int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp); | ||
| 79 | int ore_io_execute(struct ore_io_state *ios); | ||
diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 274894053b02..e6085ec192d6 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c | |||
| @@ -35,6 +35,7 @@ | |||
| 35 | #include <linux/parser.h> | 35 | #include <linux/parser.h> |
| 36 | #include <linux/vfs.h> | 36 | #include <linux/vfs.h> |
| 37 | #include <linux/random.h> | 37 | #include <linux/random.h> |
| 38 | #include <linux/module.h> | ||
| 38 | #include <linux/exportfs.h> | 39 | #include <linux/exportfs.h> |
| 39 | #include <linux/slab.h> | 40 | #include <linux/slab.h> |
| 40 | 41 | ||
| @@ -266,7 +267,7 @@ static int __sbi_read_stats(struct exofs_sb_info *sbi) | |||
| 266 | struct ore_io_state *ios; | 267 | struct ore_io_state *ios; |
| 267 | int ret; | 268 | int ret; |
| 268 | 269 | ||
| 269 | ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); | 270 | ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); |
| 270 | if (unlikely(ret)) { | 271 | if (unlikely(ret)) { |
| 271 | EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); | 272 | EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); |
| 272 | return ret; | 273 | return ret; |
| @@ -321,7 +322,7 @@ int exofs_sbi_write_stats(struct exofs_sb_info *sbi) | |||
| 321 | struct ore_io_state *ios; | 322 | struct ore_io_state *ios; |
| 322 | int ret; | 323 | int ret; |
| 323 | 324 | ||
| 324 | ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); | 325 | ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); |
| 325 | if (unlikely(ret)) { | 326 | if (unlikely(ret)) { |
| 326 | EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); | 327 | EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); |
| 327 | return ret; | 328 | return ret; |
| @@ -355,12 +356,12 @@ static const struct export_operations exofs_export_ops; | |||
| 355 | /* | 356 | /* |
| 356 | * Write the superblock to the OSD | 357 | * Write the superblock to the OSD |
| 357 | */ | 358 | */ |
| 358 | int exofs_sync_fs(struct super_block *sb, int wait) | 359 | static int exofs_sync_fs(struct super_block *sb, int wait) |
| 359 | { | 360 | { |
| 360 | struct exofs_sb_info *sbi; | 361 | struct exofs_sb_info *sbi; |
| 361 | struct exofs_fscb *fscb; | 362 | struct exofs_fscb *fscb; |
| 362 | struct ore_comp one_comp; | 363 | struct ore_comp one_comp; |
| 363 | struct ore_components comps; | 364 | struct ore_components oc; |
| 364 | struct ore_io_state *ios; | 365 | struct ore_io_state *ios; |
| 365 | int ret = -ENOMEM; | 366 | int ret = -ENOMEM; |
| 366 | 367 | ||
| @@ -378,9 +379,9 @@ int exofs_sync_fs(struct super_block *sb, int wait) | |||
| 378 | * the writeable info is set in exofs_sbi_write_stats() above. | 379 | * the writeable info is set in exofs_sbi_write_stats() above. |
| 379 | */ | 380 | */ |
| 380 | 381 | ||
| 381 | exofs_init_comps(&comps, &one_comp, sbi, EXOFS_SUPER_ID); | 382 | exofs_init_comps(&oc, &one_comp, sbi, EXOFS_SUPER_ID); |
| 382 | 383 | ||
| 383 | ret = ore_get_io_state(&sbi->layout, &comps, &ios); | 384 | ret = ore_get_io_state(&sbi->layout, &oc, &ios); |
| 384 | if (unlikely(ret)) | 385 | if (unlikely(ret)) |
| 385 | goto out; | 386 | goto out; |
| 386 | 387 | ||
| @@ -429,19 +430,20 @@ static void _exofs_print_device(const char *msg, const char *dev_path, | |||
| 429 | msg, dev_path ?: "", odi->osdname, _LLU(pid)); | 430 | msg, dev_path ?: "", odi->osdname, _LLU(pid)); |
| 430 | } | 431 | } |
| 431 | 432 | ||
| 432 | void exofs_free_sbi(struct exofs_sb_info *sbi) | 433 | static void exofs_free_sbi(struct exofs_sb_info *sbi) |
| 433 | { | 434 | { |
| 434 | while (sbi->comps.numdevs) { | 435 | unsigned numdevs = sbi->oc.numdevs; |
| 435 | int i = --sbi->comps.numdevs; | 436 | |
| 436 | struct osd_dev *od = sbi->comps.ods[i]; | 437 | while (numdevs) { |
| 438 | unsigned i = --numdevs; | ||
| 439 | struct osd_dev *od = ore_comp_dev(&sbi->oc, i); | ||
| 437 | 440 | ||
| 438 | if (od) { | 441 | if (od) { |
| 439 | sbi->comps.ods[i] = NULL; | 442 | ore_comp_set_dev(&sbi->oc, i, NULL); |
| 440 | osduld_put_device(od); | 443 | osduld_put_device(od); |
| 441 | } | 444 | } |
| 442 | } | 445 | } |
| 443 | if (sbi->comps.ods != sbi->_min_one_dev) | 446 | kfree(sbi->oc.ods); |
| 444 | kfree(sbi->comps.ods); | ||
| 445 | kfree(sbi); | 447 | kfree(sbi); |
| 446 | } | 448 | } |
| 447 | 449 | ||
| @@ -468,7 +470,7 @@ static void exofs_put_super(struct super_block *sb) | |||
| 468 | msecs_to_jiffies(100)); | 470 | msecs_to_jiffies(100)); |
| 469 | } | 471 | } |
| 470 | 472 | ||
| 471 | _exofs_print_device("Unmounting", NULL, sbi->comps.ods[0], | 473 | _exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0), |
| 472 | sbi->one_comp.obj.partition); | 474 | sbi->one_comp.obj.partition); |
| 473 | 475 | ||
| 474 | bdi_destroy(&sbi->bdi); | 476 | bdi_destroy(&sbi->bdi); |
| @@ -479,76 +481,20 @@ static void exofs_put_super(struct super_block *sb) | |||
| 479 | static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, | 481 | static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, |
| 480 | struct exofs_device_table *dt) | 482 | struct exofs_device_table *dt) |
| 481 | { | 483 | { |
| 482 | u64 stripe_length; | 484 | int ret; |
| 483 | 485 | ||
| 484 | sbi->data_map.odm_num_comps = | 486 | sbi->layout.stripe_unit = |
| 485 | le32_to_cpu(dt->dt_data_map.cb_num_comps); | ||
| 486 | sbi->data_map.odm_stripe_unit = | ||
| 487 | le64_to_cpu(dt->dt_data_map.cb_stripe_unit); | 487 | le64_to_cpu(dt->dt_data_map.cb_stripe_unit); |
| 488 | sbi->data_map.odm_group_width = | 488 | sbi->layout.group_width = |
| 489 | le32_to_cpu(dt->dt_data_map.cb_group_width); | 489 | le32_to_cpu(dt->dt_data_map.cb_group_width); |
| 490 | sbi->data_map.odm_group_depth = | 490 | sbi->layout.group_depth = |
| 491 | le32_to_cpu(dt->dt_data_map.cb_group_depth); | 491 | le32_to_cpu(dt->dt_data_map.cb_group_depth); |
| 492 | sbi->data_map.odm_mirror_cnt = | 492 | sbi->layout.mirrors_p1 = |
| 493 | le32_to_cpu(dt->dt_data_map.cb_mirror_cnt); | 493 | le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1; |
| 494 | sbi->data_map.odm_raid_algorithm = | 494 | sbi->layout.raid_algorithm = |
| 495 | le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); | 495 | le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); |
| 496 | 496 | ||
| 497 | /* FIXME: Only raid0 for now. if not so, do not mount */ | 497 | ret = ore_verify_layout(numdevs, &sbi->layout); |
| 498 | if (sbi->data_map.odm_num_comps != numdevs) { | ||
| 499 | EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n", | ||
| 500 | sbi->data_map.odm_num_comps, numdevs); | ||
| 501 | return -EINVAL; | ||
| 502 | } | ||
| 503 | if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) { | ||
| 504 | EXOFS_ERR("Only RAID_0 for now\n"); | ||
| 505 | return -EINVAL; | ||
| 506 | } | ||
| 507 | if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) { | ||
| 508 | EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n", | ||
| 509 | numdevs, sbi->data_map.odm_mirror_cnt); | ||
| 510 | return -EINVAL; | ||
| 511 | } | ||
| 512 | |||
| 513 | if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) { | ||
| 514 | EXOFS_ERR("Stripe Unit(0x%llx)" | ||
| 515 | " must be Multples of PAGE_SIZE(0x%lx)\n", | ||
| 516 | _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE); | ||
| 517 | return -EINVAL; | ||
| 518 | } | ||
| 519 | |||
| 520 | sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit; | ||
| 521 | sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1; | ||
| 522 | |||
| 523 | if (sbi->data_map.odm_group_width) { | ||
| 524 | sbi->layout.group_width = sbi->data_map.odm_group_width; | ||
| 525 | sbi->layout.group_depth = sbi->data_map.odm_group_depth; | ||
| 526 | if (!sbi->layout.group_depth) { | ||
| 527 | EXOFS_ERR("group_depth == 0 && group_width != 0\n"); | ||
| 528 | return -EINVAL; | ||
| 529 | } | ||
| 530 | sbi->layout.group_count = sbi->data_map.odm_num_comps / | ||
| 531 | sbi->layout.mirrors_p1 / | ||
| 532 | sbi->data_map.odm_group_width; | ||
| 533 | } else { | ||
| 534 | if (sbi->data_map.odm_group_depth) { | ||
| 535 | printk(KERN_NOTICE "Warning: group_depth ignored " | ||
| 536 | "group_width == 0 && group_depth == %d\n", | ||
| 537 | sbi->data_map.odm_group_depth); | ||
| 538 | sbi->data_map.odm_group_depth = 0; | ||
| 539 | } | ||
| 540 | sbi->layout.group_width = sbi->data_map.odm_num_comps / | ||
| 541 | sbi->layout.mirrors_p1; | ||
| 542 | sbi->layout.group_depth = -1; | ||
| 543 | sbi->layout.group_count = 1; | ||
| 544 | } | ||
| 545 | |||
| 546 | stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit; | ||
| 547 | if (stripe_length >= (1ULL << 32)) { | ||
| 548 | EXOFS_ERR("Total Stripe length(0x%llx)" | ||
| 549 | " >= 32bit is not supported\n", _LLU(stripe_length)); | ||
| 550 | return -EINVAL; | ||
| 551 | } | ||
| 552 | 498 | ||
| 553 | EXOFS_DBGMSG("exofs: layout: " | 499 | EXOFS_DBGMSG("exofs: layout: " |
| 554 | "num_comps=%u stripe_unit=0x%x group_width=%u " | 500 | "num_comps=%u stripe_unit=0x%x group_width=%u " |
| @@ -558,8 +504,8 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, | |||
| 558 | sbi->layout.group_width, | 504 | sbi->layout.group_width, |
| 559 | _LLU(sbi->layout.group_depth), | 505 | _LLU(sbi->layout.group_depth), |
| 560 | sbi->layout.mirrors_p1, | 506 | sbi->layout.mirrors_p1, |
| 561 | sbi->data_map.odm_raid_algorithm); | 507 | sbi->layout.raid_algorithm); |
| 562 | return 0; | 508 | return ret; |
| 563 | } | 509 | } |
| 564 | 510 | ||
| 565 | static unsigned __ra_pages(struct ore_layout *layout) | 511 | static unsigned __ra_pages(struct ore_layout *layout) |
| @@ -605,12 +551,40 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, | |||
| 605 | return !(odi->systemid_len || odi->osdname_len); | 551 | return !(odi->systemid_len || odi->osdname_len); |
| 606 | } | 552 | } |
| 607 | 553 | ||
| 554 | int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs, | ||
| 555 | struct exofs_dev **peds) | ||
| 556 | { | ||
| 557 | struct __alloc_ore_devs_and_exofs_devs { | ||
| 558 | /* Twice bigger table: See exofs_init_comps() and comment at | ||
| 559 | * exofs_read_lookup_dev_table() | ||
| 560 | */ | ||
| 561 | struct ore_dev *oreds[numdevs * 2 - 1]; | ||
| 562 | struct exofs_dev eds[numdevs]; | ||
| 563 | } *aoded; | ||
| 564 | struct exofs_dev *eds; | ||
| 565 | unsigned i; | ||
| 566 | |||
| 567 | aoded = kzalloc(sizeof(*aoded), GFP_KERNEL); | ||
| 568 | if (unlikely(!aoded)) { | ||
| 569 | EXOFS_ERR("ERROR: faild allocating Device array[%d]\n", | ||
| 570 | numdevs); | ||
| 571 | return -ENOMEM; | ||
| 572 | } | ||
| 573 | |||
| 574 | sbi->oc.ods = aoded->oreds; | ||
| 575 | *peds = eds = aoded->eds; | ||
| 576 | for (i = 0; i < numdevs; ++i) | ||
| 577 | aoded->oreds[i] = &eds[i].ored; | ||
| 578 | return 0; | ||
| 579 | } | ||
| 580 | |||
| 608 | static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, | 581 | static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, |
| 609 | struct osd_dev *fscb_od, | 582 | struct osd_dev *fscb_od, |
| 610 | unsigned table_count) | 583 | unsigned table_count) |
| 611 | { | 584 | { |
| 612 | struct ore_comp comp; | 585 | struct ore_comp comp; |
| 613 | struct exofs_device_table *dt; | 586 | struct exofs_device_table *dt; |
| 587 | struct exofs_dev *eds; | ||
| 614 | unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + | 588 | unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + |
| 615 | sizeof(*dt); | 589 | sizeof(*dt); |
| 616 | unsigned numdevs, i; | 590 | unsigned numdevs, i; |
| @@ -623,7 +597,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, | |||
| 623 | return -ENOMEM; | 597 | return -ENOMEM; |
| 624 | } | 598 | } |
| 625 | 599 | ||
| 626 | sbi->comps.numdevs = 0; | 600 | sbi->oc.numdevs = 0; |
| 627 | 601 | ||
| 628 | comp.obj.partition = sbi->one_comp.obj.partition; | 602 | comp.obj.partition = sbi->one_comp.obj.partition; |
| 629 | comp.obj.id = EXOFS_DEVTABLE_ID; | 603 | comp.obj.id = EXOFS_DEVTABLE_ID; |
| @@ -647,20 +621,16 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, | |||
| 647 | if (unlikely(ret)) | 621 | if (unlikely(ret)) |
| 648 | goto out; | 622 | goto out; |
| 649 | 623 | ||
| 650 | if (likely(numdevs > 1)) { | 624 | ret = __alloc_dev_table(sbi, numdevs, &eds); |
| 651 | unsigned size = numdevs * sizeof(sbi->comps.ods[0]); | 625 | if (unlikely(ret)) |
| 652 | 626 | goto out; | |
| 653 | /* Twice bigger table: See exofs_init_comps() and below | 627 | /* exofs round-robins the device table view according to inode |
| 654 | * comment | 628 | * number. We hold a: twice bigger table hence inodes can point |
| 655 | */ | 629 | * to any device and have a sequential view of the table |
| 656 | sbi->comps.ods = kzalloc(size + size - 1, GFP_KERNEL); | 630 | * starting at this device. See exofs_init_comps() |
| 657 | if (unlikely(!sbi->comps.ods)) { | 631 | */ |
| 658 | EXOFS_ERR("ERROR: faild allocating Device array[%d]\n", | 632 | memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0], |
| 659 | numdevs); | 633 | (numdevs - 1) * sizeof(sbi->oc.ods[0])); |
| 660 | ret = -ENOMEM; | ||
| 661 | goto out; | ||
| 662 | } | ||
| 663 | } | ||
| 664 | 634 | ||
| 665 | for (i = 0; i < numdevs; i++) { | 635 | for (i = 0; i < numdevs; i++) { |
| 666 | struct exofs_fscb fscb; | 636 | struct exofs_fscb fscb; |
| @@ -676,13 +646,16 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, | |||
| 676 | printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n", | 646 | printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n", |
| 677 | i, odi.osdname); | 647 | i, odi.osdname); |
| 678 | 648 | ||
| 649 | /* the exofs id is currently the table index */ | ||
| 650 | eds[i].did = i; | ||
| 651 | |||
| 679 | /* On all devices the device table is identical. The user can | 652 | /* On all devices the device table is identical. The user can |
| 680 | * specify any one of the participating devices on the command | 653 | * specify any one of the participating devices on the command |
| 681 | * line. We always keep them in device-table order. | 654 | * line. We always keep them in device-table order. |
| 682 | */ | 655 | */ |
| 683 | if (fscb_od && osduld_device_same(fscb_od, &odi)) { | 656 | if (fscb_od && osduld_device_same(fscb_od, &odi)) { |
| 684 | sbi->comps.ods[i] = fscb_od; | 657 | eds[i].ored.od = fscb_od; |
| 685 | ++sbi->comps.numdevs; | 658 | ++sbi->oc.numdevs; |
| 686 | fscb_od = NULL; | 659 | fscb_od = NULL; |
| 687 | continue; | 660 | continue; |
| 688 | } | 661 | } |
| @@ -695,8 +668,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, | |||
| 695 | goto out; | 668 | goto out; |
| 696 | } | 669 | } |
| 697 | 670 | ||
| 698 | sbi->comps.ods[i] = od; | 671 | eds[i].ored.od = od; |
| 699 | ++sbi->comps.numdevs; | 672 | ++sbi->oc.numdevs; |
| 700 | 673 | ||
| 701 | /* Read the fscb of the other devices to make sure the FS | 674 | /* Read the fscb of the other devices to make sure the FS |
| 702 | * partition is there. | 675 | * partition is there. |
| @@ -718,21 +691,10 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, | |||
| 718 | 691 | ||
| 719 | out: | 692 | out: |
| 720 | kfree(dt); | 693 | kfree(dt); |
| 721 | if (likely(!ret)) { | 694 | if (unlikely(fscb_od && !ret)) { |
| 722 | unsigned numdevs = sbi->comps.numdevs; | ||
| 723 | |||
| 724 | if (unlikely(fscb_od)) { | ||
| 725 | EXOFS_ERR("ERROR: Bad device-table container device not present\n"); | 695 | EXOFS_ERR("ERROR: Bad device-table container device not present\n"); |
| 726 | osduld_put_device(fscb_od); | 696 | osduld_put_device(fscb_od); |
| 727 | return -EINVAL; | 697 | return -EINVAL; |
| 728 | } | ||
| 729 | /* exofs round-robins the device table view according to inode | ||
| 730 | * number. We hold a: twice bigger table hence inodes can point | ||
| 731 | * to any device and have a sequential view of the table | ||
| 732 | * starting at this device. See exofs_init_comps() | ||
| 733 | */ | ||
| 734 | for (i = 0; i < numdevs - 1; ++i) | ||
| 735 | sbi->comps.ods[i + numdevs] = sbi->comps.ods[i]; | ||
| 736 | } | 698 | } |
| 737 | return ret; | 699 | return ret; |
| 738 | } | 700 | } |
| @@ -783,10 +745,9 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) | |||
| 783 | sbi->one_comp.obj.partition = opts->pid; | 745 | sbi->one_comp.obj.partition = opts->pid; |
| 784 | sbi->one_comp.obj.id = 0; | 746 | sbi->one_comp.obj.id = 0; |
| 785 | exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj); | 747 | exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj); |
| 786 | sbi->comps.numdevs = 1; | 748 | sbi->oc.numdevs = 1; |
| 787 | sbi->comps.single_comp = EC_SINGLE_COMP; | 749 | sbi->oc.single_comp = EC_SINGLE_COMP; |
| 788 | sbi->comps.comps = &sbi->one_comp; | 750 | sbi->oc.comps = &sbi->one_comp; |
| 789 | sbi->comps.ods = sbi->_min_one_dev; | ||
| 790 | 751 | ||
| 791 | /* fill in some other data by hand */ | 752 | /* fill in some other data by hand */ |
| 792 | memset(sb->s_id, 0, sizeof(sb->s_id)); | 753 | memset(sb->s_id, 0, sizeof(sb->s_id)); |
| @@ -835,7 +796,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) | |||
| 835 | if (unlikely(ret)) | 796 | if (unlikely(ret)) |
| 836 | goto free_sbi; | 797 | goto free_sbi; |
| 837 | } else { | 798 | } else { |
| 838 | sbi->comps.ods[0] = od; | 799 | struct exofs_dev *eds; |
| 800 | |||
| 801 | ret = __alloc_dev_table(sbi, 1, &eds); | ||
| 802 | if (unlikely(ret)) | ||
| 803 | goto free_sbi; | ||
| 804 | |||
| 805 | ore_comp_set_dev(&sbi->oc, 0, od); | ||
| 839 | } | 806 | } |
| 840 | 807 | ||
| 841 | __sbi_read_stats(sbi); | 808 | __sbi_read_stats(sbi); |
| @@ -875,7 +842,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) | |||
| 875 | goto free_sbi; | 842 | goto free_sbi; |
| 876 | } | 843 | } |
| 877 | 844 | ||
| 878 | _exofs_print_device("Mounting", opts->dev_name, sbi->comps.ods[0], | 845 | _exofs_print_device("Mounting", opts->dev_name, |
| 846 | ore_comp_dev(&sbi->oc, 0), | ||
| 879 | sbi->one_comp.obj.partition); | 847 | sbi->one_comp.obj.partition); |
| 880 | return 0; | 848 | return 0; |
| 881 | 849 | ||
| @@ -924,7 +892,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) | |||
| 924 | uint64_t used = ULLONG_MAX; | 892 | uint64_t used = ULLONG_MAX; |
| 925 | int ret; | 893 | int ret; |
| 926 | 894 | ||
| 927 | ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); | 895 | ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); |
| 928 | if (ret) { | 896 | if (ret) { |
| 929 | EXOFS_DBGMSG("ore_get_io_state failed.\n"); | 897 | EXOFS_DBGMSG("ore_get_io_state failed.\n"); |
| 930 | return ret; | 898 | return ret; |
| @@ -981,7 +949,7 @@ static const struct super_operations exofs_sops = { | |||
| 981 | * EXPORT OPERATIONS | 949 | * EXPORT OPERATIONS |
| 982 | *****************************************************************************/ | 950 | *****************************************************************************/ |
| 983 | 951 | ||
| 984 | struct dentry *exofs_get_parent(struct dentry *child) | 952 | static struct dentry *exofs_get_parent(struct dentry *child) |
| 985 | { | 953 | { |
| 986 | unsigned long ino = exofs_parent_ino(child); | 954 | unsigned long ino = exofs_parent_ino(child); |
| 987 | 955 | ||
