diff options
author | Boaz Harrosh <bharrosh@panasas.com> | 2011-09-28 04:55:51 -0400 |
---|---|---|
committer | Boaz Harrosh <bharrosh@panasas.com> | 2011-10-14 12:52:50 -0400 |
commit | b916c5cd4d895a27b47a652648958f73e4f23ac6 (patch) | |
tree | 9fe6e59edd44119c79a18b9df0b02a0c4dacb6d1 /fs/exofs/inode.c | |
parent | d866d875f68fdeae63df334d291fe138dc636d96 (diff) |
ore: Only IO one group at a time (API change)
Usually a single IO is confined to one group of devices
(group_width) and at the boundary of a raid group it can
spill into a second group. Current code would allocate a
full device_table size array at each io_state so it can
comply to requests that span two groups. Needless to say
that is very wasteful, specially when device_table count
can get very large (hundreds even thousands), while a
group_width is usually 8 or 10.
* Change ore API to trim on IO that spans two raid groups.
The user passes offset+length to ore_get_rw_state, the
ore might trim on that length if spanning a group boundary.
The user must check ios->length or ios->nrpages to see
how much IO will be preformed. It is the responsibility
of the user to re-issue the reminder of the IO.
* Modify exofs To copy spilled pages on to the next IO.
This means one last kick is needed after all coalescing
of pages is done.
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Diffstat (limited to 'fs/exofs/inode.c')
-rw-r--r-- | fs/exofs/inode.c | 100 |
1 files changed, 85 insertions, 15 deletions
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 61b2f7e5cdbd..d87c1f7562fb 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c | |||
@@ -259,6 +259,46 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) | |||
259 | } | 259 | } |
260 | } | 260 | } |
261 | 261 | ||
262 | static int _maybe_not_all_in_one_io(struct ore_io_state *ios, | ||
263 | struct page_collect *pcol_src, struct page_collect *pcol) | ||
264 | { | ||
265 | /* length was wrong or offset was not page aligned */ | ||
266 | BUG_ON(pcol_src->nr_pages < ios->nr_pages); | ||
267 | |||
268 | if (pcol_src->nr_pages > ios->nr_pages) { | ||
269 | struct page **src_page; | ||
270 | unsigned pages_less = pcol_src->nr_pages - ios->nr_pages; | ||
271 | unsigned long len_less = pcol_src->length - ios->length; | ||
272 | unsigned i; | ||
273 | int ret; | ||
274 | |||
275 | /* This IO was trimmed */ | ||
276 | pcol_src->nr_pages = ios->nr_pages; | ||
277 | pcol_src->length = ios->length; | ||
278 | |||
279 | /* Left over pages are passed to the next io */ | ||
280 | pcol->expected_pages += pages_less; | ||
281 | pcol->nr_pages = pages_less; | ||
282 | pcol->length = len_less; | ||
283 | src_page = pcol_src->pages + pcol_src->nr_pages; | ||
284 | pcol->pg_first = (*src_page)->index; | ||
285 | |||
286 | ret = pcol_try_alloc(pcol); | ||
287 | if (unlikely(ret)) | ||
288 | return ret; | ||
289 | |||
290 | for (i = 0; i < pages_less; ++i) | ||
291 | pcol->pages[i] = *src_page++; | ||
292 | |||
293 | EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x " | ||
294 | "pages_less=0x%x expected_pages=0x%x " | ||
295 | "next_offset=0x%llx next_len=0x%lx\n", | ||
296 | pcol_src->nr_pages, pages_less, pcol->expected_pages, | ||
297 | pcol->pg_first * PAGE_SIZE, pcol->length); | ||
298 | } | ||
299 | return 0; | ||
300 | } | ||
301 | |||
262 | static int read_exec(struct page_collect *pcol) | 302 | static int read_exec(struct page_collect *pcol) |
263 | { | 303 | { |
264 | struct exofs_i_info *oi = exofs_i(pcol->inode); | 304 | struct exofs_i_info *oi = exofs_i(pcol->inode); |
@@ -280,7 +320,6 @@ static int read_exec(struct page_collect *pcol) | |||
280 | 320 | ||
281 | ios = pcol->ios; | 321 | ios = pcol->ios; |
282 | ios->pages = pcol->pages; | 322 | ios->pages = pcol->pages; |
283 | ios->nr_pages = pcol->nr_pages; | ||
284 | 323 | ||
285 | if (pcol->read_4_write) { | 324 | if (pcol->read_4_write) { |
286 | ore_read(pcol->ios); | 325 | ore_read(pcol->ios); |
@@ -296,17 +335,23 @@ static int read_exec(struct page_collect *pcol) | |||
296 | *pcol_copy = *pcol; | 335 | *pcol_copy = *pcol; |
297 | ios->done = readpages_done; | 336 | ios->done = readpages_done; |
298 | ios->private = pcol_copy; | 337 | ios->private = pcol_copy; |
338 | |||
339 | /* pages ownership was passed to pcol_copy */ | ||
340 | _pcol_reset(pcol); | ||
341 | |||
342 | ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol); | ||
343 | if (unlikely(ret)) | ||
344 | goto err; | ||
345 | |||
346 | EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n", | ||
347 | pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length)); | ||
348 | |||
299 | ret = ore_read(ios); | 349 | ret = ore_read(ios); |
300 | if (unlikely(ret)) | 350 | if (unlikely(ret)) |
301 | goto err; | 351 | goto err; |
302 | 352 | ||
303 | atomic_inc(&pcol->sbi->s_curr_pending); | 353 | atomic_inc(&pcol->sbi->s_curr_pending); |
304 | 354 | ||
305 | EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", | ||
306 | oi->one_comp.obj.id, _LLU(ios->offset), pcol->length); | ||
307 | |||
308 | /* pages ownership was passed to pcol_copy */ | ||
309 | _pcol_reset(pcol); | ||
310 | return 0; | 355 | return 0; |
311 | 356 | ||
312 | err: | 357 | err: |
@@ -429,6 +474,10 @@ static int exofs_readpages(struct file *file, struct address_space *mapping, | |||
429 | return ret; | 474 | return ret; |
430 | } | 475 | } |
431 | 476 | ||
477 | ret = read_exec(&pcol); | ||
478 | if (unlikely(ret)) | ||
479 | return ret; | ||
480 | |||
432 | return read_exec(&pcol); | 481 | return read_exec(&pcol); |
433 | } | 482 | } |
434 | 483 | ||
@@ -519,7 +568,6 @@ static int write_exec(struct page_collect *pcol) | |||
519 | ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false, | 568 | ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false, |
520 | pcol->pg_first << PAGE_CACHE_SHIFT, | 569 | pcol->pg_first << PAGE_CACHE_SHIFT, |
521 | pcol->length, &pcol->ios); | 570 | pcol->length, &pcol->ios); |
522 | |||
523 | if (unlikely(ret)) | 571 | if (unlikely(ret)) |
524 | goto err; | 572 | goto err; |
525 | 573 | ||
@@ -534,10 +582,19 @@ static int write_exec(struct page_collect *pcol) | |||
534 | 582 | ||
535 | ios = pcol->ios; | 583 | ios = pcol->ios; |
536 | ios->pages = pcol_copy->pages; | 584 | ios->pages = pcol_copy->pages; |
537 | ios->nr_pages = pcol_copy->nr_pages; | ||
538 | ios->done = writepages_done; | 585 | ios->done = writepages_done; |
539 | ios->private = pcol_copy; | 586 | ios->private = pcol_copy; |
540 | 587 | ||
588 | /* pages ownership was passed to pcol_copy */ | ||
589 | _pcol_reset(pcol); | ||
590 | |||
591 | ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol); | ||
592 | if (unlikely(ret)) | ||
593 | goto err; | ||
594 | |||
595 | EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n", | ||
596 | pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length)); | ||
597 | |||
541 | ret = ore_write(ios); | 598 | ret = ore_write(ios); |
542 | if (unlikely(ret)) { | 599 | if (unlikely(ret)) { |
543 | EXOFS_ERR("write_exec: ore_write() Failed\n"); | 600 | EXOFS_ERR("write_exec: ore_write() Failed\n"); |
@@ -545,11 +602,6 @@ static int write_exec(struct page_collect *pcol) | |||
545 | } | 602 | } |
546 | 603 | ||
547 | atomic_inc(&pcol->sbi->s_curr_pending); | 604 | atomic_inc(&pcol->sbi->s_curr_pending); |
548 | EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n", | ||
549 | pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset), | ||
550 | pcol->length); | ||
551 | /* pages ownership was passed to pcol_copy */ | ||
552 | _pcol_reset(pcol); | ||
553 | return 0; | 605 | return 0; |
554 | 606 | ||
555 | err: | 607 | err: |
@@ -689,12 +741,30 @@ static int exofs_writepages(struct address_space *mapping, | |||
689 | _pcol_init(&pcol, expected_pages, mapping->host); | 741 | _pcol_init(&pcol, expected_pages, mapping->host); |
690 | 742 | ||
691 | ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol); | 743 | ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol); |
692 | if (ret) { | 744 | if (unlikely(ret)) { |
693 | EXOFS_ERR("write_cache_pages => %d\n", ret); | 745 | EXOFS_ERR("write_cache_pages => %d\n", ret); |
694 | return ret; | 746 | return ret; |
695 | } | 747 | } |
696 | 748 | ||
697 | return write_exec(&pcol); | 749 | ret = write_exec(&pcol); |
750 | if (unlikely(ret)) | ||
751 | return ret; | ||
752 | |||
753 | if (wbc->sync_mode == WB_SYNC_ALL) { | ||
754 | return write_exec(&pcol); /* pump the last reminder */ | ||
755 | } else if (pcol.nr_pages) { | ||
756 | /* not SYNC let the reminder join the next writeout */ | ||
757 | unsigned i; | ||
758 | |||
759 | for (i = 0; i < pcol.nr_pages; i++) { | ||
760 | struct page *page = pcol.pages[i]; | ||
761 | |||
762 | end_page_writeback(page); | ||
763 | set_page_dirty(page); | ||
764 | unlock_page(page); | ||
765 | } | ||
766 | } | ||
767 | return 0; | ||
698 | } | 768 | } |
699 | 769 | ||
700 | static int exofs_writepage(struct page *page, struct writeback_control *wbc) | 770 | static int exofs_writepage(struct page *page, struct writeback_control *wbc) |