aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorBoaz Harrosh <bharrosh@panasas.com>2011-09-28 04:55:51 -0400
committerBoaz Harrosh <bharrosh@panasas.com>2011-10-14 12:52:50 -0400
commitb916c5cd4d895a27b47a652648958f73e4f23ac6 (patch)
tree9fe6e59edd44119c79a18b9df0b02a0c4dacb6d1 /fs
parentd866d875f68fdeae63df334d291fe138dc636d96 (diff)
ore: Only IO one group at a time (API change)
Usually a single IO is confined to one group of devices (group_width) and at the boundary of a raid group it can spill into a second group. Current code would allocate a full device_table size array at each io_state so it can comply to requests that span two groups. Needless to say that is very wasteful, specially when device_table count can get very large (hundreds even thousands), while a group_width is usually 8 or 10. * Change ore API to trim on IO that spans two raid groups. The user passes offset+length to ore_get_rw_state, the ore might trim on that length if spanning a group boundary. The user must check ios->length or ios->nrpages to see how much IO will be preformed. It is the responsibility of the user to re-issue the reminder of the IO. * Modify exofs To copy spilled pages on to the next IO. This means one last kick is needed after all coalescing of pages is done. Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/exofs/inode.c100
-rw-r--r--fs/exofs/ore.c105
2 files changed, 154 insertions, 51 deletions
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 61b2f7e5cdbd..d87c1f7562fb 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -259,6 +259,46 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
259 } 259 }
260} 260}
261 261
262static int _maybe_not_all_in_one_io(struct ore_io_state *ios,
263 struct page_collect *pcol_src, struct page_collect *pcol)
264{
265 /* length was wrong or offset was not page aligned */
266 BUG_ON(pcol_src->nr_pages < ios->nr_pages);
267
268 if (pcol_src->nr_pages > ios->nr_pages) {
269 struct page **src_page;
270 unsigned pages_less = pcol_src->nr_pages - ios->nr_pages;
271 unsigned long len_less = pcol_src->length - ios->length;
272 unsigned i;
273 int ret;
274
275 /* This IO was trimmed */
276 pcol_src->nr_pages = ios->nr_pages;
277 pcol_src->length = ios->length;
278
279 /* Left over pages are passed to the next io */
280 pcol->expected_pages += pages_less;
281 pcol->nr_pages = pages_less;
282 pcol->length = len_less;
283 src_page = pcol_src->pages + pcol_src->nr_pages;
284 pcol->pg_first = (*src_page)->index;
285
286 ret = pcol_try_alloc(pcol);
287 if (unlikely(ret))
288 return ret;
289
290 for (i = 0; i < pages_less; ++i)
291 pcol->pages[i] = *src_page++;
292
293 EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x "
294 "pages_less=0x%x expected_pages=0x%x "
295 "next_offset=0x%llx next_len=0x%lx\n",
296 pcol_src->nr_pages, pages_less, pcol->expected_pages,
297 pcol->pg_first * PAGE_SIZE, pcol->length);
298 }
299 return 0;
300}
301
262static int read_exec(struct page_collect *pcol) 302static int read_exec(struct page_collect *pcol)
263{ 303{
264 struct exofs_i_info *oi = exofs_i(pcol->inode); 304 struct exofs_i_info *oi = exofs_i(pcol->inode);
@@ -280,7 +320,6 @@ static int read_exec(struct page_collect *pcol)
280 320
281 ios = pcol->ios; 321 ios = pcol->ios;
282 ios->pages = pcol->pages; 322 ios->pages = pcol->pages;
283 ios->nr_pages = pcol->nr_pages;
284 323
285 if (pcol->read_4_write) { 324 if (pcol->read_4_write) {
286 ore_read(pcol->ios); 325 ore_read(pcol->ios);
@@ -296,17 +335,23 @@ static int read_exec(struct page_collect *pcol)
296 *pcol_copy = *pcol; 335 *pcol_copy = *pcol;
297 ios->done = readpages_done; 336 ios->done = readpages_done;
298 ios->private = pcol_copy; 337 ios->private = pcol_copy;
338
339 /* pages ownership was passed to pcol_copy */
340 _pcol_reset(pcol);
341
342 ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
343 if (unlikely(ret))
344 goto err;
345
346 EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n",
347 pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
348
299 ret = ore_read(ios); 349 ret = ore_read(ios);
300 if (unlikely(ret)) 350 if (unlikely(ret))
301 goto err; 351 goto err;
302 352
303 atomic_inc(&pcol->sbi->s_curr_pending); 353 atomic_inc(&pcol->sbi->s_curr_pending);
304 354
305 EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
306 oi->one_comp.obj.id, _LLU(ios->offset), pcol->length);
307
308 /* pages ownership was passed to pcol_copy */
309 _pcol_reset(pcol);
310 return 0; 355 return 0;
311 356
312err: 357err:
@@ -429,6 +474,10 @@ static int exofs_readpages(struct file *file, struct address_space *mapping,
429 return ret; 474 return ret;
430 } 475 }
431 476
477 ret = read_exec(&pcol);
478 if (unlikely(ret))
479 return ret;
480
432 return read_exec(&pcol); 481 return read_exec(&pcol);
433} 482}
434 483
@@ -519,7 +568,6 @@ static int write_exec(struct page_collect *pcol)
519 ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false, 568 ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false,
520 pcol->pg_first << PAGE_CACHE_SHIFT, 569 pcol->pg_first << PAGE_CACHE_SHIFT,
521 pcol->length, &pcol->ios); 570 pcol->length, &pcol->ios);
522
523 if (unlikely(ret)) 571 if (unlikely(ret))
524 goto err; 572 goto err;
525 573
@@ -534,10 +582,19 @@ static int write_exec(struct page_collect *pcol)
534 582
535 ios = pcol->ios; 583 ios = pcol->ios;
536 ios->pages = pcol_copy->pages; 584 ios->pages = pcol_copy->pages;
537 ios->nr_pages = pcol_copy->nr_pages;
538 ios->done = writepages_done; 585 ios->done = writepages_done;
539 ios->private = pcol_copy; 586 ios->private = pcol_copy;
540 587
588 /* pages ownership was passed to pcol_copy */
589 _pcol_reset(pcol);
590
591 ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
592 if (unlikely(ret))
593 goto err;
594
595 EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n",
596 pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
597
541 ret = ore_write(ios); 598 ret = ore_write(ios);
542 if (unlikely(ret)) { 599 if (unlikely(ret)) {
543 EXOFS_ERR("write_exec: ore_write() Failed\n"); 600 EXOFS_ERR("write_exec: ore_write() Failed\n");
@@ -545,11 +602,6 @@ static int write_exec(struct page_collect *pcol)
545 } 602 }
546 603
547 atomic_inc(&pcol->sbi->s_curr_pending); 604 atomic_inc(&pcol->sbi->s_curr_pending);
548 EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
549 pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
550 pcol->length);
551 /* pages ownership was passed to pcol_copy */
552 _pcol_reset(pcol);
553 return 0; 605 return 0;
554 606
555err: 607err:
@@ -689,12 +741,30 @@ static int exofs_writepages(struct address_space *mapping,
689 _pcol_init(&pcol, expected_pages, mapping->host); 741 _pcol_init(&pcol, expected_pages, mapping->host);
690 742
691 ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol); 743 ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
692 if (ret) { 744 if (unlikely(ret)) {
693 EXOFS_ERR("write_cache_pages => %d\n", ret); 745 EXOFS_ERR("write_cache_pages => %d\n", ret);
694 return ret; 746 return ret;
695 } 747 }
696 748
697 return write_exec(&pcol); 749 ret = write_exec(&pcol);
750 if (unlikely(ret))
751 return ret;
752
753 if (wbc->sync_mode == WB_SYNC_ALL) {
754 return write_exec(&pcol); /* pump the last reminder */
755 } else if (pcol.nr_pages) {
756 /* not SYNC let the reminder join the next writeout */
757 unsigned i;
758
759 for (i = 0; i < pcol.nr_pages; i++) {
760 struct page *page = pcol.pages[i];
761
762 end_page_writeback(page);
763 set_page_dirty(page);
764 unlock_page(page);
765 }
766 }
767 return 0;
698} 768}
699 769
700static int exofs_writepage(struct page *page, struct writeback_control *wbc) 770static int exofs_writepage(struct page *page, struct writeback_control *wbc)
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index a7d79257fc65..c1c2cc607adf 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -47,6 +47,9 @@ MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>");
47MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); 47MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
48MODULE_LICENSE("GPL"); 48MODULE_LICENSE("GPL");
49 49
50static void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
51 struct ore_striping_info *si);
52
50static u8 *_ios_cred(struct ore_io_state *ios, unsigned index) 53static u8 *_ios_cred(struct ore_io_state *ios, unsigned index)
51{ 54{
52 return ios->oc->comps[index & ios->oc->single_comp].cred; 55 return ios->oc->comps[index & ios->oc->single_comp].cred;
@@ -62,38 +65,85 @@ static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
62 return ore_comp_dev(ios->oc, index); 65 return ore_comp_dev(ios->oc, index);
63} 66}
64 67
65int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, 68static int _get_io_state(struct ore_layout *layout,
66 bool is_reading, u64 offset, u64 length, 69 struct ore_components *oc, unsigned numdevs,
67 struct ore_io_state **pios) 70 struct ore_io_state **pios)
68{ 71{
69 struct ore_io_state *ios; 72 struct ore_io_state *ios;
70 73
71 /*TODO: Maybe use kmem_cach per sbi of size 74 /*TODO: Maybe use kmem_cach per sbi of size
72 * exofs_io_state_size(layout->s_numdevs) 75 * exofs_io_state_size(layout->s_numdevs)
73 */ 76 */
74 ios = kzalloc(ore_io_state_size(oc->numdevs), GFP_KERNEL); 77 ios = kzalloc(ore_io_state_size(numdevs), GFP_KERNEL);
75 if (unlikely(!ios)) { 78 if (unlikely(!ios)) {
76 ORE_DBGMSG("Failed kzalloc bytes=%d\n", 79 ORE_DBGMSG("Failed kzalloc bytes=%d\n",
77 ore_io_state_size(oc->numdevs)); 80 ore_io_state_size(numdevs));
78 *pios = NULL; 81 *pios = NULL;
79 return -ENOMEM; 82 return -ENOMEM;
80 } 83 }
81 84
82 ios->layout = layout; 85 ios->layout = layout;
83 ios->oc = oc; 86 ios->oc = oc;
84 ios->offset = offset; 87 *pios = ios;
85 ios->length = length; 88 return 0;
89}
90
91/* Allocate an io_state for only a single group of devices
92 *
93 * If a user needs to call ore_read/write() this version must be used becase it
94 * allocates extra stuff for striping and raid.
95 * The ore might decide to only IO less then @length bytes do to alignmets
96 * and constrains as follows:
97 * - The IO cannot cross group boundary.
98 * - In raid5/6 The end of the IO must align at end of a stripe eg.
99 * (@offset + @length) % strip_size == 0. Or the complete range is within a
100 * single stripe.
101 * - Memory condition only permitted a shorter IO. (A user can use @length=~0
102 * And check the returned ios->length for max_io_size.)
103 *
104 * The caller must check returned ios->length (and/or ios->nr_pages) and
105 * re-issue these pages that fall outside of ios->length
106 */
107int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
108 bool is_reading, u64 offset, u64 length,
109 struct ore_io_state **pios)
110{
111 struct ore_io_state *ios;
112 unsigned numdevs = layout->group_width * layout->mirrors_p1;
113 int ret;
114
115 ret = _get_io_state(layout, oc, numdevs, pios);
116 if (unlikely(ret))
117 return ret;
118
119 ios = *pios;
86 ios->reading = is_reading; 120 ios->reading = is_reading;
121 ios->offset = offset;
122
123 if (length) {
124 struct ore_striping_info si;
125
126 ore_calc_stripe_info(layout, offset, &si);
127 ios->length = (length <= si.group_length) ? length :
128 si.group_length;
129 ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
130 }
87 131
88 *pios = ios;
89 return 0; 132 return 0;
90} 133}
91EXPORT_SYMBOL(ore_get_rw_state); 134EXPORT_SYMBOL(ore_get_rw_state);
92 135
136/* Allocate an io_state for all the devices in the comps array
137 *
138 * This version of io_state allocation is used mostly by create/remove
139 * and trunc where we currently need all the devices. The only wastful
140 * bit is the read/write_attributes with no IO. Those sites should
141 * be converted to use ore_get_rw_state() with length=0
142 */
93int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, 143int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc,
94 struct ore_io_state **ios) 144 struct ore_io_state **pios)
95{ 145{
96 return ore_get_rw_state(layout, oc, true, 0, 0, ios); 146 return _get_io_state(layout, oc, oc->numdevs, pios);
97} 147}
98EXPORT_SYMBOL(ore_get_io_state); 148EXPORT_SYMBOL(ore_get_io_state);
99 149
@@ -374,12 +424,12 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length,
374 unsigned devs_in_group = ios->layout->group_width * mirrors_p1; 424 unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
375 unsigned dev = si->dev; 425 unsigned dev = si->dev;
376 unsigned first_dev = dev - (dev % devs_in_group); 426 unsigned first_dev = dev - (dev % devs_in_group);
377 unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
378 unsigned cur_pg = ios->pages_consumed; 427 unsigned cur_pg = ios->pages_consumed;
379 int ret = 0; 428 int ret = 0;
380 429
381 while (length) { 430 while (length) {
382 struct ore_per_dev_state *per_dev = &ios->per_dev[dev]; 431 unsigned comp = dev - first_dev;
432 struct ore_per_dev_state *per_dev = &ios->per_dev[comp];
383 unsigned cur_len, page_off = 0; 433 unsigned cur_len, page_off = 0;
384 434
385 if (!per_dev->length) { 435 if (!per_dev->length) {
@@ -397,9 +447,6 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length,
397 per_dev->offset = si->obj_offset - si->unit_off; 447 per_dev->offset = si->obj_offset - si->unit_off;
398 cur_len = stripe_unit; 448 cur_len = stripe_unit;
399 } 449 }
400
401 if (max_comp < dev)
402 max_comp = dev;
403 } else { 450 } else {
404 cur_len = stripe_unit; 451 cur_len = stripe_unit;
405 } 452 }
@@ -417,17 +464,15 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length,
417 length -= cur_len; 464 length -= cur_len;
418 } 465 }
419out: 466out:
420 ios->numdevs = max_comp + mirrors_p1; 467 ios->numdevs = devs_in_group;
421 ios->pages_consumed = cur_pg; 468 ios->pages_consumed = cur_pg;
422 return ret; 469 return ret;
423} 470}
424 471
425static int _prepare_for_striping(struct ore_io_state *ios) 472static int _prepare_for_striping(struct ore_io_state *ios)
426{ 473{
427 u64 length = ios->length;
428 u64 offset = ios->offset;
429 struct ore_striping_info si; 474 struct ore_striping_info si;
430 int ret = 0; 475 int ret;
431 476
432 if (!ios->pages) { 477 if (!ios->pages) {
433 if (ios->kern_buff) { 478 if (ios->kern_buff) {
@@ -446,21 +491,11 @@ static int _prepare_for_striping(struct ore_io_state *ios)
446 return 0; 491 return 0;
447 } 492 }
448 493
449 while (length) { 494 ore_calc_stripe_info(ios->layout, ios->offset, &si);
450 ore_calc_stripe_info(ios->layout, offset, &si);
451
452 if (length < si.group_length)
453 si.group_length = length;
454 495
455 ret = _prepare_one_group(ios, si.group_length, &si); 496 BUG_ON(ios->length > si.group_length);
456 if (unlikely(ret)) 497 ret = _prepare_one_group(ios, ios->length, &si);
457 goto out;
458 498
459 offset += si.group_length;
460 length -= si.group_length;
461 }
462
463out:
464 return ret; 499 return ret;
465} 500}
466 501
@@ -742,7 +777,6 @@ struct _trunc_info {
742 777
743 unsigned first_group_dev; 778 unsigned first_group_dev;
744 unsigned nex_group_dev; 779 unsigned nex_group_dev;
745 unsigned max_devs;
746}; 780};
747 781
748static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, 782static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
@@ -757,7 +791,6 @@ static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
757 791
758 ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width); 792 ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width);
759 ti->nex_group_dev = ti->first_group_dev + layout->group_width; 793 ti->nex_group_dev = ti->first_group_dev + layout->group_width;
760 ti->max_devs = layout->group_width * layout->group_count;
761} 794}
762 795
763int ore_truncate(struct ore_layout *layout, struct ore_components *oc, 796int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
@@ -777,7 +810,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
777 810
778 _calc_trunk_info(ios->layout, size, &ti); 811 _calc_trunk_info(ios->layout, size, &ti);
779 812
780 size_attrs = kcalloc(ti.max_devs, sizeof(*size_attrs), 813 size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs),
781 GFP_KERNEL); 814 GFP_KERNEL);
782 if (unlikely(!size_attrs)) { 815 if (unlikely(!size_attrs)) {
783 ret = -ENOMEM; 816 ret = -ENOMEM;
@@ -786,7 +819,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
786 819
787 ios->numdevs = ios->oc->numdevs; 820 ios->numdevs = ios->oc->numdevs;
788 821
789 for (i = 0; i < ti.max_devs; ++i) { 822 for (i = 0; i < ios->numdevs; ++i) {
790 struct exofs_trunc_attr *size_attr = &size_attrs[i]; 823 struct exofs_trunc_attr *size_attr = &size_attrs[i];
791 u64 obj_size; 824 u64 obj_size;
792 825