ore: Only IO one group at a time (API change)

Usually a single IO is confined to one group of devices (group_width) and at the boundary of a raid group it can spill into a second group. Current code would allocate a full device_table size array at each io_state so it can comply to requests that span two groups. Needless to say that is very wasteful, specially when device_table count can get very large (hundreds even thousands), while a group_width is usually 8 or 10. * Change ore API to trim on IO that spans two raid groups. The user passes offset+length to ore_get_rw_state, the ore might trim on that length if spanning a group boundary. The user must check ios->length or ios->nrpages to see how much IO will be preformed. It is the responsibility of the user to re-issue the reminder of the IO. * Modify exofs To copy spilled pages on to the next IO. This means one last kick is needed after all coalescing of pages is done. Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
author: Boaz Harrosh <bharrosh@panasas.com> 2011-09-28 04:55:51 -0400
committer: Boaz Harrosh <bharrosh@panasas.com> 2011-10-14 12:52:50 -0400
commit: b916c5cd4d895a27b47a652648958f73e4f23ac6 (patch)
tree: 9fe6e59edd44119c79a18b9df0b02a0c4dacb6d1 /fs/exofs/inode.c
parent: d866d875f68fdeae63df334d291fe138dc636d96 (diff)
1 files changed, 85 insertions, 15 deletions
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 61b2f7e5cdbd..d87c1f7562fb 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -259,6 +259,46 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
        }
 }
+static int _maybe_not_all_in_one_io(struct ore_io_state *ios,
+        struct page_collect *pcol_src, struct page_collect *pcol)
+{
+        /* length was wrong or offset was not page aligned */
+        BUG_ON(pcol_src->nr_pages < ios->nr_pages);
+        if (pcol_src->nr_pages > ios->nr_pages) {
+                struct page **src_page;
+                unsigned pages_less = pcol_src->nr_pages - ios->nr_pages;
+                unsigned long len_less = pcol_src->length - ios->length;
+                unsigned i;
+                int ret;
+                /* This IO was trimmed */
+                pcol_src->nr_pages = ios->nr_pages;
+                pcol_src->length = ios->length;
+                /* Left over pages are passed to the next io */
+                pcol->expected_pages += pages_less;
+                pcol->nr_pages = pages_less;
+                pcol->length = len_less;
+                src_page = pcol_src->pages + pcol_src->nr_pages;
+                pcol->pg_first = (*src_page)->index;
+                ret = pcol_try_alloc(pcol);
+                if (unlikely(ret))
+                        return ret;
+                for (i = 0; i < pages_less; ++i)
+                        pcol->pages[i] = *src_page++;
+                EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x "
+                        "pages_less=0x%x expected_pages=0x%x "
+                        "next_offset=0x%llx next_len=0x%lx\n",
+                        pcol_src->nr_pages, pages_less, pcol->expected_pages,
+                        pcol->pg_first * PAGE_SIZE, pcol->length);
+        }
+        return 0;
+}
 static int read_exec(struct page_collect *pcol)
 {
        struct exofs_i_info *oi = exofs_i(pcol->inode);
@@ -280,7 +320,6 @@ static int read_exec(struct page_collect *pcol)
        ios = pcol->ios;
        ios->pages = pcol->pages;
-        ios->nr_pages = pcol->nr_pages;
        if (pcol->read_4_write) {
                ore_read(pcol->ios);
@@ -296,17 +335,23 @@ static int read_exec(struct page_collect *pcol)
        *pcol_copy = *pcol;
        ios->done = readpages_done;
        ios->private = pcol_copy;
+        /* pages ownership was passed to pcol_copy */
+        _pcol_reset(pcol);
+        ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
+        if (unlikely(ret))
+                goto err;
+        EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n",
+                pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
        ret = ore_read(ios);
        if (unlikely(ret))
                goto err;
        atomic_inc(&pcol->sbi->s_curr_pending);
-        EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
-                  oi->one_comp.obj.id, _LLU(ios->offset), pcol->length);
-        /* pages ownership was passed to pcol_copy */
-        _pcol_reset(pcol);
        return 0;
 err:
@@ -429,6 +474,10 @@ static int exofs_readpages(struct file *file, struct address_space *mapping,
                return ret;
        }
+        ret = read_exec(&pcol);
+        if (unlikely(ret))
+                return ret;
        return read_exec(&pcol);
 }
@@ -519,7 +568,6 @@ static int write_exec(struct page_collect *pcol)
        ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false,
                                 pcol->pg_first << PAGE_CACHE_SHIFT,
                                 pcol->length, &pcol->ios);
        if (unlikely(ret))
                goto err;
@@ -534,10 +582,19 @@ static int write_exec(struct page_collect *pcol)
        ios = pcol->ios;
        ios->pages = pcol_copy->pages;
-        ios->nr_pages = pcol_copy->nr_pages;
        ios->done = writepages_done;
        ios->private = pcol_copy;
+        /* pages ownership was passed to pcol_copy */
+        _pcol_reset(pcol);
+        ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
+        if (unlikely(ret))
+                goto err;
+        EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n",
+                pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
        ret = ore_write(ios);
        if (unlikely(ret)) {
                EXOFS_ERR("write_exec: ore_write() Failed\n");
@@ -545,11 +602,6 @@ static int write_exec(struct page_collect *pcol)
        }
        atomic_inc(&pcol->sbi->s_curr_pending);
-        EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
-                  pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
-                  pcol->length);
-        /* pages ownership was passed to pcol_copy */
-        _pcol_reset(pcol);
        return 0;
 err:
@@ -689,12 +741,30 @@ static int exofs_writepages(struct address_space *mapping,
        _pcol_init(&pcol, expected_pages, mapping->host);
        ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
-        if (ret) {
+        if (unlikely(ret)) {
                EXOFS_ERR("write_cache_pages => %d\n", ret);
                return ret;
        }
-        return write_exec(&pcol);
+        ret = write_exec(&pcol);
+        if (unlikely(ret))
+                return ret;
+        if (wbc->sync_mode == WB_SYNC_ALL) {
+                return write_exec(&pcol); /* pump the last reminder */
+        } else if (pcol.nr_pages) {
+                /* not SYNC let the reminder join the next writeout */
+                unsigned i;
+                for (i = 0; i < pcol.nr_pages; i++) {
+                        struct page *page = pcol.pages[i];
+                        end_page_writeback(page);
+                        set_page_dirty(page);
+                        unlock_page(page);
+                }
+        }
+        return 0;
 }
 static int exofs_writepage(struct page *page, struct writeback_control *wbc)
author	Boaz Harrosh <bharrosh@panasas.com>	2011-09-28 04:55:51 -0400
committer	Boaz Harrosh <bharrosh@panasas.com>	2011-10-14 12:52:50 -0400
commit	b916c5cd4d895a27b47a652648958f73e4f23ac6 (patch)
tree	9fe6e59edd44119c79a18b9df0b02a0c4dacb6d1 /fs/exofs/inode.c
parent	d866d875f68fdeae63df334d291fe138dc636d96 (diff)

diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 61b2f7e5cdbd..d87c1f7562fb 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c
@@ -259,6 +259,46 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
259	}	259	}
260	}	260	}
261		261
		262	static int _maybe_not_all_in_one_io(struct ore_io_state *ios,
		263	struct page_collect pcol_src, struct page_collect pcol)
		264	{
		265	/* length was wrong or offset was not page aligned */
		266	BUG_ON(pcol_src->nr_pages < ios->nr_pages);
		267
		268	if (pcol_src->nr_pages > ios->nr_pages) {
		269	struct page **src_page;
		270	unsigned pages_less = pcol_src->nr_pages - ios->nr_pages;
		271	unsigned long len_less = pcol_src->length - ios->length;
		272	unsigned i;
		273	int ret;
		274
		275	/* This IO was trimmed */
		276	pcol_src->nr_pages = ios->nr_pages;
		277	pcol_src->length = ios->length;
		278
		279	/* Left over pages are passed to the next io */
		280	pcol->expected_pages += pages_less;
		281	pcol->nr_pages = pages_less;
		282	pcol->length = len_less;
		283	src_page = pcol_src->pages + pcol_src->nr_pages;
		284	pcol->pg_first = (*src_page)->index;
		285
		286	ret = pcol_try_alloc(pcol);
		287	if (unlikely(ret))
		288	return ret;
		289
		290	for (i = 0; i < pages_less; ++i)
		291	pcol->pages[i] = *src_page++;
		292
		293	EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x "
		294	"pages_less=0x%x expected_pages=0x%x "
		295	"next_offset=0x%llx next_len=0x%lx\n",
		296	pcol_src->nr_pages, pages_less, pcol->expected_pages,
		297	pcol->pg_first * PAGE_SIZE, pcol->length);
		298	}
		299	return 0;
		300	}
		301
262	static int read_exec(struct page_collect *pcol)	302	static int read_exec(struct page_collect *pcol)
263	{	303	{
264	struct exofs_i_info *oi = exofs_i(pcol->inode);	304	struct exofs_i_info *oi = exofs_i(pcol->inode);
@@ -280,7 +320,6 @@ static int read_exec(struct page_collect *pcol)
280		320
281	ios = pcol->ios;	321	ios = pcol->ios;
282	ios->pages = pcol->pages;	322	ios->pages = pcol->pages;
283	ios->nr_pages = pcol->nr_pages;
284		323
285	if (pcol->read_4_write) {	324	if (pcol->read_4_write) {
286	ore_read(pcol->ios);	325	ore_read(pcol->ios);
@@ -296,17 +335,23 @@ static int read_exec(struct page_collect *pcol)
296	pcol_copy = pcol;	335	pcol_copy = pcol;
297	ios->done = readpages_done;	336	ios->done = readpages_done;
298	ios->private = pcol_copy;	337	ios->private = pcol_copy;
		338
		339	/* pages ownership was passed to pcol_copy */
		340	_pcol_reset(pcol);
		341
		342	ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
		343	if (unlikely(ret))
		344	goto err;
		345
		346	EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n",
		347	pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
		348
299	ret = ore_read(ios);	349	ret = ore_read(ios);
300	if (unlikely(ret))	350	if (unlikely(ret))
301	goto err;	351	goto err;
302		352
303	atomic_inc(&pcol->sbi->s_curr_pending);	353	atomic_inc(&pcol->sbi->s_curr_pending);
304		354
305	EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
306	oi->one_comp.obj.id, _LLU(ios->offset), pcol->length);
307
308	/* pages ownership was passed to pcol_copy */
309	_pcol_reset(pcol);
310	return 0;	355	return 0;
311		356
312	err:	357	err:
@@ -429,6 +474,10 @@ static int exofs_readpages(struct file file, struct address_space mapping,
429	return ret;	474	return ret;
430	}	475	}
431		476
		477	ret = read_exec(&pcol);
		478	if (unlikely(ret))
		479	return ret;
		480
432	return read_exec(&pcol);	481	return read_exec(&pcol);
433	}	482	}
434		483
@@ -519,7 +568,6 @@ static int write_exec(struct page_collect *pcol)
519	ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false,	568	ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false,
520	pcol->pg_first << PAGE_CACHE_SHIFT,	569	pcol->pg_first << PAGE_CACHE_SHIFT,
521	pcol->length, &pcol->ios);	570	pcol->length, &pcol->ios);
522
523	if (unlikely(ret))	571	if (unlikely(ret))
524	goto err;	572	goto err;
525		573
@@ -534,10 +582,19 @@ static int write_exec(struct page_collect *pcol)
534		582
535	ios = pcol->ios;	583	ios = pcol->ios;
536	ios->pages = pcol_copy->pages;	584	ios->pages = pcol_copy->pages;
537	ios->nr_pages = pcol_copy->nr_pages;
538	ios->done = writepages_done;	585	ios->done = writepages_done;
539	ios->private = pcol_copy;	586	ios->private = pcol_copy;
540		587
		588	/* pages ownership was passed to pcol_copy */
		589	_pcol_reset(pcol);
		590
		591	ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
		592	if (unlikely(ret))
		593	goto err;
		594
		595	EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n",
		596	pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
		597
541	ret = ore_write(ios);	598	ret = ore_write(ios);
542	if (unlikely(ret)) {	599	if (unlikely(ret)) {
543	EXOFS_ERR("write_exec: ore_write() Failed\n");	600	EXOFS_ERR("write_exec: ore_write() Failed\n");
@@ -545,11 +602,6 @@ static int write_exec(struct page_collect *pcol)
545	}	602	}
546		603
547	atomic_inc(&pcol->sbi->s_curr_pending);	604	atomic_inc(&pcol->sbi->s_curr_pending);
548	EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
549	pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
550	pcol->length);
551	/* pages ownership was passed to pcol_copy */
552	_pcol_reset(pcol);
553	return 0;	605	return 0;
554		606
555	err:	607	err:
@@ -689,12 +741,30 @@ static int exofs_writepages(struct address_space *mapping,
689	_pcol_init(&pcol, expected_pages, mapping->host);	741	_pcol_init(&pcol, expected_pages, mapping->host);
690		742
691	ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);	743	ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
692	if (ret) {	744	if (unlikely(ret)) {
693	EXOFS_ERR("write_cache_pages => %d\n", ret);	745	EXOFS_ERR("write_cache_pages => %d\n", ret);
694	return ret;	746	return ret;
695	}	747	}
696		748
697	return write_exec(&pcol);	749	ret = write_exec(&pcol);
		750	if (unlikely(ret))
		751	return ret;
		752
		753	if (wbc->sync_mode == WB_SYNC_ALL) {
		754	return write_exec(&pcol); /* pump the last reminder */
		755	} else if (pcol.nr_pages) {
		756	/* not SYNC let the reminder join the next writeout */
		757	unsigned i;
		758
		759	for (i = 0; i < pcol.nr_pages; i++) {
		760	struct page *page = pcol.pages[i];
		761
		762	end_page_writeback(page);
		763	set_page_dirty(page);
		764	unlock_page(page);
		765	}
		766	}
		767	return 0;
698	}	768	}
699		769
700	static int exofs_writepage(struct page page, struct writeback_control wbc)	770	static int exofs_writepage(struct page page, struct writeback_control wbc)