exofs: address_space_operations

OK Now we start to read and write from osd-objects. We try to collect at most contiguous pages as possible in a single write/read. The first page index is the object's offset. TODO: In 64-bit a single bio can carry at most 128 pages. Add support of chaining multiple bios Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
author: Boaz Harrosh <bharrosh@panasas.com> 2008-10-27 13:31:34 -0400
committer: Boaz Harrosh <bharrosh@panasas.com> 2009-03-31 12:44:29 -0400
commit: beaec07ba6af35d387643b76a2920a7a6e22207b (patch)
tree: 74ffd4738198424f698ae238e4d3164938ef5af7 /fs/exofs/inode.c
parent: 982980d753223fda3864038236b7b94e246895cb (diff)
1 files changed, 697 insertions, 0 deletions
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index b0bda1e91225..a3691d8bfb98 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -35,6 +35,7 @@
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
+#include <scsi/scsi_device.h>
 #include "exofs.h"
@@ -42,6 +43,702 @@
 #  define EXOFS_DEBUG_OBJ_ISIZE 1
 #endif
+struct page_collect {
+        struct exofs_sb_info *sbi;
+        struct request_queue *req_q;
+        struct inode *inode;
+        unsigned expected_pages;
+        struct bio *bio;
+        unsigned nr_pages;
+        unsigned long length;
+        loff_t pg_first; /* keep 64bit also in 32-arches */
+};
+static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
+                struct inode *inode)
+{
+        struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+        struct request_queue *req_q = sbi->s_dev->scsi_device->request_queue;
+        pcol->sbi = sbi;
+        pcol->req_q = req_q;
+        pcol->inode = inode;
+        pcol->expected_pages = expected_pages;
+        pcol->bio = NULL;
+        pcol->nr_pages = 0;
+        pcol->length = 0;
+        pcol->pg_first = -1;
+        EXOFS_DBGMSG("_pcol_init ino=0x%lx expected_pages=%u\n", inode->i_ino,
+                     expected_pages);
+}
+static void _pcol_reset(struct page_collect *pcol)
+{
+        pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages);
+        pcol->bio = NULL;
+        pcol->nr_pages = 0;
+        pcol->length = 0;
+        pcol->pg_first = -1;
+        EXOFS_DBGMSG("_pcol_reset ino=0x%lx expected_pages=%u\n",
+                     pcol->inode->i_ino, pcol->expected_pages);
+        /* this is probably the end of the loop but in writes
+         * it might not end here. don't be left with nothing
+         */
+        if (!pcol->expected_pages)
+                pcol->expected_pages = 128;
+}
+static int pcol_try_alloc(struct page_collect *pcol)
+{
+        int pages = min_t(unsigned, pcol->expected_pages, BIO_MAX_PAGES);
+        for (; pages; pages >>= 1) {
+                pcol->bio = bio_alloc(GFP_KERNEL, pages);
+                if (likely(pcol->bio))
+                        return 0;
+        }
+        EXOFS_ERR("Failed to kcalloc expected_pages=%u\n",
+                  pcol->expected_pages);
+        return -ENOMEM;
+}
+static void pcol_free(struct page_collect *pcol)
+{
+        bio_put(pcol->bio);
+        pcol->bio = NULL;
+}
+static int pcol_add_page(struct page_collect *pcol, struct page *page,
+                         unsigned len)
+{
+        int added_len = bio_add_pc_page(pcol->req_q, pcol->bio, page, len, 0);
+        if (unlikely(len != added_len))
+                return -ENOMEM;
+        ++pcol->nr_pages;
+        pcol->length += len;
+        return 0;
+}
+static int update_read_page(struct page *page, int ret)
+{
+        if (ret == 0) {
+                /* Everything is OK */
+                SetPageUptodate(page);
+                if (PageError(page))
+                        ClearPageError(page);
+        } else if (ret == -EFAULT) {
+                /* In this case we were trying to read something that wasn't on
+                 * disk yet - return a page full of zeroes.  This should be OK,
+                 * because the object should be empty (if there was a write
+                 * before this read, the read would be waiting with the page
+                 * locked */
+                clear_highpage(page);
+                SetPageUptodate(page);
+                if (PageError(page))
+                        ClearPageError(page);
+                ret = 0; /* recovered error */
+                EXOFS_DBGMSG("recovered read error\n");
+        } else /* Error */
+                SetPageError(page);
+        return ret;
+}
+static void update_write_page(struct page *page, int ret)
+{
+        if (ret) {
+                mapping_set_error(page->mapping, ret);
+                SetPageError(page);
+        }
+        end_page_writeback(page);
+}
+/* Called at the end of reads, to optionally unlock pages and update their
+ * status.
+ */
+static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
+                            bool do_unlock)
+{
+        struct bio_vec *bvec;
+        int i;
+        u64 resid;
+        u64 good_bytes;
+        u64 length = 0;
+        int ret = exofs_check_ok_resid(or, &resid, NULL);
+        osd_end_request(or);
+        if (likely(!ret))
+                good_bytes = pcol->length;
+        else if (!resid)
+                good_bytes = 0;
+        else
+                good_bytes = pcol->length - resid;
+        EXOFS_DBGMSG("readpages_done(0x%lx) good_bytes=0x%llx"
+                     " length=0x%lx nr_pages=%u\n",
+                     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
+                     pcol->nr_pages);
+        __bio_for_each_segment(bvec, pcol->bio, i, 0) {
+                struct page *page = bvec->bv_page;
+                struct inode *inode = page->mapping->host;
+                int page_stat;
+                if (inode != pcol->inode)
+                        continue; /* osd might add more pages at end */
+                if (likely(length < good_bytes))
+                        page_stat = 0;
+                else
+                        page_stat = ret;
+                EXOFS_DBGMSG("    readpages_done(0x%lx, 0x%lx) %s\n",
+                          inode->i_ino, page->index,
+                          page_stat ? "bad_bytes" : "good_bytes");
+                ret = update_read_page(page, page_stat);
+                if (do_unlock)
+                        unlock_page(page);
+                length += bvec->bv_len;
+        }
+        pcol_free(pcol);
+        EXOFS_DBGMSG("readpages_done END\n");
+        return ret;
+}
+/* callback of async reads */
+static void readpages_done(struct osd_request *or, void *p)
+{
+        struct page_collect *pcol = p;
+        __readpages_done(or, pcol, true);
+        atomic_dec(&pcol->sbi->s_curr_pending);
+        kfree(p);
+}
+static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
+{
+        struct bio_vec *bvec;
+        int i;
+        __bio_for_each_segment(bvec, pcol->bio, i, 0) {
+                struct page *page = bvec->bv_page;
+                if (rw == READ)
+                        update_read_page(page, ret);
+                else
+                        update_write_page(page, ret);
+                unlock_page(page);
+        }
+        pcol_free(pcol);
+}
+static int read_exec(struct page_collect *pcol, bool is_sync)
+{
+        struct exofs_i_info *oi = exofs_i(pcol->inode);
+        struct osd_obj_id obj = {pcol->sbi->s_pid,
+                                        pcol->inode->i_ino + EXOFS_OBJ_OFF};
+        struct osd_request *or = NULL;
+        struct page_collect *pcol_copy = NULL;
+        loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
+        int ret;
+        if (!pcol->bio)
+                return 0;
+        /* see comment in _readpage() about sync reads */
+        WARN_ON(is_sync && (pcol->nr_pages != 1));
+        or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
+        if (unlikely(!or)) {
+                ret = -ENOMEM;
+                goto err;
+        }
+        osd_req_read(or, &obj, pcol->bio, i_start);
+        if (is_sync) {
+                exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred);
+                return __readpages_done(or, pcol, false);
+        }
+        pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
+        if (!pcol_copy) {
+                ret = -ENOMEM;
+                goto err;
+        }
+        *pcol_copy = *pcol;
+        ret = exofs_async_op(or, readpages_done, pcol_copy, oi->i_cred);
+        if (unlikely(ret))
+                goto err;
+        atomic_inc(&pcol->sbi->s_curr_pending);
+        EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
+                  obj.id, _LLU(i_start), pcol->length);
+        /* pages ownership was passed to pcol_copy */
+        _pcol_reset(pcol);
+        return 0;
+err:
+        if (!is_sync)
+                _unlock_pcol_pages(pcol, ret, READ);
+        kfree(pcol_copy);
+        if (or)
+                osd_end_request(or);
+        return ret;
+}
+/* readpage_strip is called either directly from readpage() or by the VFS from
+ * within read_cache_pages(), to add one more page to be read. It will try to
+ * collect as many contiguous pages as posible. If a discontinuity is
+ * encountered, or it runs out of resources, it will submit the previous segment
+ * and will start a new collection. Eventually caller must submit the last
+ * segment if present.
+ */
+static int readpage_strip(void *data, struct page *page)
+{
+        struct page_collect *pcol = data;
+        struct inode *inode = pcol->inode;
+        struct exofs_i_info *oi = exofs_i(inode);
+        loff_t i_size = i_size_read(inode);
+        pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+        size_t len;
+        int ret;
+        /* FIXME: Just for debugging, will be removed */
+        if (PageUptodate(page))
+                EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
+                          page->index);
+        if (page->index < end_index)
+                len = PAGE_CACHE_SIZE;
+        else if (page->index == end_index)
+                len = i_size & ~PAGE_CACHE_MASK;
+        else
+                len = 0;
+        if (!len || !obj_created(oi)) {
+                /* this will be out of bounds, or doesn't exist yet.
+                 * Current page is cleared and the request is split
+                 */
+                clear_highpage(page);
+                SetPageUptodate(page);
+                if (PageError(page))
+                        ClearPageError(page);
+                unlock_page(page);
+                EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
+                             " splitting\n", inode->i_ino, page->index);
+                return read_exec(pcol, false);
+        }
+try_again:
+        if (unlikely(pcol->pg_first == -1)) {
+                pcol->pg_first = page->index;
+        } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
+                   page->index)) {
+                /* Discontinuity detected, split the request */
+                ret = read_exec(pcol, false);
+                if (unlikely(ret))
+                        goto fail;
+                goto try_again;
+        }
+        if (!pcol->bio) {
+                ret = pcol_try_alloc(pcol);
+                if (unlikely(ret))
+                        goto fail;
+        }
+        if (len != PAGE_CACHE_SIZE)
+                zero_user(page, len, PAGE_CACHE_SIZE - len);
+        EXOFS_DBGMSG("    readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
+                     inode->i_ino, page->index, len);
+        ret = pcol_add_page(pcol, page, len);
+        if (ret) {
+                EXOFS_DBGMSG("Failed pcol_add_page pages[i]=%p "
+                          "this_len=0x%zx nr_pages=%u length=0x%lx\n",
+                          page, len, pcol->nr_pages, pcol->length);
+                /* split the request, and start again with current page */
+                ret = read_exec(pcol, false);
+                if (unlikely(ret))
+                        goto fail;
+                goto try_again;
+        }
+        return 0;
+fail:
+        /* SetPageError(page); ??? */
+        unlock_page(page);
+        return ret;
+}
+static int exofs_readpages(struct file *file, struct address_space *mapping,
+                           struct list_head *pages, unsigned nr_pages)
+{
+        struct page_collect pcol;
+        int ret;
+        _pcol_init(&pcol, nr_pages, mapping->host);
+        ret = read_cache_pages(mapping, pages, readpage_strip, &pcol);
+        if (ret) {
+                EXOFS_ERR("read_cache_pages => %d\n", ret);
+                return ret;
+        }
+        return read_exec(&pcol, false);
+}
+static int _readpage(struct page *page, bool is_sync)
+{
+        struct page_collect pcol;
+        int ret;
+        _pcol_init(&pcol, 1, page->mapping->host);
+        /* readpage_strip might call read_exec(,async) inside at several places
+         * but this is safe for is_async=0 since read_exec will not do anything
+         * when we have a single page.
+         */
+        ret = readpage_strip(&pcol, page);
+        if (ret) {
+                EXOFS_ERR("_readpage => %d\n", ret);
+                return ret;
+        }
+        return read_exec(&pcol, is_sync);
+}
+/*
+ * We don't need the file
+ */
+static int exofs_readpage(struct file *file, struct page *page)
+{
+        return _readpage(page, false);
+}
+/* Callback for osd_write. All writes are asynchronouse */
+static void writepages_done(struct osd_request *or, void *p)
+{
+        struct page_collect *pcol = p;
+        struct bio_vec *bvec;
+        int i;
+        u64 resid;
+        u64  good_bytes;
+        u64  length = 0;
+        int ret = exofs_check_ok_resid(or, NULL, &resid);
+        osd_end_request(or);
+        atomic_dec(&pcol->sbi->s_curr_pending);
+        if (likely(!ret))
+                good_bytes = pcol->length;
+        else if (!resid)
+                good_bytes = 0;
+        else
+                good_bytes = pcol->length - resid;
+        EXOFS_DBGMSG("writepages_done(0x%lx) good_bytes=0x%llx"
+                     " length=0x%lx nr_pages=%u\n",
+                     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
+                     pcol->nr_pages);
+        __bio_for_each_segment(bvec, pcol->bio, i, 0) {
+                struct page *page = bvec->bv_page;
+                struct inode *inode = page->mapping->host;
+                int page_stat;
+                if (inode != pcol->inode)
+                        continue; /* osd might add more pages to a bio */
+                if (likely(length < good_bytes))
+                        page_stat = 0;
+                else
+                        page_stat = ret;
+                update_write_page(page, page_stat);
+                unlock_page(page);
+                EXOFS_DBGMSG("    writepages_done(0x%lx, 0x%lx) status=%d\n",
+                             inode->i_ino, page->index, page_stat);
+                length += bvec->bv_len;
+        }
+        pcol_free(pcol);
+        kfree(pcol);
+        EXOFS_DBGMSG("writepages_done END\n");
+}
+static int write_exec(struct page_collect *pcol)
+{
+        struct exofs_i_info *oi = exofs_i(pcol->inode);
+        struct osd_obj_id obj = {pcol->sbi->s_pid,
+                                        pcol->inode->i_ino + EXOFS_OBJ_OFF};
+        struct osd_request *or = NULL;
+        struct page_collect *pcol_copy = NULL;
+        loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
+        int ret;
+        if (!pcol->bio)
+                return 0;
+        or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
+        if (unlikely(!or)) {
+                EXOFS_ERR("write_exec: Faild to osd_start_request()\n");
+                ret = -ENOMEM;
+                goto err;
+        }
+        pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
+        if (!pcol_copy) {
+                EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n");
+                ret = -ENOMEM;
+                goto err;
+        }
+        *pcol_copy = *pcol;
+        osd_req_write(or, &obj, pcol_copy->bio, i_start);
+        ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred);
+        if (unlikely(ret)) {
+                EXOFS_ERR("write_exec: exofs_async_op() Faild\n");
+                goto err;
+        }
+        atomic_inc(&pcol->sbi->s_curr_pending);
+        EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
+                  pcol->inode->i_ino, pcol->pg_first, _LLU(i_start),
+                  pcol->length);
+        /* pages ownership was passed to pcol_copy */
+        _pcol_reset(pcol);
+        return 0;
+err:
+        _unlock_pcol_pages(pcol, ret, WRITE);
+        kfree(pcol_copy);
+        if (or)
+                osd_end_request(or);
+        return ret;
+}
+/* writepage_strip is called either directly from writepage() or by the VFS from
+ * within write_cache_pages(), to add one more page to be written to storage.
+ * It will try to collect as many contiguous pages as possible. If a
+ * discontinuity is encountered or it runs out of resources it will submit the
+ * previous segment and will start a new collection.
+ * Eventually caller must submit the last segment if present.
+ */
+static int writepage_strip(struct page *page,
+                           struct writeback_control *wbc_unused, void *data)
+{
+        struct page_collect *pcol = data;
+        struct inode *inode = pcol->inode;
+        struct exofs_i_info *oi = exofs_i(inode);
+        loff_t i_size = i_size_read(inode);
+        pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+        size_t len;
+        int ret;
+        BUG_ON(!PageLocked(page));
+        ret = wait_obj_created(oi);
+        if (unlikely(ret))
+                goto fail;
+        if (page->index < end_index)
+                /* in this case, the page is within the limits of the file */
+                len = PAGE_CACHE_SIZE;
+        else {
+                len = i_size & ~PAGE_CACHE_MASK;
+                if (page->index > end_index || !len) {
+                        /* in this case, the page is outside the limits
+                         * (truncate in progress)
+                         */
+                        ret = write_exec(pcol);
+                        if (unlikely(ret))
+                                goto fail;
+                        if (PageError(page))
+                                ClearPageError(page);
+                        unlock_page(page);
+                        return 0;
+                }
+        }
+try_again:
+        if (unlikely(pcol->pg_first == -1)) {
+                pcol->pg_first = page->index;
+        } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
+                   page->index)) {
+                /* Discontinuity detected, split the request */
+                ret = write_exec(pcol);
+                if (unlikely(ret))
+                        goto fail;
+                goto try_again;
+        }
+        if (!pcol->bio) {
+                ret = pcol_try_alloc(pcol);
+                if (unlikely(ret))
+                        goto fail;
+        }
+        EXOFS_DBGMSG("    writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
+                     inode->i_ino, page->index, len);
+        ret = pcol_add_page(pcol, page, len);
+        if (unlikely(ret)) {
+                EXOFS_DBGMSG("Failed pcol_add_page "
+                             "nr_pages=%u total_length=0x%lx\n",
+                             pcol->nr_pages, pcol->length);
+                /* split the request, next loop will start again */
+                ret = write_exec(pcol);
+                if (unlikely(ret)) {
+                        EXOFS_DBGMSG("write_exec faild => %d", ret);
+                        goto fail;
+                }
+                goto try_again;
+        }
+        BUG_ON(PageWriteback(page));
+        set_page_writeback(page);
+        return 0;
+fail:
+        set_bit(AS_EIO, &page->mapping->flags);
+        unlock_page(page);
+        return ret;
+}
+static int exofs_writepages(struct address_space *mapping,
+                       struct writeback_control *wbc)
+{
+        struct page_collect pcol;
+        long start, end, expected_pages;
+        int ret;
+        start = wbc->range_start >> PAGE_CACHE_SHIFT;
+        end = (wbc->range_end == LLONG_MAX) ?
+                        start + mapping->nrpages :
+                        wbc->range_end >> PAGE_CACHE_SHIFT;
+        if (start || end)
+                expected_pages = min(end - start + 1, 32L);
+        else
+                expected_pages = mapping->nrpages;
+        EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx"
+                     " m->nrpages=%lu start=0x%lx end=0x%lx\n",
+                     mapping->host->i_ino, wbc->range_start, wbc->range_end,
+                     mapping->nrpages, start, end);
+        _pcol_init(&pcol, expected_pages, mapping->host);
+        ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
+        if (ret) {
+                EXOFS_ERR("write_cache_pages => %d\n", ret);
+                return ret;
+        }
+        return write_exec(&pcol);
+}
+static int exofs_writepage(struct page *page, struct writeback_control *wbc)
+{
+        struct page_collect pcol;
+        int ret;
+        _pcol_init(&pcol, 1, page->mapping->host);
+        ret = writepage_strip(page, NULL, &pcol);
+        if (ret) {
+                EXOFS_ERR("exofs_writepage => %d\n", ret);
+                return ret;
+        }
+        return write_exec(&pcol);
+}
+int exofs_write_begin(struct file *file, struct address_space *mapping,
+                loff_t pos, unsigned len, unsigned flags,
+                struct page **pagep, void **fsdata)
+{
+        int ret = 0;
+        struct page *page;
+        page = *pagep;
+        if (page == NULL) {
+                ret = simple_write_begin(file, mapping, pos, len, flags, pagep,
+                                         fsdata);
+                if (ret) {
+                        EXOFS_DBGMSG("simple_write_begin faild\n");
+                        return ret;
+                }
+                page = *pagep;
+        }
+         /* read modify write */
+        if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
+                ret = _readpage(page, true);
+                if (ret) {
+                        /*SetPageError was done by _readpage. Is it ok?*/
+                        unlock_page(page);
+                        EXOFS_DBGMSG("__readpage_filler faild\n");
+                }
+        }
+        return ret;
+}
+static int exofs_write_begin_export(struct file *file,
+                struct address_space *mapping,
+                loff_t pos, unsigned len, unsigned flags,
+                struct page **pagep, void **fsdata)
+{
+        *pagep = NULL;
+        return exofs_write_begin(file, mapping, pos, len, flags, pagep,
+                                        fsdata);
+}
+const struct address_space_operations exofs_aops = {
+        .readpage       = exofs_readpage,
+        .readpages      = exofs_readpages,
+        .writepage      = exofs_writepage,
+        .writepages     = exofs_writepages,
+        .write_begin    = exofs_write_begin_export,
+        .write_end      = simple_write_end,
+};
 /******************************************************************************
 * INODE OPERATIONS
 *****************************************************************************/
author	Boaz Harrosh <bharrosh@panasas.com>	2008-10-27 13:31:34 -0400
committer	Boaz Harrosh <bharrosh@panasas.com>	2009-03-31 12:44:29 -0400
commit	beaec07ba6af35d387643b76a2920a7a6e22207b (patch)
tree	74ffd4738198424f698ae238e4d3164938ef5af7 /fs/exofs/inode.c
parent	982980d753223fda3864038236b7b94e246895cb (diff)

diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index b0bda1e91225..a3691d8bfb98 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c
@@ -35,6 +35,7 @@
35		35
36	#include <linux/writeback.h>	36	#include <linux/writeback.h>
37	#include <linux/buffer_head.h>	37	#include <linux/buffer_head.h>
		38	#include <scsi/scsi_device.h>
38		39
39	#include "exofs.h"	40	#include "exofs.h"
40		41
@@ -42,6 +43,702 @@
42	# define EXOFS_DEBUG_OBJ_ISIZE 1	43	# define EXOFS_DEBUG_OBJ_ISIZE 1
43	#endif	44	#endif
44		45
		46	struct page_collect {
		47	struct exofs_sb_info *sbi;
		48	struct request_queue *req_q;
		49	struct inode *inode;
		50	unsigned expected_pages;
		51
		52	struct bio *bio;
		53	unsigned nr_pages;
		54	unsigned long length;
		55	loff_t pg_first; /* keep 64bit also in 32-arches */
		56	};
		57
		58	static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
		59	struct inode *inode)
		60	{
		61	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
		62	struct request_queue *req_q = sbi->s_dev->scsi_device->request_queue;
		63
		64	pcol->sbi = sbi;
		65	pcol->req_q = req_q;
		66	pcol->inode = inode;
		67	pcol->expected_pages = expected_pages;
		68
		69	pcol->bio = NULL;
		70	pcol->nr_pages = 0;
		71	pcol->length = 0;
		72	pcol->pg_first = -1;
		73
		74	EXOFS_DBGMSG("_pcol_init ino=0x%lx expected_pages=%u\n", inode->i_ino,
		75	expected_pages);
		76	}
		77
		78	static void _pcol_reset(struct page_collect *pcol)
		79	{
		80	pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages);
		81
		82	pcol->bio = NULL;
		83	pcol->nr_pages = 0;
		84	pcol->length = 0;
		85	pcol->pg_first = -1;
		86	EXOFS_DBGMSG("_pcol_reset ino=0x%lx expected_pages=%u\n",
		87	pcol->inode->i_ino, pcol->expected_pages);
		88
		89	/* this is probably the end of the loop but in writes
		90	* it might not end here. don't be left with nothing
		91	*/
		92	if (!pcol->expected_pages)
		93	pcol->expected_pages = 128;
		94	}
		95
		96	static int pcol_try_alloc(struct page_collect *pcol)
		97	{
		98	int pages = min_t(unsigned, pcol->expected_pages, BIO_MAX_PAGES);
		99
		100	for (; pages; pages >>= 1) {
		101	pcol->bio = bio_alloc(GFP_KERNEL, pages);
		102	if (likely(pcol->bio))
		103	return 0;
		104	}
		105
		106	EXOFS_ERR("Failed to kcalloc expected_pages=%u\n",
		107	pcol->expected_pages);
		108	return -ENOMEM;
		109	}
		110
		111	static void pcol_free(struct page_collect *pcol)
		112	{
		113	bio_put(pcol->bio);
		114	pcol->bio = NULL;
		115	}
		116
		117	static int pcol_add_page(struct page_collect pcol, struct page page,
		118	unsigned len)
		119	{
		120	int added_len = bio_add_pc_page(pcol->req_q, pcol->bio, page, len, 0);
		121	if (unlikely(len != added_len))
		122	return -ENOMEM;
		123
		124	++pcol->nr_pages;
		125	pcol->length += len;
		126	return 0;
		127	}
		128
		129	static int update_read_page(struct page *page, int ret)
		130	{
		131	if (ret == 0) {
		132	/* Everything is OK */
		133	SetPageUptodate(page);
		134	if (PageError(page))
		135	ClearPageError(page);
		136	} else if (ret == -EFAULT) {
		137	/* In this case we were trying to read something that wasn't on
		138	* disk yet - return a page full of zeroes. This should be OK,
		139	* because the object should be empty (if there was a write
		140	* before this read, the read would be waiting with the page
		141	* locked */
		142	clear_highpage(page);
		143
		144	SetPageUptodate(page);
		145	if (PageError(page))
		146	ClearPageError(page);
		147	ret = 0; /* recovered error */
		148	EXOFS_DBGMSG("recovered read error\n");
		149	} else /* Error */
		150	SetPageError(page);
		151
		152	return ret;
		153	}
		154
		155	static void update_write_page(struct page *page, int ret)
		156	{
		157	if (ret) {
		158	mapping_set_error(page->mapping, ret);
		159	SetPageError(page);
		160	}
		161	end_page_writeback(page);
		162	}
		163
		164	/* Called at the end of reads, to optionally unlock pages and update their
		165	* status.
		166	*/
		167	static int __readpages_done(struct osd_request or, struct page_collect pcol,
		168	bool do_unlock)
		169	{
		170	struct bio_vec *bvec;
		171	int i;
		172	u64 resid;
		173	u64 good_bytes;
		174	u64 length = 0;
		175	int ret = exofs_check_ok_resid(or, &resid, NULL);
		176
		177	osd_end_request(or);
		178
		179	if (likely(!ret))
		180	good_bytes = pcol->length;
		181	else if (!resid)
		182	good_bytes = 0;
		183	else
		184	good_bytes = pcol->length - resid;
		185
		186	EXOFS_DBGMSG("readpages_done(0x%lx) good_bytes=0x%llx"
		187	" length=0x%lx nr_pages=%u\n",
		188	pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
		189	pcol->nr_pages);
		190
		191	__bio_for_each_segment(bvec, pcol->bio, i, 0) {
		192	struct page *page = bvec->bv_page;
		193	struct inode *inode = page->mapping->host;
		194	int page_stat;
		195
		196	if (inode != pcol->inode)
		197	continue; /* osd might add more pages at end */
		198
		199	if (likely(length < good_bytes))
		200	page_stat = 0;
		201	else
		202	page_stat = ret;
		203
		204	EXOFS_DBGMSG(" readpages_done(0x%lx, 0x%lx) %s\n",
		205	inode->i_ino, page->index,
		206	page_stat ? "bad_bytes" : "good_bytes");
		207
		208	ret = update_read_page(page, page_stat);
		209	if (do_unlock)
		210	unlock_page(page);
		211	length += bvec->bv_len;
		212	}
		213
		214	pcol_free(pcol);
		215	EXOFS_DBGMSG("readpages_done END\n");
		216	return ret;
		217	}
		218
		219	/* callback of async reads */
		220	static void readpages_done(struct osd_request or, void p)
		221	{
		222	struct page_collect *pcol = p;
		223
		224	__readpages_done(or, pcol, true);
		225	atomic_dec(&pcol->sbi->s_curr_pending);
		226	kfree(p);
		227	}
		228
		229	static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
		230	{
		231	struct bio_vec *bvec;
		232	int i;
		233
		234	__bio_for_each_segment(bvec, pcol->bio, i, 0) {
		235	struct page *page = bvec->bv_page;
		236
		237	if (rw == READ)
		238	update_read_page(page, ret);
		239	else
		240	update_write_page(page, ret);
		241
		242	unlock_page(page);
		243	}
		244	pcol_free(pcol);
		245	}
		246
		247	static int read_exec(struct page_collect *pcol, bool is_sync)
		248	{
		249	struct exofs_i_info *oi = exofs_i(pcol->inode);
		250	struct osd_obj_id obj = {pcol->sbi->s_pid,
		251	pcol->inode->i_ino + EXOFS_OBJ_OFF};
		252	struct osd_request *or = NULL;
		253	struct page_collect *pcol_copy = NULL;
		254	loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
		255	int ret;
		256
		257	if (!pcol->bio)
		258	return 0;
		259
		260	/* see comment in _readpage() about sync reads */
		261	WARN_ON(is_sync && (pcol->nr_pages != 1));
		262
		263	or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
		264	if (unlikely(!or)) {
		265	ret = -ENOMEM;
		266	goto err;
		267	}
		268
		269	osd_req_read(or, &obj, pcol->bio, i_start);
		270
		271	if (is_sync) {
		272	exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred);
		273	return __readpages_done(or, pcol, false);
		274	}
		275
		276	pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
		277	if (!pcol_copy) {
		278	ret = -ENOMEM;
		279	goto err;
		280	}
		281
		282	pcol_copy = pcol;
		283	ret = exofs_async_op(or, readpages_done, pcol_copy, oi->i_cred);
		284	if (unlikely(ret))
		285	goto err;
		286
		287	atomic_inc(&pcol->sbi->s_curr_pending);
		288
		289	EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
		290	obj.id, _LLU(i_start), pcol->length);
		291
		292	/* pages ownership was passed to pcol_copy */
		293	_pcol_reset(pcol);
		294	return 0;
		295
		296	err:
		297	if (!is_sync)
		298	_unlock_pcol_pages(pcol, ret, READ);
		299	kfree(pcol_copy);
		300	if (or)
		301	osd_end_request(or);
		302	return ret;
		303	}
		304
		305	/* readpage_strip is called either directly from readpage() or by the VFS from
		306	* within read_cache_pages(), to add one more page to be read. It will try to
		307	* collect as many contiguous pages as posible. If a discontinuity is
		308	* encountered, or it runs out of resources, it will submit the previous segment
		309	* and will start a new collection. Eventually caller must submit the last
		310	* segment if present.
		311	*/
		312	static int readpage_strip(void data, struct page page)
		313	{
		314	struct page_collect *pcol = data;
		315	struct inode *inode = pcol->inode;
		316	struct exofs_i_info *oi = exofs_i(inode);
		317	loff_t i_size = i_size_read(inode);
		318	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
		319	size_t len;
		320	int ret;
		321
		322	/* FIXME: Just for debugging, will be removed */
		323	if (PageUptodate(page))
		324	EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
		325	page->index);
		326
		327	if (page->index < end_index)
		328	len = PAGE_CACHE_SIZE;
		329	else if (page->index == end_index)
		330	len = i_size & ~PAGE_CACHE_MASK;
		331	else
		332	len = 0;
		333
		334	if (!len \|\| !obj_created(oi)) {
		335	/* this will be out of bounds, or doesn't exist yet.
		336	* Current page is cleared and the request is split
		337	*/
		338	clear_highpage(page);
		339
		340	SetPageUptodate(page);
		341	if (PageError(page))
		342	ClearPageError(page);
		343
		344	unlock_page(page);
		345	EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
		346	" splitting\n", inode->i_ino, page->index);
		347
		348	return read_exec(pcol, false);
		349	}
		350
		351	try_again:
		352
		353	if (unlikely(pcol->pg_first == -1)) {
		354	pcol->pg_first = page->index;
		355	} else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
		356	page->index)) {
		357	/* Discontinuity detected, split the request */
		358	ret = read_exec(pcol, false);
		359	if (unlikely(ret))
		360	goto fail;
		361	goto try_again;
		362	}
		363
		364	if (!pcol->bio) {
		365	ret = pcol_try_alloc(pcol);
		366	if (unlikely(ret))
		367	goto fail;
		368	}
		369
		370	if (len != PAGE_CACHE_SIZE)
		371	zero_user(page, len, PAGE_CACHE_SIZE - len);
		372
		373	EXOFS_DBGMSG(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
		374	inode->i_ino, page->index, len);
		375
		376	ret = pcol_add_page(pcol, page, len);
		377	if (ret) {
		378	EXOFS_DBGMSG("Failed pcol_add_page pages[i]=%p "
		379	"this_len=0x%zx nr_pages=%u length=0x%lx\n",
		380	page, len, pcol->nr_pages, pcol->length);
		381
		382	/* split the request, and start again with current page */
		383	ret = read_exec(pcol, false);
		384	if (unlikely(ret))
		385	goto fail;
		386
		387	goto try_again;
		388	}
		389
		390	return 0;
		391
		392	fail:
		393	/* SetPageError(page); ??? */
		394	unlock_page(page);
		395	return ret;
		396	}
		397
		398	static int exofs_readpages(struct file file, struct address_space mapping,
		399	struct list_head *pages, unsigned nr_pages)
		400	{
		401	struct page_collect pcol;
		402	int ret;
		403
		404	_pcol_init(&pcol, nr_pages, mapping->host);
		405
		406	ret = read_cache_pages(mapping, pages, readpage_strip, &pcol);
		407	if (ret) {
		408	EXOFS_ERR("read_cache_pages => %d\n", ret);
		409	return ret;
		410	}
		411
		412	return read_exec(&pcol, false);
		413	}
		414
		415	static int _readpage(struct page *page, bool is_sync)
		416	{
		417	struct page_collect pcol;
		418	int ret;
		419
		420	_pcol_init(&pcol, 1, page->mapping->host);
		421
		422	/* readpage_strip might call read_exec(,async) inside at several places
		423	* but this is safe for is_async=0 since read_exec will not do anything
		424	* when we have a single page.
		425	*/
		426	ret = readpage_strip(&pcol, page);
		427	if (ret) {
		428	EXOFS_ERR("_readpage => %d\n", ret);
		429	return ret;
		430	}
		431
		432	return read_exec(&pcol, is_sync);
		433	}
		434
		435	/*
		436	* We don't need the file
		437	*/
		438	static int exofs_readpage(struct file file, struct page page)
		439	{
		440	return _readpage(page, false);
		441	}
		442
		443	/* Callback for osd_write. All writes are asynchronouse */
		444	static void writepages_done(struct osd_request or, void p)
		445	{
		446	struct page_collect *pcol = p;
		447	struct bio_vec *bvec;
		448	int i;
		449	u64 resid;
		450	u64 good_bytes;
		451	u64 length = 0;
		452
		453	int ret = exofs_check_ok_resid(or, NULL, &resid);
		454
		455	osd_end_request(or);
		456	atomic_dec(&pcol->sbi->s_curr_pending);
		457
		458	if (likely(!ret))
		459	good_bytes = pcol->length;
		460	else if (!resid)
		461	good_bytes = 0;
		462	else
		463	good_bytes = pcol->length - resid;
		464
		465	EXOFS_DBGMSG("writepages_done(0x%lx) good_bytes=0x%llx"
		466	" length=0x%lx nr_pages=%u\n",
		467	pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
		468	pcol->nr_pages);
		469
		470	__bio_for_each_segment(bvec, pcol->bio, i, 0) {
		471	struct page *page = bvec->bv_page;
		472	struct inode *inode = page->mapping->host;
		473	int page_stat;
		474
		475	if (inode != pcol->inode)
		476	continue; /* osd might add more pages to a bio */
		477
		478	if (likely(length < good_bytes))
		479	page_stat = 0;
		480	else
		481	page_stat = ret;
		482
		483	update_write_page(page, page_stat);
		484	unlock_page(page);
		485	EXOFS_DBGMSG(" writepages_done(0x%lx, 0x%lx) status=%d\n",
		486	inode->i_ino, page->index, page_stat);
		487
		488	length += bvec->bv_len;
		489	}
		490
		491	pcol_free(pcol);
		492	kfree(pcol);
		493	EXOFS_DBGMSG("writepages_done END\n");
		494	}
		495
		496	static int write_exec(struct page_collect *pcol)
		497	{
		498	struct exofs_i_info *oi = exofs_i(pcol->inode);
		499	struct osd_obj_id obj = {pcol->sbi->s_pid,
		500	pcol->inode->i_ino + EXOFS_OBJ_OFF};
		501	struct osd_request *or = NULL;
		502	struct page_collect *pcol_copy = NULL;
		503	loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
		504	int ret;
		505
		506	if (!pcol->bio)
		507	return 0;
		508
		509	or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
		510	if (unlikely(!or)) {
		511	EXOFS_ERR("write_exec: Faild to osd_start_request()\n");
		512	ret = -ENOMEM;
		513	goto err;
		514	}
		515
		516	pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
		517	if (!pcol_copy) {
		518	EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n");
		519	ret = -ENOMEM;
		520	goto err;
		521	}
		522
		523	pcol_copy = pcol;
		524
		525	osd_req_write(or, &obj, pcol_copy->bio, i_start);
		526	ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred);
		527	if (unlikely(ret)) {
		528	EXOFS_ERR("write_exec: exofs_async_op() Faild\n");
		529	goto err;
		530	}
		531
		532	atomic_inc(&pcol->sbi->s_curr_pending);
		533	EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
		534	pcol->inode->i_ino, pcol->pg_first, _LLU(i_start),
		535	pcol->length);
		536	/* pages ownership was passed to pcol_copy */
		537	_pcol_reset(pcol);
		538	return 0;
		539
		540	err:
		541	_unlock_pcol_pages(pcol, ret, WRITE);
		542	kfree(pcol_copy);
		543	if (or)
		544	osd_end_request(or);
		545	return ret;
		546	}
		547
		548	/* writepage_strip is called either directly from writepage() or by the VFS from
		549	* within write_cache_pages(), to add one more page to be written to storage.
		550	* It will try to collect as many contiguous pages as possible. If a
		551	* discontinuity is encountered or it runs out of resources it will submit the
		552	* previous segment and will start a new collection.
		553	* Eventually caller must submit the last segment if present.
		554	*/
		555	static int writepage_strip(struct page *page,
		556	struct writeback_control wbc_unused, void data)
		557	{
		558	struct page_collect *pcol = data;
		559	struct inode *inode = pcol->inode;
		560	struct exofs_i_info *oi = exofs_i(inode);
		561	loff_t i_size = i_size_read(inode);
		562	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
		563	size_t len;
		564	int ret;
		565
		566	BUG_ON(!PageLocked(page));
		567
		568	ret = wait_obj_created(oi);
		569	if (unlikely(ret))
		570	goto fail;
		571
		572	if (page->index < end_index)
		573	/* in this case, the page is within the limits of the file */
		574	len = PAGE_CACHE_SIZE;
		575	else {
		576	len = i_size & ~PAGE_CACHE_MASK;
		577
		578	if (page->index > end_index \|\| !len) {
		579	/* in this case, the page is outside the limits
		580	* (truncate in progress)
		581	*/
		582	ret = write_exec(pcol);
		583	if (unlikely(ret))
		584	goto fail;
		585	if (PageError(page))
		586	ClearPageError(page);
		587	unlock_page(page);
		588	return 0;
		589	}
		590	}
		591
		592	try_again:
		593
		594	if (unlikely(pcol->pg_first == -1)) {
		595	pcol->pg_first = page->index;
		596	} else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
		597	page->index)) {
		598	/* Discontinuity detected, split the request */
		599	ret = write_exec(pcol);
		600	if (unlikely(ret))
		601	goto fail;
		602	goto try_again;
		603	}
		604
		605	if (!pcol->bio) {
		606	ret = pcol_try_alloc(pcol);
		607	if (unlikely(ret))
		608	goto fail;
		609	}
		610
		611	EXOFS_DBGMSG(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
		612	inode->i_ino, page->index, len);
		613
		614	ret = pcol_add_page(pcol, page, len);
		615	if (unlikely(ret)) {
		616	EXOFS_DBGMSG("Failed pcol_add_page "
		617	"nr_pages=%u total_length=0x%lx\n",
		618	pcol->nr_pages, pcol->length);
		619
		620	/* split the request, next loop will start again */
		621	ret = write_exec(pcol);
		622	if (unlikely(ret)) {
		623	EXOFS_DBGMSG("write_exec faild => %d", ret);
		624	goto fail;
		625	}
		626
		627	goto try_again;
		628	}
		629
		630	BUG_ON(PageWriteback(page));
		631	set_page_writeback(page);
		632
		633	return 0;
		634
		635	fail:
		636	set_bit(AS_EIO, &page->mapping->flags);
		637	unlock_page(page);
		638	return ret;
		639	}
		640
		641	static int exofs_writepages(struct address_space *mapping,
		642	struct writeback_control *wbc)
		643	{
		644	struct page_collect pcol;
		645	long start, end, expected_pages;
		646	int ret;
		647
		648	start = wbc->range_start >> PAGE_CACHE_SHIFT;
		649	end = (wbc->range_end == LLONG_MAX) ?
		650	start + mapping->nrpages :
		651	wbc->range_end >> PAGE_CACHE_SHIFT;
		652
		653	if (start \|\| end)
		654	expected_pages = min(end - start + 1, 32L);
		655	else
		656	expected_pages = mapping->nrpages;
		657
		658	EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx"
		659	" m->nrpages=%lu start=0x%lx end=0x%lx\n",
		660	mapping->host->i_ino, wbc->range_start, wbc->range_end,
		661	mapping->nrpages, start, end);
		662
		663	_pcol_init(&pcol, expected_pages, mapping->host);
		664
		665	ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
		666	if (ret) {
		667	EXOFS_ERR("write_cache_pages => %d\n", ret);
		668	return ret;
		669	}
		670
		671	return write_exec(&pcol);
		672	}
		673
		674	static int exofs_writepage(struct page page, struct writeback_control wbc)
		675	{
		676	struct page_collect pcol;
		677	int ret;
		678
		679	_pcol_init(&pcol, 1, page->mapping->host);
		680
		681	ret = writepage_strip(page, NULL, &pcol);
		682	if (ret) {
		683	EXOFS_ERR("exofs_writepage => %d\n", ret);
		684	return ret;
		685	}
		686
		687	return write_exec(&pcol);
		688	}
		689
		690	int exofs_write_begin(struct file file, struct address_space mapping,
		691	loff_t pos, unsigned len, unsigned flags,
		692	struct page pagep, void fsdata)
		693	{
		694	int ret = 0;
		695	struct page *page;
		696
		697	page = *pagep;
		698	if (page == NULL) {
		699	ret = simple_write_begin(file, mapping, pos, len, flags, pagep,
		700	fsdata);
		701	if (ret) {
		702	EXOFS_DBGMSG("simple_write_begin faild\n");
		703	return ret;
		704	}
		705
		706	page = *pagep;
		707	}
		708
		709	/* read modify write */
		710	if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
		711	ret = _readpage(page, true);
		712	if (ret) {
		713	/SetPageError was done by _readpage. Is it ok?/
		714	unlock_page(page);
		715	EXOFS_DBGMSG("__readpage_filler faild\n");
		716	}
		717	}
		718
		719	return ret;
		720	}
		721
		722	static int exofs_write_begin_export(struct file *file,
		723	struct address_space *mapping,
		724	loff_t pos, unsigned len, unsigned flags,
		725	struct page pagep, void fsdata)
		726	{
		727	*pagep = NULL;
		728
		729	return exofs_write_begin(file, mapping, pos, len, flags, pagep,
		730	fsdata);
		731	}
		732
		733	const struct address_space_operations exofs_aops = {
		734	.readpage = exofs_readpage,
		735	.readpages = exofs_readpages,
		736	.writepage = exofs_writepage,
		737	.writepages = exofs_writepages,
		738	.write_begin = exofs_write_begin_export,
		739	.write_end = simple_write_end,
		740	};
		741
45	/******************************************************************************	742	/******************************************************************************
46	* INODE OPERATIONS	743	* INODE OPERATIONS
47	*****************************************************************************/	744	*****************************************************************************/