ocfs2: teach ocfs2_file_aio_write() about sparse files

Unfortunately, ocfs2 can no longer make use of generic_file_aio_write_nlock() because allocating writes will require zeroing of pages adjacent to the I/O for cluster sizes greater than page size. Implement a custom file write here, which can order page locks for zeroing. This also has the advantage that cluster locks can easily be ordered outside of the page locks. Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
author: Mark Fasheh <mark.fasheh@oracle.com> 2007-02-09 23:24:12 -0500
committer: Mark Fasheh <mark.fasheh@oracle.com> 2007-04-26 18:02:08 -0400
commit: 9517bac6cc7a7aa4fee63cb38a32cb6014e264c7 (patch)
tree: 3cac0c18d0cacc316e0e8a60f483282d6f991779 /fs/ocfs2/aops.c
parent: 89488984ac23b0580f959b9ee549f2fcb1c2f194 (diff)
1 files changed, 663 insertions, 16 deletions
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f3b0cc5cba1a..5ffb3702b5e9 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -24,6 +24,7 @@
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <asm/byteorder.h>
+#include <linux/swap.h>
 #define MLOG_MASK_PREFIX ML_FILE_IO
 #include <cluster/masklog.h>
@@ -37,6 +38,7 @@
 #include "file.h"
 #include "inode.h"
 #include "journal.h"
+#include "suballoc.h"
 #include "super.h"
 #include "symlink.h"
@@ -645,23 +647,27 @@ static ssize_t ocfs2_direct_IO(int rw,
        mlog_entry_void();
-        /*
+        if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
-         * We get PR data locks even for O_DIRECT.  This allows
+                /*
-         * concurrent O_DIRECT I/O but doesn't let O_DIRECT with
+                 * We get PR data locks even for O_DIRECT.  This
-         * extending and buffered zeroing writes race.  If they did
+                 * allows concurrent O_DIRECT I/O but doesn't let
-         * race then the buffered zeroing could be written back after
+                 * O_DIRECT with extending and buffered zeroing writes
-         * the O_DIRECT I/O.  It's one thing to tell people not to mix
+                 * race.  If they did race then the buffered zeroing
-         * buffered and O_DIRECT writes, but expecting them to
+                 * could be written back after the O_DIRECT I/O.  It's
-         * understand that file extension is also an implicit buffered
+                 * one thing to tell people not to mix buffered and
-         * write is too much.  By getting the PR we force writeback of
+                 * O_DIRECT writes, but expecting them to understand
-         * the buffered zeroing before proceeding.
+                 * that file extension is also an implicit buffered
-         */
+                 * write is too much.  By getting the PR we force
-        ret = ocfs2_data_lock(inode, 0);
+                 * writeback of the buffered zeroing before
-        if (ret < 0) {
+                 * proceeding.
-                mlog_errno(ret);
+                 */
-                goto out;
+                ret = ocfs2_data_lock(inode, 0);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ocfs2_data_unlock(inode, 0);
        }
-        ocfs2_data_unlock(inode, 0);
        ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
                                            inode->i_sb->s_bdev, iov, offset,
@@ -673,6 +679,647 @@ out:
        return ret;
 }
+static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
+                                            u32 cpos,
+                                            unsigned int *start,
+                                            unsigned int *end)
+{
+        unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
+        if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
+                unsigned int cpp;
+                cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
+                cluster_start = cpos % cpp;
+                cluster_start = cluster_start << osb->s_clustersize_bits;
+                cluster_end = cluster_start + osb->s_clustersize;
+        }
+        BUG_ON(cluster_start > PAGE_SIZE);
+        BUG_ON(cluster_end > PAGE_SIZE);
+        if (start)
+                *start = cluster_start;
+        if (end)
+                *end = cluster_end;
+}
+/*
+ * 'from' and 'to' are the region in the page to avoid zeroing.
+ *
+ * If pagesize > clustersize, this function will avoid zeroing outside
+ * of the cluster boundary.
+ *
+ * from == to == 0 is code for "zero the entire cluster region"
+ */
+static void ocfs2_clear_page_regions(struct page *page,
+                                     struct ocfs2_super *osb, u32 cpos,
+                                     unsigned from, unsigned to)
+{
+        void *kaddr;
+        unsigned int cluster_start, cluster_end;
+        ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
+        kaddr = kmap_atomic(page, KM_USER0);
+        if (from || to) {
+                if (from > cluster_start)
+                        memset(kaddr + cluster_start, 0, from - cluster_start);
+                if (to < cluster_end)
+                        memset(kaddr + to, 0, cluster_end - to);
+        } else {
+                memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
+        }
+        kunmap_atomic(kaddr, KM_USER0);
+}
+/*
+ * Some of this taken from block_prepare_write(). We already have our
+ * mapping by now though, and the entire write will be allocating or
+ * it won't, so not much need to use BH_New.
+ *
+ * This will also skip zeroing, which is handled externally.
+ */
+static int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+                                 struct inode *inode, unsigned int from,
+                                 unsigned int to, int new)
+{
+        int ret = 0;
+        struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
+        unsigned int block_end, block_start;
+        unsigned int bsize = 1 << inode->i_blkbits;
+        if (!page_has_buffers(page))
+                create_empty_buffers(page, bsize, 0);
+        head = page_buffers(page);
+        for (bh = head, block_start = 0; bh != head || !block_start;
+             bh = bh->b_this_page, block_start += bsize) {
+                block_end = block_start + bsize;
+                /*
+                 * Ignore blocks outside of our i/o range -
+                 * they may belong to unallocated clusters.
+                 */
+                if (block_start >= to ||
+                    (block_start + bsize) <= from) {
+                        if (PageUptodate(page))
+                                set_buffer_uptodate(bh);
+                        continue;
+                }
+                /*
+                 * For an allocating write with cluster size >= page
+                 * size, we always write the entire page.
+                 */
+                if (buffer_new(bh))
+                        clear_buffer_new(bh);
+                if (!buffer_mapped(bh)) {
+                        map_bh(bh, inode->i_sb, *p_blkno);
+                        unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+                }
+                if (PageUptodate(page)) {
+                        if (!buffer_uptodate(bh))
+                                set_buffer_uptodate(bh);
+                } else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
+                     (block_start < from || block_end > to)) {
+                        ll_rw_block(READ, 1, &bh);
+                        *wait_bh++=bh;
+                }
+                *p_blkno = *p_blkno + 1;
+        }
+        /*
+         * If we issued read requests - let them complete.
+         */
+        while(wait_bh > wait) {
+                wait_on_buffer(*--wait_bh);
+                if (!buffer_uptodate(*wait_bh))
+                        ret = -EIO;
+        }
+        if (ret == 0 || !new)
+                return ret;
+        /*
+         * If we get -EIO above, zero out any newly allocated blocks
+         * to avoid exposing stale data.
+         */
+        bh = head;
+        block_start = 0;
+        do {
+                void *kaddr;
+                block_end = block_start + bsize;
+                if (block_end <= from)
+                        goto next_bh;
+                if (block_start >= to)
+                        break;
+                kaddr = kmap_atomic(page, KM_USER0);
+                memset(kaddr+block_start, 0, bh->b_size);
+                flush_dcache_page(page);
+                kunmap_atomic(kaddr, KM_USER0);
+                set_buffer_uptodate(bh);
+                mark_buffer_dirty(bh);
+next_bh:
+                block_start = block_end;
+                bh = bh->b_this_page;
+        } while (bh != head);
+        return ret;
+}
+/*
+ * This will copy user data from the iovec in the buffered write
+ * context.
+ */
+int ocfs2_map_and_write_user_data(struct inode *inode,
+                                  struct ocfs2_write_ctxt *wc, u64 *p_blkno,
+                                  unsigned int *ret_from, unsigned int *ret_to)
+{
+        int ret;
+        unsigned int to, from, cluster_start, cluster_end;
+        unsigned long bytes, src_from;
+        char *dst;
+        struct ocfs2_buffered_write_priv *bp = wc->w_private;
+        const struct iovec *cur_iov = bp->b_cur_iov;
+        char __user *buf;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
+                                        &cluster_end);
+        buf = cur_iov->iov_base + bp->b_cur_off;
+        src_from = (unsigned long)buf & ~PAGE_CACHE_MASK;
+        from = wc->w_pos & (PAGE_CACHE_SIZE - 1);
+        /*
+         * This is a lot of comparisons, but it reads quite
+         * easily, which is important here.
+         */
+        /* Stay within the src page */
+        bytes = PAGE_SIZE - src_from;
+        /* Stay within the vector */
+        bytes = min(bytes,
+                    (unsigned long)(cur_iov->iov_len - bp->b_cur_off));
+        /* Stay within count */
+        bytes = min(bytes, (unsigned long)wc->w_count);
+        /*
+         * For clustersize > page size, just stay within
+         * target page, otherwise we have to calculate pos
+         * within the cluster and obey the rightmost
+         * boundary.
+         */
+        if (wc->w_large_pages) {
+                /*
+                 * For cluster size < page size, we have to
+                 * calculate pos within the cluster and obey
+                 * the rightmost boundary.
+                 */
+                bytes = min(bytes, (unsigned long)(osb->s_clustersize
+                                   - (wc->w_pos & (osb->s_clustersize - 1))));
+        } else {
+                /*
+                 * cluster size > page size is the most common
+                 * case - we just stay within the target page
+                 * boundary.
+                 */
+                bytes = min(bytes, PAGE_CACHE_SIZE - from);
+        }
+        to = from + bytes;
+        if (wc->w_this_page_new)
+                ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+                                            cluster_start, cluster_end, 1);
+        else
+                ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+                                            from, to, 0);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        BUG_ON(from > PAGE_CACHE_SIZE);
+        BUG_ON(to > PAGE_CACHE_SIZE);
+        BUG_ON(from > osb->s_clustersize);
+        BUG_ON(to > osb->s_clustersize);
+        dst = kmap(wc->w_this_page);
+        memcpy(dst + from, bp->b_src_buf + src_from, bytes);
+        kunmap(wc->w_this_page);
+        /*
+         * XXX: This is slow, but simple. The caller of
+         * ocfs2_buffered_write_cluster() is responsible for
+         * passing through the iovecs, so it's difficult to
+         * predict what our next step is in here after our
+         * initial write. A future version should be pushing
+         * that iovec manipulation further down.
+         *
+         * By setting this, we indicate that a copy from user
+         * data was done, and subsequent calls for this
+         * cluster will skip copying more data.
+         */
+        wc->w_finished_copy = 1;
+        *ret_from = from;
+        *ret_to = to;
+out:
+        return bytes ? (unsigned int)bytes : ret;
+}
+/*
+ * Map, fill and write a page to disk.
+ *
+ * The work of copying data is done via callback.  Newly allocated
+ * pages which don't take user data will be zero'd (set 'new' to
+ * indicate an allocating write)
+ *
+ * Returns a negative error code or the number of bytes copied into
+ * the page.
+ */
+int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
+                          u64 *p_blkno, struct page *page,
+                          struct ocfs2_write_ctxt *wc, int new)
+{
+        int ret, copied = 0;
+        unsigned int from = 0, to = 0;
+        unsigned int cluster_start, cluster_end;
+        unsigned int zero_from = 0, zero_to = 0;
+        ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos,
+                                        &cluster_start, &cluster_end);
+        if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index
+            && !wc->w_finished_copy) {
+                wc->w_this_page = page;
+                wc->w_this_page_new = new;
+                ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                copied = ret;
+                zero_from = from;
+                zero_to = to;
+                if (new) {
+                        from = cluster_start;
+                        to = cluster_end;
+                }
+        } else {
+                /*
+                 * If we haven't allocated the new page yet, we
+                 * shouldn't be writing it out without copying user
+                 * data. This is likely a math error from the caller.
+                 */
+                BUG_ON(!new);
+                from = cluster_start;
+                to = cluster_end;
+                ret = ocfs2_map_page_blocks(page, p_blkno, inode,
+                                            cluster_start, cluster_end, 1);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        /*
+         * Parts of newly allocated pages need to be zero'd.
+         *
+         * Above, we have also rewritten 'to' and 'from' - as far as
+         * the rest of the function is concerned, the entire cluster
+         * range inside of a page needs to be written.
+         *
+         * We can skip this if the page is up to date - it's already
+         * been zero'd from being read in as a hole.
+         */
+        if (new && !PageUptodate(page))
+                ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
+                                         wc->w_cpos, zero_from, zero_to);
+        flush_dcache_page(page);
+        if (ocfs2_should_order_data(inode)) {
+                ret = walk_page_buffers(handle,
+                                        page_buffers(page),
+                                        from, to, NULL,
+                                        ocfs2_journal_dirty_data);
+                if (ret < 0)
+                        mlog_errno(ret);
+        }
+        /*
+         * We don't use generic_commit_write() because we need to
+         * handle our own i_size update.
+         */
+        ret = block_commit_write(page, from, to);
+        if (ret)
+                mlog_errno(ret);
+out:
+        return copied ? copied : ret;
+}
+/*
+ * Do the actual write of some data into an inode. Optionally allocate
+ * in order to fulfill the write.
+ *
+ * cpos is the logical cluster offset within the file to write at
+ *
+ * 'phys' is the physical mapping of that offset. a 'phys' value of
+ * zero indicates that allocation is required. In this case, data_ac
+ * and meta_ac should be valid (meta_ac can be null if metadata
+ * allocation isn't required).
+ */
+static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
+                           struct buffer_head *di_bh,
+                           struct ocfs2_alloc_context *data_ac,
+                           struct ocfs2_alloc_context *meta_ac,
+                           struct ocfs2_write_ctxt *wc)
+{
+        int ret, i, numpages = 1, new;
+        unsigned int copied = 0;
+        u32 tmp_pos;
+        u64 v_blkno, p_blkno;
+        struct address_space *mapping = file->f_mapping;
+        struct inode *inode = mapping->host;
+        unsigned int cbits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+        unsigned long index, start;
+        struct page **cpages;
+        new = phys == 0 ? 1 : 0;
+        /*
+         * Figure out how many pages we'll be manipulating here. For
+         * non-allocating write, or any writes where cluster size is
+         * less than page size, we only need one page. Otherwise,
+         * allocating writes of cluster size larger than page size
+         * need cluster size pages.
+         */
+        if (new && !wc->w_large_pages)
+                numpages = (1 << cbits) / PAGE_SIZE;
+        cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
+        if (!cpages) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                return ret;
+        }
+        /*
+         * Fill our page array first. That way we've grabbed enough so
+         * that we can zero and flush if we error after adding the
+         * extent.
+         */
+        if (new) {
+                start = ocfs2_align_clusters_to_page_index(inode->i_sb,
+                                                           wc->w_cpos);
+                v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos);
+        } else {
+                start = wc->w_pos >> PAGE_CACHE_SHIFT;
+                v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits;
+        }
+        for(i = 0; i < numpages; i++) {
+                index = start + i;
+                cpages[i] = grab_cache_page(mapping, index);
+                if (!cpages[i]) {
+                        ret = -ENOMEM;
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        if (new) {
+                /*
+                 * This is safe to call with the page locks - it won't take
+                 * any additional semaphores or cluster locks.
+                 */
+                tmp_pos = wc->w_cpos;
+                ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
+                                                 &tmp_pos, 1, di_bh, handle,
+                                                 data_ac, meta_ac, NULL);
+                /*
+                 * This shouldn't happen because we must have already
+                 * calculated the correct meta data allocation required. The
+                 * internal tree allocation code should know how to increase
+                 * transaction credits itself.
+                 *
+                 * If need be, we could handle -EAGAIN for a
+                 * RESTART_TRANS here.
+                 */
+                mlog_bug_on_msg(ret == -EAGAIN,
+                                "Inode %llu: EAGAIN return during allocation.\n",
+                                (unsigned long long)OCFS2_I(inode)->ip_blkno);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL);
+        if (ret < 0) {
+                /*
+                 * XXX: Should we go readonly here?
+                 */
+                mlog_errno(ret);
+                goto out;
+        }
+        BUG_ON(p_blkno == 0);
+        for(i = 0; i < numpages; i++) {
+                ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i],
+                                            wc, new);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                copied += ret;
+        }
+out:
+        for(i = 0; i < numpages; i++) {
+                unlock_page(cpages[i]);
+                mark_page_accessed(cpages[i]);
+                page_cache_release(cpages[i]);
+        }
+        kfree(cpages);
+        return copied ? copied : ret;
+}
+static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc,
+                                  struct ocfs2_super *osb, loff_t pos,
+                                  size_t count, ocfs2_page_writer *cb,
+                                  void *cb_priv)
+{
+        wc->w_count = count;
+        wc->w_pos = pos;
+        wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits;
+        wc->w_finished_copy = 0;
+        if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
+                wc->w_large_pages = 1;
+        else
+                wc->w_large_pages = 0;
+        wc->w_write_data_page = cb;
+        wc->w_private = cb_priv;
+}
+/*
+ * Write a cluster to an inode. The cluster may not be allocated yet,
+ * in which case it will be. This only exists for buffered writes -
+ * O_DIRECT takes a more "traditional" path through the kernel.
+ *
+ * The caller is responsible for incrementing pos, written counts, etc
+ *
+ * For file systems that don't support sparse files, pre-allocation
+ * and page zeroing up until cpos should be done prior to this
+ * function call.
+ *
+ * Callers should be holding i_sem, and the rw cluster lock.
+ *
+ * Returns the number of user bytes written, or less than zero for
+ * error.
+ */
+ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
+                                     size_t count, ocfs2_page_writer *actor,
+                                     void *priv)
+{
+        int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
+        ssize_t written = 0;
+        u32 phys;
+        struct inode *inode = file->f_mapping->host;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct buffer_head *di_bh = NULL;
+        struct ocfs2_dinode *di;
+        struct ocfs2_alloc_context *data_ac = NULL;
+        struct ocfs2_alloc_context *meta_ac = NULL;
+        handle_t *handle;
+        struct ocfs2_write_ctxt wc;
+        ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv);
+        ret = ocfs2_meta_lock(inode, &di_bh, 1);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        di = (struct ocfs2_dinode *)di_bh->b_data;
+        /*
+         * Take alloc sem here to prevent concurrent lookups. That way
+         * the mapping, zeroing and tree manipulation within
+         * ocfs2_write() will be safe against ->readpage(). This
+         * should also serve to lock out allocation from a shared
+         * writeable region.
+         */
+        down_write(&OCFS2_I(inode)->ip_alloc_sem);
+        ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_meta;
+        }
+        /* phys == 0 means that allocation is required. */
+        if (phys == 0) {
+                ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out_meta;
+                }
+                credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1);
+        }
+        ret = ocfs2_data_lock(inode, 1);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_meta;
+        }
+        handle = ocfs2_start_trans(osb, credits);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out_data;
+        }
+        written = ocfs2_write(file, phys, handle, di_bh, data_ac,
+                              meta_ac, &wc);
+        if (written < 0) {
+                ret = written;
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ret = ocfs2_journal_access(handle, inode, di_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        pos += written;
+        if (pos > inode->i_size) {
+                i_size_write(inode, pos);
+                mark_inode_dirty(inode);
+        }
+        inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode)));
+        di->i_size = cpu_to_le64((u64)i_size_read(inode));
+        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+        di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+        ret = ocfs2_journal_dirty(handle, di_bh);
+        if (ret)
+                mlog_errno(ret);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out_data:
+        ocfs2_data_unlock(inode, 1);
+out_meta:
+        up_write(&OCFS2_I(inode)->ip_alloc_sem);
+        ocfs2_meta_unlock(inode, 1);
+out:
+        brelse(di_bh);
+        if (data_ac)
+                ocfs2_free_alloc_context(data_ac);
+        if (meta_ac)
+                ocfs2_free_alloc_context(meta_ac);
+        return written ? written : ret;
+}
 const struct address_space_operations ocfs2_aops = {
        .readpage       = ocfs2_readpage,
        .writepage      = ocfs2_writepage,
author	Mark Fasheh <mark.fasheh@oracle.com>	2007-02-09 23:24:12 -0500
committer	Mark Fasheh <mark.fasheh@oracle.com>	2007-04-26 18:02:08 -0400
commit	9517bac6cc7a7aa4fee63cb38a32cb6014e264c7 (patch)
tree	3cac0c18d0cacc316e0e8a60f483282d6f991779 /fs/ocfs2/aops.c
parent	89488984ac23b0580f959b9ee549f2fcb1c2f194 (diff)

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index f3b0cc5cba1a..5ffb3702b5e9 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c
@@ -24,6 +24,7 @@
24	#include <linux/highmem.h>	24	#include <linux/highmem.h>
25	#include <linux/pagemap.h>	25	#include <linux/pagemap.h>
26	#include <asm/byteorder.h>	26	#include <asm/byteorder.h>
		27	#include <linux/swap.h>
27		28
28	#define MLOG_MASK_PREFIX ML_FILE_IO	29	#define MLOG_MASK_PREFIX ML_FILE_IO
29	#include <cluster/masklog.h>	30	#include <cluster/masklog.h>
@@ -37,6 +38,7 @@
37	#include "file.h"	38	#include "file.h"
38	#include "inode.h"	39	#include "inode.h"
39	#include "journal.h"	40	#include "journal.h"
		41	#include "suballoc.h"
40	#include "super.h"	42	#include "super.h"
41	#include "symlink.h"	43	#include "symlink.h"
42		44
@@ -645,23 +647,27 @@ static ssize_t ocfs2_direct_IO(int rw,
645		647
646	mlog_entry_void();	648	mlog_entry_void();
647		649
648	/*	650	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
649	* We get PR data locks even for O_DIRECT. This allows	651	/*
650	* concurrent O_DIRECT I/O but doesn't let O_DIRECT with	652	* We get PR data locks even for O_DIRECT. This
651	* extending and buffered zeroing writes race. If they did	653	* allows concurrent O_DIRECT I/O but doesn't let
652	* race then the buffered zeroing could be written back after	654	* O_DIRECT with extending and buffered zeroing writes
653	* the O_DIRECT I/O. It's one thing to tell people not to mix	655	* race. If they did race then the buffered zeroing
654	* buffered and O_DIRECT writes, but expecting them to	656	* could be written back after the O_DIRECT I/O. It's
655	* understand that file extension is also an implicit buffered	657	* one thing to tell people not to mix buffered and
656	* write is too much. By getting the PR we force writeback of	658	* O_DIRECT writes, but expecting them to understand
657	* the buffered zeroing before proceeding.	659	* that file extension is also an implicit buffered
658	*/	660	* write is too much. By getting the PR we force
659	ret = ocfs2_data_lock(inode, 0);	661	* writeback of the buffered zeroing before
660	if (ret < 0) {	662	* proceeding.
661	mlog_errno(ret);	663	*/
662	goto out;	664	ret = ocfs2_data_lock(inode, 0);
		665	if (ret < 0) {
		666	mlog_errno(ret);
		667	goto out;
		668	}
		669	ocfs2_data_unlock(inode, 0);
663	}	670	}
664	ocfs2_data_unlock(inode, 0);
665		671
666	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,	672	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
667	inode->i_sb->s_bdev, iov, offset,	673	inode->i_sb->s_bdev, iov, offset,
@@ -673,6 +679,647 @@ out:
673	return ret;	679	return ret;
674	}	680	}
675		681
		682	static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
		683	u32 cpos,
		684	unsigned int *start,
		685	unsigned int *end)
		686	{
		687	unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
		688
		689	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
		690	unsigned int cpp;
		691
		692	cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
		693
		694	cluster_start = cpos % cpp;
		695	cluster_start = cluster_start << osb->s_clustersize_bits;
		696
		697	cluster_end = cluster_start + osb->s_clustersize;
		698	}
		699
		700	BUG_ON(cluster_start > PAGE_SIZE);
		701	BUG_ON(cluster_end > PAGE_SIZE);
		702
		703	if (start)
		704	*start = cluster_start;
		705	if (end)
		706	*end = cluster_end;
		707	}
		708
		709	/*
		710	* 'from' and 'to' are the region in the page to avoid zeroing.
		711	*
		712	* If pagesize > clustersize, this function will avoid zeroing outside
		713	* of the cluster boundary.
		714	*
		715	* from == to == 0 is code for "zero the entire cluster region"
		716	*/
		717	static void ocfs2_clear_page_regions(struct page *page,
		718	struct ocfs2_super *osb, u32 cpos,
		719	unsigned from, unsigned to)
		720	{
		721	void *kaddr;
		722	unsigned int cluster_start, cluster_end;
		723
		724	ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
		725
		726	kaddr = kmap_atomic(page, KM_USER0);
		727
		728	if (from \|\| to) {
		729	if (from > cluster_start)
		730	memset(kaddr + cluster_start, 0, from - cluster_start);
		731	if (to < cluster_end)
		732	memset(kaddr + to, 0, cluster_end - to);
		733	} else {
		734	memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
		735	}
		736
		737	kunmap_atomic(kaddr, KM_USER0);
		738	}
		739
		740	/*
		741	* Some of this taken from block_prepare_write(). We already have our
		742	* mapping by now though, and the entire write will be allocating or
		743	* it won't, so not much need to use BH_New.
		744	*
		745	* This will also skip zeroing, which is handled externally.
		746	*/
		747	static int ocfs2_map_page_blocks(struct page page, u64 p_blkno,
		748	struct inode *inode, unsigned int from,
		749	unsigned int to, int new)
		750	{
		751	int ret = 0;
		752	struct buffer_head head, bh, wait[2], *wait_bh = wait;
		753	unsigned int block_end, block_start;
		754	unsigned int bsize = 1 << inode->i_blkbits;
		755
		756	if (!page_has_buffers(page))
		757	create_empty_buffers(page, bsize, 0);
		758
		759	head = page_buffers(page);
		760	for (bh = head, block_start = 0; bh != head \|\| !block_start;
		761	bh = bh->b_this_page, block_start += bsize) {
		762	block_end = block_start + bsize;
		763
		764	/*
		765	* Ignore blocks outside of our i/o range -
		766	* they may belong to unallocated clusters.
		767	*/
		768	if (block_start >= to \|\|
		769	(block_start + bsize) <= from) {
		770	if (PageUptodate(page))
		771	set_buffer_uptodate(bh);
		772	continue;
		773	}
		774
		775	/*
		776	* For an allocating write with cluster size >= page
		777	* size, we always write the entire page.
		778	*/
		779
		780	if (buffer_new(bh))
		781	clear_buffer_new(bh);
		782
		783	if (!buffer_mapped(bh)) {
		784	map_bh(bh, inode->i_sb, *p_blkno);
		785	unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
		786	}
		787
		788	if (PageUptodate(page)) {
		789	if (!buffer_uptodate(bh))
		790	set_buffer_uptodate(bh);
		791	} else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
		792	(block_start < from \|\| block_end > to)) {
		793	ll_rw_block(READ, 1, &bh);
		794	*wait_bh++=bh;
		795	}
		796
		797	p_blkno = p_blkno + 1;
		798	}
		799
		800	/*
		801	* If we issued read requests - let them complete.
		802	*/
		803	while(wait_bh > wait) {
		804	wait_on_buffer(*--wait_bh);
		805	if (!buffer_uptodate(*wait_bh))
		806	ret = -EIO;
		807	}
		808
		809	if (ret == 0 \|\| !new)
		810	return ret;
		811
		812	/*
		813	* If we get -EIO above, zero out any newly allocated blocks
		814	* to avoid exposing stale data.
		815	*/
		816	bh = head;
		817	block_start = 0;
		818	do {
		819	void *kaddr;
		820
		821	block_end = block_start + bsize;
		822	if (block_end <= from)
		823	goto next_bh;
		824	if (block_start >= to)
		825	break;
		826
		827	kaddr = kmap_atomic(page, KM_USER0);
		828	memset(kaddr+block_start, 0, bh->b_size);
		829	flush_dcache_page(page);
		830	kunmap_atomic(kaddr, KM_USER0);
		831	set_buffer_uptodate(bh);
		832	mark_buffer_dirty(bh);
		833
		834	next_bh:
		835	block_start = block_end;
		836	bh = bh->b_this_page;
		837	} while (bh != head);
		838
		839	return ret;
		840	}
		841
		842	/*
		843	* This will copy user data from the iovec in the buffered write
		844	* context.
		845	*/
		846	int ocfs2_map_and_write_user_data(struct inode *inode,
		847	struct ocfs2_write_ctxt wc, u64 p_blkno,
		848	unsigned int ret_from, unsigned int ret_to)
		849	{
		850	int ret;
		851	unsigned int to, from, cluster_start, cluster_end;
		852	unsigned long bytes, src_from;
		853	char *dst;
		854	struct ocfs2_buffered_write_priv *bp = wc->w_private;
		855	const struct iovec *cur_iov = bp->b_cur_iov;
		856	char __user *buf;
		857	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
		858
		859	ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
		860	&cluster_end);
		861
		862	buf = cur_iov->iov_base + bp->b_cur_off;
		863	src_from = (unsigned long)buf & ~PAGE_CACHE_MASK;
		864
		865	from = wc->w_pos & (PAGE_CACHE_SIZE - 1);
		866
		867	/*
		868	* This is a lot of comparisons, but it reads quite
		869	* easily, which is important here.
		870	*/
		871	/* Stay within the src page */
		872	bytes = PAGE_SIZE - src_from;
		873	/* Stay within the vector */
		874	bytes = min(bytes,
		875	(unsigned long)(cur_iov->iov_len - bp->b_cur_off));
		876	/* Stay within count */
		877	bytes = min(bytes, (unsigned long)wc->w_count);
		878	/*
		879	* For clustersize > page size, just stay within
		880	* target page, otherwise we have to calculate pos
		881	* within the cluster and obey the rightmost
		882	* boundary.
		883	*/
		884	if (wc->w_large_pages) {
		885	/*
		886	* For cluster size < page size, we have to
		887	* calculate pos within the cluster and obey
		888	* the rightmost boundary.
		889	*/
		890	bytes = min(bytes, (unsigned long)(osb->s_clustersize
		891	- (wc->w_pos & (osb->s_clustersize - 1))));
		892	} else {
		893	/*
		894	* cluster size > page size is the most common
		895	* case - we just stay within the target page
		896	* boundary.
		897	*/
		898	bytes = min(bytes, PAGE_CACHE_SIZE - from);
		899	}
		900
		901	to = from + bytes;
		902
		903	if (wc->w_this_page_new)
		904	ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
		905	cluster_start, cluster_end, 1);
		906	else
		907	ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
		908	from, to, 0);
		909	if (ret) {
		910	mlog_errno(ret);
		911	goto out;
		912	}
		913
		914	BUG_ON(from > PAGE_CACHE_SIZE);
		915	BUG_ON(to > PAGE_CACHE_SIZE);
		916	BUG_ON(from > osb->s_clustersize);
		917	BUG_ON(to > osb->s_clustersize);
		918
		919	dst = kmap(wc->w_this_page);
		920	memcpy(dst + from, bp->b_src_buf + src_from, bytes);
		921	kunmap(wc->w_this_page);
		922
		923	/*
		924	* XXX: This is slow, but simple. The caller of
		925	* ocfs2_buffered_write_cluster() is responsible for
		926	* passing through the iovecs, so it's difficult to
		927	* predict what our next step is in here after our
		928	* initial write. A future version should be pushing
		929	* that iovec manipulation further down.
		930	*
		931	* By setting this, we indicate that a copy from user
		932	* data was done, and subsequent calls for this
		933	* cluster will skip copying more data.
		934	*/
		935	wc->w_finished_copy = 1;
		936
		937	*ret_from = from;
		938	*ret_to = to;
		939	out:
		940
		941	return bytes ? (unsigned int)bytes : ret;
		942	}
		943
		944	/*
		945	* Map, fill and write a page to disk.
		946	*
		947	* The work of copying data is done via callback. Newly allocated
		948	* pages which don't take user data will be zero'd (set 'new' to
		949	* indicate an allocating write)
		950	*
		951	* Returns a negative error code or the number of bytes copied into
		952	* the page.
		953	*/
		954	int ocfs2_write_data_page(struct inode inode, handle_t handle,
		955	u64 p_blkno, struct page page,
		956	struct ocfs2_write_ctxt *wc, int new)
		957	{
		958	int ret, copied = 0;
		959	unsigned int from = 0, to = 0;
		960	unsigned int cluster_start, cluster_end;
		961	unsigned int zero_from = 0, zero_to = 0;
		962
		963	ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos,
		964	&cluster_start, &cluster_end);
		965
		966	if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index
		967	&& !wc->w_finished_copy) {
		968
		969	wc->w_this_page = page;
		970	wc->w_this_page_new = new;
		971	ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to);
		972	if (ret < 0) {
		973	mlog_errno(ret);
		974	goto out;
		975	}
		976
		977	copied = ret;
		978
		979	zero_from = from;
		980	zero_to = to;
		981	if (new) {
		982	from = cluster_start;
		983	to = cluster_end;
		984	}
		985	} else {
		986	/*
		987	* If we haven't allocated the new page yet, we
		988	* shouldn't be writing it out without copying user
		989	* data. This is likely a math error from the caller.
		990	*/
		991	BUG_ON(!new);
		992
		993	from = cluster_start;
		994	to = cluster_end;
		995
		996	ret = ocfs2_map_page_blocks(page, p_blkno, inode,
		997	cluster_start, cluster_end, 1);
		998	if (ret) {
		999	mlog_errno(ret);
		1000	goto out;
		1001	}
		1002	}
		1003
		1004	/*
		1005	* Parts of newly allocated pages need to be zero'd.
		1006	*
		1007	* Above, we have also rewritten 'to' and 'from' - as far as
		1008	* the rest of the function is concerned, the entire cluster
		1009	* range inside of a page needs to be written.
		1010	*
		1011	* We can skip this if the page is up to date - it's already
		1012	* been zero'd from being read in as a hole.
		1013	*/
		1014	if (new && !PageUptodate(page))
		1015	ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
		1016	wc->w_cpos, zero_from, zero_to);
		1017
		1018	flush_dcache_page(page);
		1019
		1020	if (ocfs2_should_order_data(inode)) {
		1021	ret = walk_page_buffers(handle,
		1022	page_buffers(page),
		1023	from, to, NULL,
		1024	ocfs2_journal_dirty_data);
		1025	if (ret < 0)
		1026	mlog_errno(ret);
		1027	}
		1028
		1029	/*
		1030	* We don't use generic_commit_write() because we need to
		1031	* handle our own i_size update.
		1032	*/
		1033	ret = block_commit_write(page, from, to);
		1034	if (ret)
		1035	mlog_errno(ret);
		1036	out:
		1037
		1038	return copied ? copied : ret;
		1039	}
		1040
		1041	/*
		1042	* Do the actual write of some data into an inode. Optionally allocate
		1043	* in order to fulfill the write.
		1044	*
		1045	* cpos is the logical cluster offset within the file to write at
		1046	*
		1047	* 'phys' is the physical mapping of that offset. a 'phys' value of
		1048	* zero indicates that allocation is required. In this case, data_ac
		1049	* and meta_ac should be valid (meta_ac can be null if metadata
		1050	* allocation isn't required).
		1051	*/
		1052	static ssize_t ocfs2_write(struct file file, u32 phys, handle_t handle,
		1053	struct buffer_head *di_bh,
		1054	struct ocfs2_alloc_context *data_ac,
		1055	struct ocfs2_alloc_context *meta_ac,
		1056	struct ocfs2_write_ctxt *wc)
		1057	{
		1058	int ret, i, numpages = 1, new;
		1059	unsigned int copied = 0;
		1060	u32 tmp_pos;
		1061	u64 v_blkno, p_blkno;
		1062	struct address_space *mapping = file->f_mapping;
		1063	struct inode *inode = mapping->host;
		1064	unsigned int cbits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
		1065	unsigned long index, start;
		1066	struct page **cpages;
		1067
		1068	new = phys == 0 ? 1 : 0;
		1069
		1070	/*
		1071	* Figure out how many pages we'll be manipulating here. For
		1072	* non-allocating write, or any writes where cluster size is
		1073	* less than page size, we only need one page. Otherwise,
		1074	* allocating writes of cluster size larger than page size
		1075	* need cluster size pages.
		1076	*/
		1077	if (new && !wc->w_large_pages)
		1078	numpages = (1 << cbits) / PAGE_SIZE;
		1079
		1080	cpages = kzalloc(sizeof(cpages) numpages, GFP_NOFS);
		1081	if (!cpages) {
		1082	ret = -ENOMEM;
		1083	mlog_errno(ret);
		1084	return ret;
		1085	}
		1086
		1087	/*
		1088	* Fill our page array first. That way we've grabbed enough so
		1089	* that we can zero and flush if we error after adding the
		1090	* extent.
		1091	*/
		1092	if (new) {
		1093	start = ocfs2_align_clusters_to_page_index(inode->i_sb,
		1094	wc->w_cpos);
		1095	v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos);
		1096	} else {
		1097	start = wc->w_pos >> PAGE_CACHE_SHIFT;
		1098	v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits;
		1099	}
		1100
		1101	for(i = 0; i < numpages; i++) {
		1102	index = start + i;
		1103
		1104	cpages[i] = grab_cache_page(mapping, index);
		1105	if (!cpages[i]) {
		1106	ret = -ENOMEM;
		1107	mlog_errno(ret);
		1108	goto out;
		1109	}
		1110	}
		1111
		1112	if (new) {
		1113	/*
		1114	* This is safe to call with the page locks - it won't take
		1115	* any additional semaphores or cluster locks.
		1116	*/
		1117	tmp_pos = wc->w_cpos;
		1118	ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
		1119	&tmp_pos, 1, di_bh, handle,
		1120	data_ac, meta_ac, NULL);
		1121	/*
		1122	* This shouldn't happen because we must have already
		1123	* calculated the correct meta data allocation required. The
		1124	* internal tree allocation code should know how to increase
		1125	* transaction credits itself.
		1126	*
		1127	* If need be, we could handle -EAGAIN for a
		1128	* RESTART_TRANS here.
		1129	*/
		1130	mlog_bug_on_msg(ret == -EAGAIN,
		1131	"Inode %llu: EAGAIN return during allocation.\n",
		1132	(unsigned long long)OCFS2_I(inode)->ip_blkno);
		1133	if (ret < 0) {
		1134	mlog_errno(ret);
		1135	goto out;
		1136	}
		1137	}
		1138
		1139	ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL);
		1140	if (ret < 0) {
		1141
		1142	/*
		1143	* XXX: Should we go readonly here?
		1144	*/
		1145
		1146	mlog_errno(ret);
		1147	goto out;
		1148	}
		1149
		1150	BUG_ON(p_blkno == 0);
		1151
		1152	for(i = 0; i < numpages; i++) {
		1153	ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i],
		1154	wc, new);
		1155	if (ret < 0) {
		1156	mlog_errno(ret);
		1157	goto out;
		1158	}
		1159
		1160	copied += ret;
		1161	}
		1162
		1163	out:
		1164	for(i = 0; i < numpages; i++) {
		1165	unlock_page(cpages[i]);
		1166	mark_page_accessed(cpages[i]);
		1167	page_cache_release(cpages[i]);
		1168	}
		1169	kfree(cpages);
		1170
		1171	return copied ? copied : ret;
		1172	}
		1173
		1174	static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc,
		1175	struct ocfs2_super *osb, loff_t pos,
		1176	size_t count, ocfs2_page_writer *cb,
		1177	void *cb_priv)
		1178	{
		1179	wc->w_count = count;
		1180	wc->w_pos = pos;
		1181	wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits;
		1182	wc->w_finished_copy = 0;
		1183
		1184	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
		1185	wc->w_large_pages = 1;
		1186	else
		1187	wc->w_large_pages = 0;
		1188
		1189	wc->w_write_data_page = cb;
		1190	wc->w_private = cb_priv;
		1191	}
		1192
		1193	/*
		1194	* Write a cluster to an inode. The cluster may not be allocated yet,
		1195	* in which case it will be. This only exists for buffered writes -
		1196	* O_DIRECT takes a more "traditional" path through the kernel.
		1197	*
		1198	* The caller is responsible for incrementing pos, written counts, etc
		1199	*
		1200	* For file systems that don't support sparse files, pre-allocation
		1201	* and page zeroing up until cpos should be done prior to this
		1202	* function call.
		1203	*
		1204	* Callers should be holding i_sem, and the rw cluster lock.
		1205	*
		1206	* Returns the number of user bytes written, or less than zero for
		1207	* error.
		1208	*/
		1209	ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
		1210	size_t count, ocfs2_page_writer *actor,
		1211	void *priv)
		1212	{
		1213	int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
		1214	ssize_t written = 0;
		1215	u32 phys;
		1216	struct inode *inode = file->f_mapping->host;
		1217	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
		1218	struct buffer_head *di_bh = NULL;
		1219	struct ocfs2_dinode *di;
		1220	struct ocfs2_alloc_context *data_ac = NULL;
		1221	struct ocfs2_alloc_context *meta_ac = NULL;
		1222	handle_t *handle;
		1223	struct ocfs2_write_ctxt wc;
		1224
		1225	ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv);
		1226
		1227	ret = ocfs2_meta_lock(inode, &di_bh, 1);
		1228	if (ret) {
		1229	mlog_errno(ret);
		1230	goto out;
		1231	}
		1232	di = (struct ocfs2_dinode *)di_bh->b_data;
		1233
		1234	/*
		1235	* Take alloc sem here to prevent concurrent lookups. That way
		1236	* the mapping, zeroing and tree manipulation within
		1237	* ocfs2_write() will be safe against ->readpage(). This
		1238	* should also serve to lock out allocation from a shared
		1239	* writeable region.
		1240	*/
		1241	down_write(&OCFS2_I(inode)->ip_alloc_sem);
		1242
		1243	ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL);
		1244	if (ret) {
		1245	mlog_errno(ret);
		1246	goto out_meta;
		1247	}
		1248
		1249	/* phys == 0 means that allocation is required. */
		1250	if (phys == 0) {
		1251	ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac);
		1252	if (ret) {
		1253	mlog_errno(ret);
		1254	goto out_meta;
		1255	}
		1256
		1257	credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1);
		1258	}
		1259
		1260	ret = ocfs2_data_lock(inode, 1);
		1261	if (ret) {
		1262	mlog_errno(ret);
		1263	goto out_meta;
		1264	}
		1265
		1266	handle = ocfs2_start_trans(osb, credits);
		1267	if (IS_ERR(handle)) {
		1268	ret = PTR_ERR(handle);
		1269	mlog_errno(ret);
		1270	goto out_data;
		1271	}
		1272
		1273	written = ocfs2_write(file, phys, handle, di_bh, data_ac,
		1274	meta_ac, &wc);
		1275	if (written < 0) {
		1276	ret = written;
		1277	mlog_errno(ret);
		1278	goto out_commit;
		1279	}
		1280
		1281	ret = ocfs2_journal_access(handle, inode, di_bh,
		1282	OCFS2_JOURNAL_ACCESS_WRITE);
		1283	if (ret) {
		1284	mlog_errno(ret);
		1285	goto out_commit;
		1286	}
		1287
		1288	pos += written;
		1289	if (pos > inode->i_size) {
		1290	i_size_write(inode, pos);
		1291	mark_inode_dirty(inode);
		1292	}
		1293	inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode)));
		1294	di->i_size = cpu_to_le64((u64)i_size_read(inode));
		1295	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
		1296	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
		1297	di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
		1298
		1299	ret = ocfs2_journal_dirty(handle, di_bh);
		1300	if (ret)
		1301	mlog_errno(ret);
		1302
		1303	out_commit:
		1304	ocfs2_commit_trans(osb, handle);
		1305
		1306	out_data:
		1307	ocfs2_data_unlock(inode, 1);
		1308
		1309	out_meta:
		1310	up_write(&OCFS2_I(inode)->ip_alloc_sem);
		1311	ocfs2_meta_unlock(inode, 1);
		1312
		1313	out:
		1314	brelse(di_bh);
		1315	if (data_ac)
		1316	ocfs2_free_alloc_context(data_ac);
		1317	if (meta_ac)
		1318	ocfs2_free_alloc_context(meta_ac);
		1319
		1320	return written ? written : ret;
		1321	}
		1322
676	const struct address_space_operations ocfs2_aops = {	1323	const struct address_space_operations ocfs2_aops = {
677	.readpage = ocfs2_readpage,	1324	.readpage = ocfs2_readpage,
678	.writepage = ocfs2_writepage,	1325	.writepage = ocfs2_writepage,