ocfs2: teach ocfs2_file_aio_write() about sparse files

Unfortunately, ocfs2 can no longer make use of generic_file_aio_write_nlock() because allocating writes will require zeroing of pages adjacent to the I/O for cluster sizes greater than page size. Implement a custom file write here, which can order page locks for zeroing. This also has the advantage that cluster locks can easily be ordered outside of the page locks. Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
author: Mark Fasheh <mark.fasheh@oracle.com> 2007-02-09 23:24:12 -0500
committer: Mark Fasheh <mark.fasheh@oracle.com> 2007-04-26 18:02:08 -0400
commit: 9517bac6cc7a7aa4fee63cb38a32cb6014e264c7 (patch)
tree: 3cac0c18d0cacc316e0e8a60f483282d6f991779 /fs/ocfs2/file.c
parent: 89488984ac23b0580f959b9ee549f2fcb1c2f194 (diff)
1 files changed, 335 insertions, 39 deletions
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 3bcf3629265..667e5a869bf 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -33,6 +33,7 @@
 #include <linux/sched.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/mount.h>
+#include <linux/writeback.h>
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@@ -485,13 +486,13 @@ leave:
 * accessed, and lock them, reserving the appropriate number of bits.
 *
 * Called from ocfs2_extend_allocation() for file systems which don't
- * support holes, and from ocfs2_prepare_write() for file systems
+ * support holes, and from ocfs2_write() for file systems which
- * which understand sparse inodes.
+ * understand sparse inodes.
 */
-static int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
+int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
-                                 u32 clusters_to_add,
+                          u32 clusters_to_add,
-                                 struct ocfs2_alloc_context **data_ac,
+                          struct ocfs2_alloc_context **data_ac,
-                                 struct ocfs2_alloc_context **meta_ac)
+                          struct ocfs2_alloc_context **meta_ac)
 {
        int ret, num_free_extents;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -518,7 +519,7 @@ static int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
         * a cluster lock (because we ran out of room for another
         * extent) will violate ordering rules.
         *
-         * Most of the time we'll only be seeing this 1 page at a time
+         * Most of the time we'll only be seeing this 1 cluster at a time
         * anyway.
         */
        if (!num_free_extents ||
@@ -596,13 +597,6 @@ static int ocfs2_extend_allocation(struct inode *inode,
 restart_all:
        BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
-        status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
-                                       &meta_ac);
-        if (status) {
-                mlog_errno(status);
-                goto leave;
-        }
        /* blocks peope in read/write from reading our allocation
         * until we're done changing it. We depend on i_mutex to block
         * other extend/truncate calls while we're here. Ordering wrt
@@ -610,6 +604,13 @@ restart_all:
        down_write(&OCFS2_I(inode)->ip_alloc_sem);
        drop_alloc_sem = 1;
+        status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
+                                       &meta_ac);
+        if (status) {
+                mlog_errno(status);
+                goto leave;
+        }
        credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
        handle = ocfs2_start_trans(osb, credits);
        if (IS_ERR(handle)) {
@@ -1088,10 +1089,49 @@ out:
        return ret;
 }
+/*
+ * Will look for holes and unwritten extents in the range starting at
+ * pos for count bytes (inclusive).
+ */
+static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
+                                       size_t count)
+{
+        int ret = 0;
+        unsigned int extent_flags;
+        u32 cpos, clusters, extent_len, phys_cpos;
+        struct super_block *sb = inode->i_sb;
+        cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
+        clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
+        while (clusters) {
+                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
+                                         &extent_flags);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
+                        ret = 1;
+                        break;
+                }
+                if (extent_len > clusters)
+                        extent_len = clusters;
+                clusters -= extent_len;
+                cpos += extent_len;
+        }
+out:
+        return ret;
+}
 static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                                         loff_t *ppos,
                                         size_t count,
-                                         int appending)
+                                         int appending,
+                                         int *direct_io)
 {
        int ret = 0, meta_level = appending;
        struct inode *inode = dentry->d_inode;
@@ -1143,12 +1183,47 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                        saved_pos = *ppos;
                }
+                if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
+                        loff_t end = saved_pos + count;
+                        /*
+                         * Skip the O_DIRECT checks if we don't need
+                         * them.
+                         */
+                        if (!direct_io || !(*direct_io))
+                                break;
+                        /*
+                         * Allowing concurrent direct writes means
+                         * i_size changes wouldn't be synchronized, so
+                         * one node could wind up truncating another
+                         * nodes writes.
+                         */
+                        if (end > i_size_read(inode)) {
+                                *direct_io = 0;
+                                break;
+                        }
+                        /*
+                         * We don't fill holes during direct io, so
+                         * check for them here. If any are found, the
+                         * caller will have to retake some cluster
+                         * locks and initiate the io as buffered.
+                         */
+                        ret = ocfs2_check_range_for_holes(inode, saved_pos,
+                                                          count);
+                        if (ret == 1) {
+                                *direct_io = 0;
+                                ret = 0;
+                        } else if (ret < 0)
+                                mlog_errno(ret);
+                        break;
+                }
                /*
                 * The rest of this loop is concerned with legacy file
                 * systems which don't support sparse files.
                 */
-                if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
-                        break;
                newsize = count + saved_pos;
@@ -1202,55 +1277,264 @@ out:
        return ret;
 }
+static inline void
+ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
+{
+        const struct iovec *iov = *iovp;
+        size_t base = *basep;
+        do {
+                int copy = min(bytes, iov->iov_len - base);
+                bytes -= copy;
+                base += copy;
+                if (iov->iov_len == base) {
+                        iov++;
+                        base = 0;
+                }
+        } while (bytes);
+        *iovp = iov;
+        *basep = base;
+}
+static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp,
+                                            const struct iovec *cur_iov,
+                                            size_t iov_offset)
+{
+        int ret;
+        char *buf;
+        struct page *src_page = NULL;
+        buf = cur_iov->iov_base + iov_offset;
+        if (!segment_eq(get_fs(), KERNEL_DS)) {
+                /*
+                 * Pull in the user page. We want to do this outside
+                 * of the meta data locks in order to preserve locking
+                 * order in case of page fault.
+                 */
+                ret = get_user_pages(current, current->mm,
+                                     (unsigned long)buf & PAGE_CACHE_MASK, 1,
+                                     0, 0, &src_page, NULL);
+                if (ret == 1)
+                        bp->b_src_buf = kmap(src_page);
+                else
+                        src_page = ERR_PTR(-EFAULT);
+        } else {
+                bp->b_src_buf = buf;
+        }
+        return src_page;
+}
+static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp,
+                                   struct page *page)
+{
+        if (page) {
+                kunmap(page);
+                page_cache_release(page);
+        }
+}
+static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
+                                         const struct iovec *iov,
+                                         unsigned long nr_segs,
+                                         size_t count,
+                                         ssize_t o_direct_written)
+{
+        int ret = 0;
+        ssize_t copied, total = 0;
+        size_t iov_offset = 0;
+        const struct iovec *cur_iov = iov;
+        struct ocfs2_buffered_write_priv bp;
+        struct page *page;
+        /*
+         * handle partial DIO write.  Adjust cur_iov if needed.
+         */
+        ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
+        do {
+                bp.b_cur_off = iov_offset;
+                bp.b_cur_iov = cur_iov;
+                page = ocfs2_get_write_source(&bp, cur_iov, iov_offset);
+                if (IS_ERR(page)) {
+                        ret = PTR_ERR(page);
+                        goto out;
+                }
+                copied = ocfs2_buffered_write_cluster(file, *ppos, count,
+                                                      ocfs2_map_and_write_user_data,
+                                                      &bp);
+                ocfs2_put_write_source(&bp, page);
+                if (copied < 0) {
+                        mlog_errno(copied);
+                        ret = copied;
+                        goto out;
+                }
+                total += copied;
+                *ppos = *ppos + copied;
+                count -= copied;
+                ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
+        } while(count);
+out:
+        return total ? total : ret;
+}
+static int ocfs2_check_iovec(const struct iovec *iov, size_t *counted,
+                             unsigned long *nr_segs)
+{
+        size_t ocount;          /* original count */
+        unsigned long seg;
+        ocount = 0;
+        for (seg = 0; seg < *nr_segs; seg++) {
+                const struct iovec *iv = &iov[seg];
+                /*
+                 * If any segment has a negative length, or the cumulative
+                 * length ever wraps negative then return -EINVAL.
+                 */
+                ocount += iv->iov_len;
+                if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
+                        return -EINVAL;
+                if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
+                        continue;
+                if (seg == 0)
+                        return -EFAULT;
+                *nr_segs = seg;
+                ocount -= iv->iov_len;  /* This segment is no good */
+                break;
+        }
+        *counted = ocount;
+        return 0;
+}
 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
                                    const struct iovec *iov,
                                    unsigned long nr_segs,
                                    loff_t pos)
 {
-        int ret, rw_level, have_alloc_sem = 0;
+        int ret, direct_io, appending, rw_level, have_alloc_sem  = 0;
-        struct file *filp = iocb->ki_filp;
+        int can_do_direct, sync = 0;
-        struct inode *inode = filp->f_path.dentry->d_inode;
+        ssize_t written = 0;
-        int appending = filp->f_flags & O_APPEND ? 1 : 0;
+        size_t ocount;          /* original count */
+        size_t count;           /* after file limit checks */
-        mlog_entry("(0x%p, %u, '%.*s')\n", filp,
+        loff_t *ppos = &iocb->ki_pos;
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file->f_path.dentry->d_inode;
+        mlog_entry("(0x%p, %u, '%.*s')\n", file,
                   (unsigned int)nr_segs,
-                   filp->f_path.dentry->d_name.len,
+                   file->f_path.dentry->d_name.len,
-                   filp->f_path.dentry->d_name.name);
+                   file->f_path.dentry->d_name.name);
-        /* happy write of zero bytes */
        if (iocb->ki_left == 0)
                return 0;
+        ret = ocfs2_check_iovec(iov, &ocount, &nr_segs);
+        if (ret)
+                return ret;
+        count = ocount;
+        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+        appending = file->f_flags & O_APPEND ? 1 : 0;
+        direct_io = file->f_flags & O_DIRECT ? 1 : 0;
        mutex_lock(&inode->i_mutex);
+relock:
        /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
-        if (filp->f_flags & O_DIRECT) {
+        if (direct_io) {
-                have_alloc_sem = 1;
                down_read(&inode->i_alloc_sem);
+                have_alloc_sem = 1;
        }
        /* concurrent O_DIRECT writes are allowed */
-        rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;
+        rw_level = !direct_io;
        ret = ocfs2_rw_lock(inode, rw_level);
        if (ret < 0) {
-                rw_level = -1;
                mlog_errno(ret);
-                goto out;
+                goto out_sems;
        }
-        ret = ocfs2_prepare_inode_for_write(filp->f_path.dentry, &iocb->ki_pos,
+        can_do_direct = direct_io;
-                                            iocb->ki_left, appending);
+        ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
+                                            iocb->ki_left, appending,
+                                            &can_do_direct);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
        }
+        /*
+         * We can't complete the direct I/O as requested, fall back to
+         * buffered I/O.
+         */
+        if (direct_io && !can_do_direct) {
+                ocfs2_rw_unlock(inode, rw_level);
+                up_read(&inode->i_alloc_sem);
+                have_alloc_sem = 0;
+                rw_level = -1;
+                direct_io = 0;
+                sync = 1;
+                goto relock;
+        }
+        if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode)))
+                sync = 1;
+        /*
+         * XXX: Is it ok to execute these checks a second time?
+         */
+        ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode));
+        if (ret)
+                goto out;
+        /*
+         * Set pos so that sync_page_range_nolock() below understands
+         * where to start from. We might've moved it around via the
+         * calls above. The range we want to actually sync starts from
+         * *ppos here.
+         *
+         */
+        pos = *ppos;
        /* communicate with ocfs2_dio_end_io */
        ocfs2_iocb_set_rw_locked(iocb);
-        ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos);
+        if (direct_io) {
+                written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
+                                                    ppos, count, ocount);
+                if (written < 0) {
+                        ret = written;
+                        goto out_dio;
+                }
+        } else {
+                written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs,
+                                                    count, written);
+                if (written < 0) {
+                        ret = written;
+                        if (ret != -EFAULT || ret != -ENOSPC)
+                                mlog_errno(ret);
+                        goto out;
+                }
+        }
+out_dio:
        /* buffered aio wouldn't have proper lock coverage today */
-        BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
+        BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
        /* 
         * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
@@ -1268,14 +1552,25 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
        }
 out:
+        if (rw_level != -1)
+                ocfs2_rw_unlock(inode, rw_level);
+out_sems:
        if (have_alloc_sem)
                up_read(&inode->i_alloc_sem);
-        if (rw_level != -1) 
-                ocfs2_rw_unlock(inode, rw_level);
+        if (written > 0 && sync) {
+                ssize_t err;
+                err = sync_page_range_nolock(inode, file->f_mapping, pos, count);
+                if (err < 0)
+                        written = err;
+        }
        mutex_unlock(&inode->i_mutex);
        mlog_exit(ret);
-        return ret;
+        return written ? written : ret;
 }
 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
@@ -1300,7 +1595,8 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
                goto out;
        }
-        ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0);
+        ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,
+                                            NULL);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_unlock;
author	Mark Fasheh <mark.fasheh@oracle.com>	2007-02-09 23:24:12 -0500
committer	Mark Fasheh <mark.fasheh@oracle.com>	2007-04-26 18:02:08 -0400
commit	9517bac6cc7a7aa4fee63cb38a32cb6014e264c7 (patch)
tree	3cac0c18d0cacc316e0e8a60f483282d6f991779 /fs/ocfs2/file.c
parent	89488984ac23b0580f959b9ee549f2fcb1c2f194 (diff)

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 3bcf3629265..667e5a869bf 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c
@@ -33,6 +33,7 @@
33	#include <linux/sched.h>	33	#include <linux/sched.h>
34	#include <linux/pipe_fs_i.h>	34	#include <linux/pipe_fs_i.h>
35	#include <linux/mount.h>	35	#include <linux/mount.h>
		36	#include <linux/writeback.h>
36		37
37	#define MLOG_MASK_PREFIX ML_INODE	38	#define MLOG_MASK_PREFIX ML_INODE
38	#include <cluster/masklog.h>	39	#include <cluster/masklog.h>
@@ -485,13 +486,13 @@ leave:
485	* accessed, and lock them, reserving the appropriate number of bits.	486	* accessed, and lock them, reserving the appropriate number of bits.
486	*	487	*
487	* Called from ocfs2_extend_allocation() for file systems which don't	488	* Called from ocfs2_extend_allocation() for file systems which don't
488	* support holes, and from ocfs2_prepare_write() for file systems	489	* support holes, and from ocfs2_write() for file systems which
489	* which understand sparse inodes.	490	* understand sparse inodes.
490	*/	491	*/
491	static int ocfs2_lock_allocators(struct inode inode, struct ocfs2_dinode di,	492	int ocfs2_lock_allocators(struct inode inode, struct ocfs2_dinode di,
492	u32 clusters_to_add,	493	u32 clusters_to_add,
493	struct ocfs2_alloc_context **data_ac,	494	struct ocfs2_alloc_context **data_ac,
494	struct ocfs2_alloc_context **meta_ac)	495	struct ocfs2_alloc_context **meta_ac)
495	{	496	{
496	int ret, num_free_extents;	497	int ret, num_free_extents;
497	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	498	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -518,7 +519,7 @@ static int ocfs2_lock_allocators(struct inode inode, struct ocfs2_dinode di,
518	* a cluster lock (because we ran out of room for another	519	* a cluster lock (because we ran out of room for another
519	* extent) will violate ordering rules.	520	* extent) will violate ordering rules.
520	*	521	*
521	* Most of the time we'll only be seeing this 1 page at a time	522	* Most of the time we'll only be seeing this 1 cluster at a time
522	* anyway.	523	* anyway.
523	*/	524	*/
524	if (!num_free_extents \|\|	525	if (!num_free_extents \|\|
@@ -596,13 +597,6 @@ static int ocfs2_extend_allocation(struct inode *inode,
596	restart_all:	597	restart_all:
597	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);	598	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
598		599
599	status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
600	&meta_ac);
601	if (status) {
602	mlog_errno(status);
603	goto leave;
604	}
605
606	/* blocks peope in read/write from reading our allocation	600	/* blocks peope in read/write from reading our allocation
607	* until we're done changing it. We depend on i_mutex to block	601	* until we're done changing it. We depend on i_mutex to block
608	* other extend/truncate calls while we're here. Ordering wrt	602	* other extend/truncate calls while we're here. Ordering wrt
@@ -610,6 +604,13 @@ restart_all:
610	down_write(&OCFS2_I(inode)->ip_alloc_sem);	604	down_write(&OCFS2_I(inode)->ip_alloc_sem);
611	drop_alloc_sem = 1;	605	drop_alloc_sem = 1;
612		606
		607	status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
		608	&meta_ac);
		609	if (status) {
		610	mlog_errno(status);
		611	goto leave;
		612	}
		613
613	credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);	614	credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
614	handle = ocfs2_start_trans(osb, credits);	615	handle = ocfs2_start_trans(osb, credits);
615	if (IS_ERR(handle)) {	616	if (IS_ERR(handle)) {
@@ -1088,10 +1089,49 @@ out:
1088	return ret;	1089	return ret;
1089	}	1090	}
1090		1091
		1092	/*
		1093	* Will look for holes and unwritten extents in the range starting at
		1094	* pos for count bytes (inclusive).
		1095	*/
		1096	static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
		1097	size_t count)
		1098	{
		1099	int ret = 0;
		1100	unsigned int extent_flags;
		1101	u32 cpos, clusters, extent_len, phys_cpos;
		1102	struct super_block *sb = inode->i_sb;
		1103
		1104	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
		1105	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
		1106
		1107	while (clusters) {
		1108	ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
		1109	&extent_flags);
		1110	if (ret < 0) {
		1111	mlog_errno(ret);
		1112	goto out;
		1113	}
		1114
		1115	if (phys_cpos == 0 \|\| (extent_flags & OCFS2_EXT_UNWRITTEN)) {
		1116	ret = 1;
		1117	break;
		1118	}
		1119
		1120	if (extent_len > clusters)
		1121	extent_len = clusters;
		1122
		1123	clusters -= extent_len;
		1124	cpos += extent_len;
		1125	}
		1126	out:
		1127	return ret;
		1128	}
		1129
1091	static int ocfs2_prepare_inode_for_write(struct dentry *dentry,	1130	static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1092	loff_t *ppos,	1131	loff_t *ppos,
1093	size_t count,	1132	size_t count,
1094	int appending)	1133	int appending,
		1134	int *direct_io)
1095	{	1135	{
1096	int ret = 0, meta_level = appending;	1136	int ret = 0, meta_level = appending;
1097	struct inode *inode = dentry->d_inode;	1137	struct inode *inode = dentry->d_inode;
@@ -1143,12 +1183,47 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1143	saved_pos = *ppos;	1183	saved_pos = *ppos;
1144	}	1184	}
1145		1185
		1186	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
		1187	loff_t end = saved_pos + count;
		1188
		1189	/*
		1190	* Skip the O_DIRECT checks if we don't need
		1191	* them.
		1192	*/
		1193	if (!direct_io \|\| !(*direct_io))
		1194	break;
		1195
		1196	/*
		1197	* Allowing concurrent direct writes means
		1198	* i_size changes wouldn't be synchronized, so
		1199	* one node could wind up truncating another
		1200	* nodes writes.
		1201	*/
		1202	if (end > i_size_read(inode)) {
		1203	*direct_io = 0;
		1204	break;
		1205	}
		1206
		1207	/*
		1208	* We don't fill holes during direct io, so
		1209	* check for them here. If any are found, the
		1210	* caller will have to retake some cluster
		1211	* locks and initiate the io as buffered.
		1212	*/
		1213	ret = ocfs2_check_range_for_holes(inode, saved_pos,
		1214	count);
		1215	if (ret == 1) {
		1216	*direct_io = 0;
		1217	ret = 0;
		1218	} else if (ret < 0)
		1219	mlog_errno(ret);
		1220	break;
		1221	}
		1222
1146	/*	1223	/*
1147	* The rest of this loop is concerned with legacy file	1224	* The rest of this loop is concerned with legacy file
1148	* systems which don't support sparse files.	1225	* systems which don't support sparse files.
1149	*/	1226	*/
1150	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
1151	break;
1152		1227
1153	newsize = count + saved_pos;	1228	newsize = count + saved_pos;
1154		1229
@@ -1202,55 +1277,264 @@ out:
1202	return ret;	1277	return ret;
1203	}	1278	}
1204		1279
		1280	static inline void
		1281	ocfs2_set_next_iovec(const struct iovec *iovp, size_t basep, size_t bytes)
		1282	{
		1283	const struct iovec iov = iovp;
		1284	size_t base = *basep;
		1285
		1286	do {
		1287	int copy = min(bytes, iov->iov_len - base);
		1288
		1289	bytes -= copy;
		1290	base += copy;
		1291	if (iov->iov_len == base) {
		1292	iov++;
		1293	base = 0;
		1294	}
		1295	} while (bytes);
		1296	*iovp = iov;
		1297	*basep = base;
		1298	}
		1299
		1300	static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp,
		1301	const struct iovec *cur_iov,
		1302	size_t iov_offset)
		1303	{
		1304	int ret;
		1305	char *buf;
		1306	struct page *src_page = NULL;
		1307
		1308	buf = cur_iov->iov_base + iov_offset;
		1309
		1310	if (!segment_eq(get_fs(), KERNEL_DS)) {
		1311	/*
		1312	* Pull in the user page. We want to do this outside
		1313	* of the meta data locks in order to preserve locking
		1314	* order in case of page fault.
		1315	*/
		1316	ret = get_user_pages(current, current->mm,
		1317	(unsigned long)buf & PAGE_CACHE_MASK, 1,
		1318	0, 0, &src_page, NULL);
		1319	if (ret == 1)
		1320	bp->b_src_buf = kmap(src_page);
		1321	else
		1322	src_page = ERR_PTR(-EFAULT);
		1323	} else {
		1324	bp->b_src_buf = buf;
		1325	}
		1326
		1327	return src_page;
		1328	}
		1329
		1330	static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp,
		1331	struct page *page)
		1332	{
		1333	if (page) {
		1334	kunmap(page);
		1335	page_cache_release(page);
		1336	}
		1337	}
		1338
		1339	static ssize_t ocfs2_file_buffered_write(struct file file, loff_t ppos,
		1340	const struct iovec *iov,
		1341	unsigned long nr_segs,
		1342	size_t count,
		1343	ssize_t o_direct_written)
		1344	{
		1345	int ret = 0;
		1346	ssize_t copied, total = 0;
		1347	size_t iov_offset = 0;
		1348	const struct iovec *cur_iov = iov;
		1349	struct ocfs2_buffered_write_priv bp;
		1350	struct page *page;
		1351
		1352	/*
		1353	* handle partial DIO write. Adjust cur_iov if needed.
		1354	*/
		1355	ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
		1356
		1357	do {
		1358	bp.b_cur_off = iov_offset;
		1359	bp.b_cur_iov = cur_iov;
		1360
		1361	page = ocfs2_get_write_source(&bp, cur_iov, iov_offset);
		1362	if (IS_ERR(page)) {
		1363	ret = PTR_ERR(page);
		1364	goto out;
		1365	}
		1366
		1367	copied = ocfs2_buffered_write_cluster(file, *ppos, count,
		1368	ocfs2_map_and_write_user_data,
		1369	&bp);
		1370
		1371	ocfs2_put_write_source(&bp, page);
		1372
		1373	if (copied < 0) {
		1374	mlog_errno(copied);
		1375	ret = copied;
		1376	goto out;
		1377	}
		1378
		1379	total += copied;
		1380	ppos = ppos + copied;
		1381	count -= copied;
		1382
		1383	ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
		1384	} while(count);
		1385
		1386	out:
		1387	return total ? total : ret;
		1388	}
		1389
		1390	static int ocfs2_check_iovec(const struct iovec iov, size_t counted,
		1391	unsigned long *nr_segs)
		1392	{
		1393	size_t ocount; /* original count */
		1394	unsigned long seg;
		1395
		1396	ocount = 0;
		1397	for (seg = 0; seg < *nr_segs; seg++) {
		1398	const struct iovec *iv = &iov[seg];
		1399
		1400	/*
		1401	* If any segment has a negative length, or the cumulative
		1402	* length ever wraps negative then return -EINVAL.
		1403	*/
		1404	ocount += iv->iov_len;
		1405	if (unlikely((ssize_t)(ocount\|iv->iov_len) < 0))
		1406	return -EINVAL;
		1407	if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
		1408	continue;
		1409	if (seg == 0)
		1410	return -EFAULT;
		1411	*nr_segs = seg;
		1412	ocount -= iv->iov_len; /* This segment is no good */
		1413	break;
		1414	}
		1415
		1416	*counted = ocount;
		1417	return 0;
		1418	}
		1419
1205	static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,	1420	static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1206	const struct iovec *iov,	1421	const struct iovec *iov,
1207	unsigned long nr_segs,	1422	unsigned long nr_segs,
1208	loff_t pos)	1423	loff_t pos)
1209	{	1424	{
1210	int ret, rw_level, have_alloc_sem = 0;	1425	int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
1211	struct file *filp = iocb->ki_filp;	1426	int can_do_direct, sync = 0;
1212	struct inode *inode = filp->f_path.dentry->d_inode;	1427	ssize_t written = 0;
1213	int appending = filp->f_flags & O_APPEND ? 1 : 0;	1428	size_t ocount; /* original count */
1214		1429	size_t count; /* after file limit checks */
1215	mlog_entry("(0x%p, %u, '%.*s')\n", filp,	1430	loff_t *ppos = &iocb->ki_pos;
		1431	struct file *file = iocb->ki_filp;
		1432	struct inode *inode = file->f_path.dentry->d_inode;
		1433
		1434	mlog_entry("(0x%p, %u, '%.*s')\n", file,
1216	(unsigned int)nr_segs,	1435	(unsigned int)nr_segs,
1217	filp->f_path.dentry->d_name.len,	1436	file->f_path.dentry->d_name.len,
1218	filp->f_path.dentry->d_name.name);	1437	file->f_path.dentry->d_name.name);
1219		1438
1220	/* happy write of zero bytes */
1221	if (iocb->ki_left == 0)	1439	if (iocb->ki_left == 0)
1222	return 0;	1440	return 0;
1223		1441
		1442	ret = ocfs2_check_iovec(iov, &ocount, &nr_segs);
		1443	if (ret)
		1444	return ret;
		1445
		1446	count = ocount;
		1447
		1448	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
		1449
		1450	appending = file->f_flags & O_APPEND ? 1 : 0;
		1451	direct_io = file->f_flags & O_DIRECT ? 1 : 0;
		1452
1224	mutex_lock(&inode->i_mutex);	1453	mutex_lock(&inode->i_mutex);
		1454
		1455	relock:
1225	/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */	1456	/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
1226	if (filp->f_flags & O_DIRECT) {	1457	if (direct_io) {
1227	have_alloc_sem = 1;
1228	down_read(&inode->i_alloc_sem);	1458	down_read(&inode->i_alloc_sem);
		1459	have_alloc_sem = 1;
1229	}	1460	}
1230		1461
1231	/* concurrent O_DIRECT writes are allowed */	1462	/* concurrent O_DIRECT writes are allowed */
1232	rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;	1463	rw_level = !direct_io;
1233	ret = ocfs2_rw_lock(inode, rw_level);	1464	ret = ocfs2_rw_lock(inode, rw_level);
1234	if (ret < 0) {	1465	if (ret < 0) {
1235	rw_level = -1;
1236	mlog_errno(ret);	1466	mlog_errno(ret);
1237	goto out;	1467	goto out_sems;
1238	}	1468	}
1239		1469
1240	ret = ocfs2_prepare_inode_for_write(filp->f_path.dentry, &iocb->ki_pos,	1470	can_do_direct = direct_io;
1241	iocb->ki_left, appending);	1471	ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
		1472	iocb->ki_left, appending,
		1473	&can_do_direct);
1242	if (ret < 0) {	1474	if (ret < 0) {
1243	mlog_errno(ret);	1475	mlog_errno(ret);
1244	goto out;	1476	goto out;
1245	}	1477	}
1246		1478
		1479	/*
		1480	* We can't complete the direct I/O as requested, fall back to
		1481	* buffered I/O.
		1482	*/
		1483	if (direct_io && !can_do_direct) {
		1484	ocfs2_rw_unlock(inode, rw_level);
		1485	up_read(&inode->i_alloc_sem);
		1486
		1487	have_alloc_sem = 0;
		1488	rw_level = -1;
		1489
		1490	direct_io = 0;
		1491	sync = 1;
		1492	goto relock;
		1493	}
		1494
		1495	if (!sync && ((file->f_flags & O_SYNC) \|\| IS_SYNC(inode)))
		1496	sync = 1;
		1497
		1498	/*
		1499	* XXX: Is it ok to execute these checks a second time?
		1500	*/
		1501	ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode));
		1502	if (ret)
		1503	goto out;
		1504
		1505	/*
		1506	* Set pos so that sync_page_range_nolock() below understands
		1507	* where to start from. We might've moved it around via the
		1508	* calls above. The range we want to actually sync starts from
		1509	* *ppos here.
		1510	*
		1511	*/
		1512	pos = *ppos;
		1513
1247	/* communicate with ocfs2_dio_end_io */	1514	/* communicate with ocfs2_dio_end_io */
1248	ocfs2_iocb_set_rw_locked(iocb);	1515	ocfs2_iocb_set_rw_locked(iocb);
1249		1516
1250	ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos);	1517	if (direct_io) {
		1518	written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
		1519	ppos, count, ocount);
		1520	if (written < 0) {
		1521	ret = written;
		1522	goto out_dio;
		1523	}
		1524	} else {
		1525	written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs,
		1526	count, written);
		1527	if (written < 0) {
		1528	ret = written;
		1529	if (ret != -EFAULT \|\| ret != -ENOSPC)
		1530	mlog_errno(ret);
		1531	goto out;
		1532	}
		1533	}
1251		1534
		1535	out_dio:
1252	/* buffered aio wouldn't have proper lock coverage today */	1536	/* buffered aio wouldn't have proper lock coverage today */
1253	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));	1537	BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
1254		1538
1255	/*	1539	/*
1256	* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io	1540	* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
@@ -1268,14 +1552,25 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1268	}	1552	}
1269		1553
1270	out:	1554	out:
		1555	if (rw_level != -1)
		1556	ocfs2_rw_unlock(inode, rw_level);
		1557
		1558	out_sems:
1271	if (have_alloc_sem)	1559	if (have_alloc_sem)
1272	up_read(&inode->i_alloc_sem);	1560	up_read(&inode->i_alloc_sem);
1273	if (rw_level != -1)	1561
1274	ocfs2_rw_unlock(inode, rw_level);	1562	if (written > 0 && sync) {
		1563	ssize_t err;
		1564
		1565	err = sync_page_range_nolock(inode, file->f_mapping, pos, count);
		1566	if (err < 0)
		1567	written = err;
		1568	}
		1569
1275	mutex_unlock(&inode->i_mutex);	1570	mutex_unlock(&inode->i_mutex);
1276		1571
1277	mlog_exit(ret);	1572	mlog_exit(ret);
1278	return ret;	1573	return written ? written : ret;
1279	}	1574	}
1280		1575
1281	static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,	1576	static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
@@ -1300,7 +1595,8 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1300	goto out;	1595	goto out;
1301	}	1596	}
1302		1597
1303	ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0);	1598	ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,
		1599	NULL);
1304	if (ret < 0) {	1600	if (ret < 0) {
1305	mlog_errno(ret);	1601	mlog_errno(ret);
1306	goto out_unlock;	1602	goto out_unlock;