4 files changed, 71 insertions, 134 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 9f34bb9b1ecb..4012885d027f 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -53,13 +53,6 @@
 *
 * If blkfactor is zero then the user's request was aligned to the filesystem's
 * blocksize.
- *
- * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
- * This determines whether we need to do the fancy locking which prevents
- * direct-IO from being able to read uninitialised disk blocks.  If its zero
- * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is
- * not held for the entire direct write (taken briefly, initially, during a
- * direct read though, but its never held for the duration of a direct-IO).
 */
 struct dio {
@@ -68,7 +61,7 @@ struct dio {
        struct inode *inode;
        int rw;
        loff_t i_size;                  /* i_size when submitted */
-        int lock_type;                  /* doesn't change */
+        int flags;                      /* doesn't change */
        unsigned blkbits;               /* doesn't change */
        unsigned blkfactor;             /* When we're using an alignment which
                                           is finer than the filesystem's soft
@@ -246,7 +239,8 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
        if (dio->end_io && dio->result)
                dio->end_io(dio->iocb, offset, transferred,
                            dio->map_bh.b_private);
-        if (dio->lock_type == DIO_LOCKING)
+        if (dio->flags & DIO_LOCKING)
                /* lockdep: non-owner release */
                up_read_non_owner(&dio->inode->i_alloc_sem);
@@ -521,21 +515,24 @@ static int get_more_blocks(struct dio *dio)
                map_bh->b_state = 0;
                map_bh->b_size = fs_count << dio->inode->i_blkbits;
+                /*
+                 * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
+                 * forbid block creations: only overwrites are permitted.
+                 * We will return early to the caller once we see an
+                 * unmapped buffer head returned, and the caller will fall
+                 * back to buffered I/O.
+                 *
+                 * Otherwise the decision is left to the get_blocks method,
+                 * which may decide to handle it or also return an unmapped
+                 * buffer head.
+                 */
                create = dio->rw & WRITE;
-                if (dio->lock_type == DIO_LOCKING) {
+                if (dio->flags & DIO_SKIP_HOLES) {
                        if (dio->block_in_file < (i_size_read(dio->inode) >>
                                                        dio->blkbits))
                                create = 0;
-                } else if (dio->lock_type == DIO_NO_LOCKING) {
-                        create = 0;
                }
-                /*
-                 * For writes inside i_size we forbid block creations: only
-                 * overwrites are permitted.  We fall back to buffered writes
-                 * at a higher level for inside-i_size block-instantiating
-                 * writes.
-                 */
                ret = (*dio->get_block)(dio->inode, fs_startblk,
                                                map_bh, create);
        }
@@ -1045,7 +1042,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
         * we can let i_mutex go now that its achieved its purpose
         * of protecting us from looking up uninitialized blocks.
         */
-        if ((rw == READ) && (dio->lock_type == DIO_LOCKING))
+        if (rw == READ && (dio->flags & DIO_LOCKING))
                mutex_unlock(&dio->inode->i_mutex);
        /*
@@ -1092,30 +1089,28 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 /*
 * This is a library function for use by filesystem drivers.
- * The locking rules are governed by the dio_lock_type parameter.
 *
- * DIO_NO_LOCKING (no locking, for raw block device access)
+ * The locking rules are governed by the flags parameter:
- * For writes, i_mutex is not held on entry; it is never taken.
+ *  - if the flags value contains DIO_LOCKING we use a fancy locking
+ *    scheme for dumb filesystems.
+ *    For writes this function is called under i_mutex and returns with
+ *    i_mutex held, for reads, i_mutex is not held on entry, but it is
+ *    taken and dropped again before returning.
+ *    For reads and writes i_alloc_sem is taken in shared mode and released
+ *    on I/O completion (which may happen asynchronously after returning to
+ *    the caller).
 *
- * DIO_LOCKING (simple locking for regular files)
+ *  - if the flags value does NOT contain DIO_LOCKING we don't use any
- * For writes we are called under i_mutex and return with i_mutex held, even
+ *    internal locking but rather rely on the filesystem to synchronize
- * though it is internally dropped.
+ *    direct I/O reads/writes versus each other and truncate.
- * For reads, i_mutex is not held on entry, but it is taken and dropped before
+ *    For reads and writes both i_mutex and i_alloc_sem are not held on
- * returning.
+ *    entry and are never taken.
- *
- * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
- *      uninitialised data, allowing parallel direct readers and writers)
- * For writes we are called without i_mutex, return without it, never touch it.
- * For reads we are called under i_mutex and return with i_mutex held, even
- * though it may be internally dropped.
- *
- * Additional i_alloc_sem locking requirements described inline below.
 */
 ssize_t
 __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        struct block_device *bdev, const struct iovec *iov, loff_t offset, 
        unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
-        int dio_lock_type)
+        int flags)
 {
        int seg;
        size_t size;
@@ -1126,8 +1121,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        ssize_t retval = -EINVAL;
        loff_t end = offset;
        struct dio *dio;
-        int release_i_mutex = 0;
-        int acquire_i_mutex = 0;
        if (rw & WRITE)
                rw = WRITE_ODIRECT_PLUG;
@@ -1168,43 +1161,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
         */
        memset(dio, 0, offsetof(struct dio, pages));
-        /*
+        dio->flags = flags;
-         * For block device access DIO_NO_LOCKING is used,
+        if (dio->flags & DIO_LOCKING) {
-         *      neither readers nor writers do any locking at all
-         * For regular files using DIO_LOCKING,
-         *      readers need to grab i_mutex and i_alloc_sem
-         *      writers need to grab i_alloc_sem only (i_mutex is already held)
-         * For regular files using DIO_OWN_LOCKING,
-         *      neither readers nor writers take any locks here
-         */
-        dio->lock_type = dio_lock_type;
-        if (dio_lock_type != DIO_NO_LOCKING) {
                /* watch out for a 0 len io from a tricksy fs */
                if (rw == READ && end > offset) {
-                        struct address_space *mapping;
+                        struct address_space *mapping =
+                                        iocb->ki_filp->f_mapping;
-                        mapping = iocb->ki_filp->f_mapping;
+                        /* will be released by direct_io_worker */
-                        if (dio_lock_type != DIO_OWN_LOCKING) {
+                        mutex_lock(&inode->i_mutex);
-                                mutex_lock(&inode->i_mutex);
-                                release_i_mutex = 1;
-                        }
                        retval = filemap_write_and_wait_range(mapping, offset,
                                                              end - 1);
                        if (retval) {
+                                mutex_unlock(&inode->i_mutex);
                                kfree(dio);
                                goto out;
                        }
-                        if (dio_lock_type == DIO_OWN_LOCKING) {
-                                mutex_unlock(&inode->i_mutex);
-                                acquire_i_mutex = 1;
-                        }
                }
-                if (dio_lock_type == DIO_LOCKING)
+                /*
-                        /* lockdep: not the owner will release it */
+                 * Will be released at I/O completion, possibly in a
-                        down_read_non_owner(&inode->i_alloc_sem);
+                 * different thread.
+                 */
+                down_read_non_owner(&inode->i_alloc_sem);
        }
        /*
@@ -1222,24 +1202,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        /*
         * In case of error extending write may have instantiated a few
         * blocks outside i_size. Trim these off again for DIO_LOCKING.
-         * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by
+         *
-         * it's own meaner.
+         * NOTE: filesystems with their own locking have to handle this
+         * on their own.
         */
-        if (unlikely(retval < 0 && (rw & WRITE))) {
+        if (dio->flags & DIO_LOCKING) {
-                loff_t isize = i_size_read(inode);
+                if (unlikely((rw & WRITE) && retval < 0)) {
+                        loff_t isize = i_size_read(inode);
-                if (end > isize && dio_lock_type == DIO_LOCKING)
+                        if (end > isize)
-                        vmtruncate(inode, isize);
+                                vmtruncate(inode, isize);
+                }
        }
-        if (rw == READ && dio_lock_type == DIO_LOCKING)
-                release_i_mutex = 0;
 out:
-        if (release_i_mutex)
-                mutex_unlock(&inode->i_mutex);
-        else if (acquire_i_mutex)
-                mutex_lock(&inode->i_mutex);
        return retval;
 }
 EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index deb2b132ae5e..3dae4a13f6e4 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -547,6 +547,9 @@ bail:
 *
 * called like this: dio->get_blocks(dio->inode, fs_startblk,
 *                                      fs_count, map_bh, dio->rw == WRITE);
+ *
+ * Note that we never bother to allocate blocks here, and thus ignore the
+ * create argument.
 */
 static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
                                     struct buffer_head *bh_result, int create)
@@ -563,14 +566,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
        inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-        /*
-         * Any write past EOF is not allowed because we'd be extending.
-         */
-        if (create && (iblock + max_blocks) > inode_blocks) {
-                ret = -EIO;
-                goto bail;
-        }
        /* This figures out the size of the next contiguous block, and
         * our logical offset */
        ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
@@ -582,15 +577,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
                goto bail;
        }
-        if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {
-                ocfs2_error(inode->i_sb,
-                            "Inode %llu has a hole at block %llu\n",
-                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                            (unsigned long long)iblock);
-                ret = -EROFS;
-                goto bail;
-        }
        /* We should already CoW the refcounted extent. */
        BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
        /*
@@ -601,20 +587,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
         */
        if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
                map_bh(bh_result, inode->i_sb, p_blkno);
-        else {
+        else
-                /*
-                 * ocfs2_prepare_inode_for_write() should have caught
-                 * the case where we'd be filling a hole and triggered
-                 * a buffered write instead.
-                 */
-                if (create) {
-                        ret = -EIO;
-                        mlog_errno(ret);
-                        goto bail;
-                }
                clear_buffer_mapped(bh_result);
-        }
        /* make sure we don't map more than max_blocks blocks here as
           that's all the kernel will handle at this point. */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index d798c54296eb..66abe36c1213 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1474,19 +1474,13 @@ xfs_vm_direct_IO(
        bdev = xfs_find_bdev_for_inode(XFS_I(inode));
-        if (rw == WRITE) {
+        iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
-                iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
+                                        IOMAP_UNWRITTEN : IOMAP_READ);
-                ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
-                        bdev, iov, offset, nr_segs,
+        ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
-                        xfs_get_blocks_direct,
+                                            offset, nr_segs,
-                        xfs_end_io_direct);
+                                            xfs_get_blocks_direct,
-        } else {
+                                            xfs_end_io_direct);
-                iocb->private = xfs_alloc_ioend(inode, IOMAP_READ);
-                ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
-                        bdev, iov, offset, nr_segs,
-                        xfs_get_blocks_direct,
-                        xfs_end_io_direct);
-        }
        if (unlikely(ret != -EIOCBQUEUED && iocb->private))
                xfs_destroy_ioend(iocb->private);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a057f48eb156..b23a7018eb90 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2264,9 +2264,11 @@ ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        int lock_type);
 enum {
-        DIO_LOCKING = 1, /* need locking between buffered and direct access */
+        /* need locking between buffered and direct access */
-        DIO_NO_LOCKING,  /* bdev; no locking at all between buffered/direct */
+        DIO_LOCKING     = 0x01,
-        DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */
+        /* filesystem does not support filling holes */
+        DIO_SKIP_HOLES  = 0x02,
 };
 static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
@@ -2275,7 +2277,8 @@ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
        dio_iodone_t end_io)
 {
        return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
-                                nr_segs, get_block, end_io, DIO_LOCKING);
+                                    nr_segs, get_block, end_io,
+                                    DIO_LOCKING | DIO_SKIP_HOLES);
 }
 static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
@@ -2284,16 +2287,7 @@ static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
        dio_iodone_t end_io)
 {
        return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
-                                nr_segs, get_block, end_io, DIO_NO_LOCKING);
+                                nr_segs, get_block, end_io, 0);
-}
-static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb,
-        struct inode *inode, struct block_device *bdev, const struct iovec *iov,
-        loff_t offset, unsigned long nr_segs, get_block_t get_block,
-        dio_iodone_t end_io)
-{
-        return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
-                                nr_segs, get_block, end_io, DIO_OWN_LOCKING);
 }
 #endif

diff --git a/fs/direct-io.c b/fs/direct-io.c index 9f34bb9b1ecb..4012885d027f 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c
@@ -53,13 +53,6 @@
53	*	53	*
54	* If blkfactor is zero then the user's request was aligned to the filesystem's	54	* If blkfactor is zero then the user's request was aligned to the filesystem's
55	* blocksize.	55	* blocksize.
56	*
57	* lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
58	* This determines whether we need to do the fancy locking which prevents
59	* direct-IO from being able to read uninitialised disk blocks. If its zero
60	* (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is
61	* not held for the entire direct write (taken briefly, initially, during a
62	* direct read though, but its never held for the duration of a direct-IO).
63	*/	56	*/
64		57
65	struct dio {	58	struct dio {
@@ -68,7 +61,7 @@ struct dio {
68	struct inode *inode;	61	struct inode *inode;
69	int rw;	62	int rw;
70	loff_t i_size; /* i_size when submitted */	63	loff_t i_size; /* i_size when submitted */
71	int lock_type; /* doesn't change */	64	int flags; /* doesn't change */
72	unsigned blkbits; /* doesn't change */	65	unsigned blkbits; /* doesn't change */
73	unsigned blkfactor; /* When we're using an alignment which	66	unsigned blkfactor; /* When we're using an alignment which
74	is finer than the filesystem's soft	67	is finer than the filesystem's soft
@@ -246,7 +239,8 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
246	if (dio->end_io && dio->result)	239	if (dio->end_io && dio->result)
247	dio->end_io(dio->iocb, offset, transferred,	240	dio->end_io(dio->iocb, offset, transferred,
248	dio->map_bh.b_private);	241	dio->map_bh.b_private);
249	if (dio->lock_type == DIO_LOCKING)	242
		243	if (dio->flags & DIO_LOCKING)
250	/* lockdep: non-owner release */	244	/* lockdep: non-owner release */
251	up_read_non_owner(&dio->inode->i_alloc_sem);	245	up_read_non_owner(&dio->inode->i_alloc_sem);
252		246
@@ -521,21 +515,24 @@ static int get_more_blocks(struct dio *dio)
521	map_bh->b_state = 0;	515	map_bh->b_state = 0;
522	map_bh->b_size = fs_count << dio->inode->i_blkbits;	516	map_bh->b_size = fs_count << dio->inode->i_blkbits;
523		517
		518	/*
		519	* For writes inside i_size on a DIO_SKIP_HOLES filesystem we
		520	* forbid block creations: only overwrites are permitted.
		521	* We will return early to the caller once we see an
		522	* unmapped buffer head returned, and the caller will fall
		523	* back to buffered I/O.
		524	*
		525	* Otherwise the decision is left to the get_blocks method,
		526	* which may decide to handle it or also return an unmapped
		527	* buffer head.
		528	*/
524	create = dio->rw & WRITE;	529	create = dio->rw & WRITE;
525	if (dio->lock_type == DIO_LOCKING) {	530	if (dio->flags & DIO_SKIP_HOLES) {
526	if (dio->block_in_file < (i_size_read(dio->inode) >>	531	if (dio->block_in_file < (i_size_read(dio->inode) >>
527	dio->blkbits))	532	dio->blkbits))
528	create = 0;	533	create = 0;
529	} else if (dio->lock_type == DIO_NO_LOCKING) {
530	create = 0;
531	}	534	}
532		535
533	/*
534	* For writes inside i_size we forbid block creations: only
535	* overwrites are permitted. We fall back to buffered writes
536	* at a higher level for inside-i_size block-instantiating
537	* writes.
538	*/
539	ret = (*dio->get_block)(dio->inode, fs_startblk,	536	ret = (*dio->get_block)(dio->inode, fs_startblk,
540	map_bh, create);	537	map_bh, create);
541	}	538	}
@@ -1045,7 +1042,7 @@ direct_io_worker(int rw, struct kiocb iocb, struct inode inode,
1045	* we can let i_mutex go now that its achieved its purpose	1042	* we can let i_mutex go now that its achieved its purpose
1046	* of protecting us from looking up uninitialized blocks.	1043	* of protecting us from looking up uninitialized blocks.
1047	*/	1044	*/
1048	if ((rw == READ) && (dio->lock_type == DIO_LOCKING))	1045	if (rw == READ && (dio->flags & DIO_LOCKING))
1049	mutex_unlock(&dio->inode->i_mutex);	1046	mutex_unlock(&dio->inode->i_mutex);
1050		1047
1051	/*	1048	/*
@@ -1092,30 +1089,28 @@ direct_io_worker(int rw, struct kiocb iocb, struct inode inode,
1092		1089
1093	/*	1090	/*
1094	* This is a library function for use by filesystem drivers.	1091	* This is a library function for use by filesystem drivers.
1095	* The locking rules are governed by the dio_lock_type parameter.
1096	*	1092	*
1097	* DIO_NO_LOCKING (no locking, for raw block device access)	1093	* The locking rules are governed by the flags parameter:
1098	* For writes, i_mutex is not held on entry; it is never taken.	1094	* - if the flags value contains DIO_LOCKING we use a fancy locking
		1095	* scheme for dumb filesystems.
		1096	* For writes this function is called under i_mutex and returns with
		1097	* i_mutex held, for reads, i_mutex is not held on entry, but it is
		1098	* taken and dropped again before returning.
		1099	* For reads and writes i_alloc_sem is taken in shared mode and released
		1100	* on I/O completion (which may happen asynchronously after returning to
		1101	* the caller).
1099	*	1102	*
1100	* DIO_LOCKING (simple locking for regular files)	1103	* - if the flags value does NOT contain DIO_LOCKING we don't use any
1101	* For writes we are called under i_mutex and return with i_mutex held, even	1104	* internal locking but rather rely on the filesystem to synchronize
1102	* though it is internally dropped.	1105	* direct I/O reads/writes versus each other and truncate.
1103	* For reads, i_mutex is not held on entry, but it is taken and dropped before	1106	* For reads and writes both i_mutex and i_alloc_sem are not held on
1104	* returning.	1107	* entry and are never taken.
1105	*
1106	* DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
1107	* uninitialised data, allowing parallel direct readers and writers)
1108	* For writes we are called without i_mutex, return without it, never touch it.
1109	* For reads we are called under i_mutex and return with i_mutex held, even
1110	* though it may be internally dropped.
1111	*
1112	* Additional i_alloc_sem locking requirements described inline below.
1113	*/	1108	*/
1114	ssize_t	1109	ssize_t
1115	__blockdev_direct_IO(int rw, struct kiocb iocb, struct inode inode,	1110	__blockdev_direct_IO(int rw, struct kiocb iocb, struct inode inode,
1116	struct block_device bdev, const struct iovec iov, loff_t offset,	1111	struct block_device bdev, const struct iovec iov, loff_t offset,
1117	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,	1112	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1118	int dio_lock_type)	1113	int flags)
1119	{	1114	{
1120	int seg;	1115	int seg;
1121	size_t size;	1116	size_t size;
@@ -1126,8 +1121,6 @@ __blockdev_direct_IO(int rw, struct kiocb iocb, struct inode inode,
1126	ssize_t retval = -EINVAL;	1121	ssize_t retval = -EINVAL;
1127	loff_t end = offset;	1122	loff_t end = offset;
1128	struct dio *dio;	1123	struct dio *dio;
1129	int release_i_mutex = 0;
1130	int acquire_i_mutex = 0;
1131		1124
1132	if (rw & WRITE)	1125	if (rw & WRITE)
1133	rw = WRITE_ODIRECT_PLUG;	1126	rw = WRITE_ODIRECT_PLUG;
@@ -1168,43 +1161,30 @@ __blockdev_direct_IO(int rw, struct kiocb iocb, struct inode inode,
1168	*/	1161	*/
1169	memset(dio, 0, offsetof(struct dio, pages));	1162	memset(dio, 0, offsetof(struct dio, pages));
1170		1163
1171	/*	1164	dio->flags = flags;
1172	* For block device access DIO_NO_LOCKING is used,	1165	if (dio->flags & DIO_LOCKING) {
1173	* neither readers nor writers do any locking at all
1174	* For regular files using DIO_LOCKING,
1175	* readers need to grab i_mutex and i_alloc_sem
1176	* writers need to grab i_alloc_sem only (i_mutex is already held)
1177	* For regular files using DIO_OWN_LOCKING,
1178	* neither readers nor writers take any locks here
1179	*/
1180	dio->lock_type = dio_lock_type;
1181	if (dio_lock_type != DIO_NO_LOCKING) {
1182	/* watch out for a 0 len io from a tricksy fs */	1166	/* watch out for a 0 len io from a tricksy fs */
1183	if (rw == READ && end > offset) {	1167	if (rw == READ && end > offset) {
1184	struct address_space *mapping;	1168	struct address_space *mapping =
		1169	iocb->ki_filp->f_mapping;
1185		1170
1186	mapping = iocb->ki_filp->f_mapping;	1171	/* will be released by direct_io_worker */
1187	if (dio_lock_type != DIO_OWN_LOCKING) {	1172	mutex_lock(&inode->i_mutex);
1188	mutex_lock(&inode->i_mutex);
1189	release_i_mutex = 1;
1190	}
1191		1173
1192	retval = filemap_write_and_wait_range(mapping, offset,	1174	retval = filemap_write_and_wait_range(mapping, offset,
1193	end - 1);	1175	end - 1);
1194	if (retval) {	1176	if (retval) {
		1177	mutex_unlock(&inode->i_mutex);
1195	kfree(dio);	1178	kfree(dio);
1196	goto out;	1179	goto out;
1197	}	1180	}
1198
1199	if (dio_lock_type == DIO_OWN_LOCKING) {
1200	mutex_unlock(&inode->i_mutex);
1201	acquire_i_mutex = 1;
1202	}
1203	}	1181	}
1204		1182
1205	if (dio_lock_type == DIO_LOCKING)	1183	/*
1206	/* lockdep: not the owner will release it */	1184	* Will be released at I/O completion, possibly in a
1207	down_read_non_owner(&inode->i_alloc_sem);	1185	* different thread.
		1186	*/
		1187	down_read_non_owner(&inode->i_alloc_sem);
1208	}	1188	}
1209		1189
1210	/*	1190	/*
@@ -1222,24 +1202,19 @@ __blockdev_direct_IO(int rw, struct kiocb iocb, struct inode inode,
1222	/*	1202	/*
1223	* In case of error extending write may have instantiated a few	1203	* In case of error extending write may have instantiated a few
1224	* blocks outside i_size. Trim these off again for DIO_LOCKING.	1204	* blocks outside i_size. Trim these off again for DIO_LOCKING.
1225	* NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by	1205	*
1226	* it's own meaner.	1206	* NOTE: filesystems with their own locking have to handle this
		1207	* on their own.
1227	*/	1208	*/
1228	if (unlikely(retval < 0 && (rw & WRITE))) {	1209	if (dio->flags & DIO_LOCKING) {
1229	loff_t isize = i_size_read(inode);	1210	if (unlikely((rw & WRITE) && retval < 0)) {
1230		1211	loff_t isize = i_size_read(inode);
1231	if (end > isize && dio_lock_type == DIO_LOCKING)	1212	if (end > isize)
1232	vmtruncate(inode, isize);	1213	vmtruncate(inode, isize);
		1214	}
1233	}	1215	}
1234		1216
1235	if (rw == READ && dio_lock_type == DIO_LOCKING)
1236	release_i_mutex = 0;
1237
1238	out:	1217	out:
1239	if (release_i_mutex)
1240	mutex_unlock(&inode->i_mutex);
1241	else if (acquire_i_mutex)
1242	mutex_lock(&inode->i_mutex);
1243	return retval;	1218	return retval;
1244	}	1219	}
1245	EXPORT_SYMBOL(__blockdev_direct_IO);	1220	EXPORT_SYMBOL(__blockdev_direct_IO);


diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index deb2b132ae5e..3dae4a13f6e4 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c
@@ -547,6 +547,9 @@ bail:
547	*	547	*
548	* called like this: dio->get_blocks(dio->inode, fs_startblk,	548	* called like this: dio->get_blocks(dio->inode, fs_startblk,
549	* fs_count, map_bh, dio->rw == WRITE);	549	* fs_count, map_bh, dio->rw == WRITE);
		550	*
		551	* Note that we never bother to allocate blocks here, and thus ignore the
		552	* create argument.
550	*/	553	*/
551	static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,	554	static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
552	struct buffer_head *bh_result, int create)	555	struct buffer_head *bh_result, int create)
@@ -563,14 +566,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
563		566
564	inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));	567	inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
565		568
566	/*
567	* Any write past EOF is not allowed because we'd be extending.
568	*/
569	if (create && (iblock + max_blocks) > inode_blocks) {
570	ret = -EIO;
571	goto bail;
572	}
573
574	/* This figures out the size of the next contiguous block, and	569	/* This figures out the size of the next contiguous block, and
575	* our logical offset */	570	* our logical offset */
576	ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,	571	ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
@@ -582,15 +577,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
582	goto bail;	577	goto bail;
583	}	578	}
584		579
585	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {
586	ocfs2_error(inode->i_sb,
587	"Inode %llu has a hole at block %llu\n",
588	(unsigned long long)OCFS2_I(inode)->ip_blkno,
589	(unsigned long long)iblock);
590	ret = -EROFS;
591	goto bail;
592	}
593
594	/* We should already CoW the refcounted extent. */	580	/* We should already CoW the refcounted extent. */
595	BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);	581	BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
596	/*	582	/*
@@ -601,20 +587,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
601	*/	587	*/
602	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))	588	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
603	map_bh(bh_result, inode->i_sb, p_blkno);	589	map_bh(bh_result, inode->i_sb, p_blkno);
604	else {	590	else
605	/*
606	* ocfs2_prepare_inode_for_write() should have caught
607	* the case where we'd be filling a hole and triggered
608	* a buffered write instead.
609	*/
610	if (create) {
611	ret = -EIO;
612	mlog_errno(ret);
613	goto bail;
614	}
615
616	clear_buffer_mapped(bh_result);	591	clear_buffer_mapped(bh_result);
617	}
618		592
619	/* make sure we don't map more than max_blocks blocks here as	593	/* make sure we don't map more than max_blocks blocks here as
620	that's all the kernel will handle at this point. */	594	that's all the kernel will handle at this point. */


diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index d798c54296eb..66abe36c1213 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1474,19 +1474,13 @@ xfs_vm_direct_IO(
1474		1474
1475	bdev = xfs_find_bdev_for_inode(XFS_I(inode));	1475	bdev = xfs_find_bdev_for_inode(XFS_I(inode));
1476		1476
1477	if (rw == WRITE) {	1477	iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
1478	iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);	1478	IOMAP_UNWRITTEN : IOMAP_READ);
1479	ret = blockdev_direct_IO_own_locking(rw, iocb, inode,	1479
1480	bdev, iov, offset, nr_segs,	1480	ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
1481	xfs_get_blocks_direct,	1481	offset, nr_segs,
1482	xfs_end_io_direct);	1482	xfs_get_blocks_direct,
1483	} else {	1483	xfs_end_io_direct);
1484	iocb->private = xfs_alloc_ioend(inode, IOMAP_READ);
1485	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
1486	bdev, iov, offset, nr_segs,
1487	xfs_get_blocks_direct,
1488	xfs_end_io_direct);
1489	}
1490		1484
1491	if (unlikely(ret != -EIOCBQUEUED && iocb->private))	1485	if (unlikely(ret != -EIOCBQUEUED && iocb->private))
1492	xfs_destroy_ioend(iocb->private);	1486	xfs_destroy_ioend(iocb->private);


diff --git a/include/linux/fs.h b/include/linux/fs.h index a057f48eb156..b23a7018eb90 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h
@@ -2264,9 +2264,11 @@ ssize_t __blockdev_direct_IO(int rw, struct kiocb iocb, struct inode inode,
2264	int lock_type);	2264	int lock_type);
2265		2265
2266	enum {	2266	enum {
2267	DIO_LOCKING = 1, /* need locking between buffered and direct access */	2267	/* need locking between buffered and direct access */
2268	DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */	2268	DIO_LOCKING = 0x01,
2269	DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */	2269
		2270	/* filesystem does not support filling holes */
		2271	DIO_SKIP_HOLES = 0x02,
2270	};	2272	};
2271		2273
2272	static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,	2274	static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
@@ -2275,7 +2277,8 @@ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
2275	dio_iodone_t end_io)	2277	dio_iodone_t end_io)
2276	{	2278	{
2277	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,	2279	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
2278	nr_segs, get_block, end_io, DIO_LOCKING);	2280	nr_segs, get_block, end_io,
		2281	DIO_LOCKING \| DIO_SKIP_HOLES);
2279	}	2282	}
2280		2283
2281	static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,	2284	static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
@@ -2284,16 +2287,7 @@ static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
2284	dio_iodone_t end_io)	2287	dio_iodone_t end_io)
2285	{	2288	{
2286	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,	2289	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
2287	nr_segs, get_block, end_io, DIO_NO_LOCKING);	2290	nr_segs, get_block, end_io, 0);
2288	}
2289
2290	static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb,
2291	struct inode inode, struct block_device bdev, const struct iovec *iov,
2292	loff_t offset, unsigned long nr_segs, get_block_t get_block,
2293	dio_iodone_t end_io)
2294	{
2295	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
2296	nr_segs, get_block, end_io, DIO_OWN_LOCKING);
2297	}	2291	}
2298	#endif	2292	#endif
2299		2293