Merge branch 'xfs-dio-extend-fix' into for-next

Conflicts: fs/xfs/xfs_file.c
author: Dave Chinner <david@fromorbit.com> 2015-04-16 08:13:18 -0400
committer: Dave Chinner <david@fromorbit.com> 2015-04-16 08:13:18 -0400
commit: 542c311813d5cb2e6f0dfa9557f41c829b8fb6a0 (patch)
tree: 573c5644eb966e44112016c9ae86e80251326223 /fs
parent: 6a63ef064b2444883ce8b68b0779d0c739d27204 (diff)
parent: 0cefb29e6a63727bc7606c47fc538467594ef112 (diff)
3 files changed, 239 insertions, 82 deletions
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3a9b7a1b8704..598b259fda04 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1233,6 +1233,117 @@ xfs_vm_releasepage(
        return try_to_free_buffers(page);
 }
+/*
+ * When we map a DIO buffer, we may need to attach an ioend that describes the
+ * type of write IO we are doing. This passes to the completion function the
+ * operations it needs to perform. If the mapping is for an overwrite wholly
+ * within the EOF then we don't need an ioend and so we don't allocate one.
+ * This avoids the unnecessary overhead of allocating and freeing ioends for
+ * workloads that don't require transactions on IO completion.
+ *
+ * If we get multiple mappings in a single IO, we might be mapping different
+ * types. But because the direct IO can only have a single private pointer, we
+ * need to ensure that:
+ *
+ * a) i) the ioend spans the entire region of unwritten mappings; or
+ *    ii) the ioend spans all the mappings that cross or are beyond EOF; and
+ * b) if it contains unwritten extents, it is *permanently* marked as such
+ *
+ * We could do this by chaining ioends like buffered IO does, but we only
+ * actually get one IO completion callback from the direct IO, and that spans
+ * the entire IO regardless of how many mappings and IOs are needed to complete
+ * the DIO. There is only going to be one reference to the ioend and its life
+ * cycle is constrained by the DIO completion code. hence we don't need
+ * reference counting here.
+ */
+static void
+xfs_map_direct(
+        struct inode            *inode,
+        struct buffer_head      *bh_result,
+        struct xfs_bmbt_irec    *imap,
+        xfs_off_t               offset)
+{
+        struct xfs_ioend        *ioend;
+        xfs_off_t               size = bh_result->b_size;
+        int                     type;
+        if (ISUNWRITTEN(imap))
+                type = XFS_IO_UNWRITTEN;
+        else
+                type = XFS_IO_OVERWRITE;
+        trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
+        if (bh_result->b_private) {
+                ioend = bh_result->b_private;
+                ASSERT(ioend->io_size > 0);
+                ASSERT(offset >= ioend->io_offset);
+                if (offset + size > ioend->io_offset + ioend->io_size)
+                        ioend->io_size = offset - ioend->io_offset + size;
+                if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
+                        ioend->io_type = XFS_IO_UNWRITTEN;
+                trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
+                                              ioend->io_size, ioend->io_type,
+                                              imap);
+        } else if (type == XFS_IO_UNWRITTEN ||
+                   offset + size > i_size_read(inode)) {
+                ioend = xfs_alloc_ioend(inode, type);
+                ioend->io_offset = offset;
+                ioend->io_size = size;
+                bh_result->b_private = ioend;
+                set_buffer_defer_completion(bh_result);
+                trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
+                                           imap);
+        } else {
+                trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
+                                            imap);
+        }
+}
+/*
+ * If this is O_DIRECT or the mpage code calling tell them how large the mapping
+ * is, so that we can avoid repeated get_blocks calls.
+ *
+ * If the mapping spans EOF, then we have to break the mapping up as the mapping
+ * for blocks beyond EOF must be marked new so that sub block regions can be
+ * correctly zeroed. We can't do this for mappings within EOF unless the mapping
+ * was just allocated or is unwritten, otherwise the callers would overwrite
+ * existing data with zeros. Hence we have to split the mapping into a range up
+ * to and including EOF, and a second mapping for beyond EOF.
+ */
+static void
+xfs_map_trim_size(
+        struct inode            *inode,
+        sector_t                iblock,
+        struct buffer_head      *bh_result,
+        struct xfs_bmbt_irec    *imap,
+        xfs_off_t               offset,
+        ssize_t                 size)
+{
+        xfs_off_t               mapping_size;
+        mapping_size = imap->br_startoff + imap->br_blockcount - iblock;
+        mapping_size <<= inode->i_blkbits;
+        ASSERT(mapping_size > 0);
+        if (mapping_size > size)
+                mapping_size = size;
+        if (offset < i_size_read(inode) &&
+            offset + mapping_size >= i_size_read(inode)) {
+                /* limit mapping to block that spans EOF */
+                mapping_size = roundup_64(i_size_read(inode) - offset,
+                                          1 << inode->i_blkbits);
+        }
+        if (mapping_size > LONG_MAX)
+                mapping_size = LONG_MAX;
+        bh_result->b_size = mapping_size;
+}
 STATIC int
 __xfs_get_blocks(
        struct inode            *inode,
@@ -1321,31 +1432,37 @@ __xfs_get_blocks(
                        xfs_iunlock(ip, lockmode);
                }
+                trace_xfs_get_blocks_alloc(ip, offset, size,
-                trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
+                                ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
+                                                   : XFS_IO_DELALLOC, &imap);
        } else if (nimaps) {
-                trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
+                trace_xfs_get_blocks_found(ip, offset, size,
+                                ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
+                                                   : XFS_IO_OVERWRITE, &imap);
                xfs_iunlock(ip, lockmode);
        } else {
                trace_xfs_get_blocks_notfound(ip, offset, size);
                goto out_unlock;
        }
+        /* trim mapping down to size requested */
+        if (direct || size > (1 << inode->i_blkbits))
+                xfs_map_trim_size(inode, iblock, bh_result,
+                                  &imap, offset, size);
+        /*
+         * For unwritten extents do not report a disk address in the buffered
+         * read case (treat as if we're reading into a hole).
+         */
        if (imap.br_startblock != HOLESTARTBLOCK &&
-            imap.br_startblock != DELAYSTARTBLOCK) {
+            imap.br_startblock != DELAYSTARTBLOCK &&
-                /*
+            (create || !ISUNWRITTEN(&imap))) {
-                 * For unwritten extents do not report a disk address on
+                xfs_map_buffer(inode, bh_result, &imap, offset);
-                 * the read case (treat as if we're reading into a hole).
+                if (ISUNWRITTEN(&imap))
-                 */
-                if (create || !ISUNWRITTEN(&imap))
-                        xfs_map_buffer(inode, bh_result, &imap, offset);
-                if (create && ISUNWRITTEN(&imap)) {
-                        if (direct) {
-                                bh_result->b_private = inode;
-                                set_buffer_defer_completion(bh_result);
-                        }
                        set_buffer_unwritten(bh_result);
-                }
+                /* direct IO needs special help */
+                if (create && direct)
+                        xfs_map_direct(inode, bh_result, &imap, offset);
        }
        /*
@@ -1378,39 +1495,6 @@ __xfs_get_blocks(
                }
        }
-        /*
-         * If this is O_DIRECT or the mpage code calling tell them how large
-         * the mapping is, so that we can avoid repeated get_blocks calls.
-         *
-         * If the mapping spans EOF, then we have to break the mapping up as the
-         * mapping for blocks beyond EOF must be marked new so that sub block
-         * regions can be correctly zeroed. We can't do this for mappings within
-         * EOF unless the mapping was just allocated or is unwritten, otherwise
-         * the callers would overwrite existing data with zeros. Hence we have
-         * to split the mapping into a range up to and including EOF, and a
-         * second mapping for beyond EOF.
-         */
-        if (direct || size > (1 << inode->i_blkbits)) {
-                xfs_off_t               mapping_size;
-                mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
-                mapping_size <<= inode->i_blkbits;
-                ASSERT(mapping_size > 0);
-                if (mapping_size > size)
-                        mapping_size = size;
-                if (offset < i_size_read(inode) &&
-                    offset + mapping_size >= i_size_read(inode)) {
-                        /* limit mapping to block that spans EOF */
-                        mapping_size = roundup_64(i_size_read(inode) - offset,
-                                                  1 << inode->i_blkbits);
-                }
-                if (mapping_size > LONG_MAX)
-                        mapping_size = LONG_MAX;
-                bh_result->b_size = mapping_size;
-        }
        return 0;
 out_unlock:
@@ -1441,9 +1525,11 @@ xfs_get_blocks_direct(
 /*
 * Complete a direct I/O write request.
 *
- * If the private argument is non-NULL __xfs_get_blocks signals us that we
+ * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
- * need to issue a transaction to convert the range from unwritten to written
+ * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
- * extents.
+ * wholly within the EOF and so there is nothing for us to do. Note that in this
+ * case the completion can be called in interrupt context, whereas if we have an
+ * ioend we will always be called in task context (i.e. from a workqueue).
 */
 STATIC void
 xfs_end_io_direct_write(
@@ -1455,43 +1541,71 @@ xfs_end_io_direct_write(
        struct inode            *inode = file_inode(iocb->ki_filp);
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_ioend        *ioend = private;
-        if (XFS_FORCED_SHUTDOWN(mp))
+        trace_xfs_gbmap_direct_endio(ip, offset, size,
+                                     ioend ? ioend->io_type : 0, NULL);
+        if (!ioend) {
+                ASSERT(offset + size <= i_size_read(inode));
                return;
+        }
+        if (XFS_FORCED_SHUTDOWN(mp))
+                goto out_end_io;
        /*
-         * While the generic direct I/O code updates the inode size, it does
+         * dio completion end_io functions are only called on writes if more
-         * so only after the end_io handler is called, which means our
+         * than 0 bytes was written.
-         * end_io handler thinks the on-disk size is outside the in-core
-         * size.  To prevent this just update it a little bit earlier here.
         */
+        ASSERT(size > 0);
+        /*
+         * The ioend only maps whole blocks, while the IO may be sector aligned.
+         * Hence the ioend offset/size may not match the IO offset/size exactly.
+         * Because we don't map overwrites within EOF into the ioend, the offset
+         * may not match, but only if the endio spans EOF.  Either way, write
+         * the IO sizes into the ioend so that completion processing does the
+         * right thing.
+         */
+        ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
+        ioend->io_size = size;
+        ioend->io_offset = offset;
+        /*
+         * The ioend tells us whether we are doing unwritten extent conversion
+         * or an append transaction that updates the on-disk file size. These
+         * cases are the only cases where we should *potentially* be needing
+         * to update the VFS inode size.
+         *
+         * We need to update the in-core inode size here so that we don't end up
+         * with the on-disk inode size being outside the in-core inode size. We
+         * have no other method of updating EOF for AIO, so always do it here
+         * if necessary.
+         *
+         * We need to lock the test/set EOF update as we can be racing with
+         * other IO completions here to update the EOF. Failing to serialise
+         * here can result in EOF moving backwards and Bad Things Happen when
+         * that occurs.
+         */
+        spin_lock(&ip->i_flags_lock);
        if (offset + size > i_size_read(inode))
                i_size_write(inode, offset + size);
+        spin_unlock(&ip->i_flags_lock);
        /*
-         * For direct I/O we do not know if we need to allocate blocks or not,
+         * If we are doing an append IO that needs to update the EOF on disk,
-         * so we can't preallocate an append transaction, as that results in
+         * do the transaction reserve now so we can use common end io
-         * nested reservations and log space deadlocks. Hence allocate the
+         * processing. Stashing the error (if there is one) in the ioend will
-         * transaction here. While this is sub-optimal and can block IO
+         * result in the ioend processing passing on the error if it is
-         * completion for some time, we're stuck with doing it this way until
+         * possible as we can't return it from here.
-         * we can pass the ioend to the direct IO allocation callbacks and
-         * avoid nesting that way.
         */
-        if (private && size > 0) {
+        if (ioend->io_type == XFS_IO_OVERWRITE)
-                xfs_iomap_write_unwritten(ip, offset, size);
+                ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
-        } else if (offset + size > ip->i_d.di_size) {
-                struct xfs_trans        *tp;
-                int                     error;
-                tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
-                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
-                if (error) {
-                        xfs_trans_cancel(tp, 0);
-                        return;
-                }
-                xfs_setfilesize(ip, tp, offset, size);
+out_end_io:
-        }
+        xfs_end_io(&ioend->io_work);
+        return;
 }
 STATIC ssize_t
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c203839cd5be..3a5d305e60c9 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -569,20 +569,41 @@ restart:
         * write.  If zeroing is needed and we are currently holding the
         * iolock shared, we need to update it to exclusive which implies
         * having to redo all checks before.
+         *
+         * We need to serialise against EOF updates that occur in IO
+         * completions here. We want to make sure that nobody is changing the
+         * size while we do this check until we have placed an IO barrier (i.e.
+         * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
+         * The spinlock effectively forms a memory barrier once we have the
+         * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
+         * and hence be able to correctly determine if we need to run zeroing.
         */
+        spin_lock(&ip->i_flags_lock);
        if (*pos > i_size_read(inode)) {
                bool    zero = false;
+                spin_unlock(&ip->i_flags_lock);
                if (*iolock == XFS_IOLOCK_SHARED) {
                        xfs_rw_iunlock(ip, *iolock);
                        *iolock = XFS_IOLOCK_EXCL;
                        xfs_rw_ilock(ip, *iolock);
+                        /*
+                         * We now have an IO submission barrier in place, but
+                         * AIO can do EOF updates during IO completion and hence
+                         * we now need to wait for all of them to drain. Non-AIO
+                         * DIO will have drained before we are given the
+                         * XFS_IOLOCK_EXCL, and so for most cases this wait is a
+                         * no-op.
+                         */
+                        inode_dio_wait(inode);
                        goto restart;
                }
                error = xfs_zero_eof(ip, *pos, i_size_read(inode), &zero);
                if (error)
                        return error;
-        }
+        } else
+                spin_unlock(&ip->i_flags_lock);
        /*
         * Updating the timestamps will grab the ilock again from
@@ -644,6 +665,8 @@ xfs_file_dio_aio_write(
        int                     iolock;
        size_t                  count = iov_iter_count(from);
        loff_t                  pos = iocb->ki_pos;
+        loff_t                  end;
+        struct iov_iter         data;
        struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
                                        mp->m_rtdev_targp : mp->m_ddev_targp;
@@ -683,10 +706,11 @@ xfs_file_dio_aio_write(
        if (ret)
                goto out;
        iov_iter_truncate(from, count);
+        end = pos + count - 1;
        if (mapping->nrpages) {
                ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-                                                    pos, pos + count - 1);
+                                                   pos, end);
                if (ret)
                        goto out;
                /*
@@ -696,7 +720,7 @@ xfs_file_dio_aio_write(
                 */
                ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
                                        pos >> PAGE_CACHE_SHIFT,
-                                        (pos + count - 1) >> PAGE_CACHE_SHIFT);
+                                        end >> PAGE_CACHE_SHIFT);
                WARN_ON_ONCE(ret);
                ret = 0;
        }
@@ -713,8 +737,22 @@ xfs_file_dio_aio_write(
        }
        trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
-        ret = generic_file_direct_write(iocb, from, pos);
+        data = *from;
+        ret = mapping->a_ops->direct_IO(WRITE, iocb, &data, pos);
+        /* see generic_file_direct_write() for why this is necessary */
+        if (mapping->nrpages) {
+                invalidate_inode_pages2_range(mapping,
+                                              pos >> PAGE_CACHE_SHIFT,
+                                              end >> PAGE_CACHE_SHIFT);
+        }
+        if (ret > 0) {
+                pos += ret;
+                iov_iter_advance(from, ret);
+                iocb->ki_pos = pos;
+        }
 out:
        xfs_rw_iunlock(ip, iolock);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index b2a45cc9eceb..615781bf4ee5 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1221,6 +1221,11 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
 DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio);
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
author	Dave Chinner <david@fromorbit.com>	2015-04-16 08:13:18 -0400
committer	Dave Chinner <david@fromorbit.com>	2015-04-16 08:13:18 -0400
commit	542c311813d5cb2e6f0dfa9557f41c829b8fb6a0 (patch)
tree	573c5644eb966e44112016c9ae86e80251326223 /fs
parent	6a63ef064b2444883ce8b68b0779d0c739d27204 (diff)
parent	0cefb29e6a63727bc7606c47fc538467594ef112 (diff)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3a9b7a1b8704..598b259fda04 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c
@@ -1233,6 +1233,117 @@ xfs_vm_releasepage(
1233	return try_to_free_buffers(page);	1233	return try_to_free_buffers(page);
1234	}	1234	}
1235		1235
		1236	/*
		1237	* When we map a DIO buffer, we may need to attach an ioend that describes the
		1238	* type of write IO we are doing. This passes to the completion function the
		1239	* operations it needs to perform. If the mapping is for an overwrite wholly
		1240	* within the EOF then we don't need an ioend and so we don't allocate one.
		1241	* This avoids the unnecessary overhead of allocating and freeing ioends for
		1242	* workloads that don't require transactions on IO completion.
		1243	*
		1244	* If we get multiple mappings in a single IO, we might be mapping different
		1245	* types. But because the direct IO can only have a single private pointer, we
		1246	* need to ensure that:
		1247	*
		1248	* a) i) the ioend spans the entire region of unwritten mappings; or
		1249	* ii) the ioend spans all the mappings that cross or are beyond EOF; and
		1250	* b) if it contains unwritten extents, it is permanently marked as such
		1251	*
		1252	* We could do this by chaining ioends like buffered IO does, but we only
		1253	* actually get one IO completion callback from the direct IO, and that spans
		1254	* the entire IO regardless of how many mappings and IOs are needed to complete
		1255	* the DIO. There is only going to be one reference to the ioend and its life
		1256	* cycle is constrained by the DIO completion code. hence we don't need
		1257	* reference counting here.
		1258	*/
		1259	static void
		1260	xfs_map_direct(
		1261	struct inode *inode,
		1262	struct buffer_head *bh_result,
		1263	struct xfs_bmbt_irec *imap,
		1264	xfs_off_t offset)
		1265	{
		1266	struct xfs_ioend *ioend;
		1267	xfs_off_t size = bh_result->b_size;
		1268	int type;
		1269
		1270	if (ISUNWRITTEN(imap))
		1271	type = XFS_IO_UNWRITTEN;
		1272	else
		1273	type = XFS_IO_OVERWRITE;
		1274
		1275	trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
		1276
		1277	if (bh_result->b_private) {
		1278	ioend = bh_result->b_private;
		1279	ASSERT(ioend->io_size > 0);
		1280	ASSERT(offset >= ioend->io_offset);
		1281	if (offset + size > ioend->io_offset + ioend->io_size)
		1282	ioend->io_size = offset - ioend->io_offset + size;
		1283
		1284	if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
		1285	ioend->io_type = XFS_IO_UNWRITTEN;
		1286
		1287	trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
		1288	ioend->io_size, ioend->io_type,
		1289	imap);
		1290	} else if (type == XFS_IO_UNWRITTEN \|\|
		1291	offset + size > i_size_read(inode)) {
		1292	ioend = xfs_alloc_ioend(inode, type);
		1293	ioend->io_offset = offset;
		1294	ioend->io_size = size;
		1295
		1296	bh_result->b_private = ioend;
		1297	set_buffer_defer_completion(bh_result);
		1298
		1299	trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
		1300	imap);
		1301	} else {
		1302	trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
		1303	imap);
		1304	}
		1305	}
		1306
		1307	/*
		1308	* If this is O_DIRECT or the mpage code calling tell them how large the mapping
		1309	* is, so that we can avoid repeated get_blocks calls.
		1310	*
		1311	* If the mapping spans EOF, then we have to break the mapping up as the mapping
		1312	* for blocks beyond EOF must be marked new so that sub block regions can be
		1313	* correctly zeroed. We can't do this for mappings within EOF unless the mapping
		1314	* was just allocated or is unwritten, otherwise the callers would overwrite
		1315	* existing data with zeros. Hence we have to split the mapping into a range up
		1316	* to and including EOF, and a second mapping for beyond EOF.
		1317	*/
		1318	static void
		1319	xfs_map_trim_size(
		1320	struct inode *inode,
		1321	sector_t iblock,
		1322	struct buffer_head *bh_result,
		1323	struct xfs_bmbt_irec *imap,
		1324	xfs_off_t offset,
		1325	ssize_t size)
		1326	{
		1327	xfs_off_t mapping_size;
		1328
		1329	mapping_size = imap->br_startoff + imap->br_blockcount - iblock;
		1330	mapping_size <<= inode->i_blkbits;
		1331
		1332	ASSERT(mapping_size > 0);
		1333	if (mapping_size > size)
		1334	mapping_size = size;
		1335	if (offset < i_size_read(inode) &&
		1336	offset + mapping_size >= i_size_read(inode)) {
		1337	/* limit mapping to block that spans EOF */
		1338	mapping_size = roundup_64(i_size_read(inode) - offset,
		1339	1 << inode->i_blkbits);
		1340	}
		1341	if (mapping_size > LONG_MAX)
		1342	mapping_size = LONG_MAX;
		1343
		1344	bh_result->b_size = mapping_size;
		1345	}
		1346
1236	STATIC int	1347	STATIC int
1237	__xfs_get_blocks(	1348	__xfs_get_blocks(
1238	struct inode *inode,	1349	struct inode *inode,
@@ -1321,31 +1432,37 @@ __xfs_get_blocks(
1321		1432
1322	xfs_iunlock(ip, lockmode);	1433	xfs_iunlock(ip, lockmode);
1323	}	1434	}
1324		1435	trace_xfs_get_blocks_alloc(ip, offset, size,
1325	trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);	1436	ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
		1437	: XFS_IO_DELALLOC, &imap);
1326	} else if (nimaps) {	1438	} else if (nimaps) {
1327	trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);	1439	trace_xfs_get_blocks_found(ip, offset, size,
		1440	ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
		1441	: XFS_IO_OVERWRITE, &imap);
1328	xfs_iunlock(ip, lockmode);	1442	xfs_iunlock(ip, lockmode);
1329	} else {	1443	} else {
1330	trace_xfs_get_blocks_notfound(ip, offset, size);	1444	trace_xfs_get_blocks_notfound(ip, offset, size);
1331	goto out_unlock;	1445	goto out_unlock;
1332	}	1446	}
1333		1447
		1448	/* trim mapping down to size requested */
		1449	if (direct \|\| size > (1 << inode->i_blkbits))
		1450	xfs_map_trim_size(inode, iblock, bh_result,
		1451	&imap, offset, size);
		1452
		1453	/*
		1454	* For unwritten extents do not report a disk address in the buffered
		1455	* read case (treat as if we're reading into a hole).
		1456	*/
1334	if (imap.br_startblock != HOLESTARTBLOCK &&	1457	if (imap.br_startblock != HOLESTARTBLOCK &&
1335	imap.br_startblock != DELAYSTARTBLOCK) {	1458	imap.br_startblock != DELAYSTARTBLOCK &&
1336	/*	1459	(create \|\| !ISUNWRITTEN(&imap))) {
1337	* For unwritten extents do not report a disk address on	1460	xfs_map_buffer(inode, bh_result, &imap, offset);
1338	* the read case (treat as if we're reading into a hole).	1461	if (ISUNWRITTEN(&imap))
1339	*/
1340	if (create \|\| !ISUNWRITTEN(&imap))
1341	xfs_map_buffer(inode, bh_result, &imap, offset);
1342	if (create && ISUNWRITTEN(&imap)) {
1343	if (direct) {
1344	bh_result->b_private = inode;
1345	set_buffer_defer_completion(bh_result);
1346	}
1347	set_buffer_unwritten(bh_result);	1462	set_buffer_unwritten(bh_result);
1348	}	1463	/* direct IO needs special help */
		1464	if (create && direct)
		1465	xfs_map_direct(inode, bh_result, &imap, offset);
1349	}	1466	}
1350		1467
1351	/*	1468	/*
@@ -1378,39 +1495,6 @@ __xfs_get_blocks(
1378	}	1495	}
1379	}	1496	}
1380		1497
1381	/*
1382	* If this is O_DIRECT or the mpage code calling tell them how large
1383	* the mapping is, so that we can avoid repeated get_blocks calls.
1384	*
1385	* If the mapping spans EOF, then we have to break the mapping up as the
1386	* mapping for blocks beyond EOF must be marked new so that sub block
1387	* regions can be correctly zeroed. We can't do this for mappings within
1388	* EOF unless the mapping was just allocated or is unwritten, otherwise
1389	* the callers would overwrite existing data with zeros. Hence we have
1390	* to split the mapping into a range up to and including EOF, and a
1391	* second mapping for beyond EOF.
1392	*/
1393	if (direct \|\| size > (1 << inode->i_blkbits)) {
1394	xfs_off_t mapping_size;
1395
1396	mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
1397	mapping_size <<= inode->i_blkbits;
1398
1399	ASSERT(mapping_size > 0);
1400	if (mapping_size > size)
1401	mapping_size = size;
1402	if (offset < i_size_read(inode) &&
1403	offset + mapping_size >= i_size_read(inode)) {
1404	/* limit mapping to block that spans EOF */
1405	mapping_size = roundup_64(i_size_read(inode) - offset,
1406	1 << inode->i_blkbits);
1407	}
1408	if (mapping_size > LONG_MAX)
1409	mapping_size = LONG_MAX;
1410
1411	bh_result->b_size = mapping_size;
1412	}
1413
1414	return 0;	1498	return 0;
1415		1499
1416	out_unlock:	1500	out_unlock:
@@ -1441,9 +1525,11 @@ xfs_get_blocks_direct(
1441	/*	1525	/*
1442	* Complete a direct I/O write request.	1526	* Complete a direct I/O write request.
1443	*	1527	*
1444	* If the private argument is non-NULL __xfs_get_blocks signals us that we	1528	* The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
1445	* need to issue a transaction to convert the range from unwritten to written	1529	* If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
1446	* extents.	1530	* wholly within the EOF and so there is nothing for us to do. Note that in this
		1531	* case the completion can be called in interrupt context, whereas if we have an
		1532	* ioend we will always be called in task context (i.e. from a workqueue).
1447	*/	1533	*/
1448	STATIC void	1534	STATIC void
1449	xfs_end_io_direct_write(	1535	xfs_end_io_direct_write(
@@ -1455,43 +1541,71 @@ xfs_end_io_direct_write(
1455	struct inode *inode = file_inode(iocb->ki_filp);	1541	struct inode *inode = file_inode(iocb->ki_filp);
1456	struct xfs_inode *ip = XFS_I(inode);	1542	struct xfs_inode *ip = XFS_I(inode);
1457	struct xfs_mount *mp = ip->i_mount;	1543	struct xfs_mount *mp = ip->i_mount;
		1544	struct xfs_ioend *ioend = private;
1458		1545
1459	if (XFS_FORCED_SHUTDOWN(mp))	1546	trace_xfs_gbmap_direct_endio(ip, offset, size,
		1547	ioend ? ioend->io_type : 0, NULL);
		1548
		1549	if (!ioend) {
		1550	ASSERT(offset + size <= i_size_read(inode));
1460	return;	1551	return;
		1552	}
		1553
		1554	if (XFS_FORCED_SHUTDOWN(mp))
		1555	goto out_end_io;
1461		1556
1462	/*	1557	/*
1463	* While the generic direct I/O code updates the inode size, it does	1558	* dio completion end_io functions are only called on writes if more
1464	* so only after the end_io handler is called, which means our	1559	* than 0 bytes was written.
1465	* end_io handler thinks the on-disk size is outside the in-core
1466	* size. To prevent this just update it a little bit earlier here.
1467	*/	1560	*/
		1561	ASSERT(size > 0);
		1562
		1563	/*
		1564	* The ioend only maps whole blocks, while the IO may be sector aligned.
		1565	* Hence the ioend offset/size may not match the IO offset/size exactly.
		1566	* Because we don't map overwrites within EOF into the ioend, the offset
		1567	* may not match, but only if the endio spans EOF. Either way, write
		1568	* the IO sizes into the ioend so that completion processing does the
		1569	* right thing.
		1570	*/
		1571	ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
		1572	ioend->io_size = size;
		1573	ioend->io_offset = offset;
		1574
		1575	/*
		1576	* The ioend tells us whether we are doing unwritten extent conversion
		1577	* or an append transaction that updates the on-disk file size. These
		1578	* cases are the only cases where we should potentially be needing
		1579	* to update the VFS inode size.
		1580	*
		1581	* We need to update the in-core inode size here so that we don't end up
		1582	* with the on-disk inode size being outside the in-core inode size. We
		1583	* have no other method of updating EOF for AIO, so always do it here
		1584	* if necessary.
		1585	*
		1586	* We need to lock the test/set EOF update as we can be racing with
		1587	* other IO completions here to update the EOF. Failing to serialise
		1588	* here can result in EOF moving backwards and Bad Things Happen when
		1589	* that occurs.
		1590	*/
		1591	spin_lock(&ip->i_flags_lock);
1468	if (offset + size > i_size_read(inode))	1592	if (offset + size > i_size_read(inode))
1469	i_size_write(inode, offset + size);	1593	i_size_write(inode, offset + size);
		1594	spin_unlock(&ip->i_flags_lock);
1470		1595
1471	/*	1596	/*
1472	* For direct I/O we do not know if we need to allocate blocks or not,	1597	* If we are doing an append IO that needs to update the EOF on disk,
1473	* so we can't preallocate an append transaction, as that results in	1598	* do the transaction reserve now so we can use common end io
1474	* nested reservations and log space deadlocks. Hence allocate the	1599	* processing. Stashing the error (if there is one) in the ioend will
1475	* transaction here. While this is sub-optimal and can block IO	1600	* result in the ioend processing passing on the error if it is
1476	* completion for some time, we're stuck with doing it this way until	1601	* possible as we can't return it from here.
1477	* we can pass the ioend to the direct IO allocation callbacks and
1478	* avoid nesting that way.
1479	*/	1602	*/
1480	if (private && size > 0) {	1603	if (ioend->io_type == XFS_IO_OVERWRITE)
1481	xfs_iomap_write_unwritten(ip, offset, size);	1604	ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
1482	} else if (offset + size > ip->i_d.di_size) {
1483	struct xfs_trans *tp;
1484	int error;
1485
1486	tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
1487	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
1488	if (error) {
1489	xfs_trans_cancel(tp, 0);
1490	return;
1491	}
1492		1605
1493	xfs_setfilesize(ip, tp, offset, size);	1606	out_end_io:
1494	}	1607	xfs_end_io(&ioend->io_work);
		1608	return;
1495	}	1609	}
1496		1610
1497	STATIC ssize_t	1611	STATIC ssize_t


diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index c203839cd5be..3a5d305e60c9 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c
@@ -569,20 +569,41 @@ restart:
569	* write. If zeroing is needed and we are currently holding the	569	* write. If zeroing is needed and we are currently holding the
570	* iolock shared, we need to update it to exclusive which implies	570	* iolock shared, we need to update it to exclusive which implies
571	* having to redo all checks before.	571	* having to redo all checks before.
		572	*
		573	* We need to serialise against EOF updates that occur in IO
		574	* completions here. We want to make sure that nobody is changing the
		575	* size while we do this check until we have placed an IO barrier (i.e.
		576	* hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
		577	* The spinlock effectively forms a memory barrier once we have the
		578	* XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
		579	* and hence be able to correctly determine if we need to run zeroing.
572	*/	580	*/
		581	spin_lock(&ip->i_flags_lock);
573	if (*pos > i_size_read(inode)) {	582	if (*pos > i_size_read(inode)) {
574	bool zero = false;	583	bool zero = false;
575		584
		585	spin_unlock(&ip->i_flags_lock);
576	if (*iolock == XFS_IOLOCK_SHARED) {	586	if (*iolock == XFS_IOLOCK_SHARED) {
577	xfs_rw_iunlock(ip, *iolock);	587	xfs_rw_iunlock(ip, *iolock);
578	*iolock = XFS_IOLOCK_EXCL;	588	*iolock = XFS_IOLOCK_EXCL;
579	xfs_rw_ilock(ip, *iolock);	589	xfs_rw_ilock(ip, *iolock);
		590
		591	/*
		592	* We now have an IO submission barrier in place, but
		593	* AIO can do EOF updates during IO completion and hence
		594	* we now need to wait for all of them to drain. Non-AIO
		595	* DIO will have drained before we are given the
		596	* XFS_IOLOCK_EXCL, and so for most cases this wait is a
		597	* no-op.
		598	*/
		599	inode_dio_wait(inode);
580	goto restart;	600	goto restart;
581	}	601	}
582	error = xfs_zero_eof(ip, *pos, i_size_read(inode), &zero);	602	error = xfs_zero_eof(ip, *pos, i_size_read(inode), &zero);
583	if (error)	603	if (error)
584	return error;	604	return error;
585	}	605	} else
		606	spin_unlock(&ip->i_flags_lock);
586		607
587	/*	608	/*
588	* Updating the timestamps will grab the ilock again from	609	* Updating the timestamps will grab the ilock again from
@@ -644,6 +665,8 @@ xfs_file_dio_aio_write(
644	int iolock;	665	int iolock;
645	size_t count = iov_iter_count(from);	666	size_t count = iov_iter_count(from);
646	loff_t pos = iocb->ki_pos;	667	loff_t pos = iocb->ki_pos;
		668	loff_t end;
		669	struct iov_iter data;
647	struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?	670	struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
648	mp->m_rtdev_targp : mp->m_ddev_targp;	671	mp->m_rtdev_targp : mp->m_ddev_targp;
649		672
@@ -683,10 +706,11 @@ xfs_file_dio_aio_write(
683	if (ret)	706	if (ret)
684	goto out;	707	goto out;
685	iov_iter_truncate(from, count);	708	iov_iter_truncate(from, count);
		709	end = pos + count - 1;
686		710
687	if (mapping->nrpages) {	711	if (mapping->nrpages) {
688	ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,	712	ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
689	pos, pos + count - 1);	713	pos, end);
690	if (ret)	714	if (ret)
691	goto out;	715	goto out;
692	/*	716	/*
@@ -696,7 +720,7 @@ xfs_file_dio_aio_write(
696	*/	720	*/
697	ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,	721	ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
698	pos >> PAGE_CACHE_SHIFT,	722	pos >> PAGE_CACHE_SHIFT,
699	(pos + count - 1) >> PAGE_CACHE_SHIFT);	723	end >> PAGE_CACHE_SHIFT);
700	WARN_ON_ONCE(ret);	724	WARN_ON_ONCE(ret);
701	ret = 0;	725	ret = 0;
702	}	726	}
@@ -713,8 +737,22 @@ xfs_file_dio_aio_write(
713	}	737	}
714		738
715	trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);	739	trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
716	ret = generic_file_direct_write(iocb, from, pos);
717		740
		741	data = *from;
		742	ret = mapping->a_ops->direct_IO(WRITE, iocb, &data, pos);
		743
		744	/* see generic_file_direct_write() for why this is necessary */
		745	if (mapping->nrpages) {
		746	invalidate_inode_pages2_range(mapping,
		747	pos >> PAGE_CACHE_SHIFT,
		748	end >> PAGE_CACHE_SHIFT);
		749	}
		750
		751	if (ret > 0) {
		752	pos += ret;
		753	iov_iter_advance(from, ret);
		754	iocb->ki_pos = pos;
		755	}
718	out:	756	out:
719	xfs_rw_iunlock(ip, iolock);	757	xfs_rw_iunlock(ip, iolock);
720		758


diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index b2a45cc9eceb..615781bf4ee5 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h
@@ -1221,6 +1221,11 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
1221	DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);	1221	DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
1222	DEFINE_IOMAP_EVENT(xfs_get_blocks_found);	1222	DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
1223	DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);	1223	DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
		1224	DEFINE_IOMAP_EVENT(xfs_gbmap_direct);
		1225	DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new);
		1226	DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update);
		1227	DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none);
		1228	DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio);
1224		1229
1225	DECLARE_EVENT_CLASS(xfs_simple_io_class,	1230	DECLARE_EVENT_CLASS(xfs_simple_io_class,
1226	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),	1231	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),