1 files changed, 137 insertions, 24 deletions
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1f12ad0a8585..8121e75352ee 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -559,7 +559,7 @@ restart:
        if (error <= 0)
                return error;
-        error = xfs_break_layouts(inode, iolock);
+        error = xfs_break_layouts(inode, iolock, true);
        if (error)
                return error;
@@ -569,21 +569,42 @@ restart:
         * write.  If zeroing is needed and we are currently holding the
         * iolock shared, we need to update it to exclusive which implies
         * having to redo all checks before.
+         *
+         * We need to serialise against EOF updates that occur in IO
+         * completions here. We want to make sure that nobody is changing the
+         * size while we do this check until we have placed an IO barrier (i.e.
+         * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
+         * The spinlock effectively forms a memory barrier once we have the
+         * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
+         * and hence be able to correctly determine if we need to run zeroing.
         */
+        spin_lock(&ip->i_flags_lock);
        if (iocb->ki_pos > i_size_read(inode)) {
                bool    zero = false;
+                spin_unlock(&ip->i_flags_lock);
                if (*iolock == XFS_IOLOCK_SHARED) {
                        xfs_rw_iunlock(ip, *iolock);
                        *iolock = XFS_IOLOCK_EXCL;
                        xfs_rw_ilock(ip, *iolock);
                        iov_iter_reexpand(from, count);
+                        /*
+                         * We now have an IO submission barrier in place, but
+                         * AIO can do EOF updates during IO completion and hence
+                         * we now need to wait for all of them to drain. Non-AIO
+                         * DIO will have drained before we are given the
+                         * XFS_IOLOCK_EXCL, and so for most cases this wait is a
+                         * no-op.
+                         */
+                        inode_dio_wait(inode);
                        goto restart;
                }
                error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero);
                if (error)
                        return error;
-        }
+        } else
+                spin_unlock(&ip->i_flags_lock);
        /*
         * Updating the timestamps will grab the ilock again from
@@ -645,6 +666,8 @@ xfs_file_dio_aio_write(
        int                     iolock;
        size_t                  count = iov_iter_count(from);
        loff_t                  pos = iocb->ki_pos;
+        loff_t                  end;
+        struct iov_iter         data;
        struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
                                        mp->m_rtdev_targp : mp->m_ddev_targp;
@@ -685,10 +708,11 @@ xfs_file_dio_aio_write(
                goto out;
        count = iov_iter_count(from);
        pos = iocb->ki_pos;
+        end = pos + count - 1;
        if (mapping->nrpages) {
                ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-                                                    pos, pos + count - 1);
+                                                   pos, end);
                if (ret)
                        goto out;
                /*
@@ -698,7 +722,7 @@ xfs_file_dio_aio_write(
                 */
                ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
                                        pos >> PAGE_CACHE_SHIFT,
-                                        (pos + count - 1) >> PAGE_CACHE_SHIFT);
+                                        end >> PAGE_CACHE_SHIFT);
                WARN_ON_ONCE(ret);
                ret = 0;
        }
@@ -715,8 +739,22 @@ xfs_file_dio_aio_write(
        }
        trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
-        ret = generic_file_direct_write(iocb, from, pos);
+        data = *from;
+        ret = mapping->a_ops->direct_IO(iocb, &data, pos);
+        /* see generic_file_direct_write() for why this is necessary */
+        if (mapping->nrpages) {
+                invalidate_inode_pages2_range(mapping,
+                                              pos >> PAGE_CACHE_SHIFT,
+                                              end >> PAGE_CACHE_SHIFT);
+        }
+        if (ret > 0) {
+                pos += ret;
+                iov_iter_advance(from, ret);
+                iocb->ki_pos = pos;
+        }
 out:
        xfs_rw_iunlock(ip, iolock);
@@ -822,6 +860,11 @@ xfs_file_write_iter(
        return ret;
 }
+#define XFS_FALLOC_FL_SUPPORTED                                         \
+                (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |           \
+                 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |      \
+                 FALLOC_FL_INSERT_RANGE)
 STATIC long
 xfs_file_fallocate(
        struct file             *file,
@@ -835,18 +878,21 @@ xfs_file_fallocate(
        enum xfs_prealloc_flags flags = 0;
        uint                    iolock = XFS_IOLOCK_EXCL;
        loff_t                  new_size = 0;
+        bool                    do_file_insert = 0;
        if (!S_ISREG(inode->i_mode))
                return -EINVAL;
-        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+        if (mode & ~XFS_FALLOC_FL_SUPPORTED)
-                     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
                return -EOPNOTSUPP;
        xfs_ilock(ip, iolock);
-        error = xfs_break_layouts(inode, &iolock);
+        error = xfs_break_layouts(inode, &iolock, false);
        if (error)
                goto out_unlock;
+        xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+        iolock |= XFS_MMAPLOCK_EXCL;
        if (mode & FALLOC_FL_PUNCH_HOLE) {
                error = xfs_free_file_space(ip, offset, len);
                if (error)
@@ -873,6 +919,27 @@ xfs_file_fallocate(
                error = xfs_collapse_file_space(ip, offset, len);
                if (error)
                        goto out_unlock;
+        } else if (mode & FALLOC_FL_INSERT_RANGE) {
+                unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
+                new_size = i_size_read(inode) + len;
+                if (offset & blksize_mask || len & blksize_mask) {
+                        error = -EINVAL;
+                        goto out_unlock;
+                }
+                /* check the new inode size does not wrap through zero */
+                if (new_size > inode->i_sb->s_maxbytes) {
+                        error = -EFBIG;
+                        goto out_unlock;
+                }
+                /* Offset should be less than i_size */
+                if (offset >= i_size_read(inode)) {
+                        error = -EINVAL;
+                        goto out_unlock;
+                }
+                do_file_insert = 1;
        } else {
                flags |= XFS_PREALLOC_SET;
@@ -907,8 +974,19 @@ xfs_file_fallocate(
                iattr.ia_valid = ATTR_SIZE;
                iattr.ia_size = new_size;
                error = xfs_setattr_size(ip, &iattr);
+                if (error)
+                        goto out_unlock;
        }
+        /*
+         * Perform hole insertion now that the file size has been
+         * updated so that if we crash during the operation we don't
+         * leave shifted extents past EOF and hence losing access to
+         * the data that is contained within them.
+         */
+        if (do_file_insert)
+                error = xfs_insert_file_space(ip, offset, len);
 out_unlock:
        xfs_iunlock(ip, iolock);
        return error;
@@ -997,20 +1075,6 @@ xfs_file_mmap(
 }
 /*
- * mmap()d file has taken write protection fault and is being made
- * writable. We can set the page state up correctly for a writable
- * page, which means we can do correct delalloc accounting (ENOSPC
- * checking!) and unwritten extent mapping.
- */
-STATIC int
-xfs_vm_page_mkwrite(
-        struct vm_area_struct   *vma,
-        struct vm_fault         *vmf)
-{
-        return block_page_mkwrite(vma, vmf, xfs_get_blocks);
-}
-/*
 * This type is designed to indicate the type of offset we would like
 * to search from page cache for xfs_seek_hole_data().
 */
@@ -1385,6 +1449,55 @@ xfs_file_llseek(
        }
 }
+/*
+ * Locking for serialisation of IO during page faults. This results in a lock
+ * ordering of:
+ *
+ * mmap_sem (MM)
+ *   i_mmap_lock (XFS - truncate serialisation)
+ *     page_lock (MM)
+ *       i_lock (XFS - extent map serialisation)
+ */
+STATIC int
+xfs_filemap_fault(
+        struct vm_area_struct   *vma,
+        struct vm_fault         *vmf)
+{
+        struct xfs_inode        *ip = XFS_I(vma->vm_file->f_mapping->host);
+        int                     error;
+        trace_xfs_filemap_fault(ip);
+        xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+        error = filemap_fault(vma, vmf);
+        xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+        return error;
+}
+/*
+ * mmap()d file has taken write protection fault and is being made writable. We
+ * can set the page state up correctly for a writable page, which means we can
+ * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
+ * mapping.
+ */
+STATIC int
+xfs_filemap_page_mkwrite(
+        struct vm_area_struct   *vma,
+        struct vm_fault         *vmf)
+{
+        struct xfs_inode        *ip = XFS_I(vma->vm_file->f_mapping->host);
+        int                     error;
+        trace_xfs_filemap_page_mkwrite(ip);
+        xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+        error = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+        xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+        return error;
+}
 const struct file_operations xfs_file_operations = {
        .llseek         = xfs_file_llseek,
        .read_iter      = xfs_file_read_iter,
@@ -1415,7 +1528,7 @@ const struct file_operations xfs_dir_file_operations = {
 };
 static const struct vm_operations_struct xfs_file_vm_ops = {
-        .fault          = filemap_fault,
+        .fault          = xfs_filemap_fault,
        .map_pages      = filemap_map_pages,
-        .page_mkwrite   = xfs_vm_page_mkwrite,
+        .page_mkwrite   = xfs_filemap_page_mkwrite,
 };