1 files changed, 48 insertions, 284 deletions
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 87d2b215cbbd..7575cfc3ad15 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -87,6 +87,12 @@ xfs_find_bdev_for_inode(
 * We're now finished for good with this page.  Update the page state via the
 * associated buffer_heads, paying attention to the start and end offsets that
 * we need to process on the page.
+ *
+ * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
+ * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
+ * the page at all, as we may be racing with memory reclaim and it can free both
+ * the bufferhead chain and the page as it will see the page as clean and
+ * unused.
 */
 static void
 xfs_finish_page_writeback(
@@ -95,8 +101,9 @@ xfs_finish_page_writeback(
        int                     error)
 {
        unsigned int            end = bvec->bv_offset + bvec->bv_len - 1;
-        struct buffer_head      *head, *bh;
+        struct buffer_head      *head, *bh, *next;
        unsigned int            off = 0;
+        unsigned int            bsize;
        ASSERT(bvec->bv_offset < PAGE_SIZE);
        ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0);
@@ -105,15 +112,17 @@ xfs_finish_page_writeback(
        bh = head = page_buffers(bvec->bv_page);
+        bsize = bh->b_size;
        do {
+                next = bh->b_this_page;
                if (off < bvec->bv_offset)
                        goto next_bh;
                if (off > end)
                        break;
                bh->b_end_io(bh, !error);
 next_bh:
-                off += bh->b_size;
+                off += bsize;
-        } while ((bh = bh->b_this_page) != head);
+        } while ((bh = next) != head);
 }
 /*
@@ -1041,6 +1050,20 @@ xfs_vm_releasepage(
        trace_xfs_releasepage(page->mapping->host, page, 0, 0);
+        /*
+         * mm accommodates an old ext3 case where clean pages might not have had
+         * the dirty bit cleared. Thus, it can send actual dirty pages to
+         * ->releasepage() via shrink_active_list(). Conversely,
+         * block_invalidatepage() can send pages that are still marked dirty
+         * but otherwise have invalidated buffers.
+         *
+         * We've historically freed buffers on the latter. Instead, quietly
+         * filter out all dirty pages to avoid spurious buffer state warnings.
+         * This can likely be removed once shrink_active_list() is fixed.
+         */
+        if (PageDirty(page))
+                return 0;
        xfs_count_page_state(page, &delalloc, &unwritten);
        if (WARN_ON_ONCE(delalloc))
@@ -1144,6 +1167,8 @@ __xfs_get_blocks(
        ssize_t                 size;
        int                     new = 0;
+        BUG_ON(create && !direct);
        if (XFS_FORCED_SHUTDOWN(mp))
                return -EIO;
@@ -1151,22 +1176,14 @@ __xfs_get_blocks(
        ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
        size = bh_result->b_size;
-        if (!create && direct && offset >= i_size_read(inode))
+        if (!create && offset >= i_size_read(inode))
                return 0;
        /*
         * Direct I/O is usually done on preallocated files, so try getting
-         * a block mapping without an exclusive lock first.  For buffered
+         * a block mapping without an exclusive lock first.
-         * writes we already have the exclusive iolock anyway, so avoiding
-         * a lock roundtrip here by taking the ilock exclusive from the
-         * beginning is a useful micro optimization.
         */
-        if (create && !direct) {
+        lockmode = xfs_ilock_data_map_shared(ip);
-                lockmode = XFS_ILOCK_EXCL;
-                xfs_ilock(ip, lockmode);
-        } else {
-                lockmode = xfs_ilock_data_map_shared(ip);
-        }
        ASSERT(offset <= mp->m_super->s_maxbytes);
        if (offset + size > mp->m_super->s_maxbytes)
@@ -1185,37 +1202,19 @@ __xfs_get_blocks(
             (imap.br_startblock == HOLESTARTBLOCK ||
              imap.br_startblock == DELAYSTARTBLOCK) ||
             (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
-                if (direct || xfs_get_extsz_hint(ip)) {
+                /*
-                        /*
+                 * xfs_iomap_write_direct() expects the shared lock. It
-                         * xfs_iomap_write_direct() expects the shared lock. It
+                 * is unlocked on return.
-                         * is unlocked on return.
+                 */
-                         */
+                if (lockmode == XFS_ILOCK_EXCL)
-                        if (lockmode == XFS_ILOCK_EXCL)
+                        xfs_ilock_demote(ip, lockmode);
-                                xfs_ilock_demote(ip, lockmode);
-                        error = xfs_iomap_write_direct(ip, offset, size,
-                                                       &imap, nimaps);
-                        if (error)
-                                return error;
-                        new = 1;
-                } else {
+                error = xfs_iomap_write_direct(ip, offset, size,
-                        /*
+                                               &imap, nimaps);
-                         * Delalloc reservations do not require a transaction,
+                if (error)
-                         * we can go on without dropping the lock here. If we
+                        return error;
-                         * are allocating a new delalloc block, make sure that
+                new = 1;
-                         * we set the new flag so that we mark the buffer new so
-                         * that we know that it is newly allocated if the write
-                         * fails.
-                         */
-                        if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
-                                new = 1;
-                        error = xfs_iomap_write_delay(ip, offset, size, &imap);
-                        if (error)
-                                goto out_unlock;
-                        xfs_iunlock(ip, lockmode);
-                }
                trace_xfs_get_blocks_alloc(ip, offset, size,
                                ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
                                                   : XFS_IO_DELALLOC, &imap);
@@ -1236,9 +1235,7 @@ __xfs_get_blocks(
        }
        /* trim mapping down to size requested */
-        if (direct || size > (1 << inode->i_blkbits))
+        xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
-                xfs_map_trim_size(inode, iblock, bh_result,
-                                  &imap, offset, size);
        /*
         * For unwritten extents do not report a disk address in the buffered
@@ -1251,7 +1248,7 @@ __xfs_get_blocks(
                if (ISUNWRITTEN(&imap))
                        set_buffer_unwritten(bh_result);
                /* direct IO needs special help */
-                if (create && direct) {
+                if (create) {
                        if (dax_fault)
                                ASSERT(!ISUNWRITTEN(&imap));
                        else
@@ -1280,14 +1277,7 @@ __xfs_get_blocks(
             (new || ISUNWRITTEN(&imap))))
                set_buffer_new(bh_result);
-        if (imap.br_startblock == DELAYSTARTBLOCK) {
+        BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK);
-                BUG_ON(direct);
-                if (create) {
-                        set_buffer_uptodate(bh_result);
-                        set_buffer_mapped(bh_result);
-                        set_buffer_delay(bh_result);
-                }
-        }
        return 0;
@@ -1337,7 +1327,7 @@ xfs_get_blocks_dax_fault(
 * whereas if we have flags set we will always be called in task context
 * (i.e. from a workqueue).
 */
-STATIC int
+int
 xfs_end_io_direct_write(
        struct kiocb            *iocb,
        loff_t                  offset,
@@ -1408,234 +1398,10 @@ xfs_vm_direct_IO(
        struct kiocb            *iocb,
        struct iov_iter         *iter)
 {
-        struct inode            *inode = iocb->ki_filp->f_mapping->host;
-        dio_iodone_t            *endio = NULL;
-        int                     flags = 0;
-        struct block_device     *bdev;
-        if (iov_iter_rw(iter) == WRITE) {
-                endio = xfs_end_io_direct_write;
-                flags = DIO_ASYNC_EXTEND;
-        }
-        if (IS_DAX(inode)) {
-                return dax_do_io(iocb, inode, iter,
-                                 xfs_get_blocks_direct, endio, 0);
-        }
-        bdev = xfs_find_bdev_for_inode(inode);
-        return  __blockdev_direct_IO(iocb, inode, bdev, iter,
-                        xfs_get_blocks_direct, endio, NULL, flags);
-}
-/*
- * Punch out the delalloc blocks we have already allocated.
- *
- * Don't bother with xfs_setattr given that nothing can have made it to disk yet
- * as the page is still locked at this point.
- */
-STATIC void
-xfs_vm_kill_delalloc_range(
-        struct inode            *inode,
-        loff_t                  start,
-        loff_t                  end)
-{
-        struct xfs_inode        *ip = XFS_I(inode);
-        xfs_fileoff_t           start_fsb;
-        xfs_fileoff_t           end_fsb;
-        int                     error;
-        start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
-        end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
-        if (end_fsb <= start_fsb)
-                return;
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
-                                                end_fsb - start_fsb);
-        if (error) {
-                /* something screwed, just bail */
-                if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                        xfs_alert(ip->i_mount,
-                "xfs_vm_write_failed: unable to clean up ino %lld",
-                                        ip->i_ino);
-                }
-        }
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-}
-STATIC void
-xfs_vm_write_failed(
-        struct inode            *inode,
-        struct page             *page,
-        loff_t                  pos,
-        unsigned                len)
-{
-        loff_t                  block_offset;
-        loff_t                  block_start;
-        loff_t                  block_end;
-        loff_t                  from = pos & (PAGE_SIZE - 1);
-        loff_t                  to = from + len;
-        struct buffer_head      *bh, *head;
-        struct xfs_mount        *mp = XFS_I(inode)->i_mount;
        /*
-         * The request pos offset might be 32 or 64 bit, this is all fine
+         * We just need the method present so that open/fcntl allow direct I/O.
-         * on 64-bit platform.  However, for 64-bit pos request on 32-bit
-         * platform, the high 32-bit will be masked off if we evaluate the
-         * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
-         * 0xfffff000 as an unsigned long, hence the result is incorrect
-         * which could cause the following ASSERT failed in most cases.
-         * In order to avoid this, we can evaluate the block_offset of the
-         * start of the page by using shifts rather than masks the mismatch
-         * problem.
         */
-        block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT;
+        return -EINVAL;
-        ASSERT(block_offset + from == pos);
-        head = page_buffers(page);
-        block_start = 0;
-        for (bh = head; bh != head || !block_start;
-             bh = bh->b_this_page, block_start = block_end,
-                                   block_offset += bh->b_size) {
-                block_end = block_start + bh->b_size;
-                /* skip buffers before the write */
-                if (block_end <= from)
-                        continue;
-                /* if the buffer is after the write, we're done */
-                if (block_start >= to)
-                        break;
-                /*
-                 * Process delalloc and unwritten buffers beyond EOF. We can
-                 * encounter unwritten buffers in the event that a file has
-                 * post-EOF unwritten extents and an extending write happens to
-                 * fail (e.g., an unaligned write that also involves a delalloc
-                 * to the same page).
-                 */
-                if (!buffer_delay(bh) && !buffer_unwritten(bh))
-                        continue;
-                if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
-                    block_offset < i_size_read(inode))
-                        continue;
-                if (buffer_delay(bh))
-                        xfs_vm_kill_delalloc_range(inode, block_offset,
-                                                   block_offset + bh->b_size);
-                /*
-                 * This buffer does not contain data anymore. make sure anyone
-                 * who finds it knows that for certain.
-                 */
-                clear_buffer_delay(bh);
-                clear_buffer_uptodate(bh);
-                clear_buffer_mapped(bh);
-                clear_buffer_new(bh);
-                clear_buffer_dirty(bh);
-                clear_buffer_unwritten(bh);
-        }
-}
-/*
- * This used to call block_write_begin(), but it unlocks and releases the page
- * on error, and we need that page to be able to punch stale delalloc blocks out
- * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
- * the appropriate point.
- */
-STATIC int
-xfs_vm_write_begin(
-        struct file             *file,
-        struct address_space    *mapping,
-        loff_t                  pos,
-        unsigned                len,
-        unsigned                flags,
-        struct page             **pagep,
-        void                    **fsdata)
-{
-        pgoff_t                 index = pos >> PAGE_SHIFT;
-        struct page             *page;
-        int                     status;
-        struct xfs_mount        *mp = XFS_I(mapping->host)->i_mount;
-        ASSERT(len <= PAGE_SIZE);
-        page = grab_cache_page_write_begin(mapping, index, flags);
-        if (!page)
-                return -ENOMEM;
-        status = __block_write_begin(page, pos, len, xfs_get_blocks);
-        if (xfs_mp_fail_writes(mp))
-                status = -EIO;
-        if (unlikely(status)) {
-                struct inode    *inode = mapping->host;
-                size_t          isize = i_size_read(inode);
-                xfs_vm_write_failed(inode, page, pos, len);
-                unlock_page(page);
-                /*
-                 * If the write is beyond EOF, we only want to kill blocks
-                 * allocated in this write, not blocks that were previously
-                 * written successfully.
-                 */
-                if (xfs_mp_fail_writes(mp))
-                        isize = 0;
-                if (pos + len > isize) {
-                        ssize_t start = max_t(ssize_t, pos, isize);
-                        truncate_pagecache_range(inode, start, pos + len);
-                }
-                put_page(page);
-                page = NULL;
-        }
-        *pagep = page;
-        return status;
-}
-/*
- * On failure, we only need to kill delalloc blocks beyond EOF in the range of
- * this specific write because they will never be written. Previous writes
- * beyond EOF where block allocation succeeded do not need to be trashed, so
- * only new blocks from this write should be trashed. For blocks within
- * EOF, generic_write_end() zeros them so they are safe to leave alone and be
- * written with all the other valid data.
- */
-STATIC int
-xfs_vm_write_end(
-        struct file             *file,
-        struct address_space    *mapping,
-        loff_t                  pos,
-        unsigned                len,
-        unsigned                copied,
-        struct page             *page,
-        void                    *fsdata)
-{
-        int                     ret;
-        ASSERT(len <= PAGE_SIZE);
-        ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
-        if (unlikely(ret < len)) {
-                struct inode    *inode = mapping->host;
-                size_t          isize = i_size_read(inode);
-                loff_t          to = pos + len;
-                if (to > isize) {
-                        /* only kill blocks in this write beyond EOF */
-                        if (pos > isize)
-                                isize = pos;
-                        xfs_vm_kill_delalloc_range(inode, isize, to);
-                        truncate_pagecache_range(inode, isize, to);
-                }
-        }
-        return ret;
 }
 STATIC sector_t
@@ -1748,8 +1514,6 @@ const struct address_space_operations xfs_address_space_operations = {
        .set_page_dirty         = xfs_vm_set_page_dirty,
        .releasepage            = xfs_vm_releasepage,
        .invalidatepage         = xfs_vm_invalidatepage,
-        .write_begin            = xfs_vm_write_begin,
-        .write_end              = xfs_vm_write_end,
        .bmap                   = xfs_vm_bmap,
        .direct_IO              = xfs_vm_direct_IO,
        .migratepage            = buffer_migrate_page,

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 87d2b215cbbd..7575cfc3ad15 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c
@@ -87,6 +87,12 @@ xfs_find_bdev_for_inode(
87	* We're now finished for good with this page. Update the page state via the	87	* We're now finished for good with this page. Update the page state via the
88	* associated buffer_heads, paying attention to the start and end offsets that	88	* associated buffer_heads, paying attention to the start and end offsets that
89	* we need to process on the page.	89	* we need to process on the page.
		90	*
		91	* Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
		92	* buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
		93	* the page at all, as we may be racing with memory reclaim and it can free both
		94	* the bufferhead chain and the page as it will see the page as clean and
		95	* unused.
90	*/	96	*/
91	static void	97	static void
92	xfs_finish_page_writeback(	98	xfs_finish_page_writeback(
@@ -95,8 +101,9 @@ xfs_finish_page_writeback(
95	int error)	101	int error)
96	{	102	{
97	unsigned int end = bvec->bv_offset + bvec->bv_len - 1;	103	unsigned int end = bvec->bv_offset + bvec->bv_len - 1;
98	struct buffer_head head, bh;	104	struct buffer_head head, bh, *next;
99	unsigned int off = 0;	105	unsigned int off = 0;
		106	unsigned int bsize;
100		107
101	ASSERT(bvec->bv_offset < PAGE_SIZE);	108	ASSERT(bvec->bv_offset < PAGE_SIZE);
102	ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0);	109	ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0);
@@ -105,15 +112,17 @@ xfs_finish_page_writeback(
105		112
106	bh = head = page_buffers(bvec->bv_page);	113	bh = head = page_buffers(bvec->bv_page);
107		114
		115	bsize = bh->b_size;
108	do {	116	do {
		117	next = bh->b_this_page;
109	if (off < bvec->bv_offset)	118	if (off < bvec->bv_offset)
110	goto next_bh;	119	goto next_bh;
111	if (off > end)	120	if (off > end)
112	break;	121	break;
113	bh->b_end_io(bh, !error);	122	bh->b_end_io(bh, !error);
114	next_bh:	123	next_bh:
115	off += bh->b_size;	124	off += bsize;
116	} while ((bh = bh->b_this_page) != head);	125	} while ((bh = next) != head);
117	}	126	}
118		127
119	/*	128	/*
@@ -1041,6 +1050,20 @@ xfs_vm_releasepage(
1041		1050
1042	trace_xfs_releasepage(page->mapping->host, page, 0, 0);	1051	trace_xfs_releasepage(page->mapping->host, page, 0, 0);
1043		1052
		1053	/*
		1054	* mm accommodates an old ext3 case where clean pages might not have had
		1055	* the dirty bit cleared. Thus, it can send actual dirty pages to
		1056	* ->releasepage() via shrink_active_list(). Conversely,
		1057	* block_invalidatepage() can send pages that are still marked dirty
		1058	* but otherwise have invalidated buffers.
		1059	*
		1060	* We've historically freed buffers on the latter. Instead, quietly
		1061	* filter out all dirty pages to avoid spurious buffer state warnings.
		1062	* This can likely be removed once shrink_active_list() is fixed.
		1063	*/
		1064	if (PageDirty(page))
		1065	return 0;
		1066
1044	xfs_count_page_state(page, &delalloc, &unwritten);	1067	xfs_count_page_state(page, &delalloc, &unwritten);
1045		1068
1046	if (WARN_ON_ONCE(delalloc))	1069	if (WARN_ON_ONCE(delalloc))
@@ -1144,6 +1167,8 @@ __xfs_get_blocks(
1144	ssize_t size;	1167	ssize_t size;
1145	int new = 0;	1168	int new = 0;
1146		1169
		1170	BUG_ON(create && !direct);
		1171
1147	if (XFS_FORCED_SHUTDOWN(mp))	1172	if (XFS_FORCED_SHUTDOWN(mp))
1148	return -EIO;	1173	return -EIO;
1149		1174
@@ -1151,22 +1176,14 @@ __xfs_get_blocks(
1151	ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));	1176	ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
1152	size = bh_result->b_size;	1177	size = bh_result->b_size;
1153		1178
1154	if (!create && direct && offset >= i_size_read(inode))	1179	if (!create && offset >= i_size_read(inode))
1155	return 0;	1180	return 0;
1156		1181
1157	/*	1182	/*
1158	* Direct I/O is usually done on preallocated files, so try getting	1183	* Direct I/O is usually done on preallocated files, so try getting
1159	* a block mapping without an exclusive lock first. For buffered	1184	* a block mapping without an exclusive lock first.
1160	* writes we already have the exclusive iolock anyway, so avoiding
1161	* a lock roundtrip here by taking the ilock exclusive from the
1162	* beginning is a useful micro optimization.
1163	*/	1185	*/
1164	if (create && !direct) {	1186	lockmode = xfs_ilock_data_map_shared(ip);
1165	lockmode = XFS_ILOCK_EXCL;
1166	xfs_ilock(ip, lockmode);
1167	} else {
1168	lockmode = xfs_ilock_data_map_shared(ip);
1169	}
1170		1187
1171	ASSERT(offset <= mp->m_super->s_maxbytes);	1188	ASSERT(offset <= mp->m_super->s_maxbytes);
1172	if (offset + size > mp->m_super->s_maxbytes)	1189	if (offset + size > mp->m_super->s_maxbytes)
@@ -1185,37 +1202,19 @@ __xfs_get_blocks(
1185	(imap.br_startblock == HOLESTARTBLOCK \|\|	1202	(imap.br_startblock == HOLESTARTBLOCK \|\|
1186	imap.br_startblock == DELAYSTARTBLOCK) \|\|	1203	imap.br_startblock == DELAYSTARTBLOCK) \|\|
1187	(IS_DAX(inode) && ISUNWRITTEN(&imap)))) {	1204	(IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
1188	if (direct \|\| xfs_get_extsz_hint(ip)) {	1205	/*
1189	/*	1206	* xfs_iomap_write_direct() expects the shared lock. It
1190	* xfs_iomap_write_direct() expects the shared lock. It	1207	* is unlocked on return.
1191	* is unlocked on return.	1208	*/
1192	*/	1209	if (lockmode == XFS_ILOCK_EXCL)
1193	if (lockmode == XFS_ILOCK_EXCL)	1210	xfs_ilock_demote(ip, lockmode);
1194	xfs_ilock_demote(ip, lockmode);
1195
1196	error = xfs_iomap_write_direct(ip, offset, size,
1197	&imap, nimaps);
1198	if (error)
1199	return error;
1200	new = 1;
1201		1211
1202	} else {	1212	error = xfs_iomap_write_direct(ip, offset, size,
1203	/*	1213	&imap, nimaps);
1204	* Delalloc reservations do not require a transaction,	1214	if (error)
1205	* we can go on without dropping the lock here. If we	1215	return error;
1206	* are allocating a new delalloc block, make sure that	1216	new = 1;
1207	* we set the new flag so that we mark the buffer new so
1208	* that we know that it is newly allocated if the write
1209	* fails.
1210	*/
1211	if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
1212	new = 1;
1213	error = xfs_iomap_write_delay(ip, offset, size, &imap);
1214	if (error)
1215	goto out_unlock;
1216		1217
1217	xfs_iunlock(ip, lockmode);
1218	}
1219	trace_xfs_get_blocks_alloc(ip, offset, size,	1218	trace_xfs_get_blocks_alloc(ip, offset, size,
1220	ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN	1219	ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
1221	: XFS_IO_DELALLOC, &imap);	1220	: XFS_IO_DELALLOC, &imap);
@@ -1236,9 +1235,7 @@ __xfs_get_blocks(
1236	}	1235	}
1237		1236
1238	/* trim mapping down to size requested */	1237	/* trim mapping down to size requested */
1239	if (direct \|\| size > (1 << inode->i_blkbits))	1238	xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
1240	xfs_map_trim_size(inode, iblock, bh_result,
1241	&imap, offset, size);
1242		1239
1243	/*	1240	/*
1244	* For unwritten extents do not report a disk address in the buffered	1241	* For unwritten extents do not report a disk address in the buffered
@@ -1251,7 +1248,7 @@ __xfs_get_blocks(
1251	if (ISUNWRITTEN(&imap))	1248	if (ISUNWRITTEN(&imap))
1252	set_buffer_unwritten(bh_result);	1249	set_buffer_unwritten(bh_result);
1253	/* direct IO needs special help */	1250	/* direct IO needs special help */
1254	if (create && direct) {	1251	if (create) {
1255	if (dax_fault)	1252	if (dax_fault)
1256	ASSERT(!ISUNWRITTEN(&imap));	1253	ASSERT(!ISUNWRITTEN(&imap));
1257	else	1254	else
@@ -1280,14 +1277,7 @@ __xfs_get_blocks(
1280	(new \|\| ISUNWRITTEN(&imap))))	1277	(new \|\| ISUNWRITTEN(&imap))))
1281	set_buffer_new(bh_result);	1278	set_buffer_new(bh_result);
1282		1279
1283	if (imap.br_startblock == DELAYSTARTBLOCK) {	1280	BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK);
1284	BUG_ON(direct);
1285	if (create) {
1286	set_buffer_uptodate(bh_result);
1287	set_buffer_mapped(bh_result);
1288	set_buffer_delay(bh_result);
1289	}
1290	}
1291		1281
1292	return 0;	1282	return 0;
1293		1283
@@ -1337,7 +1327,7 @@ xfs_get_blocks_dax_fault(
1337	* whereas if we have flags set we will always be called in task context	1327	* whereas if we have flags set we will always be called in task context
1338	* (i.e. from a workqueue).	1328	* (i.e. from a workqueue).
1339	*/	1329	*/
1340	STATIC int	1330	int
1341	xfs_end_io_direct_write(	1331	xfs_end_io_direct_write(
1342	struct kiocb *iocb,	1332	struct kiocb *iocb,
1343	loff_t offset,	1333	loff_t offset,
@@ -1408,234 +1398,10 @@ xfs_vm_direct_IO(
1408	struct kiocb *iocb,	1398	struct kiocb *iocb,
1409	struct iov_iter *iter)	1399	struct iov_iter *iter)
1410	{	1400	{
1411	struct inode *inode = iocb->ki_filp->f_mapping->host;
1412	dio_iodone_t *endio = NULL;
1413	int flags = 0;
1414	struct block_device *bdev;
1415
1416	if (iov_iter_rw(iter) == WRITE) {
1417	endio = xfs_end_io_direct_write;
1418	flags = DIO_ASYNC_EXTEND;
1419	}
1420
1421	if (IS_DAX(inode)) {
1422	return dax_do_io(iocb, inode, iter,
1423	xfs_get_blocks_direct, endio, 0);
1424	}
1425
1426	bdev = xfs_find_bdev_for_inode(inode);
1427	return __blockdev_direct_IO(iocb, inode, bdev, iter,
1428	xfs_get_blocks_direct, endio, NULL, flags);
1429	}
1430
1431	/*
1432	* Punch out the delalloc blocks we have already allocated.
1433	*
1434	* Don't bother with xfs_setattr given that nothing can have made it to disk yet
1435	* as the page is still locked at this point.
1436	*/
1437	STATIC void
1438	xfs_vm_kill_delalloc_range(
1439	struct inode *inode,
1440	loff_t start,
1441	loff_t end)
1442	{
1443	struct xfs_inode *ip = XFS_I(inode);
1444	xfs_fileoff_t start_fsb;
1445	xfs_fileoff_t end_fsb;
1446	int error;
1447
1448	start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
1449	end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
1450	if (end_fsb <= start_fsb)
1451	return;
1452
1453	xfs_ilock(ip, XFS_ILOCK_EXCL);
1454	error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
1455	end_fsb - start_fsb);
1456	if (error) {
1457	/* something screwed, just bail */
1458	if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1459	xfs_alert(ip->i_mount,
1460	"xfs_vm_write_failed: unable to clean up ino %lld",
1461	ip->i_ino);
1462	}
1463	}
1464	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1465	}
1466
1467	STATIC void
1468	xfs_vm_write_failed(
1469	struct inode *inode,
1470	struct page *page,
1471	loff_t pos,
1472	unsigned len)
1473	{
1474	loff_t block_offset;
1475	loff_t block_start;
1476	loff_t block_end;
1477	loff_t from = pos & (PAGE_SIZE - 1);
1478	loff_t to = from + len;
1479	struct buffer_head bh, head;
1480	struct xfs_mount *mp = XFS_I(inode)->i_mount;
1481
1482	/*	1401	/*
1483	* The request pos offset might be 32 or 64 bit, this is all fine	1402	* We just need the method present so that open/fcntl allow direct I/O.
1484	* on 64-bit platform. However, for 64-bit pos request on 32-bit
1485	* platform, the high 32-bit will be masked off if we evaluate the
1486	* block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
1487	* 0xfffff000 as an unsigned long, hence the result is incorrect
1488	* which could cause the following ASSERT failed in most cases.
1489	* In order to avoid this, we can evaluate the block_offset of the
1490	* start of the page by using shifts rather than masks the mismatch
1491	* problem.
1492	*/	1403	*/
1493	block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT;	1404	return -EINVAL;
1494
1495	ASSERT(block_offset + from == pos);
1496
1497	head = page_buffers(page);
1498	block_start = 0;
1499	for (bh = head; bh != head \|\| !block_start;
1500	bh = bh->b_this_page, block_start = block_end,
1501	block_offset += bh->b_size) {
1502	block_end = block_start + bh->b_size;
1503
1504	/* skip buffers before the write */
1505	if (block_end <= from)
1506	continue;
1507
1508	/* if the buffer is after the write, we're done */
1509	if (block_start >= to)
1510	break;
1511
1512	/*
1513	* Process delalloc and unwritten buffers beyond EOF. We can
1514	* encounter unwritten buffers in the event that a file has
1515	* post-EOF unwritten extents and an extending write happens to
1516	* fail (e.g., an unaligned write that also involves a delalloc
1517	* to the same page).
1518	*/
1519	if (!buffer_delay(bh) && !buffer_unwritten(bh))
1520	continue;
1521
1522	if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
1523	block_offset < i_size_read(inode))
1524	continue;
1525
1526	if (buffer_delay(bh))
1527	xfs_vm_kill_delalloc_range(inode, block_offset,
1528	block_offset + bh->b_size);
1529
1530	/*
1531	* This buffer does not contain data anymore. make sure anyone
1532	* who finds it knows that for certain.
1533	*/
1534	clear_buffer_delay(bh);
1535	clear_buffer_uptodate(bh);
1536	clear_buffer_mapped(bh);
1537	clear_buffer_new(bh);
1538	clear_buffer_dirty(bh);
1539	clear_buffer_unwritten(bh);
1540	}
1541
1542	}
1543
1544	/*
1545	* This used to call block_write_begin(), but it unlocks and releases the page
1546	* on error, and we need that page to be able to punch stale delalloc blocks out
1547	* on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
1548	* the appropriate point.
1549	*/
1550	STATIC int
1551	xfs_vm_write_begin(
1552	struct file *file,
1553	struct address_space *mapping,
1554	loff_t pos,
1555	unsigned len,
1556	unsigned flags,
1557	struct page **pagep,
1558	void **fsdata)
1559	{
1560	pgoff_t index = pos >> PAGE_SHIFT;
1561	struct page *page;
1562	int status;
1563	struct xfs_mount *mp = XFS_I(mapping->host)->i_mount;
1564
1565	ASSERT(len <= PAGE_SIZE);
1566
1567	page = grab_cache_page_write_begin(mapping, index, flags);
1568	if (!page)
1569	return -ENOMEM;
1570
1571	status = __block_write_begin(page, pos, len, xfs_get_blocks);
1572	if (xfs_mp_fail_writes(mp))
1573	status = -EIO;
1574	if (unlikely(status)) {
1575	struct inode *inode = mapping->host;
1576	size_t isize = i_size_read(inode);
1577
1578	xfs_vm_write_failed(inode, page, pos, len);
1579	unlock_page(page);
1580
1581	/*
1582	* If the write is beyond EOF, we only want to kill blocks
1583	* allocated in this write, not blocks that were previously
1584	* written successfully.
1585	*/
1586	if (xfs_mp_fail_writes(mp))
1587	isize = 0;
1588	if (pos + len > isize) {
1589	ssize_t start = max_t(ssize_t, pos, isize);
1590
1591	truncate_pagecache_range(inode, start, pos + len);
1592	}
1593
1594	put_page(page);
1595	page = NULL;
1596	}
1597
1598	*pagep = page;
1599	return status;
1600	}
1601
1602	/*
1603	* On failure, we only need to kill delalloc blocks beyond EOF in the range of
1604	* this specific write because they will never be written. Previous writes
1605	* beyond EOF where block allocation succeeded do not need to be trashed, so
1606	* only new blocks from this write should be trashed. For blocks within
1607	* EOF, generic_write_end() zeros them so they are safe to leave alone and be
1608	* written with all the other valid data.
1609	*/
1610	STATIC int
1611	xfs_vm_write_end(
1612	struct file *file,
1613	struct address_space *mapping,
1614	loff_t pos,
1615	unsigned len,
1616	unsigned copied,
1617	struct page *page,
1618	void *fsdata)
1619	{
1620	int ret;
1621
1622	ASSERT(len <= PAGE_SIZE);
1623
1624	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
1625	if (unlikely(ret < len)) {
1626	struct inode *inode = mapping->host;
1627	size_t isize = i_size_read(inode);
1628	loff_t to = pos + len;
1629
1630	if (to > isize) {
1631	/* only kill blocks in this write beyond EOF */
1632	if (pos > isize)
1633	isize = pos;
1634	xfs_vm_kill_delalloc_range(inode, isize, to);
1635	truncate_pagecache_range(inode, isize, to);
1636	}
1637	}
1638	return ret;
1639	}	1405	}
1640		1406
1641	STATIC sector_t	1407	STATIC sector_t
@@ -1748,8 +1514,6 @@ const struct address_space_operations xfs_address_space_operations = {
1748	.set_page_dirty = xfs_vm_set_page_dirty,	1514	.set_page_dirty = xfs_vm_set_page_dirty,
1749	.releasepage = xfs_vm_releasepage,	1515	.releasepage = xfs_vm_releasepage,
1750	.invalidatepage = xfs_vm_invalidatepage,	1516	.invalidatepage = xfs_vm_invalidatepage,
1751	.write_begin = xfs_vm_write_begin,
1752	.write_end = xfs_vm_write_end,
1753	.bmap = xfs_vm_bmap,	1517	.bmap = xfs_vm_bmap,
1754	.direct_IO = xfs_vm_direct_IO,	1518	.direct_IO = xfs_vm_direct_IO,
1755	.migratepage = buffer_migrate_page,	1519	.migratepage = buffer_migrate_page,