1 files changed, 172 insertions, 49 deletions
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 66abe36c1213..9083357f9e44 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -39,6 +39,7 @@
 #include "xfs_iomap.h"
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
+#include "xfs_bmap.h"
 #include <linux/mpage.h>
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
@@ -163,14 +164,17 @@ xfs_ioend_new_eof(
 }
 /*
- * Update on-disk file size now that data has been written to disk.
+ * Update on-disk file size now that data has been written to disk.  The
- * The current in-memory file size is i_size.  If a write is beyond
+ * current in-memory file size is i_size.  If a write is beyond eof i_new_size
- * eof i_new_size will be the intended file size until i_size is
+ * will be the intended file size until i_size is updated.  If this write does
- * updated.  If this write does not extend all the way to the valid
+ * not extend all the way to the valid file size then restrict this update to
- * file size then restrict this update to the end of the write.
+ * the end of the write.
+ *
+ * This function does not block as blocking on the inode lock in IO completion
+ * can lead to IO completion order dependency deadlocks.. If it can't get the
+ * inode ilock it will return EAGAIN. Callers must handle this.
 */
+STATIC int
-STATIC void
 xfs_setfilesize(
        xfs_ioend_t             *ioend)
 {
@@ -181,16 +185,40 @@ xfs_setfilesize(
        ASSERT(ioend->io_type != IOMAP_READ);
        if (unlikely(ioend->io_error))
-                return;
+                return 0;
+        if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+                return EAGAIN;
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
        isize = xfs_ioend_new_eof(ioend);
        if (isize) {
                ip->i_d.di_size = isize;
-                xfs_mark_inode_dirty_sync(ip);
+                xfs_mark_inode_dirty(ip);
        }
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        return 0;
+}
+/*
+ * Schedule IO completion handling on a xfsdatad if this was
+ * the final hold on this ioend. If we are asked to wait,
+ * flush the workqueue.
+ */
+STATIC void
+xfs_finish_ioend(
+        xfs_ioend_t     *ioend,
+        int             wait)
+{
+        if (atomic_dec_and_test(&ioend->io_remaining)) {
+                struct workqueue_struct *wq;
+                wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
+                        xfsconvertd_workqueue : xfsdatad_workqueue;
+                queue_work(wq, &ioend->io_work);
+                if (wait)
+                        flush_workqueue(wq);
+        }
 }
 /*
@@ -198,11 +226,11 @@ xfs_setfilesize(
 */
 STATIC void
 xfs_end_io(
-        struct work_struct      *work)
+        struct work_struct *work)
 {
-        xfs_ioend_t             *ioend =
+        xfs_ioend_t     *ioend = container_of(work, xfs_ioend_t, io_work);
-                container_of(work, xfs_ioend_t, io_work);
+        struct xfs_inode *ip = XFS_I(ioend->io_inode);
-        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
+        int             error = 0;
        /*
         * For unwritten extents we need to issue transactions to convert a
@@ -210,7 +238,6 @@ xfs_end_io(
         */
        if (ioend->io_type == IOMAP_UNWRITTEN &&
            likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
-                int error;
                error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
                                                 ioend->io_size);
@@ -222,30 +249,23 @@ xfs_end_io(
         * We might have to update the on-disk file size after extending
         * writes.
         */
-        if (ioend->io_type != IOMAP_READ)
+        if (ioend->io_type != IOMAP_READ) {
-                xfs_setfilesize(ioend);
+                error = xfs_setfilesize(ioend);
-        xfs_destroy_ioend(ioend);
+                ASSERT(!error || error == EAGAIN);
-}
-/*
- * Schedule IO completion handling on a xfsdatad if this was
- * the final hold on this ioend. If we are asked to wait,
- * flush the workqueue.
- */
-STATIC void
-xfs_finish_ioend(
-        xfs_ioend_t     *ioend,
-        int             wait)
-{
-        if (atomic_dec_and_test(&ioend->io_remaining)) {
-                struct workqueue_struct *wq;
-                wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
-                        xfsconvertd_workqueue : xfsdatad_workqueue;
-                queue_work(wq, &ioend->io_work);
-                if (wait)
-                        flush_workqueue(wq);
        }
+        /*
+         * If we didn't complete processing of the ioend, requeue it to the
+         * tail of the workqueue for another attempt later. Otherwise destroy
+         * it.
+         */
+        if (error == EAGAIN) {
+                atomic_inc(&ioend->io_remaining);
+                xfs_finish_ioend(ioend, 0);
+                /* ensure we don't spin on blocked ioends */
+                delay(1);
+        } else
+                xfs_destroy_ioend(ioend);
 }
 /*
@@ -341,7 +361,7 @@ xfs_submit_ioend_bio(
         * but don't update the inode size until I/O completion.
         */
        if (xfs_ioend_new_eof(ioend))
-                xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode));
+                xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
        submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
                   WRITE_SYNC_PLUG : WRITE, bio);
@@ -874,6 +894,118 @@ xfs_cluster_write(
        }
 }
+STATIC void
+xfs_vm_invalidatepage(
+        struct page             *page,
+        unsigned long           offset)
+{
+        trace_xfs_invalidatepage(page->mapping->host, page, offset);
+        block_invalidatepage(page, offset);
+}
+/*
+ * If the page has delalloc buffers on it, we need to punch them out before we
+ * invalidate the page. If we don't, we leave a stale delalloc mapping on the
+ * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
+ * is done on that same region - the delalloc extent is returned when none is
+ * supposed to be there.
+ *
+ * We prevent this by truncating away the delalloc regions on the page before
+ * invalidating it. Because they are delalloc, we can do this without needing a
+ * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
+ * truncation without a transaction as there is no space left for block
+ * reservation (typically why we see a ENOSPC in writeback).
+ *
+ * This is not a performance critical path, so for now just do the punching a
+ * buffer head at a time.
+ */
+STATIC void
+xfs_aops_discard_page(
+        struct page             *page)
+{
+        struct inode            *inode = page->mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct buffer_head      *bh, *head;
+        loff_t                  offset = page_offset(page);
+        ssize_t                 len = 1 << inode->i_blkbits;
+        if (!xfs_is_delayed_page(page, IOMAP_DELAY))
+                goto out_invalidate;
+        xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                "page discard on page %p, inode 0x%llx, offset %llu.",
+                        page, ip->i_ino, offset);
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        bh = head = page_buffers(page);
+        do {
+                int             done;
+                xfs_fileoff_t   offset_fsb;
+                xfs_bmbt_irec_t imap;
+                int             nimaps = 1;
+                int             error;
+                xfs_fsblock_t   firstblock;
+                xfs_bmap_free_t flist;
+                if (!buffer_delay(bh))
+                        goto next_buffer;
+                offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+                /*
+                 * Map the range first and check that it is a delalloc extent
+                 * before trying to unmap the range. Otherwise we will be
+                 * trying to remove a real extent (which requires a
+                 * transaction) or a hole, which is probably a bad idea...
+                 */
+                error = xfs_bmapi(NULL, ip, offset_fsb, 1,
+                                XFS_BMAPI_ENTIRE,  NULL, 0, &imap,
+                                &nimaps, NULL, NULL);
+                if (error) {
+                        /* something screwed, just bail */
+                        xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                        "page discard failed delalloc mapping lookup.");
+                        break;
+                }
+                if (!nimaps) {
+                        /* nothing there */
+                        goto next_buffer;
+                }
+                if (imap.br_startblock != DELAYSTARTBLOCK) {
+                        /* been converted, ignore */
+                        goto next_buffer;
+                }
+                WARN_ON(imap.br_blockcount == 0);
+                /*
+                 * Note: while we initialise the firstblock/flist pair, they
+                 * should never be used because blocks should never be
+                 * allocated or freed for a delalloc extent and hence we need
+                 * don't cancel or finish them after the xfs_bunmapi() call.
+                 */
+                xfs_bmap_init(&flist, &firstblock);
+                error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
+                                        &flist, NULL, &done);
+                ASSERT(!flist.xbf_count && !flist.xbf_first);
+                if (error) {
+                        /* something screwed, just bail */
+                        xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                        "page discard unable to remove delalloc mapping.");
+                        break;
+                }
+next_buffer:
+                offset += len;
+        } while ((bh = bh->b_this_page) != head);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out_invalidate:
+        xfs_vm_invalidatepage(page, 0);
+        return;
+}
 /*
 * Calling this without startio set means we are being asked to make a dirty
 * page ready for freeing it's buffers.  When called with startio set then
@@ -1125,7 +1257,7 @@ error:
         */
        if (err != -EAGAIN) {
                if (!unmapped)
-                        block_invalidatepage(page, 0);
+                        xfs_aops_discard_page(page);
                ClearPageUptodate(page);
        }
        return err;
@@ -1535,15 +1667,6 @@ xfs_vm_readpages(
        return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
 }
-STATIC void
-xfs_vm_invalidatepage(
-        struct page             *page,
-        unsigned long           offset)
-{
-        trace_xfs_invalidatepage(page->mapping->host, page, offset);
-        block_invalidatepage(page, offset);
-}
 const struct address_space_operations xfs_address_space_operations = {
        .readpage               = xfs_vm_readpage,
        .readpages              = xfs_vm_readpages,

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 66abe36c1213..9083357f9e44 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -39,6 +39,7 @@
39	#include "xfs_iomap.h"	39	#include "xfs_iomap.h"
40	#include "xfs_vnodeops.h"	40	#include "xfs_vnodeops.h"
41	#include "xfs_trace.h"	41	#include "xfs_trace.h"
		42	#include "xfs_bmap.h"
42	#include <linux/mpage.h>	43	#include <linux/mpage.h>
43	#include <linux/pagevec.h>	44	#include <linux/pagevec.h>
44	#include <linux/writeback.h>	45	#include <linux/writeback.h>
@@ -163,14 +164,17 @@ xfs_ioend_new_eof(
163	}	164	}
164		165
165	/*	166	/*
166	* Update on-disk file size now that data has been written to disk.	167	* Update on-disk file size now that data has been written to disk. The
167	* The current in-memory file size is i_size. If a write is beyond	168	* current in-memory file size is i_size. If a write is beyond eof i_new_size
168	* eof i_new_size will be the intended file size until i_size is	169	* will be the intended file size until i_size is updated. If this write does
169	* updated. If this write does not extend all the way to the valid	170	* not extend all the way to the valid file size then restrict this update to
170	* file size then restrict this update to the end of the write.	171	* the end of the write.
		172	*
		173	* This function does not block as blocking on the inode lock in IO completion
		174	* can lead to IO completion order dependency deadlocks.. If it can't get the
		175	* inode ilock it will return EAGAIN. Callers must handle this.
171	*/	176	*/
172		177	STATIC int
173	STATIC void
174	xfs_setfilesize(	178	xfs_setfilesize(
175	xfs_ioend_t *ioend)	179	xfs_ioend_t *ioend)
176	{	180	{
@@ -181,16 +185,40 @@ xfs_setfilesize(
181	ASSERT(ioend->io_type != IOMAP_READ);	185	ASSERT(ioend->io_type != IOMAP_READ);
182		186
183	if (unlikely(ioend->io_error))	187	if (unlikely(ioend->io_error))
184	return;	188	return 0;
		189
		190	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
		191	return EAGAIN;
185		192
186	xfs_ilock(ip, XFS_ILOCK_EXCL);
187	isize = xfs_ioend_new_eof(ioend);	193	isize = xfs_ioend_new_eof(ioend);
188	if (isize) {	194	if (isize) {
189	ip->i_d.di_size = isize;	195	ip->i_d.di_size = isize;
190	xfs_mark_inode_dirty_sync(ip);	196	xfs_mark_inode_dirty(ip);
191	}	197	}
192		198
193	xfs_iunlock(ip, XFS_ILOCK_EXCL);	199	xfs_iunlock(ip, XFS_ILOCK_EXCL);
		200	return 0;
		201	}
		202
		203	/*
		204	* Schedule IO completion handling on a xfsdatad if this was
		205	* the final hold on this ioend. If we are asked to wait,
		206	* flush the workqueue.
		207	*/
		208	STATIC void
		209	xfs_finish_ioend(
		210	xfs_ioend_t *ioend,
		211	int wait)
		212	{
		213	if (atomic_dec_and_test(&ioend->io_remaining)) {
		214	struct workqueue_struct *wq;
		215
		216	wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
		217	xfsconvertd_workqueue : xfsdatad_workqueue;
		218	queue_work(wq, &ioend->io_work);
		219	if (wait)
		220	flush_workqueue(wq);
		221	}
194	}	222	}
195		223
196	/*	224	/*
@@ -198,11 +226,11 @@ xfs_setfilesize(
198	*/	226	*/
199	STATIC void	227	STATIC void
200	xfs_end_io(	228	xfs_end_io(
201	struct work_struct *work)	229	struct work_struct *work)
202	{	230	{
203	xfs_ioend_t *ioend =	231	xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work);
204	container_of(work, xfs_ioend_t, io_work);	232	struct xfs_inode *ip = XFS_I(ioend->io_inode);
205	struct xfs_inode *ip = XFS_I(ioend->io_inode);	233	int error = 0;
206		234
207	/*	235	/*
208	* For unwritten extents we need to issue transactions to convert a	236	* For unwritten extents we need to issue transactions to convert a
@@ -210,7 +238,6 @@ xfs_end_io(
210	*/	238	*/
211	if (ioend->io_type == IOMAP_UNWRITTEN &&	239	if (ioend->io_type == IOMAP_UNWRITTEN &&
212	likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {	240	likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
213	int error;
214		241
215	error = xfs_iomap_write_unwritten(ip, ioend->io_offset,	242	error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
216	ioend->io_size);	243	ioend->io_size);
@@ -222,30 +249,23 @@ xfs_end_io(
222	* We might have to update the on-disk file size after extending	249	* We might have to update the on-disk file size after extending
223	* writes.	250	* writes.
224	*/	251	*/
225	if (ioend->io_type != IOMAP_READ)	252	if (ioend->io_type != IOMAP_READ) {
226	xfs_setfilesize(ioend);	253	error = xfs_setfilesize(ioend);
227	xfs_destroy_ioend(ioend);	254	ASSERT(!error \|\| error == EAGAIN);
228	}
229
230	/*
231	* Schedule IO completion handling on a xfsdatad if this was
232	* the final hold on this ioend. If we are asked to wait,
233	* flush the workqueue.
234	*/
235	STATIC void
236	xfs_finish_ioend(
237	xfs_ioend_t *ioend,
238	int wait)
239	{
240	if (atomic_dec_and_test(&ioend->io_remaining)) {
241	struct workqueue_struct *wq;
242
243	wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
244	xfsconvertd_workqueue : xfsdatad_workqueue;
245	queue_work(wq, &ioend->io_work);
246	if (wait)
247	flush_workqueue(wq);
248	}	255	}
		256
		257	/*
		258	* If we didn't complete processing of the ioend, requeue it to the
		259	* tail of the workqueue for another attempt later. Otherwise destroy
		260	* it.
		261	*/
		262	if (error == EAGAIN) {
		263	atomic_inc(&ioend->io_remaining);
		264	xfs_finish_ioend(ioend, 0);
		265	/* ensure we don't spin on blocked ioends */
		266	delay(1);
		267	} else
		268	xfs_destroy_ioend(ioend);
249	}	269	}
250		270
251	/*	271	/*
@@ -341,7 +361,7 @@ xfs_submit_ioend_bio(
341	* but don't update the inode size until I/O completion.	361	* but don't update the inode size until I/O completion.
342	*/	362	*/
343	if (xfs_ioend_new_eof(ioend))	363	if (xfs_ioend_new_eof(ioend))
344	xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode));	364	xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
345		365
346	submit_bio(wbc->sync_mode == WB_SYNC_ALL ?	366	submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
347	WRITE_SYNC_PLUG : WRITE, bio);	367	WRITE_SYNC_PLUG : WRITE, bio);
@@ -874,6 +894,118 @@ xfs_cluster_write(
874	}	894	}
875	}	895	}
876		896
		897	STATIC void
		898	xfs_vm_invalidatepage(
		899	struct page *page,
		900	unsigned long offset)
		901	{
		902	trace_xfs_invalidatepage(page->mapping->host, page, offset);
		903	block_invalidatepage(page, offset);
		904	}
		905
		906	/*
		907	* If the page has delalloc buffers on it, we need to punch them out before we
		908	* invalidate the page. If we don't, we leave a stale delalloc mapping on the
		909	* inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
		910	* is done on that same region - the delalloc extent is returned when none is
		911	* supposed to be there.
		912	*
		913	* We prevent this by truncating away the delalloc regions on the page before
		914	* invalidating it. Because they are delalloc, we can do this without needing a
		915	* transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
		916	* truncation without a transaction as there is no space left for block
		917	* reservation (typically why we see a ENOSPC in writeback).
		918	*
		919	* This is not a performance critical path, so for now just do the punching a
		920	* buffer head at a time.
		921	*/
		922	STATIC void
		923	xfs_aops_discard_page(
		924	struct page *page)
		925	{
		926	struct inode *inode = page->mapping->host;
		927	struct xfs_inode *ip = XFS_I(inode);
		928	struct buffer_head bh, head;
		929	loff_t offset = page_offset(page);
		930	ssize_t len = 1 << inode->i_blkbits;
		931
		932	if (!xfs_is_delayed_page(page, IOMAP_DELAY))
		933	goto out_invalidate;
		934
		935	xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
		936	"page discard on page %p, inode 0x%llx, offset %llu.",
		937	page, ip->i_ino, offset);
		938
		939	xfs_ilock(ip, XFS_ILOCK_EXCL);
		940	bh = head = page_buffers(page);
		941	do {
		942	int done;
		943	xfs_fileoff_t offset_fsb;
		944	xfs_bmbt_irec_t imap;
		945	int nimaps = 1;
		946	int error;
		947	xfs_fsblock_t firstblock;
		948	xfs_bmap_free_t flist;
		949
		950	if (!buffer_delay(bh))
		951	goto next_buffer;
		952
		953	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
		954
		955	/*
		956	* Map the range first and check that it is a delalloc extent
		957	* before trying to unmap the range. Otherwise we will be
		958	* trying to remove a real extent (which requires a
		959	* transaction) or a hole, which is probably a bad idea...
		960	*/
		961	error = xfs_bmapi(NULL, ip, offset_fsb, 1,
		962	XFS_BMAPI_ENTIRE, NULL, 0, &imap,
		963	&nimaps, NULL, NULL);
		964
		965	if (error) {
		966	/* something screwed, just bail */
		967	xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
		968	"page discard failed delalloc mapping lookup.");
		969	break;
		970	}
		971	if (!nimaps) {
		972	/* nothing there */
		973	goto next_buffer;
		974	}
		975	if (imap.br_startblock != DELAYSTARTBLOCK) {
		976	/* been converted, ignore */
		977	goto next_buffer;
		978	}
		979	WARN_ON(imap.br_blockcount == 0);
		980
		981	/*
		982	* Note: while we initialise the firstblock/flist pair, they
		983	* should never be used because blocks should never be
		984	* allocated or freed for a delalloc extent and hence we need
		985	* don't cancel or finish them after the xfs_bunmapi() call.
		986	*/
		987	xfs_bmap_init(&flist, &firstblock);
		988	error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
		989	&flist, NULL, &done);
		990
		991	ASSERT(!flist.xbf_count && !flist.xbf_first);
		992	if (error) {
		993	/* something screwed, just bail */
		994	xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
		995	"page discard unable to remove delalloc mapping.");
		996	break;
		997	}
		998	next_buffer:
		999	offset += len;
		1000
		1001	} while ((bh = bh->b_this_page) != head);
		1002
		1003	xfs_iunlock(ip, XFS_ILOCK_EXCL);
		1004	out_invalidate:
		1005	xfs_vm_invalidatepage(page, 0);
		1006	return;
		1007	}
		1008
877	/*	1009	/*
878	* Calling this without startio set means we are being asked to make a dirty	1010	* Calling this without startio set means we are being asked to make a dirty
879	* page ready for freeing it's buffers. When called with startio set then	1011	* page ready for freeing it's buffers. When called with startio set then
@@ -1125,7 +1257,7 @@ error:
1125	*/	1257	*/
1126	if (err != -EAGAIN) {	1258	if (err != -EAGAIN) {
1127	if (!unmapped)	1259	if (!unmapped)
1128	block_invalidatepage(page, 0);	1260	xfs_aops_discard_page(page);
1129	ClearPageUptodate(page);	1261	ClearPageUptodate(page);
1130	}	1262	}
1131	return err;	1263	return err;
@@ -1535,15 +1667,6 @@ xfs_vm_readpages(
1535	return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);	1667	return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
1536	}	1668	}
1537		1669
1538	STATIC void
1539	xfs_vm_invalidatepage(
1540	struct page *page,
1541	unsigned long offset)
1542	{
1543	trace_xfs_invalidatepage(page->mapping->host, page, offset);
1544	block_invalidatepage(page, offset);
1545	}
1546
1547	const struct address_space_operations xfs_address_space_operations = {	1670	const struct address_space_operations xfs_address_space_operations = {
1548	.readpage = xfs_vm_readpage,	1671	.readpage = xfs_vm_readpage,
1549	.readpages = xfs_vm_readpages,	1672	.readpages = xfs_vm_readpages,