[GFS2] Fix DIO deadlock

This patch fixes Red Hat bugzilla #212627 in which a deadlock occurs due to trying to take the i_mutex while holding a glock. The correct locking order is defined as i_mutex -> glock in all cases. I've left dealing with allocating writes. I know that we need to do that, but for now this should do the trick. We don't need to take the i_mutex on write, because the VFS has already taken it for us. On read we don't need it since the glock is enough protection. The reason that I've made some of the checks into a separate function is that we'll need to do the checks again in the allocating write case eventually, so this is partly in preparation for this. Likewise the return value test of != 1 might look a bit odd and thats because we'll need a third return value in case of requiring an allocation. I've made the change to deferred mode on the glock to ensure flushing read caches on other nodes. I notice that (using blktrace to look at whats going on) we appear to do a better job of large I/Os than ext3 after this patch (in terms of not splitting up the I/Os). Signed-off-by: Steven Whitehouse <swhiteho@redhat.com> Cc: Wendy Cheng <wcheng@redhat.com>
author: Steven Whitehouse <swhiteho@redhat.com> 2006-12-14 13:24:26 -0500
committer: Steven Whitehouse <swhiteho@redhat.com> 2007-02-05 13:36:09 -0500
commit: c7b3383437ff41781964d1bf7f40ff8d7dd5bc47 (patch)
tree: dbe0e6cb267bfe1dc5f52ee6c30529b55ac58a16 /fs/gfs2
parent: 927255f0383342f5d49b82adb6689b9cba52a6f5 (diff)
1 files changed, 45 insertions, 29 deletions
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index d8d69a72a10d..0118aa439c1d 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -594,6 +594,36 @@ static void gfs2_invalidatepage(struct page *page, unsigned long offset)
        return;
 }
+/**
+ * gfs2_ok_for_dio - check that dio is valid on this file
+ * @ip: The inode
+ * @rw: READ or WRITE
+ * @offset: The offset at which we are reading or writing
+ *
+ * Returns: 0 (to ignore the i/o request and thus fall back to buffered i/o)
+ *          1 (to accept the i/o request)
+ */
+static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
+{
+        /*
+         * Should we return an error here? I can't see that O_DIRECT for
+         * a journaled file makes any sense. For now we'll silently fall
+         * back to buffered I/O, likewise we do the same for stuffed
+         * files since they are (a) small and (b) unaligned.
+         */
+        if (gfs2_is_jdata(ip))
+                return 0;
+        if (gfs2_is_stuffed(ip))
+                return 0;
+        if (offset > i_size_read(&ip->i_inode))
+                return 0;
+        return 1;
+}
 static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
                              const struct iovec *iov, loff_t offset,
                              unsigned long nr_segs)
@@ -604,42 +634,28 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
        struct gfs2_holder gh;
        int rv;
-        if (rw == READ)
-                mutex_lock(&inode->i_mutex);
        /*
-         * Shared lock, even if its a write, since we do no allocation
+         * Deferred lock, even if its a write, since we do no allocation
-         * on this path. All we need change is atime.
+         * on this path. All we need change is atime, and this lock mode
+         * ensures that other nodes have flushed their buffered read caches
+         * (i.e. their page cache entries for this inode). We do not,
+         * unfortunately have the option of only flushing a range like
+         * the VFS does.
         */
-        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
+        gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, GL_ATIME, &gh);
        rv = gfs2_glock_nq_atime(&gh);
        if (rv)
-                goto out;
+                return rv;
+        rv = gfs2_ok_for_dio(ip, rw, offset);
-        if (offset > i_size_read(inode))
+        if (rv != 1)
-                goto out;
+                goto out; /* dio not valid, fall back to buffered i/o */
-        /*
+        rv = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev,
-         * Should we return an error here? I can't see that O_DIRECT for
+                                           iov, offset, nr_segs,
-         * a journaled file makes any sense. For now we'll silently fall
+                                           gfs2_get_block_direct, NULL);
-         * back to buffered I/O, likewise we do the same for stuffed
-         * files since they are (a) small and (b) unaligned.
-         */
-        if (gfs2_is_jdata(ip))
-                goto out;
-        if (gfs2_is_stuffed(ip))
-                goto out;
-        rv = blockdev_direct_IO_own_locking(rw, iocb, inode,
-                                            inode->i_sb->s_bdev,
-                                            iov, offset, nr_segs,
-                                            gfs2_get_block_direct, NULL);
 out:
        gfs2_glock_dq_m(1, &gh);
        gfs2_holder_uninit(&gh);
-        if (rw == READ)
-                mutex_unlock(&inode->i_mutex);
        return rv;
 }
author	Steven Whitehouse <swhiteho@redhat.com>	2006-12-14 13:24:26 -0500
committer	Steven Whitehouse <swhiteho@redhat.com>	2007-02-05 13:36:09 -0500
commit	c7b3383437ff41781964d1bf7f40ff8d7dd5bc47 (patch)
tree	dbe0e6cb267bfe1dc5f52ee6c30529b55ac58a16 /fs/gfs2
parent	927255f0383342f5d49b82adb6689b9cba52a6f5 (diff)

diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index d8d69a72a10d..0118aa439c1d 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c
@@ -594,6 +594,36 @@ static void gfs2_invalidatepage(struct page *page, unsigned long offset)
594	return;	594	return;
595	}	595	}
596		596
		597	/**
		598	* gfs2_ok_for_dio - check that dio is valid on this file
		599	* @ip: The inode
		600	* @rw: READ or WRITE
		601	* @offset: The offset at which we are reading or writing
		602	*
		603	* Returns: 0 (to ignore the i/o request and thus fall back to buffered i/o)
		604	* 1 (to accept the i/o request)
		605	*/
		606	static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
		607	{
		608	/*
		609	* Should we return an error here? I can't see that O_DIRECT for
		610	* a journaled file makes any sense. For now we'll silently fall
		611	* back to buffered I/O, likewise we do the same for stuffed
		612	* files since they are (a) small and (b) unaligned.
		613	*/
		614	if (gfs2_is_jdata(ip))
		615	return 0;
		616
		617	if (gfs2_is_stuffed(ip))
		618	return 0;
		619
		620	if (offset > i_size_read(&ip->i_inode))
		621	return 0;
		622	return 1;
		623	}
		624
		625
		626
597	static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,	627	static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
598	const struct iovec *iov, loff_t offset,	628	const struct iovec *iov, loff_t offset,
599	unsigned long nr_segs)	629	unsigned long nr_segs)
@@ -604,42 +634,28 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
604	struct gfs2_holder gh;	634	struct gfs2_holder gh;
605	int rv;	635	int rv;
606		636
607	if (rw == READ)
608	mutex_lock(&inode->i_mutex);
609	/*	637	/*
610	* Shared lock, even if its a write, since we do no allocation	638	* Deferred lock, even if its a write, since we do no allocation
611	* on this path. All we need change is atime.	639	* on this path. All we need change is atime, and this lock mode
		640	* ensures that other nodes have flushed their buffered read caches
		641	* (i.e. their page cache entries for this inode). We do not,
		642	* unfortunately have the option of only flushing a range like
		643	* the VFS does.
612	*/	644	*/
613	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);	645	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, GL_ATIME, &gh);
614	rv = gfs2_glock_nq_atime(&gh);	646	rv = gfs2_glock_nq_atime(&gh);
615	if (rv)	647	if (rv)
616	goto out;	648	return rv;
617		649	rv = gfs2_ok_for_dio(ip, rw, offset);
618	if (offset > i_size_read(inode))	650	if (rv != 1)
619	goto out;	651	goto out; /* dio not valid, fall back to buffered i/o */
620		652
621	/*	653	rv = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev,
622	* Should we return an error here? I can't see that O_DIRECT for	654	iov, offset, nr_segs,
623	* a journaled file makes any sense. For now we'll silently fall	655	gfs2_get_block_direct, NULL);
624	* back to buffered I/O, likewise we do the same for stuffed
625	* files since they are (a) small and (b) unaligned.
626	*/
627	if (gfs2_is_jdata(ip))
628	goto out;
629
630	if (gfs2_is_stuffed(ip))
631	goto out;
632
633	rv = blockdev_direct_IO_own_locking(rw, iocb, inode,
634	inode->i_sb->s_bdev,
635	iov, offset, nr_segs,
636	gfs2_get_block_direct, NULL);
637	out:	656	out:
638	gfs2_glock_dq_m(1, &gh);	657	gfs2_glock_dq_m(1, &gh);
639	gfs2_holder_uninit(&gh);	658	gfs2_holder_uninit(&gh);
640	if (rw == READ)
641	mutex_unlock(&inode->i_mutex);
642
643	return rv;	659	return rv;
644	}	660	}
645		661