1 files changed, 72 insertions, 23 deletions
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 9a24adf9be6e..c9037414f4f6 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -100,6 +100,9 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
        mlog_entry("(block=(%llu), nr=(%d), flags=%d, inode=%p)\n",
                   (unsigned long long)block, nr, flags, inode);
+        BUG_ON((flags & OCFS2_BH_READAHEAD) &&
+               (!inode || !(flags & OCFS2_BH_CACHED)));
        if (osb == NULL || osb->sb == NULL || bhs == NULL) {
                status = -EINVAL;
                mlog_errno(status);
@@ -140,6 +143,30 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
                bh = bhs[i];
                ignore_cache = 0;
+                /* There are three read-ahead cases here which we need to
+                 * be concerned with. All three assume a buffer has
+                 * previously been submitted with OCFS2_BH_READAHEAD
+                 * and it hasn't yet completed I/O.
+                 *
+                 * 1) The current request is sync to disk. This rarely
+                 *    happens these days, and never when performance
+                 *    matters - the code can just wait on the buffer
+                 *    lock and re-submit.
+                 *
+                 * 2) The current request is cached, but not
+                 *    readahead. ocfs2_buffer_uptodate() will return
+                 *    false anyway, so we'll wind up waiting on the
+                 *    buffer lock to do I/O. We re-check the request
+                 *    with after getting the lock to avoid a re-submit.
+                 *
+                 * 3) The current request is readahead (and so must
+                 *    also be a caching one). We short circuit if the
+                 *    buffer is locked (under I/O) and if it's in the
+                 *    uptodate cache. The re-check from #2 catches the
+                 *    case that the previous read-ahead completes just
+                 *    before our is-it-in-flight check.
+                 */
                if (flags & OCFS2_BH_CACHED &&
                    !ocfs2_buffer_uptodate(inode, bh)) {
                        mlog(ML_UPTODATE,
@@ -169,6 +196,14 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
                                continue;
                        }
+                        /* A read-ahead request was made - if the
+                         * buffer is already under read-ahead from a
+                         * previously submitted request than we are
+                         * done here. */
+                        if ((flags & OCFS2_BH_READAHEAD)
+                            && ocfs2_buffer_read_ahead(inode, bh))
+                                continue;
                        lock_buffer(bh);
                        if (buffer_jbd(bh)) {
 #ifdef CATCH_BH_JBD_RACES
@@ -181,13 +216,22 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
                                continue;
 #endif
                        }
+                        /* Re-check ocfs2_buffer_uptodate() as a
+                         * previously read-ahead buffer may have
+                         * completed I/O while we were waiting for the
+                         * buffer lock. */
+                        if ((flags & OCFS2_BH_CACHED)
+                            && !(flags & OCFS2_BH_READAHEAD)
+                            && ocfs2_buffer_uptodate(inode, bh)) {
+                                unlock_buffer(bh);
+                                continue;
+                        }
                        clear_buffer_uptodate(bh);
                        get_bh(bh); /* for end_buffer_read_sync() */
                        bh->b_end_io = end_buffer_read_sync;
-                        if (flags & OCFS2_BH_READAHEAD)
+                        submit_bh(READ, bh);
-                                submit_bh(READA, bh);
-                        else
-                                submit_bh(READ, bh);
                        continue;
                }
        }
@@ -197,34 +241,39 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
        for (i = (nr - 1); i >= 0; i--) {
                bh = bhs[i];
-                /* We know this can't have changed as we hold the
+                if (!(flags & OCFS2_BH_READAHEAD)) {
-                 * inode sem. Avoid doing any work on the bh if the
+                        /* We know this can't have changed as we hold the
-                 * journal has it. */
+                         * inode sem. Avoid doing any work on the bh if the
-                if (!buffer_jbd(bh))
+                         * journal has it. */
-                        wait_on_buffer(bh);
+                        if (!buffer_jbd(bh))
+                                wait_on_buffer(bh);
-                if (!buffer_uptodate(bh)) {
-                        /* Status won't be cleared from here on out,
+                        if (!buffer_uptodate(bh)) {
-                         * so we can safely record this and loop back
+                                /* Status won't be cleared from here on out,
-                         * to cleanup the other buffers. Don't need to
+                                 * so we can safely record this and loop back
-                         * remove the clustered uptodate information
+                                 * to cleanup the other buffers. Don't need to
-                         * for this bh as it's not marked locally
+                                 * remove the clustered uptodate information
-                         * uptodate. */
+                                 * for this bh as it's not marked locally
-                        status = -EIO;
+                                 * uptodate. */
-                        brelse(bh);
+                                status = -EIO;
-                        bhs[i] = NULL;
+                                brelse(bh);
-                        continue;
+                                bhs[i] = NULL;
+                                continue;
+                        }
                }
+                /* Always set the buffer in the cache, even if it was
+                 * a forced read, or read-ahead which hasn't yet
+                 * completed. */
                if (inode)
                        ocfs2_set_buffer_uptodate(inode, bh);
        }
        if (inode)
                mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
-        mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s\n", 
+        mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", 
             (unsigned long long)block, nr,
-             (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes");
+             (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes", flags);
 bail:

diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 9a24adf9be6e..c9037414f4f6 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c
@@ -100,6 +100,9 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
100	mlog_entry("(block=(%llu), nr=(%d), flags=%d, inode=%p)\n",	100	mlog_entry("(block=(%llu), nr=(%d), flags=%d, inode=%p)\n",
101	(unsigned long long)block, nr, flags, inode);	101	(unsigned long long)block, nr, flags, inode);
102		102
		103	BUG_ON((flags & OCFS2_BH_READAHEAD) &&
		104	(!inode \|\| !(flags & OCFS2_BH_CACHED)));
		105
103	if (osb == NULL \|\| osb->sb == NULL \|\| bhs == NULL) {	106	if (osb == NULL \|\| osb->sb == NULL \|\| bhs == NULL) {
104	status = -EINVAL;	107	status = -EINVAL;
105	mlog_errno(status);	108	mlog_errno(status);
@@ -140,6 +143,30 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
140	bh = bhs[i];	143	bh = bhs[i];
141	ignore_cache = 0;	144	ignore_cache = 0;
142		145
		146	/* There are three read-ahead cases here which we need to
		147	* be concerned with. All three assume a buffer has
		148	* previously been submitted with OCFS2_BH_READAHEAD
		149	* and it hasn't yet completed I/O.
		150	*
		151	* 1) The current request is sync to disk. This rarely
		152	* happens these days, and never when performance
		153	* matters - the code can just wait on the buffer
		154	* lock and re-submit.
		155	*
		156	* 2) The current request is cached, but not
		157	* readahead. ocfs2_buffer_uptodate() will return
		158	* false anyway, so we'll wind up waiting on the
		159	* buffer lock to do I/O. We re-check the request
		160	* with after getting the lock to avoid a re-submit.
		161	*
		162	* 3) The current request is readahead (and so must
		163	* also be a caching one). We short circuit if the
		164	* buffer is locked (under I/O) and if it's in the
		165	* uptodate cache. The re-check from #2 catches the
		166	* case that the previous read-ahead completes just
		167	* before our is-it-in-flight check.
		168	*/
		169
143	if (flags & OCFS2_BH_CACHED &&	170	if (flags & OCFS2_BH_CACHED &&
144	!ocfs2_buffer_uptodate(inode, bh)) {	171	!ocfs2_buffer_uptodate(inode, bh)) {
145	mlog(ML_UPTODATE,	172	mlog(ML_UPTODATE,
@@ -169,6 +196,14 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
169	continue;	196	continue;
170	}	197	}
171		198
		199	/* A read-ahead request was made - if the
		200	* buffer is already under read-ahead from a
		201	* previously submitted request than we are
		202	* done here. */
		203	if ((flags & OCFS2_BH_READAHEAD)
		204	&& ocfs2_buffer_read_ahead(inode, bh))
		205	continue;
		206
172	lock_buffer(bh);	207	lock_buffer(bh);
173	if (buffer_jbd(bh)) {	208	if (buffer_jbd(bh)) {
174	#ifdef CATCH_BH_JBD_RACES	209	#ifdef CATCH_BH_JBD_RACES
@@ -181,13 +216,22 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
181	continue;	216	continue;
182	#endif	217	#endif
183	}	218	}
		219
		220	/* Re-check ocfs2_buffer_uptodate() as a
		221	* previously read-ahead buffer may have
		222	* completed I/O while we were waiting for the
		223	* buffer lock. */
		224	if ((flags & OCFS2_BH_CACHED)
		225	&& !(flags & OCFS2_BH_READAHEAD)
		226	&& ocfs2_buffer_uptodate(inode, bh)) {
		227	unlock_buffer(bh);
		228	continue;
		229	}
		230
184	clear_buffer_uptodate(bh);	231	clear_buffer_uptodate(bh);
185	get_bh(bh); /* for end_buffer_read_sync() */	232	get_bh(bh); /* for end_buffer_read_sync() */
186	bh->b_end_io = end_buffer_read_sync;	233	bh->b_end_io = end_buffer_read_sync;
187	if (flags & OCFS2_BH_READAHEAD)	234	submit_bh(READ, bh);
188	submit_bh(READA, bh);
189	else
190	submit_bh(READ, bh);
191	continue;	235	continue;
192	}	236	}
193	}	237	}
@@ -197,34 +241,39 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
197	for (i = (nr - 1); i >= 0; i--) {	241	for (i = (nr - 1); i >= 0; i--) {
198	bh = bhs[i];	242	bh = bhs[i];
199		243
200	/* We know this can't have changed as we hold the	244	if (!(flags & OCFS2_BH_READAHEAD)) {
201	* inode sem. Avoid doing any work on the bh if the	245	/* We know this can't have changed as we hold the
202	* journal has it. */	246	* inode sem. Avoid doing any work on the bh if the
203	if (!buffer_jbd(bh))	247	* journal has it. */
204	wait_on_buffer(bh);	248	if (!buffer_jbd(bh))
205		249	wait_on_buffer(bh);
206	if (!buffer_uptodate(bh)) {	250
207	/* Status won't be cleared from here on out,	251	if (!buffer_uptodate(bh)) {
208	* so we can safely record this and loop back	252	/* Status won't be cleared from here on out,
209	* to cleanup the other buffers. Don't need to	253	* so we can safely record this and loop back
210	* remove the clustered uptodate information	254	* to cleanup the other buffers. Don't need to
211	* for this bh as it's not marked locally	255	* remove the clustered uptodate information
212	* uptodate. */	256	* for this bh as it's not marked locally
213	status = -EIO;	257	* uptodate. */
214	brelse(bh);	258	status = -EIO;
215	bhs[i] = NULL;	259	brelse(bh);
216	continue;	260	bhs[i] = NULL;
		261	continue;
		262	}
217	}	263	}
218		264
		265	/* Always set the buffer in the cache, even if it was
		266	* a forced read, or read-ahead which hasn't yet
		267	* completed. */
219	if (inode)	268	if (inode)
220	ocfs2_set_buffer_uptodate(inode, bh);	269	ocfs2_set_buffer_uptodate(inode, bh);
221	}	270	}
222	if (inode)	271	if (inode)
223	mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);	272	mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
224		273
225	mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s\n",	274	mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n",
226	(unsigned long long)block, nr,	275	(unsigned long long)block, nr,
227	(!(flags & OCFS2_BH_CACHED) \|\| ignore_cache) ? "no" : "yes");	276	(!(flags & OCFS2_BH_CACHED) \|\| ignore_cache) ? "no" : "yes", flags);
228		277
229	bail:	278	bail:
230		279