aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMark Fasheh <mark.fasheh@oracle.com>2006-04-21 16:49:02 -0400
committerMark Fasheh <mark.fasheh@oracle.com>2006-09-20 18:53:40 -0400
commitaa9588741db907785e4d92c8b768dd6c9077e6f0 (patch)
treed34da288a9d296a8a2ba19dfa0f1df8429bd3e33
parente0b4096d34fbd6b30838c417100c9d0ef73c71f2 (diff)
ocfs2: implement directory read-ahead
Uptodate.c now knows about read-ahead buffers. Use some more aggressive logic in ocfs2_readdir(). The two functions which currently use directory read-ahead are ocfs2_find_entry() and ocfs2_readdir(). Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
-rw-r--r--fs/ocfs2/buffer_head_io.c95
-rw-r--r--fs/ocfs2/buffer_head_io.h2
-rw-r--r--fs/ocfs2/dir.c28
-rw-r--r--fs/ocfs2/inode.c4
-rw-r--r--fs/ocfs2/namei.c10
-rw-r--r--fs/ocfs2/uptodate.c21
-rw-r--r--fs/ocfs2/uptodate.h2
7 files changed, 115 insertions, 47 deletions
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 9a24adf9be6e..c9037414f4f6 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -100,6 +100,9 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
100 mlog_entry("(block=(%llu), nr=(%d), flags=%d, inode=%p)\n", 100 mlog_entry("(block=(%llu), nr=(%d), flags=%d, inode=%p)\n",
101 (unsigned long long)block, nr, flags, inode); 101 (unsigned long long)block, nr, flags, inode);
102 102
103 BUG_ON((flags & OCFS2_BH_READAHEAD) &&
104 (!inode || !(flags & OCFS2_BH_CACHED)));
105
103 if (osb == NULL || osb->sb == NULL || bhs == NULL) { 106 if (osb == NULL || osb->sb == NULL || bhs == NULL) {
104 status = -EINVAL; 107 status = -EINVAL;
105 mlog_errno(status); 108 mlog_errno(status);
@@ -140,6 +143,30 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
140 bh = bhs[i]; 143 bh = bhs[i];
141 ignore_cache = 0; 144 ignore_cache = 0;
142 145
146 /* There are three read-ahead cases here which we need to
147 * be concerned with. All three assume a buffer has
148 * previously been submitted with OCFS2_BH_READAHEAD
149 * and it hasn't yet completed I/O.
150 *
151 * 1) The current request is sync to disk. This rarely
152 * happens these days, and never when performance
153 * matters - the code can just wait on the buffer
154 * lock and re-submit.
155 *
156 * 2) The current request is cached, but not
157 * readahead. ocfs2_buffer_uptodate() will return
158 * false anyway, so we'll wind up waiting on the
159 * buffer lock to do I/O. We re-check the request
160 * with after getting the lock to avoid a re-submit.
161 *
162 * 3) The current request is readahead (and so must
163 * also be a caching one). We short circuit if the
164 * buffer is locked (under I/O) and if it's in the
165 * uptodate cache. The re-check from #2 catches the
166 * case that the previous read-ahead completes just
167 * before our is-it-in-flight check.
168 */
169
143 if (flags & OCFS2_BH_CACHED && 170 if (flags & OCFS2_BH_CACHED &&
144 !ocfs2_buffer_uptodate(inode, bh)) { 171 !ocfs2_buffer_uptodate(inode, bh)) {
145 mlog(ML_UPTODATE, 172 mlog(ML_UPTODATE,
@@ -169,6 +196,14 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
169 continue; 196 continue;
170 } 197 }
171 198
199 /* A read-ahead request was made - if the
200 * buffer is already under read-ahead from a
201 * previously submitted request than we are
202 * done here. */
203 if ((flags & OCFS2_BH_READAHEAD)
204 && ocfs2_buffer_read_ahead(inode, bh))
205 continue;
206
172 lock_buffer(bh); 207 lock_buffer(bh);
173 if (buffer_jbd(bh)) { 208 if (buffer_jbd(bh)) {
174#ifdef CATCH_BH_JBD_RACES 209#ifdef CATCH_BH_JBD_RACES
@@ -181,13 +216,22 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
181 continue; 216 continue;
182#endif 217#endif
183 } 218 }
219
220 /* Re-check ocfs2_buffer_uptodate() as a
221 * previously read-ahead buffer may have
222 * completed I/O while we were waiting for the
223 * buffer lock. */
224 if ((flags & OCFS2_BH_CACHED)
225 && !(flags & OCFS2_BH_READAHEAD)
226 && ocfs2_buffer_uptodate(inode, bh)) {
227 unlock_buffer(bh);
228 continue;
229 }
230
184 clear_buffer_uptodate(bh); 231 clear_buffer_uptodate(bh);
185 get_bh(bh); /* for end_buffer_read_sync() */ 232 get_bh(bh); /* for end_buffer_read_sync() */
186 bh->b_end_io = end_buffer_read_sync; 233 bh->b_end_io = end_buffer_read_sync;
187 if (flags & OCFS2_BH_READAHEAD) 234 submit_bh(READ, bh);
188 submit_bh(READA, bh);
189 else
190 submit_bh(READ, bh);
191 continue; 235 continue;
192 } 236 }
193 } 237 }
@@ -197,34 +241,39 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
197 for (i = (nr - 1); i >= 0; i--) { 241 for (i = (nr - 1); i >= 0; i--) {
198 bh = bhs[i]; 242 bh = bhs[i];
199 243
200 /* We know this can't have changed as we hold the 244 if (!(flags & OCFS2_BH_READAHEAD)) {
201 * inode sem. Avoid doing any work on the bh if the 245 /* We know this can't have changed as we hold the
202 * journal has it. */ 246 * inode sem. Avoid doing any work on the bh if the
203 if (!buffer_jbd(bh)) 247 * journal has it. */
204 wait_on_buffer(bh); 248 if (!buffer_jbd(bh))
205 249 wait_on_buffer(bh);
206 if (!buffer_uptodate(bh)) { 250
207 /* Status won't be cleared from here on out, 251 if (!buffer_uptodate(bh)) {
208 * so we can safely record this and loop back 252 /* Status won't be cleared from here on out,
209 * to cleanup the other buffers. Don't need to 253 * so we can safely record this and loop back
210 * remove the clustered uptodate information 254 * to cleanup the other buffers. Don't need to
211 * for this bh as it's not marked locally 255 * remove the clustered uptodate information
212 * uptodate. */ 256 * for this bh as it's not marked locally
213 status = -EIO; 257 * uptodate. */
214 brelse(bh); 258 status = -EIO;
215 bhs[i] = NULL; 259 brelse(bh);
216 continue; 260 bhs[i] = NULL;
261 continue;
262 }
217 } 263 }
218 264
265 /* Always set the buffer in the cache, even if it was
266 * a forced read, or read-ahead which hasn't yet
267 * completed. */
219 if (inode) 268 if (inode)
220 ocfs2_set_buffer_uptodate(inode, bh); 269 ocfs2_set_buffer_uptodate(inode, bh);
221 } 270 }
222 if (inode) 271 if (inode)
223 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); 272 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
224 273
225 mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s\n", 274 mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n",
226 (unsigned long long)block, nr, 275 (unsigned long long)block, nr,
227 (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes"); 276 (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes", flags);
228 277
229bail: 278bail:
230 279
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index 6ecb90937b68..6cc20930fac3 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -49,7 +49,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb,
49 49
50 50
51#define OCFS2_BH_CACHED 1 51#define OCFS2_BH_CACHED 1
52#define OCFS2_BH_READAHEAD 8 /* use this to pass READA down to submit_bh */ 52#define OCFS2_BH_READAHEAD 8
53 53
54static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off, 54static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
55 struct buffer_head **bh, int flags, 55 struct buffer_head **bh, int flags,
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 3d494d1a5f36..04e01915b86e 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -74,14 +74,14 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
74int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir) 74int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
75{ 75{
76 int error = 0; 76 int error = 0;
77 unsigned long offset, blk; 77 unsigned long offset, blk, last_ra_blk = 0;
78 int i, num, stored; 78 int i, stored;
79 struct buffer_head * bh, * tmp; 79 struct buffer_head * bh, * tmp;
80 struct ocfs2_dir_entry * de; 80 struct ocfs2_dir_entry * de;
81 int err; 81 int err;
82 struct inode *inode = filp->f_dentry->d_inode; 82 struct inode *inode = filp->f_dentry->d_inode;
83 struct super_block * sb = inode->i_sb; 83 struct super_block * sb = inode->i_sb;
84 int have_disk_lock = 0; 84 unsigned int ra_sectors = 16;
85 85
86 mlog_entry("dirino=%llu\n", 86 mlog_entry("dirino=%llu\n",
87 (unsigned long long)OCFS2_I(inode)->ip_blkno); 87 (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -95,9 +95,8 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
95 mlog_errno(error); 95 mlog_errno(error);
96 /* we haven't got any yet, so propagate the error. */ 96 /* we haven't got any yet, so propagate the error. */
97 stored = error; 97 stored = error;
98 goto bail; 98 goto bail_nolock;
99 } 99 }
100 have_disk_lock = 1;
101 100
102 offset = filp->f_pos & (sb->s_blocksize - 1); 101 offset = filp->f_pos & (sb->s_blocksize - 1);
103 102
@@ -113,16 +112,21 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
113 continue; 112 continue;
114 } 113 }
115 114
116 /* 115 /* The idea here is to begin with 8k read-ahead and to stay
117 * Do the readahead (8k) 116 * 4k ahead of our current position.
118 */ 117 *
119 if (!offset) { 118 * TODO: Use the pagecache for this. We just need to
120 for (i = 16 >> (sb->s_blocksize_bits - 9), num = 0; 119 * make sure it's cluster-safe... */
120 if (!last_ra_blk
121 || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) {
122 for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
121 i > 0; i--) { 123 i > 0; i--) {
122 tmp = ocfs2_bread(inode, ++blk, &err, 1); 124 tmp = ocfs2_bread(inode, ++blk, &err, 1);
123 if (tmp) 125 if (tmp)
124 brelse(tmp); 126 brelse(tmp);
125 } 127 }
128 last_ra_blk = blk;
129 ra_sectors = 8;
126 } 130 }
127 131
128revalidate: 132revalidate:
@@ -194,9 +198,9 @@ revalidate:
194 198
195 stored = 0; 199 stored = 0;
196bail: 200bail:
197 if (have_disk_lock) 201 ocfs2_meta_unlock(inode, 0);
198 ocfs2_meta_unlock(inode, 0);
199 202
203bail_nolock:
200 mlog_exit(stored); 204 mlog_exit(stored);
201 205
202 return stored; 206 return stored;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 3f496c41fea8..7bcf69154592 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1050,12 +1050,8 @@ struct buffer_head *ocfs2_bread(struct inode *inode,
1050 u64 p_blkno; 1050 u64 p_blkno;
1051 int readflags = OCFS2_BH_CACHED; 1051 int readflags = OCFS2_BH_CACHED;
1052 1052
1053#if 0
1054 /* only turn this on if we know we can deal with read_block
1055 * returning nothing */
1056 if (reada) 1053 if (reada)
1057 readflags |= OCFS2_BH_READAHEAD; 1054 readflags |= OCFS2_BH_READAHEAD;
1058#endif
1059 1055
1060 if (((u64)block << inode->i_sb->s_blocksize_bits) >= 1056 if (((u64)block << inode->i_sb->s_blocksize_bits) >=
1061 i_size_read(inode)) { 1057 i_size_read(inode)) {
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 24126476a8cc..0d3e939b1f56 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -56,6 +56,7 @@
56#include "journal.h" 56#include "journal.h"
57#include "namei.h" 57#include "namei.h"
58#include "suballoc.h" 58#include "suballoc.h"
59#include "super.h"
59#include "symlink.h" 60#include "symlink.h"
60#include "sysfile.h" 61#include "sysfile.h"
61#include "uptodate.h" 62#include "uptodate.h"
@@ -1962,13 +1963,8 @@ restart:
1962 } 1963 }
1963 num++; 1964 num++;
1964 1965
1965 /* XXX: questionable readahead stuff here */
1966 bh = ocfs2_bread(dir, b++, &err, 1); 1966 bh = ocfs2_bread(dir, b++, &err, 1);
1967 bh_use[ra_max] = bh; 1967 bh_use[ra_max] = bh;
1968#if 0 // ???
1969 if (bh)
1970 ll_rw_block(READ, 1, &bh);
1971#endif
1972 } 1968 }
1973 } 1969 }
1974 if ((bh = bh_use[ra_ptr++]) == NULL) 1970 if ((bh = bh_use[ra_ptr++]) == NULL)
@@ -1976,6 +1972,10 @@ restart:
1976 wait_on_buffer(bh); 1972 wait_on_buffer(bh);
1977 if (!buffer_uptodate(bh)) { 1973 if (!buffer_uptodate(bh)) {
1978 /* read error, skip block & hope for the best */ 1974 /* read error, skip block & hope for the best */
1975 ocfs2_error(dir->i_sb, "reading directory %llu, "
1976 "offset %lu\n",
1977 (unsigned long long)OCFS2_I(dir)->ip_blkno,
1978 block);
1979 brelse(bh); 1979 brelse(bh);
1980 goto next; 1980 goto next;
1981 } 1981 }
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index b8a00a793326..9707ed7a3206 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -206,7 +206,10 @@ static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi,
206} 206}
207 207
208/* Warning: even if it returns true, this does *not* guarantee that 208/* Warning: even if it returns true, this does *not* guarantee that
209 * the block is stored in our inode metadata cache. */ 209 * the block is stored in our inode metadata cache.
210 *
211 * This can be called under lock_buffer()
212 */
210int ocfs2_buffer_uptodate(struct inode *inode, 213int ocfs2_buffer_uptodate(struct inode *inode,
211 struct buffer_head *bh) 214 struct buffer_head *bh)
212{ 215{
@@ -226,6 +229,16 @@ int ocfs2_buffer_uptodate(struct inode *inode,
226 return ocfs2_buffer_cached(OCFS2_I(inode), bh); 229 return ocfs2_buffer_cached(OCFS2_I(inode), bh);
227} 230}
228 231
232/*
233 * Determine whether a buffer is currently out on a read-ahead request.
234 * ip_io_sem should be held to serialize submitters with the logic here.
235 */
236int ocfs2_buffer_read_ahead(struct inode *inode,
237 struct buffer_head *bh)
238{
239 return buffer_locked(bh) && ocfs2_buffer_cached(OCFS2_I(inode), bh);
240}
241
229/* Requires ip_lock */ 242/* Requires ip_lock */
230static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci, 243static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
231 sector_t block) 244 sector_t block)
@@ -403,7 +416,11 @@ out_free:
403 * 416 *
404 * Note that this function may actually fail to insert the block if 417 * Note that this function may actually fail to insert the block if
405 * memory cannot be allocated. This is not fatal however (but may 418 * memory cannot be allocated. This is not fatal however (but may
406 * result in a performance penalty) */ 419 * result in a performance penalty)
420 *
421 * Readahead buffers can be passed in here before the I/O request is
422 * completed.
423 */
407void ocfs2_set_buffer_uptodate(struct inode *inode, 424void ocfs2_set_buffer_uptodate(struct inode *inode,
408 struct buffer_head *bh) 425 struct buffer_head *bh)
409{ 426{
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
index 01cd32d26b06..2e73206059a8 100644
--- a/fs/ocfs2/uptodate.h
+++ b/fs/ocfs2/uptodate.h
@@ -40,5 +40,7 @@ void ocfs2_set_new_buffer_uptodate(struct inode *inode,
40 struct buffer_head *bh); 40 struct buffer_head *bh);
41void ocfs2_remove_from_cache(struct inode *inode, 41void ocfs2_remove_from_cache(struct inode *inode,
42 struct buffer_head *bh); 42 struct buffer_head *bh);
43int ocfs2_buffer_read_ahead(struct inode *inode,
44 struct buffer_head *bh);
43 45
44#endif /* OCFS2_UPTODATE_H */ 46#endif /* OCFS2_UPTODATE_H */