diff options
author | Mark Fasheh <mark.fasheh@oracle.com> | 2006-04-21 16:49:02 -0400 |
---|---|---|
committer | Mark Fasheh <mark.fasheh@oracle.com> | 2006-09-20 18:53:40 -0400 |
commit | aa9588741db907785e4d92c8b768dd6c9077e6f0 (patch) | |
tree | d34da288a9d296a8a2ba19dfa0f1df8429bd3e33 | |
parent | e0b4096d34fbd6b30838c417100c9d0ef73c71f2 (diff) |
ocfs2: implement directory read-ahead
Uptodate.c now knows about read-ahead buffers. Use some more aggressive
logic in ocfs2_readdir().
The two functions which currently use directory read-ahead are
ocfs2_find_entry() and ocfs2_readdir().
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
-rw-r--r-- | fs/ocfs2/buffer_head_io.c | 95 | ||||
-rw-r--r-- | fs/ocfs2/buffer_head_io.h | 2 | ||||
-rw-r--r-- | fs/ocfs2/dir.c | 28 | ||||
-rw-r--r-- | fs/ocfs2/inode.c | 4 | ||||
-rw-r--r-- | fs/ocfs2/namei.c | 10 | ||||
-rw-r--r-- | fs/ocfs2/uptodate.c | 21 | ||||
-rw-r--r-- | fs/ocfs2/uptodate.h | 2 |
7 files changed, 115 insertions, 47 deletions
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 9a24adf9be6e..c9037414f4f6 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c | |||
@@ -100,6 +100,9 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, | |||
100 | mlog_entry("(block=(%llu), nr=(%d), flags=%d, inode=%p)\n", | 100 | mlog_entry("(block=(%llu), nr=(%d), flags=%d, inode=%p)\n", |
101 | (unsigned long long)block, nr, flags, inode); | 101 | (unsigned long long)block, nr, flags, inode); |
102 | 102 | ||
103 | BUG_ON((flags & OCFS2_BH_READAHEAD) && | ||
104 | (!inode || !(flags & OCFS2_BH_CACHED))); | ||
105 | |||
103 | if (osb == NULL || osb->sb == NULL || bhs == NULL) { | 106 | if (osb == NULL || osb->sb == NULL || bhs == NULL) { |
104 | status = -EINVAL; | 107 | status = -EINVAL; |
105 | mlog_errno(status); | 108 | mlog_errno(status); |
@@ -140,6 +143,30 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, | |||
140 | bh = bhs[i]; | 143 | bh = bhs[i]; |
141 | ignore_cache = 0; | 144 | ignore_cache = 0; |
142 | 145 | ||
146 | /* There are three read-ahead cases here which we need to | ||
147 | * be concerned with. All three assume a buffer has | ||
148 | * previously been submitted with OCFS2_BH_READAHEAD | ||
149 | * and it hasn't yet completed I/O. | ||
150 | * | ||
151 | * 1) The current request is sync to disk. This rarely | ||
152 | * happens these days, and never when performance | ||
153 | * matters - the code can just wait on the buffer | ||
154 | * lock and re-submit. | ||
155 | * | ||
156 | * 2) The current request is cached, but not | ||
157 | * readahead. ocfs2_buffer_uptodate() will return | ||
158 | * false anyway, so we'll wind up waiting on the | ||
159 | * buffer lock to do I/O. We re-check the request | ||
160 | * with after getting the lock to avoid a re-submit. | ||
161 | * | ||
162 | * 3) The current request is readahead (and so must | ||
163 | * also be a caching one). We short circuit if the | ||
164 | * buffer is locked (under I/O) and if it's in the | ||
165 | * uptodate cache. The re-check from #2 catches the | ||
166 | * case that the previous read-ahead completes just | ||
167 | * before our is-it-in-flight check. | ||
168 | */ | ||
169 | |||
143 | if (flags & OCFS2_BH_CACHED && | 170 | if (flags & OCFS2_BH_CACHED && |
144 | !ocfs2_buffer_uptodate(inode, bh)) { | 171 | !ocfs2_buffer_uptodate(inode, bh)) { |
145 | mlog(ML_UPTODATE, | 172 | mlog(ML_UPTODATE, |
@@ -169,6 +196,14 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, | |||
169 | continue; | 196 | continue; |
170 | } | 197 | } |
171 | 198 | ||
199 | /* A read-ahead request was made - if the | ||
200 | * buffer is already under read-ahead from a | ||
201 | * previously submitted request than we are | ||
202 | * done here. */ | ||
203 | if ((flags & OCFS2_BH_READAHEAD) | ||
204 | && ocfs2_buffer_read_ahead(inode, bh)) | ||
205 | continue; | ||
206 | |||
172 | lock_buffer(bh); | 207 | lock_buffer(bh); |
173 | if (buffer_jbd(bh)) { | 208 | if (buffer_jbd(bh)) { |
174 | #ifdef CATCH_BH_JBD_RACES | 209 | #ifdef CATCH_BH_JBD_RACES |
@@ -181,13 +216,22 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, | |||
181 | continue; | 216 | continue; |
182 | #endif | 217 | #endif |
183 | } | 218 | } |
219 | |||
220 | /* Re-check ocfs2_buffer_uptodate() as a | ||
221 | * previously read-ahead buffer may have | ||
222 | * completed I/O while we were waiting for the | ||
223 | * buffer lock. */ | ||
224 | if ((flags & OCFS2_BH_CACHED) | ||
225 | && !(flags & OCFS2_BH_READAHEAD) | ||
226 | && ocfs2_buffer_uptodate(inode, bh)) { | ||
227 | unlock_buffer(bh); | ||
228 | continue; | ||
229 | } | ||
230 | |||
184 | clear_buffer_uptodate(bh); | 231 | clear_buffer_uptodate(bh); |
185 | get_bh(bh); /* for end_buffer_read_sync() */ | 232 | get_bh(bh); /* for end_buffer_read_sync() */ |
186 | bh->b_end_io = end_buffer_read_sync; | 233 | bh->b_end_io = end_buffer_read_sync; |
187 | if (flags & OCFS2_BH_READAHEAD) | 234 | submit_bh(READ, bh); |
188 | submit_bh(READA, bh); | ||
189 | else | ||
190 | submit_bh(READ, bh); | ||
191 | continue; | 235 | continue; |
192 | } | 236 | } |
193 | } | 237 | } |
@@ -197,34 +241,39 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, | |||
197 | for (i = (nr - 1); i >= 0; i--) { | 241 | for (i = (nr - 1); i >= 0; i--) { |
198 | bh = bhs[i]; | 242 | bh = bhs[i]; |
199 | 243 | ||
200 | /* We know this can't have changed as we hold the | 244 | if (!(flags & OCFS2_BH_READAHEAD)) { |
201 | * inode sem. Avoid doing any work on the bh if the | 245 | /* We know this can't have changed as we hold the |
202 | * journal has it. */ | 246 | * inode sem. Avoid doing any work on the bh if the |
203 | if (!buffer_jbd(bh)) | 247 | * journal has it. */ |
204 | wait_on_buffer(bh); | 248 | if (!buffer_jbd(bh)) |
205 | 249 | wait_on_buffer(bh); | |
206 | if (!buffer_uptodate(bh)) { | 250 | |
207 | /* Status won't be cleared from here on out, | 251 | if (!buffer_uptodate(bh)) { |
208 | * so we can safely record this and loop back | 252 | /* Status won't be cleared from here on out, |
209 | * to cleanup the other buffers. Don't need to | 253 | * so we can safely record this and loop back |
210 | * remove the clustered uptodate information | 254 | * to cleanup the other buffers. Don't need to |
211 | * for this bh as it's not marked locally | 255 | * remove the clustered uptodate information |
212 | * uptodate. */ | 256 | * for this bh as it's not marked locally |
213 | status = -EIO; | 257 | * uptodate. */ |
214 | brelse(bh); | 258 | status = -EIO; |
215 | bhs[i] = NULL; | 259 | brelse(bh); |
216 | continue; | 260 | bhs[i] = NULL; |
261 | continue; | ||
262 | } | ||
217 | } | 263 | } |
218 | 264 | ||
265 | /* Always set the buffer in the cache, even if it was | ||
266 | * a forced read, or read-ahead which hasn't yet | ||
267 | * completed. */ | ||
219 | if (inode) | 268 | if (inode) |
220 | ocfs2_set_buffer_uptodate(inode, bh); | 269 | ocfs2_set_buffer_uptodate(inode, bh); |
221 | } | 270 | } |
222 | if (inode) | 271 | if (inode) |
223 | mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); | 272 | mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); |
224 | 273 | ||
225 | mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s\n", | 274 | mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", |
226 | (unsigned long long)block, nr, | 275 | (unsigned long long)block, nr, |
227 | (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes"); | 276 | (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes", flags); |
228 | 277 | ||
229 | bail: | 278 | bail: |
230 | 279 | ||
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h index 6ecb90937b68..6cc20930fac3 100644 --- a/fs/ocfs2/buffer_head_io.h +++ b/fs/ocfs2/buffer_head_io.h | |||
@@ -49,7 +49,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, | |||
49 | 49 | ||
50 | 50 | ||
51 | #define OCFS2_BH_CACHED 1 | 51 | #define OCFS2_BH_CACHED 1 |
52 | #define OCFS2_BH_READAHEAD 8 /* use this to pass READA down to submit_bh */ | 52 | #define OCFS2_BH_READAHEAD 8 |
53 | 53 | ||
54 | static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off, | 54 | static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off, |
55 | struct buffer_head **bh, int flags, | 55 | struct buffer_head **bh, int flags, |
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 3d494d1a5f36..04e01915b86e 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c | |||
@@ -74,14 +74,14 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, | |||
74 | int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir) | 74 | int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir) |
75 | { | 75 | { |
76 | int error = 0; | 76 | int error = 0; |
77 | unsigned long offset, blk; | 77 | unsigned long offset, blk, last_ra_blk = 0; |
78 | int i, num, stored; | 78 | int i, stored; |
79 | struct buffer_head * bh, * tmp; | 79 | struct buffer_head * bh, * tmp; |
80 | struct ocfs2_dir_entry * de; | 80 | struct ocfs2_dir_entry * de; |
81 | int err; | 81 | int err; |
82 | struct inode *inode = filp->f_dentry->d_inode; | 82 | struct inode *inode = filp->f_dentry->d_inode; |
83 | struct super_block * sb = inode->i_sb; | 83 | struct super_block * sb = inode->i_sb; |
84 | int have_disk_lock = 0; | 84 | unsigned int ra_sectors = 16; |
85 | 85 | ||
86 | mlog_entry("dirino=%llu\n", | 86 | mlog_entry("dirino=%llu\n", |
87 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | 87 | (unsigned long long)OCFS2_I(inode)->ip_blkno); |
@@ -95,9 +95,8 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir) | |||
95 | mlog_errno(error); | 95 | mlog_errno(error); |
96 | /* we haven't got any yet, so propagate the error. */ | 96 | /* we haven't got any yet, so propagate the error. */ |
97 | stored = error; | 97 | stored = error; |
98 | goto bail; | 98 | goto bail_nolock; |
99 | } | 99 | } |
100 | have_disk_lock = 1; | ||
101 | 100 | ||
102 | offset = filp->f_pos & (sb->s_blocksize - 1); | 101 | offset = filp->f_pos & (sb->s_blocksize - 1); |
103 | 102 | ||
@@ -113,16 +112,21 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir) | |||
113 | continue; | 112 | continue; |
114 | } | 113 | } |
115 | 114 | ||
116 | /* | 115 | /* The idea here is to begin with 8k read-ahead and to stay |
117 | * Do the readahead (8k) | 116 | * 4k ahead of our current position. |
118 | */ | 117 | * |
119 | if (!offset) { | 118 | * TODO: Use the pagecache for this. We just need to |
120 | for (i = 16 >> (sb->s_blocksize_bits - 9), num = 0; | 119 | * make sure it's cluster-safe... */ |
120 | if (!last_ra_blk | ||
121 | || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) { | ||
122 | for (i = ra_sectors >> (sb->s_blocksize_bits - 9); | ||
121 | i > 0; i--) { | 123 | i > 0; i--) { |
122 | tmp = ocfs2_bread(inode, ++blk, &err, 1); | 124 | tmp = ocfs2_bread(inode, ++blk, &err, 1); |
123 | if (tmp) | 125 | if (tmp) |
124 | brelse(tmp); | 126 | brelse(tmp); |
125 | } | 127 | } |
128 | last_ra_blk = blk; | ||
129 | ra_sectors = 8; | ||
126 | } | 130 | } |
127 | 131 | ||
128 | revalidate: | 132 | revalidate: |
@@ -194,9 +198,9 @@ revalidate: | |||
194 | 198 | ||
195 | stored = 0; | 199 | stored = 0; |
196 | bail: | 200 | bail: |
197 | if (have_disk_lock) | 201 | ocfs2_meta_unlock(inode, 0); |
198 | ocfs2_meta_unlock(inode, 0); | ||
199 | 202 | ||
203 | bail_nolock: | ||
200 | mlog_exit(stored); | 204 | mlog_exit(stored); |
201 | 205 | ||
202 | return stored; | 206 | return stored; |
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 3f496c41fea8..7bcf69154592 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c | |||
@@ -1050,12 +1050,8 @@ struct buffer_head *ocfs2_bread(struct inode *inode, | |||
1050 | u64 p_blkno; | 1050 | u64 p_blkno; |
1051 | int readflags = OCFS2_BH_CACHED; | 1051 | int readflags = OCFS2_BH_CACHED; |
1052 | 1052 | ||
1053 | #if 0 | ||
1054 | /* only turn this on if we know we can deal with read_block | ||
1055 | * returning nothing */ | ||
1056 | if (reada) | 1053 | if (reada) |
1057 | readflags |= OCFS2_BH_READAHEAD; | 1054 | readflags |= OCFS2_BH_READAHEAD; |
1058 | #endif | ||
1059 | 1055 | ||
1060 | if (((u64)block << inode->i_sb->s_blocksize_bits) >= | 1056 | if (((u64)block << inode->i_sb->s_blocksize_bits) >= |
1061 | i_size_read(inode)) { | 1057 | i_size_read(inode)) { |
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 24126476a8cc..0d3e939b1f56 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c | |||
@@ -56,6 +56,7 @@ | |||
56 | #include "journal.h" | 56 | #include "journal.h" |
57 | #include "namei.h" | 57 | #include "namei.h" |
58 | #include "suballoc.h" | 58 | #include "suballoc.h" |
59 | #include "super.h" | ||
59 | #include "symlink.h" | 60 | #include "symlink.h" |
60 | #include "sysfile.h" | 61 | #include "sysfile.h" |
61 | #include "uptodate.h" | 62 | #include "uptodate.h" |
@@ -1962,13 +1963,8 @@ restart: | |||
1962 | } | 1963 | } |
1963 | num++; | 1964 | num++; |
1964 | 1965 | ||
1965 | /* XXX: questionable readahead stuff here */ | ||
1966 | bh = ocfs2_bread(dir, b++, &err, 1); | 1966 | bh = ocfs2_bread(dir, b++, &err, 1); |
1967 | bh_use[ra_max] = bh; | 1967 | bh_use[ra_max] = bh; |
1968 | #if 0 // ??? | ||
1969 | if (bh) | ||
1970 | ll_rw_block(READ, 1, &bh); | ||
1971 | #endif | ||
1972 | } | 1968 | } |
1973 | } | 1969 | } |
1974 | if ((bh = bh_use[ra_ptr++]) == NULL) | 1970 | if ((bh = bh_use[ra_ptr++]) == NULL) |
@@ -1976,6 +1972,10 @@ restart: | |||
1976 | wait_on_buffer(bh); | 1972 | wait_on_buffer(bh); |
1977 | if (!buffer_uptodate(bh)) { | 1973 | if (!buffer_uptodate(bh)) { |
1978 | /* read error, skip block & hope for the best */ | 1974 | /* read error, skip block & hope for the best */ |
1975 | ocfs2_error(dir->i_sb, "reading directory %llu, " | ||
1976 | "offset %lu\n", | ||
1977 | (unsigned long long)OCFS2_I(dir)->ip_blkno, | ||
1978 | block); | ||
1979 | brelse(bh); | 1979 | brelse(bh); |
1980 | goto next; | 1980 | goto next; |
1981 | } | 1981 | } |
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index b8a00a793326..9707ed7a3206 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c | |||
@@ -206,7 +206,10 @@ static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi, | |||
206 | } | 206 | } |
207 | 207 | ||
208 | /* Warning: even if it returns true, this does *not* guarantee that | 208 | /* Warning: even if it returns true, this does *not* guarantee that |
209 | * the block is stored in our inode metadata cache. */ | 209 | * the block is stored in our inode metadata cache. |
210 | * | ||
211 | * This can be called under lock_buffer() | ||
212 | */ | ||
210 | int ocfs2_buffer_uptodate(struct inode *inode, | 213 | int ocfs2_buffer_uptodate(struct inode *inode, |
211 | struct buffer_head *bh) | 214 | struct buffer_head *bh) |
212 | { | 215 | { |
@@ -226,6 +229,16 @@ int ocfs2_buffer_uptodate(struct inode *inode, | |||
226 | return ocfs2_buffer_cached(OCFS2_I(inode), bh); | 229 | return ocfs2_buffer_cached(OCFS2_I(inode), bh); |
227 | } | 230 | } |
228 | 231 | ||
232 | /* | ||
233 | * Determine whether a buffer is currently out on a read-ahead request. | ||
234 | * ip_io_sem should be held to serialize submitters with the logic here. | ||
235 | */ | ||
236 | int ocfs2_buffer_read_ahead(struct inode *inode, | ||
237 | struct buffer_head *bh) | ||
238 | { | ||
239 | return buffer_locked(bh) && ocfs2_buffer_cached(OCFS2_I(inode), bh); | ||
240 | } | ||
241 | |||
229 | /* Requires ip_lock */ | 242 | /* Requires ip_lock */ |
230 | static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci, | 243 | static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci, |
231 | sector_t block) | 244 | sector_t block) |
@@ -403,7 +416,11 @@ out_free: | |||
403 | * | 416 | * |
404 | * Note that this function may actually fail to insert the block if | 417 | * Note that this function may actually fail to insert the block if |
405 | * memory cannot be allocated. This is not fatal however (but may | 418 | * memory cannot be allocated. This is not fatal however (but may |
406 | * result in a performance penalty) */ | 419 | * result in a performance penalty) |
420 | * | ||
421 | * Readahead buffers can be passed in here before the I/O request is | ||
422 | * completed. | ||
423 | */ | ||
407 | void ocfs2_set_buffer_uptodate(struct inode *inode, | 424 | void ocfs2_set_buffer_uptodate(struct inode *inode, |
408 | struct buffer_head *bh) | 425 | struct buffer_head *bh) |
409 | { | 426 | { |
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h index 01cd32d26b06..2e73206059a8 100644 --- a/fs/ocfs2/uptodate.h +++ b/fs/ocfs2/uptodate.h | |||
@@ -40,5 +40,7 @@ void ocfs2_set_new_buffer_uptodate(struct inode *inode, | |||
40 | struct buffer_head *bh); | 40 | struct buffer_head *bh); |
41 | void ocfs2_remove_from_cache(struct inode *inode, | 41 | void ocfs2_remove_from_cache(struct inode *inode, |
42 | struct buffer_head *bh); | 42 | struct buffer_head *bh); |
43 | int ocfs2_buffer_read_ahead(struct inode *inode, | ||
44 | struct buffer_head *bh); | ||
43 | 45 | ||
44 | #endif /* OCFS2_UPTODATE_H */ | 46 | #endif /* OCFS2_UPTODATE_H */ |