diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/readahead.c | 373 |
1 files changed, 25 insertions, 348 deletions
diff --git a/mm/readahead.c b/mm/readahead.c index c094e4f5a25..5b3c9b7d70f 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -49,82 +49,6 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) | |||
49 | } | 49 | } |
50 | EXPORT_SYMBOL_GPL(file_ra_state_init); | 50 | EXPORT_SYMBOL_GPL(file_ra_state_init); |
51 | 51 | ||
52 | /* | ||
53 | * Return max readahead size for this inode in number-of-pages. | ||
54 | */ | ||
55 | static inline unsigned long get_max_readahead(struct file_ra_state *ra) | ||
56 | { | ||
57 | return ra->ra_pages; | ||
58 | } | ||
59 | |||
60 | static inline unsigned long get_min_readahead(struct file_ra_state *ra) | ||
61 | { | ||
62 | return MIN_RA_PAGES; | ||
63 | } | ||
64 | |||
65 | static inline void reset_ahead_window(struct file_ra_state *ra) | ||
66 | { | ||
67 | /* | ||
68 | * ... but preserve ahead_start + ahead_size value, | ||
69 | * see 'recheck:' label in page_cache_readahead(). | ||
70 | * Note: We never use ->ahead_size as rvalue without | ||
71 | * checking ->ahead_start != 0 first. | ||
72 | */ | ||
73 | ra->ahead_size += ra->ahead_start; | ||
74 | ra->ahead_start = 0; | ||
75 | } | ||
76 | |||
77 | static inline void ra_off(struct file_ra_state *ra) | ||
78 | { | ||
79 | ra->start = 0; | ||
80 | ra->flags = 0; | ||
81 | ra->size = 0; | ||
82 | reset_ahead_window(ra); | ||
83 | return; | ||
84 | } | ||
85 | |||
86 | /* | ||
87 | * Set the initial window size, round to next power of 2 and square | ||
88 | * for small size, x 4 for medium, and x 2 for large | ||
89 | * for 128k (32 page) max ra | ||
90 | * 1-8 page = 32k initial, > 8 page = 128k initial | ||
91 | */ | ||
92 | static unsigned long get_init_ra_size(unsigned long size, unsigned long max) | ||
93 | { | ||
94 | unsigned long newsize = roundup_pow_of_two(size); | ||
95 | |||
96 | if (newsize <= max / 32) | ||
97 | newsize = newsize * 4; | ||
98 | else if (newsize <= max / 4) | ||
99 | newsize = newsize * 2; | ||
100 | else | ||
101 | newsize = max; | ||
102 | return newsize; | ||
103 | } | ||
104 | |||
105 | /* | ||
106 | * Set the new window size, this is called only when I/O is to be submitted, | ||
107 | * not for each call to readahead. If a cache miss occured, reduce next I/O | ||
108 | * size, else increase depending on how close to max we are. | ||
109 | */ | ||
110 | static inline unsigned long get_next_ra_size(struct file_ra_state *ra) | ||
111 | { | ||
112 | unsigned long max = get_max_readahead(ra); | ||
113 | unsigned long min = get_min_readahead(ra); | ||
114 | unsigned long cur = ra->size; | ||
115 | unsigned long newsize; | ||
116 | |||
117 | if (ra->flags & RA_FLAG_MISS) { | ||
118 | ra->flags &= ~RA_FLAG_MISS; | ||
119 | newsize = max((cur - 2), min); | ||
120 | } else if (cur < max / 16) { | ||
121 | newsize = 4 * cur; | ||
122 | } else { | ||
123 | newsize = 2 * cur; | ||
124 | } | ||
125 | return min(newsize, max); | ||
126 | } | ||
127 | |||
128 | #define list_to_page(head) (list_entry((head)->prev, struct page, lru)) | 52 | #define list_to_page(head) (list_entry((head)->prev, struct page, lru)) |
129 | 53 | ||
130 | /** | 54 | /** |
@@ -201,66 +125,6 @@ out: | |||
201 | } | 125 | } |
202 | 126 | ||
203 | /* | 127 | /* |
204 | * Readahead design. | ||
205 | * | ||
206 | * The fields in struct file_ra_state represent the most-recently-executed | ||
207 | * readahead attempt: | ||
208 | * | ||
209 | * start: Page index at which we started the readahead | ||
210 | * size: Number of pages in that read | ||
211 | * Together, these form the "current window". | ||
212 | * Together, start and size represent the `readahead window'. | ||
213 | * prev_index: The page which the readahead algorithm most-recently inspected. | ||
214 | * It is mainly used to detect sequential file reading. | ||
215 | * If page_cache_readahead sees that it is again being called for | ||
216 | * a page which it just looked at, it can return immediately without | ||
217 | * making any state changes. | ||
218 | * offset: Offset in the prev_index where the last read ended - used for | ||
219 | * detection of sequential file reading. | ||
220 | * ahead_start, | ||
221 | * ahead_size: Together, these form the "ahead window". | ||
222 | * ra_pages: The externally controlled max readahead for this fd. | ||
223 | * | ||
224 | * When readahead is in the off state (size == 0), readahead is disabled. | ||
225 | * In this state, prev_index is used to detect the resumption of sequential I/O. | ||
226 | * | ||
227 | * The readahead code manages two windows - the "current" and the "ahead" | ||
228 | * windows. The intent is that while the application is walking the pages | ||
229 | * in the current window, I/O is underway on the ahead window. When the | ||
230 | * current window is fully traversed, it is replaced by the ahead window | ||
231 | * and the ahead window is invalidated. When this copying happens, the | ||
232 | * new current window's pages are probably still locked. So | ||
233 | * we submit a new batch of I/O immediately, creating a new ahead window. | ||
234 | * | ||
235 | * So: | ||
236 | * | ||
237 | * ----|----------------|----------------|----- | ||
238 | * ^start ^start+size | ||
239 | * ^ahead_start ^ahead_start+ahead_size | ||
240 | * | ||
241 | * ^ When this page is read, we submit I/O for the | ||
242 | * ahead window. | ||
243 | * | ||
244 | * A `readahead hit' occurs when a read request is made against a page which is | ||
245 | * the next sequential page. Ahead window calculations are done only when it | ||
246 | * is time to submit a new IO. The code ramps up the size agressively at first, | ||
247 | * but slow down as it approaches max_readhead. | ||
248 | * | ||
249 | * Any seek/ramdom IO will result in readahead being turned off. It will resume | ||
250 | * at the first sequential access. | ||
251 | * | ||
252 | * There is a special-case: if the first page which the application tries to | ||
253 | * read happens to be the first page of the file, it is assumed that a linear | ||
254 | * read is about to happen and the window is immediately set to the initial size | ||
255 | * based on I/O request size and the max_readahead. | ||
256 | * | ||
257 | * This function is to be called for every read request, rather than when | ||
258 | * it is time to perform readahead. It is called only once for the entire I/O | ||
259 | * regardless of size unless readahead is unable to start enough I/O to satisfy | ||
260 | * the request (I/O request > max_readahead). | ||
261 | */ | ||
262 | |||
263 | /* | ||
264 | * do_page_cache_readahead actually reads a chunk of disk. It allocates all | 128 | * do_page_cache_readahead actually reads a chunk of disk. It allocates all |
265 | * the pages first, then submits them all for I/O. This avoids the very bad | 129 | * the pages first, then submits them all for I/O. This avoids the very bad |
266 | * behaviour which would occur if page allocations are causing VM writeback. | 130 | * behaviour which would occur if page allocations are causing VM writeback. |
@@ -295,7 +159,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
295 | read_lock_irq(&mapping->tree_lock); | 159 | read_lock_irq(&mapping->tree_lock); |
296 | for (page_idx = 0; page_idx < nr_to_read; page_idx++) { | 160 | for (page_idx = 0; page_idx < nr_to_read; page_idx++) { |
297 | pgoff_t page_offset = offset + page_idx; | 161 | pgoff_t page_offset = offset + page_idx; |
298 | 162 | ||
299 | if (page_offset > end_index) | 163 | if (page_offset > end_index) |
300 | break; | 164 | break; |
301 | 165 | ||
@@ -361,28 +225,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
361 | } | 225 | } |
362 | 226 | ||
363 | /* | 227 | /* |
364 | * Check how effective readahead is being. If the amount of started IO is | ||
365 | * less than expected then the file is partly or fully in pagecache and | ||
366 | * readahead isn't helping. | ||
367 | * | ||
368 | */ | ||
369 | static inline int check_ra_success(struct file_ra_state *ra, | ||
370 | unsigned long nr_to_read, unsigned long actual) | ||
371 | { | ||
372 | if (actual == 0) { | ||
373 | ra->cache_hit += nr_to_read; | ||
374 | if (ra->cache_hit >= VM_MAX_CACHE_HIT) { | ||
375 | ra_off(ra); | ||
376 | ra->flags |= RA_FLAG_INCACHE; | ||
377 | return 0; | ||
378 | } | ||
379 | } else { | ||
380 | ra->cache_hit=0; | ||
381 | } | ||
382 | return 1; | ||
383 | } | ||
384 | |||
385 | /* | ||
386 | * This version skips the IO if the queue is read-congested, and will tell the | 228 | * This version skips the IO if the queue is read-congested, and will tell the |
387 | * block layer to abandon the readahead if request allocation would block. | 229 | * block layer to abandon the readahead if request allocation would block. |
388 | * | 230 | * |
@@ -399,191 +241,6 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
399 | } | 241 | } |
400 | 242 | ||
401 | /* | 243 | /* |
402 | * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block' | ||
403 | * is set wait till the read completes. Otherwise attempt to read without | ||
404 | * blocking. | ||
405 | * Returns 1 meaning 'success' if read is successful without switching off | ||
406 | * readahead mode. Otherwise return failure. | ||
407 | */ | ||
408 | static int | ||
409 | blockable_page_cache_readahead(struct address_space *mapping, struct file *filp, | ||
410 | pgoff_t offset, unsigned long nr_to_read, | ||
411 | struct file_ra_state *ra, int block) | ||
412 | { | ||
413 | int actual; | ||
414 | |||
415 | if (!block && bdi_read_congested(mapping->backing_dev_info)) | ||
416 | return 0; | ||
417 | |||
418 | actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0); | ||
419 | |||
420 | return check_ra_success(ra, nr_to_read, actual); | ||
421 | } | ||
422 | |||
423 | static int make_ahead_window(struct address_space *mapping, struct file *filp, | ||
424 | struct file_ra_state *ra, int force) | ||
425 | { | ||
426 | int block, ret; | ||
427 | |||
428 | ra->ahead_size = get_next_ra_size(ra); | ||
429 | ra->ahead_start = ra->start + ra->size; | ||
430 | |||
431 | block = force || (ra->prev_index >= ra->ahead_start); | ||
432 | ret = blockable_page_cache_readahead(mapping, filp, | ||
433 | ra->ahead_start, ra->ahead_size, ra, block); | ||
434 | |||
435 | if (!ret && !force) { | ||
436 | /* A read failure in blocking mode, implies pages are | ||
437 | * all cached. So we can safely assume we have taken | ||
438 | * care of all the pages requested in this call. | ||
439 | * A read failure in non-blocking mode, implies we are | ||
440 | * reading more pages than requested in this call. So | ||
441 | * we safely assume we have taken care of all the pages | ||
442 | * requested in this call. | ||
443 | * | ||
444 | * Just reset the ahead window in case we failed due to | ||
445 | * congestion. The ahead window will any way be closed | ||
446 | * in case we failed due to excessive page cache hits. | ||
447 | */ | ||
448 | reset_ahead_window(ra); | ||
449 | } | ||
450 | |||
451 | return ret; | ||
452 | } | ||
453 | |||
454 | /** | ||
455 | * page_cache_readahead - generic adaptive readahead | ||
456 | * @mapping: address_space which holds the pagecache and I/O vectors | ||
457 | * @ra: file_ra_state which holds the readahead state | ||
458 | * @filp: passed on to ->readpage() and ->readpages() | ||
459 | * @offset: start offset into @mapping, in PAGE_CACHE_SIZE units | ||
460 | * @req_size: hint: total size of the read which the caller is performing in | ||
461 | * PAGE_CACHE_SIZE units | ||
462 | * | ||
463 | * page_cache_readahead() is the main function. It performs the adaptive | ||
464 | * readahead window size management and submits the readahead I/O. | ||
465 | * | ||
466 | * Note that @filp is purely used for passing on to the ->readpage[s]() | ||
467 | * handler: it may refer to a different file from @mapping (so we may not use | ||
468 | * @filp->f_mapping or @filp->f_path.dentry->d_inode here). | ||
469 | * Also, @ra may not be equal to &@filp->f_ra. | ||
470 | * | ||
471 | */ | ||
472 | unsigned long | ||
473 | page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, | ||
474 | struct file *filp, pgoff_t offset, unsigned long req_size) | ||
475 | { | ||
476 | unsigned long max, newsize; | ||
477 | int sequential; | ||
478 | |||
479 | /* | ||
480 | * We avoid doing extra work and bogusly perturbing the readahead | ||
481 | * window expansion logic. | ||
482 | */ | ||
483 | if (offset == ra->prev_index && --req_size) | ||
484 | ++offset; | ||
485 | |||
486 | /* Note that prev_index == -1 if it is a first read */ | ||
487 | sequential = (offset == ra->prev_index + 1); | ||
488 | ra->prev_index = offset; | ||
489 | ra->prev_offset = 0; | ||
490 | |||
491 | max = get_max_readahead(ra); | ||
492 | newsize = min(req_size, max); | ||
493 | |||
494 | /* No readahead or sub-page sized read or file already in cache */ | ||
495 | if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE)) | ||
496 | goto out; | ||
497 | |||
498 | ra->prev_index += newsize - 1; | ||
499 | |||
500 | /* | ||
501 | * Special case - first read at start of file. We'll assume it's | ||
502 | * a whole-file read and grow the window fast. Or detect first | ||
503 | * sequential access | ||
504 | */ | ||
505 | if (sequential && ra->size == 0) { | ||
506 | ra->size = get_init_ra_size(newsize, max); | ||
507 | ra->start = offset; | ||
508 | if (!blockable_page_cache_readahead(mapping, filp, offset, | ||
509 | ra->size, ra, 1)) | ||
510 | goto out; | ||
511 | |||
512 | /* | ||
513 | * If the request size is larger than our max readahead, we | ||
514 | * at least want to be sure that we get 2 IOs in flight and | ||
515 | * we know that we will definitly need the new I/O. | ||
516 | * once we do this, subsequent calls should be able to overlap | ||
517 | * IOs,* thus preventing stalls. so issue the ahead window | ||
518 | * immediately. | ||
519 | */ | ||
520 | if (req_size >= max) | ||
521 | make_ahead_window(mapping, filp, ra, 1); | ||
522 | |||
523 | goto out; | ||
524 | } | ||
525 | |||
526 | /* | ||
527 | * Now handle the random case: | ||
528 | * partial page reads and first access were handled above, | ||
529 | * so this must be the next page otherwise it is random | ||
530 | */ | ||
531 | if (!sequential) { | ||
532 | ra_off(ra); | ||
533 | blockable_page_cache_readahead(mapping, filp, offset, | ||
534 | newsize, ra, 1); | ||
535 | goto out; | ||
536 | } | ||
537 | |||
538 | /* | ||
539 | * If we get here we are doing sequential IO and this was not the first | ||
540 | * occurence (ie we have an existing window) | ||
541 | */ | ||
542 | if (ra->ahead_start == 0) { /* no ahead window yet */ | ||
543 | if (!make_ahead_window(mapping, filp, ra, 0)) | ||
544 | goto recheck; | ||
545 | } | ||
546 | |||
547 | /* | ||
548 | * Already have an ahead window, check if we crossed into it. | ||
549 | * If so, shift windows and issue a new ahead window. | ||
550 | * Only return the #pages that are in the current window, so that | ||
551 | * we get called back on the first page of the ahead window which | ||
552 | * will allow us to submit more IO. | ||
553 | */ | ||
554 | if (ra->prev_index >= ra->ahead_start) { | ||
555 | ra->start = ra->ahead_start; | ||
556 | ra->size = ra->ahead_size; | ||
557 | make_ahead_window(mapping, filp, ra, 0); | ||
558 | recheck: | ||
559 | /* prev_index shouldn't overrun the ahead window */ | ||
560 | ra->prev_index = min(ra->prev_index, | ||
561 | ra->ahead_start + ra->ahead_size - 1); | ||
562 | } | ||
563 | |||
564 | out: | ||
565 | return ra->prev_index + 1; | ||
566 | } | ||
567 | EXPORT_SYMBOL_GPL(page_cache_readahead); | ||
568 | |||
569 | /* | ||
570 | * handle_ra_miss() is called when it is known that a page which should have | ||
571 | * been present in the pagecache (we just did some readahead there) was in fact | ||
572 | * not found. This will happen if it was evicted by the VM (readahead | ||
573 | * thrashing) | ||
574 | * | ||
575 | * Turn on the cache miss flag in the RA struct, this will cause the RA code | ||
576 | * to reduce the RA size on the next read. | ||
577 | */ | ||
578 | void handle_ra_miss(struct address_space *mapping, | ||
579 | struct file_ra_state *ra, pgoff_t offset) | ||
580 | { | ||
581 | ra->flags |= RA_FLAG_MISS; | ||
582 | ra->flags &= ~RA_FLAG_INCACHE; | ||
583 | ra->cache_hit = 0; | ||
584 | } | ||
585 | |||
586 | /* | ||
587 | * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a | 244 | * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a |
588 | * sensible upper limit. | 245 | * sensible upper limit. |
589 | */ | 246 | */ |
@@ -613,19 +270,39 @@ unsigned long ra_submit(struct file_ra_state *ra, | |||
613 | EXPORT_SYMBOL_GPL(ra_submit); | 270 | EXPORT_SYMBOL_GPL(ra_submit); |
614 | 271 | ||
615 | /* | 272 | /* |
273 | * Set the initial window size, round to next power of 2 and square | ||
274 | * for small size, x 4 for medium, and x 2 for large | ||
275 | * for 128k (32 page) max ra | ||
276 | * 1-8 page = 32k initial, > 8 page = 128k initial | ||
277 | */ | ||
278 | static unsigned long get_init_ra_size(unsigned long size, unsigned long max) | ||
279 | { | ||
280 | unsigned long newsize = roundup_pow_of_two(size); | ||
281 | |||
282 | if (newsize <= max / 32) | ||
283 | newsize = newsize * 4; | ||
284 | else if (newsize <= max / 4) | ||
285 | newsize = newsize * 2; | ||
286 | else | ||
287 | newsize = max; | ||
288 | |||
289 | return newsize; | ||
290 | } | ||
291 | |||
292 | /* | ||
616 | * Get the previous window size, ramp it up, and | 293 | * Get the previous window size, ramp it up, and |
617 | * return it as the new window size. | 294 | * return it as the new window size. |
618 | */ | 295 | */ |
619 | static unsigned long get_next_ra_size2(struct file_ra_state *ra, | 296 | static unsigned long get_next_ra_size(struct file_ra_state *ra, |
620 | unsigned long max) | 297 | unsigned long max) |
621 | { | 298 | { |
622 | unsigned long cur = ra->readahead_index - ra->ra_index; | 299 | unsigned long cur = ra->readahead_index - ra->ra_index; |
623 | unsigned long newsize; | 300 | unsigned long newsize; |
624 | 301 | ||
625 | if (cur < max / 16) | 302 | if (cur < max / 16) |
626 | newsize = cur * 4; | 303 | newsize = 4 * cur; |
627 | else | 304 | else |
628 | newsize = cur * 2; | 305 | newsize = 2 * cur; |
629 | 306 | ||
630 | return min(newsize, max); | 307 | return min(newsize, max); |
631 | } | 308 | } |
@@ -701,7 +378,7 @@ ondemand_readahead(struct address_space *mapping, | |||
701 | if (offset && (offset == ra->lookahead_index || | 378 | if (offset && (offset == ra->lookahead_index || |
702 | offset == ra->readahead_index)) { | 379 | offset == ra->readahead_index)) { |
703 | ra_index = ra->readahead_index; | 380 | ra_index = ra->readahead_index; |
704 | ra_size = get_next_ra_size2(ra, max); | 381 | ra_size = get_next_ra_size(ra, max); |
705 | la_size = ra_size; | 382 | la_size = ra_size; |
706 | goto fill_ra; | 383 | goto fill_ra; |
707 | } | 384 | } |