diff options
Diffstat (limited to 'mm/readahead.c')
-rw-r--r-- | mm/readahead.c | 145 |
1 files changed, 101 insertions, 44 deletions
diff --git a/mm/readahead.c b/mm/readahead.c index 133b6d525513..aa1aa2345235 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -133,15 +133,12 @@ out: | |||
133 | } | 133 | } |
134 | 134 | ||
135 | /* | 135 | /* |
136 | * do_page_cache_readahead actually reads a chunk of disk. It allocates all | 136 | * __do_page_cache_readahead() actually reads a chunk of disk. It allocates all |
137 | * the pages first, then submits them all for I/O. This avoids the very bad | 137 | * the pages first, then submits them all for I/O. This avoids the very bad |
138 | * behaviour which would occur if page allocations are causing VM writeback. | 138 | * behaviour which would occur if page allocations are causing VM writeback. |
139 | * We really don't want to intermingle reads and writes like that. | 139 | * We really don't want to intermingle reads and writes like that. |
140 | * | 140 | * |
141 | * Returns the number of pages requested, or the maximum amount of I/O allowed. | 141 | * Returns the number of pages requested, or the maximum amount of I/O allowed. |
142 | * | ||
143 | * do_page_cache_readahead() returns -1 if it encountered request queue | ||
144 | * congestion. | ||
145 | */ | 142 | */ |
146 | static int | 143 | static int |
147 | __do_page_cache_readahead(struct address_space *mapping, struct file *filp, | 144 | __do_page_cache_readahead(struct address_space *mapping, struct file *filp, |
@@ -210,6 +207,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
210 | if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) | 207 | if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) |
211 | return -EINVAL; | 208 | return -EINVAL; |
212 | 209 | ||
210 | nr_to_read = max_sane_readahead(nr_to_read); | ||
213 | while (nr_to_read) { | 211 | while (nr_to_read) { |
214 | int err; | 212 | int err; |
215 | 213 | ||
@@ -231,22 +229,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
231 | } | 229 | } |
232 | 230 | ||
233 | /* | 231 | /* |
234 | * This version skips the IO if the queue is read-congested, and will tell the | ||
235 | * block layer to abandon the readahead if request allocation would block. | ||
236 | * | ||
237 | * force_page_cache_readahead() will ignore queue congestion and will block on | ||
238 | * request queues. | ||
239 | */ | ||
240 | int do_page_cache_readahead(struct address_space *mapping, struct file *filp, | ||
241 | pgoff_t offset, unsigned long nr_to_read) | ||
242 | { | ||
243 | if (bdi_read_congested(mapping->backing_dev_info)) | ||
244 | return -1; | ||
245 | |||
246 | return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0); | ||
247 | } | ||
248 | |||
249 | /* | ||
250 | * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a | 232 | * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a |
251 | * sensible upper limit. | 233 | * sensible upper limit. |
252 | */ | 234 | */ |
@@ -259,7 +241,7 @@ unsigned long max_sane_readahead(unsigned long nr) | |||
259 | /* | 241 | /* |
260 | * Submit IO for the read-ahead request in file_ra_state. | 242 | * Submit IO for the read-ahead request in file_ra_state. |
261 | */ | 243 | */ |
262 | static unsigned long ra_submit(struct file_ra_state *ra, | 244 | unsigned long ra_submit(struct file_ra_state *ra, |
263 | struct address_space *mapping, struct file *filp) | 245 | struct address_space *mapping, struct file *filp) |
264 | { | 246 | { |
265 | int actual; | 247 | int actual; |
@@ -348,6 +330,59 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra, | |||
348 | */ | 330 | */ |
349 | 331 | ||
350 | /* | 332 | /* |
333 | * Count contiguously cached pages from @offset-1 to @offset-@max, | ||
334 | * this count is a conservative estimation of | ||
335 | * - length of the sequential read sequence, or | ||
336 | * - thrashing threshold in memory tight systems | ||
337 | */ | ||
338 | static pgoff_t count_history_pages(struct address_space *mapping, | ||
339 | struct file_ra_state *ra, | ||
340 | pgoff_t offset, unsigned long max) | ||
341 | { | ||
342 | pgoff_t head; | ||
343 | |||
344 | rcu_read_lock(); | ||
345 | head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max); | ||
346 | rcu_read_unlock(); | ||
347 | |||
348 | return offset - 1 - head; | ||
349 | } | ||
350 | |||
351 | /* | ||
352 | * page cache context based read-ahead | ||
353 | */ | ||
354 | static int try_context_readahead(struct address_space *mapping, | ||
355 | struct file_ra_state *ra, | ||
356 | pgoff_t offset, | ||
357 | unsigned long req_size, | ||
358 | unsigned long max) | ||
359 | { | ||
360 | pgoff_t size; | ||
361 | |||
362 | size = count_history_pages(mapping, ra, offset, max); | ||
363 | |||
364 | /* | ||
365 | * no history pages: | ||
366 | * it could be a random read | ||
367 | */ | ||
368 | if (!size) | ||
369 | return 0; | ||
370 | |||
371 | /* | ||
372 | * starts from beginning of file: | ||
373 | * it is a strong indication of long-run stream (or whole-file-read) | ||
374 | */ | ||
375 | if (size >= offset) | ||
376 | size *= 2; | ||
377 | |||
378 | ra->start = offset; | ||
379 | ra->size = get_init_ra_size(size + req_size, max); | ||
380 | ra->async_size = ra->size; | ||
381 | |||
382 | return 1; | ||
383 | } | ||
384 | |||
385 | /* | ||
351 | * A minimal readahead algorithm for trivial sequential/random reads. | 386 | * A minimal readahead algorithm for trivial sequential/random reads. |
352 | */ | 387 | */ |
353 | static unsigned long | 388 | static unsigned long |
@@ -356,34 +391,26 @@ ondemand_readahead(struct address_space *mapping, | |||
356 | bool hit_readahead_marker, pgoff_t offset, | 391 | bool hit_readahead_marker, pgoff_t offset, |
357 | unsigned long req_size) | 392 | unsigned long req_size) |
358 | { | 393 | { |
359 | int max = ra->ra_pages; /* max readahead pages */ | 394 | unsigned long max = max_sane_readahead(ra->ra_pages); |
360 | pgoff_t prev_offset; | 395 | |
361 | int sequential; | 396 | /* |
397 | * start of file | ||
398 | */ | ||
399 | if (!offset) | ||
400 | goto initial_readahead; | ||
362 | 401 | ||
363 | /* | 402 | /* |
364 | * It's the expected callback offset, assume sequential access. | 403 | * It's the expected callback offset, assume sequential access. |
365 | * Ramp up sizes, and push forward the readahead window. | 404 | * Ramp up sizes, and push forward the readahead window. |
366 | */ | 405 | */ |
367 | if (offset && (offset == (ra->start + ra->size - ra->async_size) || | 406 | if ((offset == (ra->start + ra->size - ra->async_size) || |
368 | offset == (ra->start + ra->size))) { | 407 | offset == (ra->start + ra->size))) { |
369 | ra->start += ra->size; | 408 | ra->start += ra->size; |
370 | ra->size = get_next_ra_size(ra, max); | 409 | ra->size = get_next_ra_size(ra, max); |
371 | ra->async_size = ra->size; | 410 | ra->async_size = ra->size; |
372 | goto readit; | 411 | goto readit; |
373 | } | 412 | } |
374 | 413 | ||
375 | prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT; | ||
376 | sequential = offset - prev_offset <= 1UL || req_size > max; | ||
377 | |||
378 | /* | ||
379 | * Standalone, small read. | ||
380 | * Read as is, and do not pollute the readahead state. | ||
381 | */ | ||
382 | if (!hit_readahead_marker && !sequential) { | ||
383 | return __do_page_cache_readahead(mapping, filp, | ||
384 | offset, req_size, 0); | ||
385 | } | ||
386 | |||
387 | /* | 414 | /* |
388 | * Hit a marked page without valid readahead state. | 415 | * Hit a marked page without valid readahead state. |
389 | * E.g. interleaved reads. | 416 | * E.g. interleaved reads. |
@@ -394,7 +421,7 @@ ondemand_readahead(struct address_space *mapping, | |||
394 | pgoff_t start; | 421 | pgoff_t start; |
395 | 422 | ||
396 | rcu_read_lock(); | 423 | rcu_read_lock(); |
397 | start = radix_tree_next_hole(&mapping->page_tree, offset,max+1); | 424 | start = radix_tree_next_hole(&mapping->page_tree, offset+1,max); |
398 | rcu_read_unlock(); | 425 | rcu_read_unlock(); |
399 | 426 | ||
400 | if (!start || start - offset > max) | 427 | if (!start || start - offset > max) |
@@ -402,23 +429,53 @@ ondemand_readahead(struct address_space *mapping, | |||
402 | 429 | ||
403 | ra->start = start; | 430 | ra->start = start; |
404 | ra->size = start - offset; /* old async_size */ | 431 | ra->size = start - offset; /* old async_size */ |
432 | ra->size += req_size; | ||
405 | ra->size = get_next_ra_size(ra, max); | 433 | ra->size = get_next_ra_size(ra, max); |
406 | ra->async_size = ra->size; | 434 | ra->async_size = ra->size; |
407 | goto readit; | 435 | goto readit; |
408 | } | 436 | } |
409 | 437 | ||
410 | /* | 438 | /* |
411 | * It may be one of | 439 | * oversize read |
412 | * - first read on start of file | 440 | */ |
413 | * - sequential cache miss | 441 | if (req_size > max) |
414 | * - oversize random read | 442 | goto initial_readahead; |
415 | * Start readahead for it. | 443 | |
444 | /* | ||
445 | * sequential cache miss | ||
446 | */ | ||
447 | if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL) | ||
448 | goto initial_readahead; | ||
449 | |||
450 | /* | ||
451 | * Query the page cache and look for the traces(cached history pages) | ||
452 | * that a sequential stream would leave behind. | ||
453 | */ | ||
454 | if (try_context_readahead(mapping, ra, offset, req_size, max)) | ||
455 | goto readit; | ||
456 | |||
457 | /* | ||
458 | * standalone, small random read | ||
459 | * Read as is, and do not pollute the readahead state. | ||
416 | */ | 460 | */ |
461 | return __do_page_cache_readahead(mapping, filp, offset, req_size, 0); | ||
462 | |||
463 | initial_readahead: | ||
417 | ra->start = offset; | 464 | ra->start = offset; |
418 | ra->size = get_init_ra_size(req_size, max); | 465 | ra->size = get_init_ra_size(req_size, max); |
419 | ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; | 466 | ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; |
420 | 467 | ||
421 | readit: | 468 | readit: |
469 | /* | ||
470 | * Will this read hit the readahead marker made by itself? | ||
471 | * If so, trigger the readahead marker hit now, and merge | ||
472 | * the resulted next readahead window into the current one. | ||
473 | */ | ||
474 | if (offset == ra->start && ra->size == ra->async_size) { | ||
475 | ra->async_size = get_next_ra_size(ra, max); | ||
476 | ra->size += ra->async_size; | ||
477 | } | ||
478 | |||
422 | return ra_submit(ra, mapping, filp); | 479 | return ra_submit(ra, mapping, filp); |
423 | } | 480 | } |
424 | 481 | ||