aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/mm.h6
-rw-r--r--mm/readahead.c174
2 files changed, 180 insertions, 0 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f8e12b3b6110..619c0e80cf0c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1138,6 +1138,12 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
1138 pgoff_t offset, unsigned long nr_to_read); 1138 pgoff_t offset, unsigned long nr_to_read);
1139int force_page_cache_readahead(struct address_space *mapping, struct file *filp, 1139int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
1140 pgoff_t offset, unsigned long nr_to_read); 1140 pgoff_t offset, unsigned long nr_to_read);
1141unsigned long page_cache_readahead_ondemand(struct address_space *mapping,
1142 struct file_ra_state *ra,
1143 struct file *filp,
1144 struct page *page,
1145 pgoff_t offset,
1146 unsigned long size);
1141unsigned long page_cache_readahead(struct address_space *mapping, 1147unsigned long page_cache_readahead(struct address_space *mapping,
1142 struct file_ra_state *ra, 1148 struct file_ra_state *ra,
1143 struct file *filp, 1149 struct file *filp,
diff --git a/mm/readahead.c b/mm/readahead.c
index 072ce8f8357d..c094e4f5a250 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -611,3 +611,177 @@ unsigned long ra_submit(struct file_ra_state *ra,
611 return actual; 611 return actual;
612} 612}
613EXPORT_SYMBOL_GPL(ra_submit); 613EXPORT_SYMBOL_GPL(ra_submit);
614
615/*
616 * Get the previous window size, ramp it up, and
617 * return it as the new window size.
618 */
619static unsigned long get_next_ra_size2(struct file_ra_state *ra,
620 unsigned long max)
621{
622 unsigned long cur = ra->readahead_index - ra->ra_index;
623 unsigned long newsize;
624
625 if (cur < max / 16)
626 newsize = cur * 4;
627 else
628 newsize = cur * 2;
629
630 return min(newsize, max);
631}
632
633/*
634 * On-demand readahead design.
635 *
636 * The fields in struct file_ra_state represent the most-recently-executed
637 * readahead attempt:
638 *
639 * |-------- last readahead window -------->|
640 * |-- application walking here -->|
641 * ======#============|==================#=====================|
642 * ^la_index ^ra_index ^lookahead_index ^readahead_index
643 *
644 * [ra_index, readahead_index) represents the last readahead window.
645 *
646 * [la_index, lookahead_index] is where the application would be walking(in
647 * the common case of cache-cold sequential reads): the last window was
648 * established when the application was at la_index, and the next window will
649 * be bring in when the application reaches lookahead_index.
650 *
651 * To overlap application thinking time and disk I/O time, we do
652 * `readahead pipelining': Do not wait until the application consumed all
653 * readahead pages and stalled on the missing page at readahead_index;
654 * Instead, submit an asynchronous readahead I/O as early as the application
655 * reads on the page at lookahead_index. Normally lookahead_index will be
656 * equal to ra_index, for maximum pipelining.
657 *
658 * In interleaved sequential reads, concurrent streams on the same fd can
659 * be invalidating each other's readahead state. So we flag the new readahead
660 * page at lookahead_index with PG_readahead, and use it as readahead
661 * indicator. The flag won't be set on already cached pages, to avoid the
662 * readahead-for-nothing fuss, saving pointless page cache lookups.
663 *
664 * prev_index tracks the last visited page in the _previous_ read request.
665 * It should be maintained by the caller, and will be used for detecting
666 * small random reads. Note that the readahead algorithm checks loosely
667 * for sequential patterns. Hence interleaved reads might be served as
668 * sequential ones.
669 *
670 * There is a special-case: if the first page which the application tries to
671 * read happens to be the first page of the file, it is assumed that a linear
672 * read is about to happen and the window is immediately set to the initial size
673 * based on I/O request size and the max_readahead.
674 *
675 * The code ramps up the readahead size aggressively at first, but slow down as
676 * it approaches max_readhead.
677 */
678
679/*
680 * A minimal readahead algorithm for trivial sequential/random reads.
681 */
682static unsigned long
683ondemand_readahead(struct address_space *mapping,
684 struct file_ra_state *ra, struct file *filp,
685 struct page *page, pgoff_t offset,
686 unsigned long req_size)
687{
688 unsigned long max; /* max readahead pages */
689 pgoff_t ra_index; /* readahead index */
690 unsigned long ra_size; /* readahead size */
691 unsigned long la_size; /* lookahead size */
692 int sequential;
693
694 max = ra->ra_pages;
695 sequential = (offset - ra->prev_index <= 1UL) || (req_size > max);
696
697 /*
698 * Lookahead/readahead hit, assume sequential access.
699 * Ramp up sizes, and push forward the readahead window.
700 */
701 if (offset && (offset == ra->lookahead_index ||
702 offset == ra->readahead_index)) {
703 ra_index = ra->readahead_index;
704 ra_size = get_next_ra_size2(ra, max);
705 la_size = ra_size;
706 goto fill_ra;
707 }
708
709 /*
710 * Standalone, small read.
711 * Read as is, and do not pollute the readahead state.
712 */
713 if (!page && !sequential) {
714 return __do_page_cache_readahead(mapping, filp,
715 offset, req_size, 0);
716 }
717
718 /*
719 * It may be one of
720 * - first read on start of file
721 * - sequential cache miss
722 * - oversize random read
723 * Start readahead for it.
724 */
725 ra_index = offset;
726 ra_size = get_init_ra_size(req_size, max);
727 la_size = ra_size > req_size ? ra_size - req_size : ra_size;
728
729 /*
730 * Hit on a lookahead page without valid readahead state.
731 * E.g. interleaved reads.
732 * Not knowing its readahead pos/size, bet on the minimal possible one.
733 */
734 if (page) {
735 ra_index++;
736 ra_size = min(4 * ra_size, max);
737 }
738
739fill_ra:
740 ra_set_index(ra, offset, ra_index);
741 ra_set_size(ra, ra_size, la_size);
742
743 return ra_submit(ra, mapping, filp);
744}
745
746/**
747 * page_cache_readahead_ondemand - generic file readahead
748 * @mapping: address_space which holds the pagecache and I/O vectors
749 * @ra: file_ra_state which holds the readahead state
750 * @filp: passed on to ->readpage() and ->readpages()
751 * @page: the page at @offset, or NULL if non-present
752 * @offset: start offset into @mapping, in PAGE_CACHE_SIZE units
753 * @req_size: hint: total size of the read which the caller is performing in
754 * PAGE_CACHE_SIZE units
755 *
756 * page_cache_readahead_ondemand() is the entry point of readahead logic.
757 * This function should be called when it is time to perform readahead:
758 * 1) @page == NULL
759 * A cache miss happened, time for synchronous readahead.
760 * 2) @page != NULL && PageReadahead(@page)
761 * A look-ahead hit occured, time for asynchronous readahead.
762 */
763unsigned long
764page_cache_readahead_ondemand(struct address_space *mapping,
765 struct file_ra_state *ra, struct file *filp,
766 struct page *page, pgoff_t offset,
767 unsigned long req_size)
768{
769 /* no read-ahead */
770 if (!ra->ra_pages)
771 return 0;
772
773 if (page) {
774 ClearPageReadahead(page);
775
776 /*
777 * Defer asynchronous read-ahead on IO congestion.
778 */
779 if (bdi_read_congested(mapping->backing_dev_info))
780 return 0;
781 }
782
783 /* do read-ahead */
784 return ondemand_readahead(mapping, ra, filp, page,
785 offset, req_size);
786}
787EXPORT_SYMBOL_GPL(page_cache_readahead_ondemand);