1 files changed, 557 insertions, 0 deletions
diff --git a/mm/readahead.c b/mm/readahead.c
new file mode 100644
index 000000000000..b840e7c6ea74
--- /dev/null
+++ b/mm/readahead.c
@@ -0,0 +1,557 @@
+/*
+ * mm/readahead.c - address_space-level file readahead.
+ *
+ * Copyright (C) 2002, Linus Torvalds
+ *
+ * 09Apr2002    akpm@zip.com.au
+ *              Initial version.
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/pagevec.h>
+void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
+{
+}
+EXPORT_SYMBOL(default_unplug_io_fn);
+struct backing_dev_info default_backing_dev_info = {
+        .ra_pages       = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
+        .state          = 0,
+        .capabilities   = BDI_CAP_MAP_COPY,
+        .unplug_io_fn   = default_unplug_io_fn,
+};
+EXPORT_SYMBOL_GPL(default_backing_dev_info);
+/*
+ * Initialise a struct file's readahead state.  Assumes that the caller has
+ * memset *ra to zero.
+ */
+void
+file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
+{
+        ra->ra_pages = mapping->backing_dev_info->ra_pages;
+        ra->prev_page = -1;
+}
+/*
+ * Return max readahead size for this inode in number-of-pages.
+ */
+static inline unsigned long get_max_readahead(struct file_ra_state *ra)
+{
+        return ra->ra_pages;
+}
+static inline unsigned long get_min_readahead(struct file_ra_state *ra)
+{
+        return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+}
+static inline void ra_off(struct file_ra_state *ra)
+{
+        ra->start = 0;
+        ra->flags = 0;
+        ra->size = 0;
+        ra->ahead_start = 0;
+        ra->ahead_size = 0;
+        return;
+}
+/*
+ * Set the initial window size, round to next power of 2 and square
+ * for small size, x 4 for medium, and x 2 for large
+ * for 128k (32 page) max ra
+ * 1-8 page = 32k initial, > 8 page = 128k initial
+ */
+static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
+{
+        unsigned long newsize = roundup_pow_of_two(size);
+        if (newsize <= max / 64)
+                newsize = newsize * newsize;
+        else if (newsize <= max / 4)
+                newsize = max / 4;
+        else
+                newsize = max;
+        return newsize;
+}
+/*
+ * Set the new window size, this is called only when I/O is to be submitted,
+ * not for each call to readahead.  If a cache miss occured, reduce next I/O
+ * size, else increase depending on how close to max we are.
+ */
+static inline unsigned long get_next_ra_size(struct file_ra_state *ra)
+{
+        unsigned long max = get_max_readahead(ra);
+        unsigned long min = get_min_readahead(ra);
+        unsigned long cur = ra->size;
+        unsigned long newsize;
+        if (ra->flags & RA_FLAG_MISS) {
+                ra->flags &= ~RA_FLAG_MISS;
+                newsize = max((cur - 2), min);
+        } else if (cur < max / 16) {
+                newsize = 4 * cur;
+        } else {
+                newsize = 2 * cur;
+        }
+        return min(newsize, max);
+}
+#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
+/**
+ * read_cache_pages - populate an address space with some pages, and
+ *                      start reads against them.
+ * @mapping: the address_space
+ * @pages: The address of a list_head which contains the target pages.  These
+ *   pages have their ->index populated and are otherwise uninitialised.
+ * @filler: callback routine for filling a single page.
+ * @data: private data for the callback routine.
+ *
+ * Hides the details of the LRU cache etc from the filesystems.
+ */
+int read_cache_pages(struct address_space *mapping, struct list_head *pages,
+                        int (*filler)(void *, struct page *), void *data)
+{
+        struct page *page;
+        struct pagevec lru_pvec;
+        int ret = 0;
+        pagevec_init(&lru_pvec, 0);
+        while (!list_empty(pages)) {
+                page = list_to_page(pages);
+                list_del(&page->lru);
+                if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) {
+                        page_cache_release(page);
+                        continue;
+                }
+                ret = filler(data, page);
+                if (!pagevec_add(&lru_pvec, page))
+                        __pagevec_lru_add(&lru_pvec);
+                if (ret) {
+                        while (!list_empty(pages)) {
+                                struct page *victim;
+                                victim = list_to_page(pages);
+                                list_del(&victim->lru);
+                                page_cache_release(victim);
+                        }
+                        break;
+                }
+        }
+        pagevec_lru_add(&lru_pvec);
+        return ret;
+}
+EXPORT_SYMBOL(read_cache_pages);
+static int read_pages(struct address_space *mapping, struct file *filp,
+                struct list_head *pages, unsigned nr_pages)
+{
+        unsigned page_idx;
+        struct pagevec lru_pvec;
+        int ret = 0;
+        if (mapping->a_ops->readpages) {
+                ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
+                goto out;
+        }
+        pagevec_init(&lru_pvec, 0);
+        for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+                struct page *page = list_to_page(pages);
+                list_del(&page->lru);
+                if (!add_to_page_cache(page, mapping,
+                                        page->index, GFP_KERNEL)) {
+                        mapping->a_ops->readpage(filp, page);
+                        if (!pagevec_add(&lru_pvec, page))
+                                __pagevec_lru_add(&lru_pvec);
+                } else {
+                        page_cache_release(page);
+                }
+        }
+        pagevec_lru_add(&lru_pvec);
+out:
+        return ret;
+}
+/*
+ * Readahead design.
+ *
+ * The fields in struct file_ra_state represent the most-recently-executed
+ * readahead attempt:
+ *
+ * start:       Page index at which we started the readahead
+ * size:        Number of pages in that read
+ *              Together, these form the "current window".
+ *              Together, start and size represent the `readahead window'.
+ * prev_page:   The page which the readahead algorithm most-recently inspected.
+ *              It is mainly used to detect sequential file reading.
+ *              If page_cache_readahead sees that it is again being called for
+ *              a page which it just looked at, it can return immediately without
+ *              making any state changes.
+ * ahead_start,
+ * ahead_size:  Together, these form the "ahead window".
+ * ra_pages:    The externally controlled max readahead for this fd.
+ *
+ * When readahead is in the off state (size == 0), readahead is disabled.
+ * In this state, prev_page is used to detect the resumption of sequential I/O.
+ *
+ * The readahead code manages two windows - the "current" and the "ahead"
+ * windows.  The intent is that while the application is walking the pages
+ * in the current window, I/O is underway on the ahead window.  When the
+ * current window is fully traversed, it is replaced by the ahead window
+ * and the ahead window is invalidated.  When this copying happens, the
+ * new current window's pages are probably still locked.  So
+ * we submit a new batch of I/O immediately, creating a new ahead window.
+ *
+ * So:
+ *
+ *   ----|----------------|----------------|-----
+ *       ^start           ^start+size
+ *                        ^ahead_start     ^ahead_start+ahead_size
+ *
+ *         ^ When this page is read, we submit I/O for the
+ *           ahead window.
+ *
+ * A `readahead hit' occurs when a read request is made against a page which is
+ * the next sequential page. Ahead window calculations are done only when it
+ * is time to submit a new IO.  The code ramps up the size agressively at first,
+ * but slow down as it approaches max_readhead.
+ *
+ * Any seek/ramdom IO will result in readahead being turned off.  It will resume
+ * at the first sequential access.
+ *
+ * There is a special-case: if the first page which the application tries to
+ * read happens to be the first page of the file, it is assumed that a linear
+ * read is about to happen and the window is immediately set to the initial size
+ * based on I/O request size and the max_readahead.
+ *
+ * This function is to be called for every read request, rather than when
+ * it is time to perform readahead.  It is called only once for the entire I/O
+ * regardless of size unless readahead is unable to start enough I/O to satisfy
+ * the request (I/O request > max_readahead).
+ */
+/*
+ * do_page_cache_readahead actually reads a chunk of disk.  It allocates all
+ * the pages first, then submits them all for I/O. This avoids the very bad
+ * behaviour which would occur if page allocations are causing VM writeback.
+ * We really don't want to intermingle reads and writes like that.
+ *
+ * Returns the number of pages requested, or the maximum amount of I/O allowed.
+ *
+ * do_page_cache_readahead() returns -1 if it encountered request queue
+ * congestion.
+ */
+static int
+__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
+                        unsigned long offset, unsigned long nr_to_read)
+{
+        struct inode *inode = mapping->host;
+        struct page *page;
+        unsigned long end_index;        /* The last page we want to read */
+        LIST_HEAD(page_pool);
+        int page_idx;
+        int ret = 0;
+        loff_t isize = i_size_read(inode);
+        if (isize == 0)
+                goto out;
+        end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
+        /*
+         * Preallocate as many pages as we will need.
+         */
+        read_lock_irq(&mapping->tree_lock);
+        for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
+                unsigned long page_offset = offset + page_idx;
+                
+                if (page_offset > end_index)
+                        break;
+                page = radix_tree_lookup(&mapping->page_tree, page_offset);
+                if (page)
+                        continue;
+                read_unlock_irq(&mapping->tree_lock);
+                page = page_cache_alloc_cold(mapping);
+                read_lock_irq(&mapping->tree_lock);
+                if (!page)
+                        break;
+                page->index = page_offset;
+                list_add(&page->lru, &page_pool);
+                ret++;
+        }
+        read_unlock_irq(&mapping->tree_lock);
+        /*
+         * Now start the IO.  We ignore I/O errors - if the page is not
+         * uptodate then the caller will launch readpage again, and
+         * will then handle the error.
+         */
+        if (ret)
+                read_pages(mapping, filp, &page_pool, ret);
+        BUG_ON(!list_empty(&page_pool));
+out:
+        return ret;
+}
+/*
+ * Chunk the readahead into 2 megabyte units, so that we don't pin too much
+ * memory at once.
+ */
+int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
+                unsigned long offset, unsigned long nr_to_read)
+{
+        int ret = 0;
+        if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
+                return -EINVAL;
+        while (nr_to_read) {
+                int err;
+                unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE;
+                if (this_chunk > nr_to_read)
+                        this_chunk = nr_to_read;
+                err = __do_page_cache_readahead(mapping, filp,
+                                                offset, this_chunk);
+                if (err < 0) {
+                        ret = err;
+                        break;
+                }
+                ret += err;
+                offset += this_chunk;
+                nr_to_read -= this_chunk;
+        }
+        return ret;
+}
+/*
+ * Check how effective readahead is being.  If the amount of started IO is
+ * less than expected then the file is partly or fully in pagecache and
+ * readahead isn't helping.
+ *
+ */
+static inline int check_ra_success(struct file_ra_state *ra,
+                        unsigned long nr_to_read, unsigned long actual)
+{
+        if (actual == 0) {
+                ra->cache_hit += nr_to_read;
+                if (ra->cache_hit >= VM_MAX_CACHE_HIT) {
+                        ra_off(ra);
+                        ra->flags |= RA_FLAG_INCACHE;
+                        return 0;
+                }
+        } else {
+                ra->cache_hit=0;
+        }
+        return 1;
+}
+/*
+ * This version skips the IO if the queue is read-congested, and will tell the
+ * block layer to abandon the readahead if request allocation would block.
+ *
+ * force_page_cache_readahead() will ignore queue congestion and will block on
+ * request queues.
+ */
+int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
+                        unsigned long offset, unsigned long nr_to_read)
+{
+        if (bdi_read_congested(mapping->backing_dev_info))
+                return -1;
+        return __do_page_cache_readahead(mapping, filp, offset, nr_to_read);
+}
+/*
+ * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block'
+ * is set wait till the read completes.  Otherwise attempt to read without
+ * blocking.
+ * Returns 1 meaning 'success' if read is succesfull without switching off
+ * readhaead mode. Otherwise return failure.
+ */
+static int
+blockable_page_cache_readahead(struct address_space *mapping, struct file *filp,
+                        unsigned long offset, unsigned long nr_to_read,
+                        struct file_ra_state *ra, int block)
+{
+        int actual;
+        if (!block && bdi_read_congested(mapping->backing_dev_info))
+                return 0;
+        actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read);
+        return check_ra_success(ra, nr_to_read, actual);
+}
+static int make_ahead_window(struct address_space *mapping, struct file *filp,
+                                struct file_ra_state *ra, int force)
+{
+        int block, ret;
+        ra->ahead_size = get_next_ra_size(ra);
+        ra->ahead_start = ra->start + ra->size;
+        block = force || (ra->prev_page >= ra->ahead_start);
+        ret = blockable_page_cache_readahead(mapping, filp,
+                        ra->ahead_start, ra->ahead_size, ra, block);
+        if (!ret && !force) {
+                /* A read failure in blocking mode, implies pages are
+                 * all cached. So we can safely assume we have taken
+                 * care of all the pages requested in this call.
+                 * A read failure in non-blocking mode, implies we are
+                 * reading more pages than requested in this call.  So
+                 * we safely assume we have taken care of all the pages
+                 * requested in this call.
+                 *
+                 * Just reset the ahead window in case we failed due to
+                 * congestion.  The ahead window will any way be closed
+                 * in case we failed due to excessive page cache hits.
+                 */
+                ra->ahead_start = 0;
+                ra->ahead_size = 0;
+        }
+        return ret;
+}
+/*
+ * page_cache_readahead is the main function.  If performs the adaptive
+ * readahead window size management and submits the readahead I/O.
+ */
+unsigned long
+page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
+                     struct file *filp, unsigned long offset,
+                     unsigned long req_size)
+{
+        unsigned long max, newsize;
+        int sequential;
+        /*
+         * We avoid doing extra work and bogusly perturbing the readahead
+         * window expansion logic.
+         */
+        if (offset == ra->prev_page && --req_size)
+                ++offset;
+        /* Note that prev_page == -1 if it is a first read */
+        sequential = (offset == ra->prev_page + 1);
+        ra->prev_page = offset;
+        max = get_max_readahead(ra);
+        newsize = min(req_size, max);
+        /* No readahead or sub-page sized read or file already in cache */
+        if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE))
+                goto out;
+        ra->prev_page += newsize - 1;
+        /*
+         * Special case - first read at start of file. We'll assume it's
+         * a whole-file read and grow the window fast.  Or detect first
+         * sequential access
+         */
+        if (sequential && ra->size == 0) {
+                ra->size = get_init_ra_size(newsize, max);
+                ra->start = offset;
+                if (!blockable_page_cache_readahead(mapping, filp, offset,
+                                                         ra->size, ra, 1))
+                        goto out;
+                /*
+                 * If the request size is larger than our max readahead, we
+                 * at least want to be sure that we get 2 IOs in flight and
+                 * we know that we will definitly need the new I/O.
+                 * once we do this, subsequent calls should be able to overlap
+                 * IOs,* thus preventing stalls. so issue the ahead window
+                 * immediately.
+                 */
+                if (req_size >= max)
+                        make_ahead_window(mapping, filp, ra, 1);
+                goto out;
+        }
+        /*
+         * Now handle the random case:
+         * partial page reads and first access were handled above,
+         * so this must be the next page otherwise it is random
+         */
+        if (!sequential) {
+                ra_off(ra);
+                blockable_page_cache_readahead(mapping, filp, offset,
+                                 newsize, ra, 1);
+                goto out;
+        }
+        /*
+         * If we get here we are doing sequential IO and this was not the first
+         * occurence (ie we have an existing window)
+         */
+        if (ra->ahead_start == 0) {      /* no ahead window yet */
+                if (!make_ahead_window(mapping, filp, ra, 0))
+                        goto out;
+        }
+        /*
+         * Already have an ahead window, check if we crossed into it.
+         * If so, shift windows and issue a new ahead window.
+         * Only return the #pages that are in the current window, so that
+         * we get called back on the first page of the ahead window which
+         * will allow us to submit more IO.
+         */
+        if (ra->prev_page >= ra->ahead_start) {
+                ra->start = ra->ahead_start;
+                ra->size = ra->ahead_size;
+                make_ahead_window(mapping, filp, ra, 0);
+        }
+out:
+        return ra->prev_page + 1;
+}
+/*
+ * handle_ra_miss() is called when it is known that a page which should have
+ * been present in the pagecache (we just did some readahead there) was in fact
+ * not found.  This will happen if it was evicted by the VM (readahead
+ * thrashing)
+ *
+ * Turn on the cache miss flag in the RA struct, this will cause the RA code
+ * to reduce the RA size on the next read.
+ */
+void handle_ra_miss(struct address_space *mapping,
+                struct file_ra_state *ra, pgoff_t offset)
+{
+        ra->flags |= RA_FLAG_MISS;
+        ra->flags &= ~RA_FLAG_INCACHE;
+}
+/*
+ * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
+ * sensible upper limit.
+ */
+unsigned long max_sane_readahead(unsigned long nr)
+{
+        unsigned long active;
+        unsigned long inactive;
+        unsigned long free;
+        __get_zone_counts(&active, &inactive, &free, NODE_DATA(numa_node_id()));
+        return min(nr, (inactive + free) / 2);
+}

diff --git a/mm/readahead.c b/mm/readahead.c new file mode 100644 index 000000000000..b840e7c6ea74 --- /dev/null +++ b/mm/readahead.c
@@ -0,0 +1,557 @@
	1	/*
	2	* mm/readahead.c - address_space-level file readahead.
	3	*
	4	* Copyright (C) 2002, Linus Torvalds
	5	*
	6	* 09Apr2002 akpm@zip.com.au
	7	* Initial version.
	8	*/
	9
	10	#include <linux/kernel.h>
	11	#include <linux/fs.h>
	12	#include <linux/mm.h>
	13	#include <linux/module.h>
	14	#include <linux/blkdev.h>
	15	#include <linux/backing-dev.h>
	16	#include <linux/pagevec.h>
	17
	18	void default_unplug_io_fn(struct backing_dev_info bdi, struct page page)
	19	{
	20	}
	21	EXPORT_SYMBOL(default_unplug_io_fn);
	22
	23	struct backing_dev_info default_backing_dev_info = {
	24	.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
	25	.state = 0,
	26	.capabilities = BDI_CAP_MAP_COPY,
	27	.unplug_io_fn = default_unplug_io_fn,
	28	};
	29	EXPORT_SYMBOL_GPL(default_backing_dev_info);
	30
	31	/*
	32	* Initialise a struct file's readahead state. Assumes that the caller has
	33	* memset *ra to zero.
	34	*/
	35	void
	36	file_ra_state_init(struct file_ra_state ra, struct address_space mapping)
	37	{
	38	ra->ra_pages = mapping->backing_dev_info->ra_pages;
	39	ra->prev_page = -1;
	40	}
	41
	42	/*
	43	* Return max readahead size for this inode in number-of-pages.
	44	*/
	45	static inline unsigned long get_max_readahead(struct file_ra_state *ra)
	46	{
	47	return ra->ra_pages;
	48	}
	49
	50	static inline unsigned long get_min_readahead(struct file_ra_state *ra)
	51	{
	52	return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
	53	}
	54
	55	static inline void ra_off(struct file_ra_state *ra)
	56	{
	57	ra->start = 0;
	58	ra->flags = 0;
	59	ra->size = 0;
	60	ra->ahead_start = 0;
	61	ra->ahead_size = 0;
	62	return;
	63	}
	64
	65	/*
	66	* Set the initial window size, round to next power of 2 and square
	67	* for small size, x 4 for medium, and x 2 for large
	68	* for 128k (32 page) max ra
	69	* 1-8 page = 32k initial, > 8 page = 128k initial
	70	*/
	71	static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
	72	{
	73	unsigned long newsize = roundup_pow_of_two(size);
	74
	75	if (newsize <= max / 64)
	76	newsize = newsize * newsize;
	77	else if (newsize <= max / 4)
	78	newsize = max / 4;
	79	else
	80	newsize = max;
	81	return newsize;
	82	}
	83
	84	/*
	85	* Set the new window size, this is called only when I/O is to be submitted,
	86	* not for each call to readahead. If a cache miss occured, reduce next I/O
	87	* size, else increase depending on how close to max we are.
	88	*/
	89	static inline unsigned long get_next_ra_size(struct file_ra_state *ra)
	90	{
	91	unsigned long max = get_max_readahead(ra);
	92	unsigned long min = get_min_readahead(ra);
	93	unsigned long cur = ra->size;
	94	unsigned long newsize;
	95
	96	if (ra->flags & RA_FLAG_MISS) {
	97	ra->flags &= ~RA_FLAG_MISS;
	98	newsize = max((cur - 2), min);
	99	} else if (cur < max / 16) {
	100	newsize = 4 * cur;
	101	} else {
	102	newsize = 2 * cur;
	103	}
	104	return min(newsize, max);
	105	}
	106
	107	#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
	108
	109	/**
	110	* read_cache_pages - populate an address space with some pages, and
	111	* start reads against them.
	112	* @mapping: the address_space
	113	* @pages: The address of a list_head which contains the target pages. These
	114	* pages have their ->index populated and are otherwise uninitialised.
	115	* @filler: callback routine for filling a single page.
	116	* @data: private data for the callback routine.
	117	*
	118	* Hides the details of the LRU cache etc from the filesystems.
	119	*/
	120	int read_cache_pages(struct address_space mapping, struct list_head pages,
	121	int (filler)(void , struct page ), void data)
	122	{
	123	struct page *page;
	124	struct pagevec lru_pvec;
	125	int ret = 0;
	126
	127	pagevec_init(&lru_pvec, 0);
	128
	129	while (!list_empty(pages)) {
	130	page = list_to_page(pages);
	131	list_del(&page->lru);
	132	if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) {
	133	page_cache_release(page);
	134	continue;
	135	}
	136	ret = filler(data, page);
	137	if (!pagevec_add(&lru_pvec, page))
	138	__pagevec_lru_add(&lru_pvec);
	139	if (ret) {
	140	while (!list_empty(pages)) {
	141	struct page *victim;
	142
	143	victim = list_to_page(pages);
	144	list_del(&victim->lru);
	145	page_cache_release(victim);
	146	}
	147	break;
	148	}
	149	}
	150	pagevec_lru_add(&lru_pvec);
	151	return ret;
	152	}
	153
	154	EXPORT_SYMBOL(read_cache_pages);
	155
	156	static int read_pages(struct address_space mapping, struct file filp,
	157	struct list_head *pages, unsigned nr_pages)
	158	{
	159	unsigned page_idx;
	160	struct pagevec lru_pvec;
	161	int ret = 0;
	162
	163	if (mapping->a_ops->readpages) {
	164	ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
	165	goto out;
	166	}
	167
	168	pagevec_init(&lru_pvec, 0);
	169	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
	170	struct page *page = list_to_page(pages);
	171	list_del(&page->lru);
	172	if (!add_to_page_cache(page, mapping,
	173	page->index, GFP_KERNEL)) {
	174	mapping->a_ops->readpage(filp, page);
	175	if (!pagevec_add(&lru_pvec, page))
	176	__pagevec_lru_add(&lru_pvec);
	177	} else {
	178	page_cache_release(page);
	179	}
	180	}
	181	pagevec_lru_add(&lru_pvec);
	182	out:
	183	return ret;
	184	}
	185
	186	/*
	187	* Readahead design.
	188	*
	189	* The fields in struct file_ra_state represent the most-recently-executed
	190	* readahead attempt:
	191	*
	192	* start: Page index at which we started the readahead
	193	* size: Number of pages in that read
	194	* Together, these form the "current window".
	195	* Together, start and size represent the `readahead window'.
	196	* prev_page: The page which the readahead algorithm most-recently inspected.
	197	* It is mainly used to detect sequential file reading.
	198	* If page_cache_readahead sees that it is again being called for
	199	* a page which it just looked at, it can return immediately without
	200	* making any state changes.
	201	* ahead_start,
	202	* ahead_size: Together, these form the "ahead window".
	203	* ra_pages: The externally controlled max readahead for this fd.
	204	*
	205	* When readahead is in the off state (size == 0), readahead is disabled.
	206	* In this state, prev_page is used to detect the resumption of sequential I/O.
	207	*
	208	* The readahead code manages two windows - the "current" and the "ahead"
	209	* windows. The intent is that while the application is walking the pages
	210	* in the current window, I/O is underway on the ahead window. When the
	211	* current window is fully traversed, it is replaced by the ahead window
	212	* and the ahead window is invalidated. When this copying happens, the
	213	* new current window's pages are probably still locked. So
	214	* we submit a new batch of I/O immediately, creating a new ahead window.
	215	*
	216	* So:
	217	*
	218	* ----\|----------------\|----------------\|-----
	219	* ^start ^start+size
	220	* ^ahead_start ^ahead_start+ahead_size
	221	*
	222	* ^ When this page is read, we submit I/O for the
	223	* ahead window.
	224	*
	225	* A `readahead hit' occurs when a read request is made against a page which is
	226	* the next sequential page. Ahead window calculations are done only when it
	227	* is time to submit a new IO. The code ramps up the size agressively at first,
	228	* but slow down as it approaches max_readhead.
	229	*
	230	* Any seek/ramdom IO will result in readahead being turned off. It will resume
	231	* at the first sequential access.
	232	*
	233	* There is a special-case: if the first page which the application tries to
	234	* read happens to be the first page of the file, it is assumed that a linear
	235	* read is about to happen and the window is immediately set to the initial size
	236	* based on I/O request size and the max_readahead.
	237	*
	238	* This function is to be called for every read request, rather than when
	239	* it is time to perform readahead. It is called only once for the entire I/O
	240	* regardless of size unless readahead is unable to start enough I/O to satisfy
	241	* the request (I/O request > max_readahead).
	242	*/
	243
	244	/*
	245	* do_page_cache_readahead actually reads a chunk of disk. It allocates all
	246	* the pages first, then submits them all for I/O. This avoids the very bad
	247	* behaviour which would occur if page allocations are causing VM writeback.
	248	* We really don't want to intermingle reads and writes like that.
	249	*
	250	* Returns the number of pages requested, or the maximum amount of I/O allowed.
	251	*
	252	* do_page_cache_readahead() returns -1 if it encountered request queue
	253	* congestion.
	254	*/
	255	static int
	256	__do_page_cache_readahead(struct address_space mapping, struct file filp,
	257	unsigned long offset, unsigned long nr_to_read)
	258	{
	259	struct inode *inode = mapping->host;
	260	struct page *page;
	261	unsigned long end_index; /* The last page we want to read */
	262	LIST_HEAD(page_pool);
	263	int page_idx;
	264	int ret = 0;
	265	loff_t isize = i_size_read(inode);
	266
	267	if (isize == 0)
	268	goto out;
	269
	270	end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
	271
	272	/*
	273	* Preallocate as many pages as we will need.
	274	*/
	275	read_lock_irq(&mapping->tree_lock);
	276	for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
	277	unsigned long page_offset = offset + page_idx;
	278
	279	if (page_offset > end_index)
	280	break;
	281
	282	page = radix_tree_lookup(&mapping->page_tree, page_offset);
	283	if (page)
	284	continue;
	285
	286	read_unlock_irq(&mapping->tree_lock);
	287	page = page_cache_alloc_cold(mapping);
	288	read_lock_irq(&mapping->tree_lock);
	289	if (!page)
	290	break;
	291	page->index = page_offset;
	292	list_add(&page->lru, &page_pool);
	293	ret++;
	294	}
	295	read_unlock_irq(&mapping->tree_lock);
	296
	297	/*
	298	* Now start the IO. We ignore I/O errors - if the page is not
	299	* uptodate then the caller will launch readpage again, and
	300	* will then handle the error.
	301	*/
	302	if (ret)
	303	read_pages(mapping, filp, &page_pool, ret);
	304	BUG_ON(!list_empty(&page_pool));
	305	out:
	306	return ret;
	307	}
	308
	309	/*
	310	* Chunk the readahead into 2 megabyte units, so that we don't pin too much
	311	* memory at once.
	312	*/
	313	int force_page_cache_readahead(struct address_space mapping, struct file filp,
	314	unsigned long offset, unsigned long nr_to_read)
	315	{
	316	int ret = 0;
	317
	318	if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
	319	return -EINVAL;
	320
	321	while (nr_to_read) {
	322	int err;
	323
	324	unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE;
	325
	326	if (this_chunk > nr_to_read)
	327	this_chunk = nr_to_read;
	328	err = __do_page_cache_readahead(mapping, filp,
	329	offset, this_chunk);
	330	if (err < 0) {
	331	ret = err;
	332	break;
	333	}
	334	ret += err;
	335	offset += this_chunk;
	336	nr_to_read -= this_chunk;
	337	}
	338	return ret;
	339	}
	340
	341	/*
	342	* Check how effective readahead is being. If the amount of started IO is
	343	* less than expected then the file is partly or fully in pagecache and
	344	* readahead isn't helping.
	345	*
	346	*/
	347	static inline int check_ra_success(struct file_ra_state *ra,
	348	unsigned long nr_to_read, unsigned long actual)
	349	{
	350	if (actual == 0) {
	351	ra->cache_hit += nr_to_read;
	352	if (ra->cache_hit >= VM_MAX_CACHE_HIT) {
	353	ra_off(ra);
	354	ra->flags \|= RA_FLAG_INCACHE;
	355	return 0;
	356	}
	357	} else {
	358	ra->cache_hit=0;
	359	}
	360	return 1;
	361	}
	362
	363	/*
	364	* This version skips the IO if the queue is read-congested, and will tell the
	365	* block layer to abandon the readahead if request allocation would block.
	366	*
	367	* force_page_cache_readahead() will ignore queue congestion and will block on
	368	* request queues.
	369	*/
	370	int do_page_cache_readahead(struct address_space mapping, struct file filp,
	371	unsigned long offset, unsigned long nr_to_read)
	372	{
	373	if (bdi_read_congested(mapping->backing_dev_info))
	374	return -1;
	375
	376	return __do_page_cache_readahead(mapping, filp, offset, nr_to_read);
	377	}
	378
	379	/*
	380	* Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block'
	381	* is set wait till the read completes. Otherwise attempt to read without
	382	* blocking.
	383	* Returns 1 meaning 'success' if read is succesfull without switching off
	384	* readhaead mode. Otherwise return failure.
	385	*/
	386	static int
	387	blockable_page_cache_readahead(struct address_space mapping, struct file filp,
	388	unsigned long offset, unsigned long nr_to_read,
	389	struct file_ra_state *ra, int block)
	390	{
	391	int actual;
	392
	393	if (!block && bdi_read_congested(mapping->backing_dev_info))
	394	return 0;
	395
	396	actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read);
	397
	398	return check_ra_success(ra, nr_to_read, actual);
	399	}
	400
	401	static int make_ahead_window(struct address_space mapping, struct file filp,
	402	struct file_ra_state *ra, int force)
	403	{
	404	int block, ret;
	405
	406	ra->ahead_size = get_next_ra_size(ra);
	407	ra->ahead_start = ra->start + ra->size;
	408
	409	block = force \|\| (ra->prev_page >= ra->ahead_start);
	410	ret = blockable_page_cache_readahead(mapping, filp,
	411	ra->ahead_start, ra->ahead_size, ra, block);
	412
	413	if (!ret && !force) {
	414	/* A read failure in blocking mode, implies pages are
	415	* all cached. So we can safely assume we have taken
	416	* care of all the pages requested in this call.
	417	* A read failure in non-blocking mode, implies we are
	418	* reading more pages than requested in this call. So
	419	* we safely assume we have taken care of all the pages
	420	* requested in this call.
	421	*
	422	* Just reset the ahead window in case we failed due to
	423	* congestion. The ahead window will any way be closed
	424	* in case we failed due to excessive page cache hits.
	425	*/
	426	ra->ahead_start = 0;
	427	ra->ahead_size = 0;
	428	}
	429
	430	return ret;
	431	}
	432
	433	/*
	434	* page_cache_readahead is the main function. If performs the adaptive
	435	* readahead window size management and submits the readahead I/O.
	436	*/
	437	unsigned long
	438	page_cache_readahead(struct address_space mapping, struct file_ra_state ra,
	439	struct file *filp, unsigned long offset,
	440	unsigned long req_size)
	441	{
	442	unsigned long max, newsize;
	443	int sequential;
	444
	445	/*
	446	* We avoid doing extra work and bogusly perturbing the readahead
	447	* window expansion logic.
	448	*/
	449	if (offset == ra->prev_page && --req_size)
	450	++offset;
	451
	452	/* Note that prev_page == -1 if it is a first read */
	453	sequential = (offset == ra->prev_page + 1);
	454	ra->prev_page = offset;
	455
	456	max = get_max_readahead(ra);
	457	newsize = min(req_size, max);
	458
	459	/* No readahead or sub-page sized read or file already in cache */
	460	if (newsize == 0 \|\| (ra->flags & RA_FLAG_INCACHE))
	461	goto out;
	462
	463	ra->prev_page += newsize - 1;
	464
	465	/*
	466	* Special case - first read at start of file. We'll assume it's
	467	* a whole-file read and grow the window fast. Or detect first
	468	* sequential access
	469	*/
	470	if (sequential && ra->size == 0) {
	471	ra->size = get_init_ra_size(newsize, max);
	472	ra->start = offset;
	473	if (!blockable_page_cache_readahead(mapping, filp, offset,
	474	ra->size, ra, 1))
	475	goto out;
	476
	477	/*
	478	* If the request size is larger than our max readahead, we
	479	* at least want to be sure that we get 2 IOs in flight and
	480	* we know that we will definitly need the new I/O.
	481	* once we do this, subsequent calls should be able to overlap
	482	* IOs,* thus preventing stalls. so issue the ahead window
	483	* immediately.
	484	*/
	485	if (req_size >= max)
	486	make_ahead_window(mapping, filp, ra, 1);
	487
	488	goto out;
	489	}
	490
	491	/*
	492	* Now handle the random case:
	493	* partial page reads and first access were handled above,
	494	* so this must be the next page otherwise it is random
	495	*/
	496	if (!sequential) {
	497	ra_off(ra);
	498	blockable_page_cache_readahead(mapping, filp, offset,
	499	newsize, ra, 1);
	500	goto out;
	501	}
	502
	503	/*
	504	* If we get here we are doing sequential IO and this was not the first
	505	* occurence (ie we have an existing window)
	506	*/
	507
	508	if (ra->ahead_start == 0) { /* no ahead window yet */
	509	if (!make_ahead_window(mapping, filp, ra, 0))
	510	goto out;
	511	}
	512	/*
	513	* Already have an ahead window, check if we crossed into it.
	514	* If so, shift windows and issue a new ahead window.
	515	* Only return the #pages that are in the current window, so that
	516	* we get called back on the first page of the ahead window which
	517	* will allow us to submit more IO.
	518	*/
	519	if (ra->prev_page >= ra->ahead_start) {
	520	ra->start = ra->ahead_start;
	521	ra->size = ra->ahead_size;
	522	make_ahead_window(mapping, filp, ra, 0);
	523	}
	524
	525	out:
	526	return ra->prev_page + 1;
	527	}
	528
	529	/*
	530	* handle_ra_miss() is called when it is known that a page which should have
	531	* been present in the pagecache (we just did some readahead there) was in fact
	532	* not found. This will happen if it was evicted by the VM (readahead
	533	* thrashing)
	534	*
	535	* Turn on the cache miss flag in the RA struct, this will cause the RA code
	536	* to reduce the RA size on the next read.
	537	*/
	538	void handle_ra_miss(struct address_space *mapping,
	539	struct file_ra_state *ra, pgoff_t offset)
	540	{
	541	ra->flags \|= RA_FLAG_MISS;
	542	ra->flags &= ~RA_FLAG_INCACHE;
	543	}
	544
	545	/*
	546	* Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
	547	* sensible upper limit.
	548	*/
	549	unsigned long max_sane_readahead(unsigned long nr)
	550	{
	551	unsigned long active;
	552	unsigned long inactive;
	553	unsigned long free;
	554
	555	__get_zone_counts(&active, &inactive, &free, NODE_DATA(numa_node_id()));
	556	return min(nr, (inactive + free) / 2);
	557	}