1 files changed, 101 insertions, 44 deletions
diff --git a/mm/readahead.c b/mm/readahead.c
index 133b6d525513..aa1aa2345235 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -133,15 +133,12 @@ out:
 }
 /*
- * do_page_cache_readahead actually reads a chunk of disk.  It allocates all
+ * __do_page_cache_readahead() actually reads a chunk of disk.  It allocates all
 * the pages first, then submits them all for I/O. This avoids the very bad
 * behaviour which would occur if page allocations are causing VM writeback.
 * We really don't want to intermingle reads and writes like that.
 *
 * Returns the number of pages requested, or the maximum amount of I/O allowed.
- *
- * do_page_cache_readahead() returns -1 if it encountered request queue
- * congestion.
 */
 static int
 __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
@@ -210,6 +207,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
        if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
                return -EINVAL;
+        nr_to_read = max_sane_readahead(nr_to_read);
        while (nr_to_read) {
                int err;
@@ -231,22 +229,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
 }
 /*
- * This version skips the IO if the queue is read-congested, and will tell the
- * block layer to abandon the readahead if request allocation would block.
- *
- * force_page_cache_readahead() will ignore queue congestion and will block on
- * request queues.
- */
-int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
-                        pgoff_t offset, unsigned long nr_to_read)
-{
-        if (bdi_read_congested(mapping->backing_dev_info))
-                return -1;
-        return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
-}
-/*
 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
 * sensible upper limit.
 */
@@ -259,7 +241,7 @@ unsigned long max_sane_readahead(unsigned long nr)
 /*
 * Submit IO for the read-ahead request in file_ra_state.
 */
-static unsigned long ra_submit(struct file_ra_state *ra,
+unsigned long ra_submit(struct file_ra_state *ra,
                       struct address_space *mapping, struct file *filp)
 {
        int actual;
@@ -348,6 +330,59 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra,
 */
 /*
+ * Count contiguously cached pages from @offset-1 to @offset-@max,
+ * this count is a conservative estimation of
+ *      - length of the sequential read sequence, or
+ *      - thrashing threshold in memory tight systems
+ */
+static pgoff_t count_history_pages(struct address_space *mapping,
+                                   struct file_ra_state *ra,
+                                   pgoff_t offset, unsigned long max)
+{
+        pgoff_t head;
+        rcu_read_lock();
+        head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max);
+        rcu_read_unlock();
+        return offset - 1 - head;
+}
+/*
+ * page cache context based read-ahead
+ */
+static int try_context_readahead(struct address_space *mapping,
+                                 struct file_ra_state *ra,
+                                 pgoff_t offset,
+                                 unsigned long req_size,
+                                 unsigned long max)
+{
+        pgoff_t size;
+        size = count_history_pages(mapping, ra, offset, max);
+        /*
+         * no history pages:
+         * it could be a random read
+         */
+        if (!size)
+                return 0;
+        /*
+         * starts from beginning of file:
+         * it is a strong indication of long-run stream (or whole-file-read)
+         */
+        if (size >= offset)
+                size *= 2;
+        ra->start = offset;
+        ra->size = get_init_ra_size(size + req_size, max);
+        ra->async_size = ra->size;
+        return 1;
+}
+/*
 * A minimal readahead algorithm for trivial sequential/random reads.
 */
 static unsigned long
@@ -356,34 +391,26 @@ ondemand_readahead(struct address_space *mapping,
                   bool hit_readahead_marker, pgoff_t offset,
                   unsigned long req_size)
 {
-        int     max = ra->ra_pages;     /* max readahead pages */
+        unsigned long max = max_sane_readahead(ra->ra_pages);
-        pgoff_t prev_offset;
-        int     sequential;
+        /*
+         * start of file
+         */
+        if (!offset)
+                goto initial_readahead;
        /*
         * It's the expected callback offset, assume sequential access.
         * Ramp up sizes, and push forward the readahead window.
         */
-        if (offset && (offset == (ra->start + ra->size - ra->async_size) ||
+        if ((offset == (ra->start + ra->size - ra->async_size) ||
-                        offset == (ra->start + ra->size))) {
+             offset == (ra->start + ra->size))) {
                ra->start += ra->size;
                ra->size = get_next_ra_size(ra, max);
                ra->async_size = ra->size;
                goto readit;
        }
-        prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT;
-        sequential = offset - prev_offset <= 1UL || req_size > max;
-        /*
-         * Standalone, small read.
-         * Read as is, and do not pollute the readahead state.
-         */
-        if (!hit_readahead_marker && !sequential) {
-                return __do_page_cache_readahead(mapping, filp,
-                                                offset, req_size, 0);
-        }
        /*
         * Hit a marked page without valid readahead state.
         * E.g. interleaved reads.
@@ -394,7 +421,7 @@ ondemand_readahead(struct address_space *mapping,
                pgoff_t start;
                rcu_read_lock();
-                start = radix_tree_next_hole(&mapping->page_tree, offset,max+1);
+                start = radix_tree_next_hole(&mapping->page_tree, offset+1,max);
                rcu_read_unlock();
                if (!start || start - offset > max)
@@ -402,23 +429,53 @@ ondemand_readahead(struct address_space *mapping,
                ra->start = start;
                ra->size = start - offset;      /* old async_size */
+                ra->size += req_size;
                ra->size = get_next_ra_size(ra, max);
                ra->async_size = ra->size;
                goto readit;
        }
        /*
-         * It may be one of
+         * oversize read
-         *      - first read on start of file
+         */
-         *      - sequential cache miss
+        if (req_size > max)
-         *      - oversize random read
+                goto initial_readahead;
-         * Start readahead for it.
+        /*
+         * sequential cache miss
+         */
+        if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL)
+                goto initial_readahead;
+        /*
+         * Query the page cache and look for the traces(cached history pages)
+         * that a sequential stream would leave behind.
+         */
+        if (try_context_readahead(mapping, ra, offset, req_size, max))
+                goto readit;
+        /*
+         * standalone, small random read
+         * Read as is, and do not pollute the readahead state.
         */
+        return __do_page_cache_readahead(mapping, filp, offset, req_size, 0);
+initial_readahead:
        ra->start = offset;
        ra->size = get_init_ra_size(req_size, max);
        ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
 readit:
+        /*
+         * Will this read hit the readahead marker made by itself?
+         * If so, trigger the readahead marker hit now, and merge
+         * the resulted next readahead window into the current one.
+         */
+        if (offset == ra->start && ra->size == ra->async_size) {
+                ra->async_size = get_next_ra_size(ra, max);
+                ra->size += ra->async_size;
+        }
        return ra_submit(ra, mapping, filp);
 }

diff --git a/mm/readahead.c b/mm/readahead.c index 133b6d525513..aa1aa2345235 100644 --- a/mm/readahead.c +++ b/mm/readahead.c
@@ -133,15 +133,12 @@ out:
133	}	133	}
134		134
135	/*	135	/*
136	* do_page_cache_readahead actually reads a chunk of disk. It allocates all	136	* __do_page_cache_readahead() actually reads a chunk of disk. It allocates all
137	* the pages first, then submits them all for I/O. This avoids the very bad	137	* the pages first, then submits them all for I/O. This avoids the very bad
138	* behaviour which would occur if page allocations are causing VM writeback.	138	* behaviour which would occur if page allocations are causing VM writeback.
139	* We really don't want to intermingle reads and writes like that.	139	* We really don't want to intermingle reads and writes like that.
140	*	140	*
141	* Returns the number of pages requested, or the maximum amount of I/O allowed.	141	* Returns the number of pages requested, or the maximum amount of I/O allowed.
142	*
143	* do_page_cache_readahead() returns -1 if it encountered request queue
144	* congestion.
145	*/	142	*/
146	static int	143	static int
147	__do_page_cache_readahead(struct address_space mapping, struct file filp,	144	__do_page_cache_readahead(struct address_space mapping, struct file filp,
@@ -210,6 +207,7 @@ int force_page_cache_readahead(struct address_space mapping, struct file filp,
210	if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))	207	if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
211	return -EINVAL;	208	return -EINVAL;
212		209
		210	nr_to_read = max_sane_readahead(nr_to_read);
213	while (nr_to_read) {	211	while (nr_to_read) {
214	int err;	212	int err;
215		213
@@ -231,22 +229,6 @@ int force_page_cache_readahead(struct address_space mapping, struct file filp,
231	}	229	}
232		230
233	/*	231	/*
234	* This version skips the IO if the queue is read-congested, and will tell the
235	* block layer to abandon the readahead if request allocation would block.
236	*
237	* force_page_cache_readahead() will ignore queue congestion and will block on
238	* request queues.
239	*/
240	int do_page_cache_readahead(struct address_space mapping, struct file filp,
241	pgoff_t offset, unsigned long nr_to_read)
242	{
243	if (bdi_read_congested(mapping->backing_dev_info))
244	return -1;
245
246	return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
247	}
248
249	/*
250	* Given a desired number of PAGE_CACHE_SIZE readahead pages, return a	232	* Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
251	* sensible upper limit.	233	* sensible upper limit.
252	*/	234	*/
@@ -259,7 +241,7 @@ unsigned long max_sane_readahead(unsigned long nr)
259	/*	241	/*
260	* Submit IO for the read-ahead request in file_ra_state.	242	* Submit IO for the read-ahead request in file_ra_state.
261	*/	243	*/
262	static unsigned long ra_submit(struct file_ra_state *ra,	244	unsigned long ra_submit(struct file_ra_state *ra,
263	struct address_space mapping, struct file filp)	245	struct address_space mapping, struct file filp)
264	{	246	{
265	int actual;	247	int actual;
@@ -348,6 +330,59 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra,
348	*/	330	*/
349		331
350	/*	332	/*
		333	* Count contiguously cached pages from @offset-1 to @offset-@max,
		334	* this count is a conservative estimation of
		335	* - length of the sequential read sequence, or
		336	* - thrashing threshold in memory tight systems
		337	*/
		338	static pgoff_t count_history_pages(struct address_space *mapping,
		339	struct file_ra_state *ra,
		340	pgoff_t offset, unsigned long max)
		341	{
		342	pgoff_t head;
		343
		344	rcu_read_lock();
		345	head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max);
		346	rcu_read_unlock();
		347
		348	return offset - 1 - head;
		349	}
		350
		351	/*
		352	* page cache context based read-ahead
		353	*/
		354	static int try_context_readahead(struct address_space *mapping,
		355	struct file_ra_state *ra,
		356	pgoff_t offset,
		357	unsigned long req_size,
		358	unsigned long max)
		359	{
		360	pgoff_t size;
		361
		362	size = count_history_pages(mapping, ra, offset, max);
		363
		364	/*
		365	* no history pages:
		366	* it could be a random read
		367	*/
		368	if (!size)
		369	return 0;
		370
		371	/*
		372	* starts from beginning of file:
		373	* it is a strong indication of long-run stream (or whole-file-read)
		374	*/
		375	if (size >= offset)
		376	size *= 2;
		377
		378	ra->start = offset;
		379	ra->size = get_init_ra_size(size + req_size, max);
		380	ra->async_size = ra->size;
		381
		382	return 1;
		383	}
		384
		385	/*
351	* A minimal readahead algorithm for trivial sequential/random reads.	386	* A minimal readahead algorithm for trivial sequential/random reads.
352	*/	387	*/
353	static unsigned long	388	static unsigned long
@@ -356,34 +391,26 @@ ondemand_readahead(struct address_space *mapping,
356	bool hit_readahead_marker, pgoff_t offset,	391	bool hit_readahead_marker, pgoff_t offset,
357	unsigned long req_size)	392	unsigned long req_size)
358	{	393	{
359	int max = ra->ra_pages; /* max readahead pages */	394	unsigned long max = max_sane_readahead(ra->ra_pages);
360	pgoff_t prev_offset;	395
361	int sequential;	396	/*
		397	* start of file
		398	*/
		399	if (!offset)
		400	goto initial_readahead;
362		401
363	/*	402	/*
364	* It's the expected callback offset, assume sequential access.	403	* It's the expected callback offset, assume sequential access.
365	* Ramp up sizes, and push forward the readahead window.	404	* Ramp up sizes, and push forward the readahead window.
366	*/	405	*/
367	if (offset && (offset == (ra->start + ra->size - ra->async_size) \|\|	406	if ((offset == (ra->start + ra->size - ra->async_size) \|\|
368	offset == (ra->start + ra->size))) {	407	offset == (ra->start + ra->size))) {
369	ra->start += ra->size;	408	ra->start += ra->size;
370	ra->size = get_next_ra_size(ra, max);	409	ra->size = get_next_ra_size(ra, max);
371	ra->async_size = ra->size;	410	ra->async_size = ra->size;
372	goto readit;	411	goto readit;
373	}	412	}
374		413
375	prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT;
376	sequential = offset - prev_offset <= 1UL \|\| req_size > max;
377
378	/*
379	* Standalone, small read.
380	* Read as is, and do not pollute the readahead state.
381	*/
382	if (!hit_readahead_marker && !sequential) {
383	return __do_page_cache_readahead(mapping, filp,
384	offset, req_size, 0);
385	}
386
387	/*	414	/*
388	* Hit a marked page without valid readahead state.	415	* Hit a marked page without valid readahead state.
389	* E.g. interleaved reads.	416	* E.g. interleaved reads.
@@ -394,7 +421,7 @@ ondemand_readahead(struct address_space *mapping,
394	pgoff_t start;	421	pgoff_t start;
395		422
396	rcu_read_lock();	423	rcu_read_lock();
397	start = radix_tree_next_hole(&mapping->page_tree, offset,max+1);	424	start = radix_tree_next_hole(&mapping->page_tree, offset+1,max);
398	rcu_read_unlock();	425	rcu_read_unlock();
399		426
400	if (!start \|\| start - offset > max)	427	if (!start \|\| start - offset > max)
@@ -402,23 +429,53 @@ ondemand_readahead(struct address_space *mapping,
402		429
403	ra->start = start;	430	ra->start = start;
404	ra->size = start - offset; /* old async_size */	431	ra->size = start - offset; /* old async_size */
		432	ra->size += req_size;
405	ra->size = get_next_ra_size(ra, max);	433	ra->size = get_next_ra_size(ra, max);
406	ra->async_size = ra->size;	434	ra->async_size = ra->size;
407	goto readit;	435	goto readit;
408	}	436	}
409		437
410	/*	438	/*
411	* It may be one of	439	* oversize read
412	* - first read on start of file	440	*/
413	* - sequential cache miss	441	if (req_size > max)
414	* - oversize random read	442	goto initial_readahead;
415	* Start readahead for it.	443
		444	/*
		445	* sequential cache miss
		446	*/
		447	if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL)
		448	goto initial_readahead;
		449
		450	/*
		451	* Query the page cache and look for the traces(cached history pages)
		452	* that a sequential stream would leave behind.
		453	*/
		454	if (try_context_readahead(mapping, ra, offset, req_size, max))
		455	goto readit;
		456
		457	/*
		458	* standalone, small random read
		459	* Read as is, and do not pollute the readahead state.
416	*/	460	*/
		461	return __do_page_cache_readahead(mapping, filp, offset, req_size, 0);
		462
		463	initial_readahead:
417	ra->start = offset;	464	ra->start = offset;
418	ra->size = get_init_ra_size(req_size, max);	465	ra->size = get_init_ra_size(req_size, max);
419	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;	466	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
420		467
421	readit:	468	readit:
		469	/*
		470	* Will this read hit the readahead marker made by itself?
		471	* If so, trigger the readahead marker hit now, and merge
		472	* the resulted next readahead window into the current one.
		473	*/
		474	if (offset == ra->start && ra->size == ra->async_size) {
		475	ra->async_size = get_next_ra_size(ra, max);
		476	ra->size += ra->async_size;
		477	}
		478
422	return ra_submit(ra, mapping, filp);	479	return ra_submit(ra, mapping, filp);
423	}	480	}
424		481