mm: make swapin readahead skip over holes

Ever since abandoning the virtual scan of processes, for scalability reasons, swap space has been a little more fragmented than before. This can lead to the situation where a large memory user is killed, swap space ends up full of "holes" and swapin readahead is totally ineffective. On my home system, after killing a leaky firefox it took over an hour to page just under 2GB of memory back in, slowing the virtual machines down to a crawl. This patch makes swapin readahead simply skip over holes, instead of stopping at them. This allows the system to swap things back in at rates of several MB/second, instead of a few hundred kB/second. The checks done in valid_swaphandles are already done in read_swap_cache_async as well, allowing us to remove a fair amount of code. [akpm@linux-foundation.org: fix it for page_cluster >= 32] Signed-off-by: Rik van Riel <riel@redhat.com> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Mel Gorman <mgorman@suse.de> Cc: Adrian Drzewiecki <z@drze.net> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Rik van Riel <riel@redhat.com> 2012-03-21 19:33:50 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-03-21 20:54:56 -0400
commit: 67f96aa252e606cdf6c3cf1032952ec207ec0cf0 (patch)
tree: a5a4299dd32789831eda558b51c0120272846664 /mm
parent: c38446cc65e1f2b3eb8630c53943b94c4f65f670 (diff)
2 files changed, 11 insertions, 65 deletions
diff --git a/mm/swap_state.c b/mm/swap_state.c
index ea6b32d61873..9d3dd3763cf7 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -372,25 +372,23 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
                        struct vm_area_struct *vma, unsigned long addr)
 {
-        int nr_pages;
        struct page *page;
-        unsigned long offset;
+        unsigned long offset = swp_offset(entry);
-        unsigned long end_offset;
+        unsigned long start_offset, end_offset;
+        unsigned long mask = (1UL << page_cluster) - 1;
-        /*
+        /* Read a page_cluster sized and aligned cluster around offset. */
-         * Get starting offset for readaround, and number of pages to read.
+        start_offset = offset & ~mask;
-         * Adjust starting address by readbehind (for NUMA interleave case)?
+        end_offset = offset | mask;
-         * No, it's very unlikely that swap layout would follow vma layout,
+        if (!start_offset)      /* First page is swap header. */
-         * more likely that neighbouring swap pages came from the same node:
+                start_offset++;
-         * so use the same "addr" to choose the same node for each swap read.
-         */
+        for (offset = start_offset; offset <= end_offset ; offset++) {
-        nr_pages = valid_swaphandles(entry, &offset);
-        for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
                /* Ok, do the async read-ahead now */
                page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
                                                gfp_mask, vma, addr);
                if (!page)
-                        break;
+                        continue;
                page_cache_release(page);
        }
        lru_add_drain();        /* Push any new pages onto the LRU now */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 44595a373e42..b82c028cfcc6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2288,58 +2288,6 @@ int swapcache_prepare(swp_entry_t entry)
 }
 /*
- * swap_lock prevents swap_map being freed. Don't grab an extra
- * reference on the swaphandle, it doesn't matter if it becomes unused.
- */
-int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
-{
-        struct swap_info_struct *si;
-        int our_page_cluster = page_cluster;
-        pgoff_t target, toff;
-        pgoff_t base, end;
-        int nr_pages = 0;
-        if (!our_page_cluster)  /* no readahead */
-                return 0;
-        si = swap_info[swp_type(entry)];
-        target = swp_offset(entry);
-        base = (target >> our_page_cluster) << our_page_cluster;
-        end = base + (1 << our_page_cluster);
-        if (!base)              /* first page is swap header */
-                base++;
-        spin_lock(&swap_lock);
-        if (end > si->max)      /* don't go beyond end of map */
-                end = si->max;
-        /* Count contiguous allocated slots above our target */
-        for (toff = target; ++toff < end; nr_pages++) {
-                /* Don't read in free or bad pages */
-                if (!si->swap_map[toff])
-                        break;
-                if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
-                        break;
-        }
-        /* Count contiguous allocated slots below our target */
-        for (toff = target; --toff >= base; nr_pages++) {
-                /* Don't read in free or bad pages */
-                if (!si->swap_map[toff])
-                        break;
-                if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
-                        break;
-        }
-        spin_unlock(&swap_lock);
-        /*
-         * Indicate starting offset, and return number of pages to get:
-         * if only 1, say 0, since there's then no readahead to be done.
-         */
-        *offset = ++toff;
-        return nr_pages? ++nr_pages: 0;
-}
-/*
 * add_swap_count_continuation - called when a swap count is duplicated
 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
 * page of the original vmalloc'ed swap_map, to hold the continuation count
author	Rik van Riel <riel@redhat.com>	2012-03-21 19:33:50 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-03-21 20:54:56 -0400
commit	67f96aa252e606cdf6c3cf1032952ec207ec0cf0 (patch)
tree	a5a4299dd32789831eda558b51c0120272846664 /mm
parent	c38446cc65e1f2b3eb8630c53943b94c4f65f670 (diff)

diff --git a/mm/swap_state.c b/mm/swap_state.c index ea6b32d61873..9d3dd3763cf7 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c
@@ -372,25 +372,23 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
372	struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,	372	struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
373	struct vm_area_struct *vma, unsigned long addr)	373	struct vm_area_struct *vma, unsigned long addr)
374	{	374	{
375	int nr_pages;
376	struct page *page;	375	struct page *page;
377	unsigned long offset;	376	unsigned long offset = swp_offset(entry);
378	unsigned long end_offset;	377	unsigned long start_offset, end_offset;
		378	unsigned long mask = (1UL << page_cluster) - 1;
379		379
380	/*	380	/* Read a page_cluster sized and aligned cluster around offset. */
381	* Get starting offset for readaround, and number of pages to read.	381	start_offset = offset & ~mask;
382	* Adjust starting address by readbehind (for NUMA interleave case)?	382	end_offset = offset \| mask;
383	* No, it's very unlikely that swap layout would follow vma layout,	383	if (!start_offset) /* First page is swap header. */
384	* more likely that neighbouring swap pages came from the same node:	384	start_offset++;
385	* so use the same "addr" to choose the same node for each swap read.	385
386	*/	386	for (offset = start_offset; offset <= end_offset ; offset++) {
387	nr_pages = valid_swaphandles(entry, &offset);
388	for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
389	/* Ok, do the async read-ahead now */	387	/* Ok, do the async read-ahead now */
390	page = read_swap_cache_async(swp_entry(swp_type(entry), offset),	388	page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
391	gfp_mask, vma, addr);	389	gfp_mask, vma, addr);
392	if (!page)	390	if (!page)
393	break;	391	continue;
394	page_cache_release(page);	392	page_cache_release(page);
395	}	393	}
396	lru_add_drain(); /* Push any new pages onto the LRU now */	394	lru_add_drain(); /* Push any new pages onto the LRU now */


diff --git a/mm/swapfile.c b/mm/swapfile.c index 44595a373e42..b82c028cfcc6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c
@@ -2288,58 +2288,6 @@ int swapcache_prepare(swp_entry_t entry)
2288	}	2288	}
2289		2289
2290	/*	2290	/*
2291	* swap_lock prevents swap_map being freed. Don't grab an extra
2292	* reference on the swaphandle, it doesn't matter if it becomes unused.
2293	*/
2294	int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2295	{
2296	struct swap_info_struct *si;
2297	int our_page_cluster = page_cluster;
2298	pgoff_t target, toff;
2299	pgoff_t base, end;
2300	int nr_pages = 0;
2301
2302	if (!our_page_cluster) /* no readahead */
2303	return 0;
2304
2305	si = swap_info[swp_type(entry)];
2306	target = swp_offset(entry);
2307	base = (target >> our_page_cluster) << our_page_cluster;
2308	end = base + (1 << our_page_cluster);
2309	if (!base) /* first page is swap header */
2310	base++;
2311
2312	spin_lock(&swap_lock);
2313	if (end > si->max) /* don't go beyond end of map */
2314	end = si->max;
2315
2316	/* Count contiguous allocated slots above our target */
2317	for (toff = target; ++toff < end; nr_pages++) {
2318	/* Don't read in free or bad pages */
2319	if (!si->swap_map[toff])
2320	break;
2321	if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2322	break;
2323	}
2324	/* Count contiguous allocated slots below our target */
2325	for (toff = target; --toff >= base; nr_pages++) {
2326	/* Don't read in free or bad pages */
2327	if (!si->swap_map[toff])
2328	break;
2329	if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2330	break;
2331	}
2332	spin_unlock(&swap_lock);
2333
2334	/*
2335	* Indicate starting offset, and return number of pages to get:
2336	* if only 1, say 0, since there's then no readahead to be done.
2337	*/
2338	*offset = ++toff;
2339	return nr_pages? ++nr_pages: 0;
2340	}
2341
2342	/*
2343	* add_swap_count_continuation - called when a swap count is duplicated	2291	* add_swap_count_continuation - called when a swap count is duplicated
2344	* beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's	2292	* beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
2345	* page of the original vmalloc'ed swap_map, to hold the continuation count	2293	* page of the original vmalloc'ed swap_map, to hold the continuation count