mm, swap: VMA based swap readahead

The swap readahead is an important mechanism to reduce the swap in latency. Although pure sequential memory access pattern isn't very popular for anonymous memory, the space locality is still considered valid. In the original swap readahead implementation, the consecutive blocks in swap device are readahead based on the global space locality estimation. But the consecutive blocks in swap device just reflect the order of page reclaiming, don't necessarily reflect the access pattern in virtual memory. And the different tasks in the system may have different access patterns, which makes the global space locality estimation incorrect. In this patch, when page fault occurs, the virtual pages near the fault address will be readahead instead of the swap slots near the fault swap slot in swap device. This avoid to readahead the unrelated swap slots. At the same time, the swap readahead is changed to work on per-VMA from globally. So that the different access patterns of the different VMAs could be distinguished, and the different readahead policy could be applied accordingly. The original core readahead detection and scaling algorithm is reused, because it is an effect algorithm to detect the space locality. The test and result is as follow, Common test condition ===================== Test Machine: Xeon E5 v3 (2 sockets, 72 threads, 32G RAM) Swap device: NVMe disk Micro-benchmark with combined access pattern ============================================ vm-scalability, sequential swap test case, 4 processes to eat 50G virtual memory space, repeat the sequential memory writing until 300 seconds. The first round writing will trigger swap out, the following rounds will trigger sequential swap in and out. At the same time, run vm-scalability random swap test case in background, 8 processes to eat 30G virtual memory space, repeat the random memory write until 300 seconds. This will trigger random swap-in in the background. This is a combined workload with sequential and random memory accessing at the same time. The result (for sequential workload) is as follow, Base Optimized ---- --------- throughput 345413 KB/s 414029 KB/s (+19.9%) latency.average 97.14 us 61.06 us (-37.1%) latency.50th 2 us 1 us latency.60th 2 us 1 us latency.70th 98 us 2 us latency.80th 160 us 2 us latency.90th 260 us 217 us latency.95th 346 us 369 us latency.99th 1.34 ms 1.09 ms ra_hit% 52.69% 99.98% The original swap readahead algorithm is confused by the background random access workload, so readahead hit rate is lower. The VMA-base readahead algorithm works much better. Linpack ======= The test memory size is bigger than RAM to trigger swapping. Base Optimized ---- --------- elapsed_time 393.49 s 329.88 s (-16.2%) ra_hit% 86.21% 98.82% The score of base and optimized kernel hasn't visible changes. But the elapsed time reduced and readahead hit rate improved, so the optimized kernel runs better for startup and tear down stages. And the absolute value of readahead hit rate is high, shows that the space locality is still valid in some practical workloads. Link: http://lkml.kernel.org/r/20170807054038.1843-4-ying.huang@intel.com Signed-off-by: "Huang, Ying" <ying.huang@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Minchan Kim <minchan@kernel.org> Cc: Rik van Riel <riel@redhat.com> Cc: Shaohua Li <shli@kernel.org> Cc: Hugh Dickins <hughd@google.com> Cc: Fengguang Wu <fengguang.wu@intel.com> Cc: Tim Chen <tim.c.chen@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Huang Ying <ying.huang@intel.com> 2017-09-06 19:24:36 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2017-09-06 20:27:29 -0400
commit: ec560175c0b6fce86994bdf036754d48122c5c87 (patch)
tree: 7aacd0beae098c785452a8a8361e13e7ffe2bc73 /mm/swap_state.c
parent: c4fa63092f216737b60c789968371d9960a598e5 (diff)
1 files changed, 198 insertions, 17 deletions
diff --git a/mm/swap_state.c b/mm/swap_state.c
index a901afe9da61..3885fef7bdf5 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -37,6 +37,29 @@ static const struct address_space_operations swap_aops = {
 struct address_space *swapper_spaces[MAX_SWAPFILES];
 static unsigned int nr_swapper_spaces[MAX_SWAPFILES];
+bool swap_vma_readahead = true;
+#define SWAP_RA_MAX_ORDER_DEFAULT       3
+static int swap_ra_max_order = SWAP_RA_MAX_ORDER_DEFAULT;
+#define SWAP_RA_WIN_SHIFT       (PAGE_SHIFT / 2)
+#define SWAP_RA_HITS_MASK       ((1UL << SWAP_RA_WIN_SHIFT) - 1)
+#define SWAP_RA_HITS_MAX        SWAP_RA_HITS_MASK
+#define SWAP_RA_WIN_MASK        (~PAGE_MASK & ~SWAP_RA_HITS_MASK)
+#define SWAP_RA_HITS(v)         ((v) & SWAP_RA_HITS_MASK)
+#define SWAP_RA_WIN(v)          (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT)
+#define SWAP_RA_ADDR(v)         ((v) & PAGE_MASK)
+#define SWAP_RA_VAL(addr, win, hits)                            \
+        (((addr) & PAGE_MASK) |                                 \
+         (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) |    \
+         ((hits) & SWAP_RA_HITS_MASK))
+/* Initial readahead hits is 4 to start up with a small window */
+#define GET_SWAP_RA_VAL(vma)                                    \
+        (atomic_long_read(&(vma)->swap_readahead_info) ? : 4)
 #define INC_CACHE_INFO(x)       do { swap_cache_info.x++; } while (0)
 #define ADD_CACHE_INFO(x, nr)   do { swap_cache_info.x += (nr); } while (0)
@@ -297,21 +320,36 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
 * lock getting page table operations atomic even if we drop the page
 * lock before returning.
 */
-struct page * lookup_swap_cache(swp_entry_t entry)
+struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
+                               unsigned long addr)
 {
        struct page *page;
+        unsigned long ra_info;
+        int win, hits, readahead;
        page = find_get_page(swap_address_space(entry), swp_offset(entry));
-        if (page && likely(!PageTransCompound(page))) {
+        INC_CACHE_INFO(find_total);
+        if (page) {
                INC_CACHE_INFO(find_success);
-                if (TestClearPageReadahead(page)) {
+                if (unlikely(PageTransCompound(page)))
-                        atomic_inc(&swapin_readahead_hits);
+                        return page;
+                readahead = TestClearPageReadahead(page);
+                if (vma) {
+                        ra_info = GET_SWAP_RA_VAL(vma);
+                        win = SWAP_RA_WIN(ra_info);
+                        hits = SWAP_RA_HITS(ra_info);
+                        if (readahead)
+                                hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
+                        atomic_long_set(&vma->swap_readahead_info,
+                                        SWAP_RA_VAL(addr, win, hits));
+                }
+                if (readahead) {
                        count_vm_event(SWAP_RA_HIT);
+                        if (!vma)
+                                atomic_inc(&swapin_readahead_hits);
                }
        }
-        INC_CACHE_INFO(find_total);
        return page;
 }
@@ -426,22 +464,20 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
        return retpage;
 }
-static unsigned long swapin_nr_pages(unsigned long offset)
+static unsigned int __swapin_nr_pages(unsigned long prev_offset,
+                                      unsigned long offset,
+                                      int hits,
+                                      int max_pages,
+                                      int prev_win)
 {
-        static unsigned long prev_offset;
+        unsigned int pages, last_ra;
-        unsigned int pages, max_pages, last_ra;
-        static atomic_t last_readahead_pages;
-        max_pages = 1 << READ_ONCE(page_cluster);
-        if (max_pages <= 1)
-                return 1;
        /*
         * This heuristic has been found to work well on both sequential and
         * random loads, swapping to hard disk or to SSD: please don't ask
         * what the "+ 2" means, it just happens to work well, that's all.
         */
-        pages = atomic_xchg(&swapin_readahead_hits, 0) + 2;
+        pages = hits + 2;
        if (pages == 2) {
                /*
                 * We can have no readahead hits to judge by: but must not get
@@ -450,7 +486,6 @@ static unsigned long swapin_nr_pages(unsigned long offset)
                 */
                if (offset != prev_offset + 1 && offset != prev_offset - 1)
                        pages = 1;
-                prev_offset = offset;
        } else {
                unsigned int roundup = 4;
                while (roundup < pages)
@@ -462,9 +497,28 @@ static unsigned long swapin_nr_pages(unsigned long offset)
                pages = max_pages;
        /* Don't shrink readahead too fast */
-        last_ra = atomic_read(&last_readahead_pages) / 2;
+        last_ra = prev_win / 2;
        if (pages < last_ra)
                pages = last_ra;
+        return pages;
+}
+static unsigned long swapin_nr_pages(unsigned long offset)
+{
+        static unsigned long prev_offset;
+        unsigned int hits, pages, max_pages;
+        static atomic_t last_readahead_pages;
+        max_pages = 1 << READ_ONCE(page_cluster);
+        if (max_pages <= 1)
+                return 1;
+        hits = atomic_xchg(&swapin_readahead_hits, 0);
+        pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages,
+                                  atomic_read(&last_readahead_pages));
+        if (!hits)
+                prev_offset = offset;
        atomic_set(&last_readahead_pages, pages);
        return pages;
@@ -570,3 +624,130 @@ void exit_swap_address_space(unsigned int type)
        synchronize_rcu();
        kvfree(spaces);
 }
+static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
+                                     unsigned long faddr,
+                                     unsigned long lpfn,
+                                     unsigned long rpfn,
+                                     unsigned long *start,
+                                     unsigned long *end)
+{
+        *start = max3(lpfn, PFN_DOWN(vma->vm_start),
+                      PFN_DOWN(faddr & PMD_MASK));
+        *end = min3(rpfn, PFN_DOWN(vma->vm_end),
+                    PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
+}
+struct page *swap_readahead_detect(struct vm_fault *vmf,
+                                   struct vma_swap_readahead *swap_ra)
+{
+        struct vm_area_struct *vma = vmf->vma;
+        unsigned long swap_ra_info;
+        struct page *page;
+        swp_entry_t entry;
+        unsigned long faddr, pfn, fpfn;
+        unsigned long start, end;
+        pte_t *pte;
+        unsigned int max_win, hits, prev_win, win, left;
+#ifndef CONFIG_64BIT
+        pte_t *tpte;
+#endif
+        faddr = vmf->address;
+        entry = pte_to_swp_entry(vmf->orig_pte);
+        if ((unlikely(non_swap_entry(entry))))
+                return NULL;
+        page = lookup_swap_cache(entry, vma, faddr);
+        if (page)
+                return page;
+        max_win = 1 << READ_ONCE(swap_ra_max_order);
+        if (max_win == 1) {
+                swap_ra->win = 1;
+                return NULL;
+        }
+        fpfn = PFN_DOWN(faddr);
+        swap_ra_info = GET_SWAP_RA_VAL(vma);
+        pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info));
+        prev_win = SWAP_RA_WIN(swap_ra_info);
+        hits = SWAP_RA_HITS(swap_ra_info);
+        swap_ra->win = win = __swapin_nr_pages(pfn, fpfn, hits,
+                                               max_win, prev_win);
+        atomic_long_set(&vma->swap_readahead_info,
+                        SWAP_RA_VAL(faddr, win, 0));
+        if (win == 1)
+                return NULL;
+        /* Copy the PTEs because the page table may be unmapped */
+        if (fpfn == pfn + 1)
+                swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end);
+        else if (pfn == fpfn + 1)
+                swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1,
+                                  &start, &end);
+        else {
+                left = (win - 1) / 2;
+                swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left,
+                                  &start, &end);
+        }
+        swap_ra->nr_pte = end - start;
+        swap_ra->offset = fpfn - start;
+        pte = vmf->pte - swap_ra->offset;
+#ifdef CONFIG_64BIT
+        swap_ra->ptes = pte;
+#else
+        tpte = swap_ra->ptes;
+        for (pfn = start; pfn != end; pfn++)
+                *tpte++ = *pte++;
+#endif
+        return NULL;
+}
+struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask,
+                                    struct vm_fault *vmf,
+                                    struct vma_swap_readahead *swap_ra)
+{
+        struct blk_plug plug;
+        struct vm_area_struct *vma = vmf->vma;
+        struct page *page;
+        pte_t *pte, pentry;
+        swp_entry_t entry;
+        unsigned int i;
+        bool page_allocated;
+        if (swap_ra->win == 1)
+                goto skip;
+        blk_start_plug(&plug);
+        for (i = 0, pte = swap_ra->ptes; i < swap_ra->nr_pte;
+             i++, pte++) {
+                pentry = *pte;
+                if (pte_none(pentry))
+                        continue;
+                if (pte_present(pentry))
+                        continue;
+                entry = pte_to_swp_entry(pentry);
+                if (unlikely(non_swap_entry(entry)))
+                        continue;
+                page = __read_swap_cache_async(entry, gfp_mask, vma,
+                                               vmf->address, &page_allocated);
+                if (!page)
+                        continue;
+                if (page_allocated) {
+                        swap_readpage(page, false);
+                        if (i != swap_ra->offset &&
+                            likely(!PageTransCompound(page))) {
+                                SetPageReadahead(page);
+                                count_vm_event(SWAP_RA);
+                        }
+                }
+                put_page(page);
+        }
+        blk_finish_plug(&plug);
+        lru_add_drain();
+skip:
+        return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
+                                     swap_ra->win == 1);
+}
author	Huang Ying <ying.huang@intel.com>	2017-09-06 19:24:36 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2017-09-06 20:27:29 -0400
commit	ec560175c0b6fce86994bdf036754d48122c5c87 (patch)
tree	7aacd0beae098c785452a8a8361e13e7ffe2bc73 /mm/swap_state.c
parent	c4fa63092f216737b60c789968371d9960a598e5 (diff)

diff --git a/mm/swap_state.c b/mm/swap_state.c index a901afe9da61..3885fef7bdf5 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c
@@ -37,6 +37,29 @@ static const struct address_space_operations swap_aops = {
37		37
38	struct address_space *swapper_spaces[MAX_SWAPFILES];	38	struct address_space *swapper_spaces[MAX_SWAPFILES];
39	static unsigned int nr_swapper_spaces[MAX_SWAPFILES];	39	static unsigned int nr_swapper_spaces[MAX_SWAPFILES];
		40	bool swap_vma_readahead = true;
		41
		42	#define SWAP_RA_MAX_ORDER_DEFAULT 3
		43
		44	static int swap_ra_max_order = SWAP_RA_MAX_ORDER_DEFAULT;
		45
		46	#define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2)
		47	#define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1)
		48	#define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK
		49	#define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK)
		50
		51	#define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK)
		52	#define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT)
		53	#define SWAP_RA_ADDR(v) ((v) & PAGE_MASK)
		54
		55	#define SWAP_RA_VAL(addr, win, hits) \
		56	(((addr) & PAGE_MASK) \| \
		57	(((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) \| \
		58	((hits) & SWAP_RA_HITS_MASK))
		59
		60	/* Initial readahead hits is 4 to start up with a small window */
		61	#define GET_SWAP_RA_VAL(vma) \
		62	(atomic_long_read(&(vma)->swap_readahead_info) ? : 4)
40		63
41	#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)	64	#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
42	#define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0)	65	#define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0)
@@ -297,21 +320,36 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
297	* lock getting page table operations atomic even if we drop the page	320	* lock getting page table operations atomic even if we drop the page
298	* lock before returning.	321	* lock before returning.
299	*/	322	*/
300	struct page * lookup_swap_cache(swp_entry_t entry)	323	struct page lookup_swap_cache(swp_entry_t entry, struct vm_area_struct vma,
		324	unsigned long addr)
301	{	325	{
302	struct page *page;	326	struct page *page;
		327	unsigned long ra_info;
		328	int win, hits, readahead;
303		329
304	page = find_get_page(swap_address_space(entry), swp_offset(entry));	330	page = find_get_page(swap_address_space(entry), swp_offset(entry));
305		331
306	if (page && likely(!PageTransCompound(page))) {	332	INC_CACHE_INFO(find_total);
		333	if (page) {
307	INC_CACHE_INFO(find_success);	334	INC_CACHE_INFO(find_success);
308	if (TestClearPageReadahead(page)) {	335	if (unlikely(PageTransCompound(page)))
309	atomic_inc(&swapin_readahead_hits);	336	return page;
		337	readahead = TestClearPageReadahead(page);
		338	if (vma) {
		339	ra_info = GET_SWAP_RA_VAL(vma);
		340	win = SWAP_RA_WIN(ra_info);
		341	hits = SWAP_RA_HITS(ra_info);
		342	if (readahead)
		343	hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
		344	atomic_long_set(&vma->swap_readahead_info,
		345	SWAP_RA_VAL(addr, win, hits));
		346	}
		347	if (readahead) {
310	count_vm_event(SWAP_RA_HIT);	348	count_vm_event(SWAP_RA_HIT);
		349	if (!vma)
		350	atomic_inc(&swapin_readahead_hits);
311	}	351	}
312	}	352	}
313
314	INC_CACHE_INFO(find_total);
315	return page;	353	return page;
316	}	354	}
317		355
@@ -426,22 +464,20 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
426	return retpage;	464	return retpage;
427	}	465	}
428		466
429	static unsigned long swapin_nr_pages(unsigned long offset)	467	static unsigned int __swapin_nr_pages(unsigned long prev_offset,
		468	unsigned long offset,
		469	int hits,
		470	int max_pages,
		471	int prev_win)
430	{	472	{
431	static unsigned long prev_offset;	473	unsigned int pages, last_ra;
432	unsigned int pages, max_pages, last_ra;
433	static atomic_t last_readahead_pages;
434
435	max_pages = 1 << READ_ONCE(page_cluster);
436	if (max_pages <= 1)
437	return 1;
438		474
439	/*	475	/*
440	* This heuristic has been found to work well on both sequential and	476	* This heuristic has been found to work well on both sequential and
441	* random loads, swapping to hard disk or to SSD: please don't ask	477	* random loads, swapping to hard disk or to SSD: please don't ask
442	* what the "+ 2" means, it just happens to work well, that's all.	478	* what the "+ 2" means, it just happens to work well, that's all.
443	*/	479	*/
444	pages = atomic_xchg(&swapin_readahead_hits, 0) + 2;	480	pages = hits + 2;
445	if (pages == 2) {	481	if (pages == 2) {
446	/*	482	/*
447	* We can have no readahead hits to judge by: but must not get	483	* We can have no readahead hits to judge by: but must not get
@@ -450,7 +486,6 @@ static unsigned long swapin_nr_pages(unsigned long offset)
450	*/	486	*/
451	if (offset != prev_offset + 1 && offset != prev_offset - 1)	487	if (offset != prev_offset + 1 && offset != prev_offset - 1)
452	pages = 1;	488	pages = 1;
453	prev_offset = offset;
454	} else {	489	} else {
455	unsigned int roundup = 4;	490	unsigned int roundup = 4;
456	while (roundup < pages)	491	while (roundup < pages)
@@ -462,9 +497,28 @@ static unsigned long swapin_nr_pages(unsigned long offset)
462	pages = max_pages;	497	pages = max_pages;
463		498
464	/* Don't shrink readahead too fast */	499	/* Don't shrink readahead too fast */
465	last_ra = atomic_read(&last_readahead_pages) / 2;	500	last_ra = prev_win / 2;
466	if (pages < last_ra)	501	if (pages < last_ra)
467	pages = last_ra;	502	pages = last_ra;
		503
		504	return pages;
		505	}
		506
		507	static unsigned long swapin_nr_pages(unsigned long offset)
		508	{
		509	static unsigned long prev_offset;
		510	unsigned int hits, pages, max_pages;
		511	static atomic_t last_readahead_pages;
		512
		513	max_pages = 1 << READ_ONCE(page_cluster);
		514	if (max_pages <= 1)
		515	return 1;
		516
		517	hits = atomic_xchg(&swapin_readahead_hits, 0);
		518	pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages,
		519	atomic_read(&last_readahead_pages));
		520	if (!hits)
		521	prev_offset = offset;
468	atomic_set(&last_readahead_pages, pages);	522	atomic_set(&last_readahead_pages, pages);
469		523
470	return pages;	524	return pages;
@@ -570,3 +624,130 @@ void exit_swap_address_space(unsigned int type)
570	synchronize_rcu();	624	synchronize_rcu();
571	kvfree(spaces);	625	kvfree(spaces);
572	}	626	}
		627
		628	static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
		629	unsigned long faddr,
		630	unsigned long lpfn,
		631	unsigned long rpfn,
		632	unsigned long *start,
		633	unsigned long *end)
		634	{
		635	*start = max3(lpfn, PFN_DOWN(vma->vm_start),
		636	PFN_DOWN(faddr & PMD_MASK));
		637	*end = min3(rpfn, PFN_DOWN(vma->vm_end),
		638	PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
		639	}
		640
		641	struct page swap_readahead_detect(struct vm_fault vmf,
		642	struct vma_swap_readahead *swap_ra)
		643	{
		644	struct vm_area_struct *vma = vmf->vma;
		645	unsigned long swap_ra_info;
		646	struct page *page;
		647	swp_entry_t entry;
		648	unsigned long faddr, pfn, fpfn;
		649	unsigned long start, end;
		650	pte_t *pte;
		651	unsigned int max_win, hits, prev_win, win, left;
		652	#ifndef CONFIG_64BIT
		653	pte_t *tpte;
		654	#endif
		655
		656	faddr = vmf->address;
		657	entry = pte_to_swp_entry(vmf->orig_pte);
		658	if ((unlikely(non_swap_entry(entry))))
		659	return NULL;
		660	page = lookup_swap_cache(entry, vma, faddr);
		661	if (page)
		662	return page;
		663
		664	max_win = 1 << READ_ONCE(swap_ra_max_order);
		665	if (max_win == 1) {
		666	swap_ra->win = 1;
		667	return NULL;
		668	}
		669
		670	fpfn = PFN_DOWN(faddr);
		671	swap_ra_info = GET_SWAP_RA_VAL(vma);
		672	pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info));
		673	prev_win = SWAP_RA_WIN(swap_ra_info);
		674	hits = SWAP_RA_HITS(swap_ra_info);
		675	swap_ra->win = win = __swapin_nr_pages(pfn, fpfn, hits,
		676	max_win, prev_win);
		677	atomic_long_set(&vma->swap_readahead_info,
		678	SWAP_RA_VAL(faddr, win, 0));
		679
		680	if (win == 1)
		681	return NULL;
		682
		683	/* Copy the PTEs because the page table may be unmapped */
		684	if (fpfn == pfn + 1)
		685	swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end);
		686	else if (pfn == fpfn + 1)
		687	swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1,
		688	&start, &end);
		689	else {
		690	left = (win - 1) / 2;
		691	swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left,
		692	&start, &end);
		693	}
		694	swap_ra->nr_pte = end - start;
		695	swap_ra->offset = fpfn - start;
		696	pte = vmf->pte - swap_ra->offset;
		697	#ifdef CONFIG_64BIT
		698	swap_ra->ptes = pte;
		699	#else
		700	tpte = swap_ra->ptes;
		701	for (pfn = start; pfn != end; pfn++)
		702	tpte++ = pte++;
		703	#endif
		704
		705	return NULL;
		706	}
		707
		708	struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask,
		709	struct vm_fault *vmf,
		710	struct vma_swap_readahead *swap_ra)
		711	{
		712	struct blk_plug plug;
		713	struct vm_area_struct *vma = vmf->vma;
		714	struct page *page;
		715	pte_t *pte, pentry;
		716	swp_entry_t entry;
		717	unsigned int i;
		718	bool page_allocated;
		719
		720	if (swap_ra->win == 1)
		721	goto skip;
		722
		723	blk_start_plug(&plug);
		724	for (i = 0, pte = swap_ra->ptes; i < swap_ra->nr_pte;
		725	i++, pte++) {
		726	pentry = *pte;
		727	if (pte_none(pentry))
		728	continue;
		729	if (pte_present(pentry))
		730	continue;
		731	entry = pte_to_swp_entry(pentry);
		732	if (unlikely(non_swap_entry(entry)))
		733	continue;
		734	page = __read_swap_cache_async(entry, gfp_mask, vma,
		735	vmf->address, &page_allocated);
		736	if (!page)
		737	continue;
		738	if (page_allocated) {
		739	swap_readpage(page, false);
		740	if (i != swap_ra->offset &&
		741	likely(!PageTransCompound(page))) {
		742	SetPageReadahead(page);
		743	count_vm_event(SWAP_RA);
		744	}
		745	}
		746	put_page(page);
		747	}
		748	blk_finish_plug(&plug);
		749	lru_add_drain();
		750	skip:
		751	return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
		752	swap_ra->win == 1);
		753	}