mm: send one IPI per CPU to TLB flush all entries after unmapping pages

An IPI is sent to flush remote TLBs when a page is unmapped that was potentially accesssed by other CPUs. There are many circumstances where this happens but the obvious one is kswapd reclaiming pages belonging to a running process as kswapd and the task are likely running on separate CPUs. On small machines, this is not a significant problem but as machine gets larger with more cores and more memory, the cost of these IPIs can be high. This patch uses a simple structure that tracks CPUs that potentially have TLB entries for pages being unmapped. When the unmapping is complete, the full TLB is flushed on the assumption that a refill cost is lower than flushing individual entries. Architectures wishing to do this must give the following guarantee. If a clean page is unmapped and not immediately flushed, the architecture must guarantee that a write to that linear address from a CPU with a cached TLB entry will trap a page fault. This is essentially what the kernel already depends on but the window is much larger with this patch applied and is worth highlighting. The architecture should consider whether the cost of the full TLB flush is higher than sending an IPI to flush each individual entry. An additional architecture helper called flush_tlb_local is required. It's a trivial wrapper with some accounting in the x86 case. The impact of this patch depends on the workload as measuring any benefit requires both mapped pages co-located on the LRU and memory pressure. The case with the biggest impact is multiple processes reading mapped pages taken from the vm-scalability test suite. The test case uses NR_CPU readers of mapped files that consume 10*RAM. Linear mapped reader on a 4-node machine with 64G RAM and 48 CPUs 4.2.0-rc1 4.2.0-rc1 vanilla flushfull-v7 Ops lru-file-mmap-read-elapsed 159.62 ( 0.00%) 120.68 ( 24.40%) Ops lru-file-mmap-read-time_range 30.59 ( 0.00%) 2.80 ( 90.85%) Ops lru-file-mmap-read-time_stddv 6.70 ( 0.00%) 0.64 ( 90.38%) 4.2.0-rc1 4.2.0-rc1 vanilla flushfull-v7 User 581.00 611.43 System 5804.93 4111.76 Elapsed 161.03 122.12 This is showing that the readers completed 24.40% faster with 29% less system CPU time. From vmstats, it is known that the vanilla kernel was interrupted roughly 900K times per second during the steady phase of the test and the patched kernel was interrupts 180K times per second. The impact is lower on a single socket machine. 4.2.0-rc1 4.2.0-rc1 vanilla flushfull-v7 Ops lru-file-mmap-read-elapsed 25.33 ( 0.00%) 20.38 ( 19.54%) Ops lru-file-mmap-read-time_range 0.91 ( 0.00%) 1.44 (-58.24%) Ops lru-file-mmap-read-time_stddv 0.28 ( 0.00%) 0.47 (-65.34%) 4.2.0-rc1 4.2.0-rc1 vanilla flushfull-v7 User 58.09 57.64 System 111.82 76.56 Elapsed 27.29 22.55 It's still a noticeable improvement with vmstat showing interrupts went from roughly 500K per second to 45K per second. The patch will have no impact on workloads with no memory pressure or have relatively few mapped pages. It will have an unpredictable impact on the workload running on the CPU being flushed as it'll depend on how many TLB entries need to be refilled and how long that takes. Worst case, the TLB will be completely cleared of active entries when the target PFNs were not resident at all. [sasha.levin@oracle.com: trace tlb flush after disabling preemption in try_to_unmap_flush] Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Rik van Riel <riel@redhat.com> Cc: Dave Hansen <dave.hansen@intel.com> Acked-by: Ingo Molnar <mingo@kernel.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Sasha Levin <sasha.levin@oracle.com> Cc: Michal Hocko <mhocko@suse.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Mel Gorman <mgorman@suse.de> 2015-09-04 18:47:32 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2015-09-04 19:54:41 -0400
commit: 72b252aed506b8f1a03f7abd29caef4cdf6a043b (patch)
tree: a0825c463af7ebca1b172ceb26ab8ea3eb6ff602
parent: 5b74283ab251b9db55cbbe31d19ca72482103290 (diff)
8 files changed, 172 insertions, 2 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 48f7433dac6f..117e2f373e50 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -41,6 +41,7 @@ config X86
        select ARCH_USE_CMPXCHG_LOCKREF         if X86_64
        select ARCH_USE_QUEUED_RWLOCKS
        select ARCH_USE_QUEUED_SPINLOCKS
+        select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
        select ARCH_WANTS_DYNAMIC_TASK_STRUCT
        select ARCH_WANT_FRAME_POINTERS
        select ARCH_WANT_IPC_PARSE_VERSION      if X86_32
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index cd791948b286..6df2029405a3 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -261,6 +261,12 @@ static inline void reset_lazy_tlbstate(void)
 #endif  /* SMP */
+/* Not inlined due to inc_irq_stat not being defined yet */
+#define flush_tlb_local() {             \
+        inc_irq_stat(irq_tlb_count);    \
+        local_flush_tlb();              \
+}
 #ifndef CONFIG_PARAVIRT
 #define flush_tlb_others(mask, mm, start, end)  \
        native_flush_tlb_others(mask, mm, start, end)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index c89c53a113a8..29446aeef36e 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -89,6 +89,9 @@ enum ttu_flags {
        TTU_IGNORE_MLOCK = (1 << 8),    /* ignore mlock */
        TTU_IGNORE_ACCESS = (1 << 9),   /* don't age */
        TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
+        TTU_BATCH_FLUSH = (1 << 11),    /* Batch TLB flushes where possible
+                                         * and caller guarantees they will
+                                         * do a final flush if necessary */
 };
 #ifdef CONFIG_MMU
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 119823decc46..3c602c20c717 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1344,6 +1344,18 @@ enum perf_event_task_context {
        perf_nr_task_contexts,
 };
+/* Track pages that require TLB flushes */
+struct tlbflush_unmap_batch {
+        /*
+         * Each bit set is a CPU that potentially has a TLB entry for one of
+         * the PFNs being flushed. See set_tlb_ubc_flush_pending().
+         */
+        struct cpumask cpumask;
+        /* True if any bit in cpumask is set */
+        bool flush_required;
+};
 struct task_struct {
        volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
        void *stack;
@@ -1700,6 +1712,10 @@ struct task_struct {
        unsigned long numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+        struct tlbflush_unmap_batch tlb_ubc;
+#endif
        struct rcu_head rcu;
        /*
diff --git a/init/Kconfig b/init/Kconfig
index 161acd8bc56f..cf7e4824c8d0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -883,6 +883,16 @@ config ARCH_SUPPORTS_NUMA_BALANCING
        bool
 #
+# For architectures that prefer to flush all TLBs after a number of pages
+# are unmapped instead of sending one IPI per page to flush. The architecture
+# must provide guarantees on what happens if a clean TLB cache entry is
+# written after the unmap. Details are in mm/rmap.c near the check for
+# should_defer_flush. The architecture should also consider if the full flush
+# and the refill costs are offset by the savings of sending fewer IPIs.
+config ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+        bool
+#
 # For architectures that know their GCC __int128 support is sound
 #
 config ARCH_SUPPORTS_INT128
diff --git a/mm/internal.h b/mm/internal.h
index 36b23f1e2ca6..bd6372ac5f7f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -426,4 +426,15 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 #define ALLOC_CMA               0x80 /* allow allocations from CMA areas */
 #define ALLOC_FAIR              0x100 /* fair zone allocation */
+enum ttu_flags;
+struct tlbflush_unmap_batch;
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+void try_to_unmap_flush(void);
+#else
+static inline void try_to_unmap_flush(void)
+{
+}
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
 #endif  /* __MM_INTERNAL_H */
diff --git a/mm/rmap.c b/mm/rmap.c
index 171b68768df1..326d5d89e45c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -62,6 +62,8 @@
 #include <asm/tlbflush.h>
+#include <trace/events/tlb.h>
 #include "internal.h"
 static struct kmem_cache *anon_vma_cachep;
@@ -583,6 +585,89 @@ vma_address(struct page *page, struct vm_area_struct *vma)
        return address;
 }
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+static void percpu_flush_tlb_batch_pages(void *data)
+{
+        /*
+         * All TLB entries are flushed on the assumption that it is
+         * cheaper to flush all TLBs and let them be refilled than
+         * flushing individual PFNs. Note that we do not track mm's
+         * to flush as that might simply be multiple full TLB flushes
+         * for no gain.
+         */
+        count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+        flush_tlb_local();
+}
+/*
+ * Flush TLB entries for recently unmapped pages from remote CPUs. It is
+ * important if a PTE was dirty when it was unmapped that it's flushed
+ * before any IO is initiated on the page to prevent lost writes. Similarly,
+ * it must be flushed before freeing to prevent data leakage.
+ */
+void try_to_unmap_flush(void)
+{
+        struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+        int cpu;
+        if (!tlb_ubc->flush_required)
+                return;
+        cpu = get_cpu();
+        trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, -1UL);
+        if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask))
+                percpu_flush_tlb_batch_pages(&tlb_ubc->cpumask);
+        if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) {
+                smp_call_function_many(&tlb_ubc->cpumask,
+                        percpu_flush_tlb_batch_pages, (void *)tlb_ubc, true);
+        }
+        cpumask_clear(&tlb_ubc->cpumask);
+        tlb_ubc->flush_required = false;
+        put_cpu();
+}
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
+                struct page *page)
+{
+        struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+        cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm));
+        tlb_ubc->flush_required = true;
+}
+/*
+ * Returns true if the TLB flush should be deferred to the end of a batch of
+ * unmap operations to reduce IPIs.
+ */
+static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
+{
+        bool should_defer = false;
+        if (!(flags & TTU_BATCH_FLUSH))
+                return false;
+        /* If remote CPUs need to be flushed then defer batch the flush */
+        if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
+                should_defer = true;
+        put_cpu();
+        return should_defer;
+}
+#else
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
+                struct page *page)
+{
+}
+static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
+{
+        return false;
+}
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
 /*
 * At what user virtual address is page expected in vma?
 * Caller should check the page is actually part of the vma.
@@ -1220,7 +1305,24 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        /* Nuke the page table entry. */
        flush_cache_page(vma, address, page_to_pfn(page));
-        pteval = ptep_clear_flush(vma, address, pte);
+        if (should_defer_flush(mm, flags)) {
+                /*
+                 * We clear the PTE but do not flush so potentially a remote
+                 * CPU could still be writing to the page. If the entry was
+                 * previously clean then the architecture must guarantee that
+                 * a clear->dirty transition on a cached TLB entry is written
+                 * through and traps if the PTE is unmapped.
+                 */
+                pteval = ptep_get_and_clear(mm, address, pte);
+                /* Potentially writable TLBs must be flushed before IO */
+                if (pte_dirty(pteval))
+                        flush_tlb_page(vma, address);
+                else
+                        set_tlb_ubc_flush_pending(mm, page);
+        } else {
+                pteval = ptep_clear_flush(vma, address, pte);
+        }
        /* Move the dirty bit to the physical page now the pte is gone. */
        if (pte_dirty(pteval))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8286938c70de..99ec00d6a5dd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1057,7 +1057,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 * processes. Try to unmap it here.
                 */
                if (page_mapped(page) && mapping) {
-                        switch (try_to_unmap(page, ttu_flags)) {
+                        switch (try_to_unmap(page,
+                                        ttu_flags|TTU_BATCH_FLUSH)) {
                        case SWAP_FAIL:
                                goto activate_locked;
                        case SWAP_AGAIN:
@@ -1208,6 +1209,7 @@ keep:
        }
        mem_cgroup_uncharge_list(&free_pages);
+        try_to_unmap_flush();
        free_hot_cold_page_list(&free_pages, true);
        list_splice(&ret_pages, page_list);
@@ -2151,6 +2153,23 @@ out:
        }
 }
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+static void init_tlb_ubc(void)
+{
+        /*
+         * This deliberately does not clear the cpumask as it's expensive
+         * and unnecessary. If there happens to be data in there then the
+         * first SWAP_CLUSTER_MAX pages will send an unnecessary IPI and
+         * then will be cleared.
+         */
+        current->tlb_ubc.flush_required = false;
+}
+#else
+static inline void init_tlb_ubc(void)
+{
+}
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
 /*
 * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
 */
@@ -2185,6 +2204,8 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
        scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
                         sc->priority == DEF_PRIORITY);
+        init_tlb_ubc();
        blk_start_plug(&plug);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                        nr[LRU_INACTIVE_FILE]) {
author	Mel Gorman <mgorman@suse.de>	2015-09-04 18:47:32 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2015-09-04 19:54:41 -0400
commit	72b252aed506b8f1a03f7abd29caef4cdf6a043b (patch)
tree	a0825c463af7ebca1b172ceb26ab8ea3eb6ff602
parent	5b74283ab251b9db55cbbe31d19ca72482103290 (diff)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 48f7433dac6f..117e2f373e50 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig
@@ -41,6 +41,7 @@ config X86
41	select ARCH_USE_CMPXCHG_LOCKREF if X86_64	41	select ARCH_USE_CMPXCHG_LOCKREF if X86_64
42	select ARCH_USE_QUEUED_RWLOCKS	42	select ARCH_USE_QUEUED_RWLOCKS
43	select ARCH_USE_QUEUED_SPINLOCKS	43	select ARCH_USE_QUEUED_SPINLOCKS
		44	select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
44	select ARCH_WANTS_DYNAMIC_TASK_STRUCT	45	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
45	select ARCH_WANT_FRAME_POINTERS	46	select ARCH_WANT_FRAME_POINTERS
46	select ARCH_WANT_IPC_PARSE_VERSION if X86_32	47	select ARCH_WANT_IPC_PARSE_VERSION if X86_32


diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index cd791948b286..6df2029405a3 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h
@@ -261,6 +261,12 @@ static inline void reset_lazy_tlbstate(void)
261		261
262	#endif /* SMP */	262	#endif /* SMP */
263		263
		264	/* Not inlined due to inc_irq_stat not being defined yet */
		265	#define flush_tlb_local() { \
		266	inc_irq_stat(irq_tlb_count); \
		267	local_flush_tlb(); \
		268	}
		269
264	#ifndef CONFIG_PARAVIRT	270	#ifndef CONFIG_PARAVIRT
265	#define flush_tlb_others(mask, mm, start, end) \	271	#define flush_tlb_others(mask, mm, start, end) \
266	native_flush_tlb_others(mask, mm, start, end)	272	native_flush_tlb_others(mask, mm, start, end)


diff --git a/include/linux/rmap.h b/include/linux/rmap.h index c89c53a113a8..29446aeef36e 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h
@@ -89,6 +89,9 @@ enum ttu_flags {
89	TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */	89	TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */
90	TTU_IGNORE_ACCESS = (1 << 9), /* don't age */	90	TTU_IGNORE_ACCESS = (1 << 9), /* don't age */
91	TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */	91	TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
		92	TTU_BATCH_FLUSH = (1 << 11), /* Batch TLB flushes where possible
		93	* and caller guarantees they will
		94	* do a final flush if necessary */
92	};	95	};
93		96
94	#ifdef CONFIG_MMU	97	#ifdef CONFIG_MMU


diff --git a/include/linux/sched.h b/include/linux/sched.h index 119823decc46..3c602c20c717 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h
@@ -1344,6 +1344,18 @@ enum perf_event_task_context {
1344	perf_nr_task_contexts,	1344	perf_nr_task_contexts,
1345	};	1345	};
1346		1346
		1347	/* Track pages that require TLB flushes */
		1348	struct tlbflush_unmap_batch {
		1349	/*
		1350	* Each bit set is a CPU that potentially has a TLB entry for one of
		1351	* the PFNs being flushed. See set_tlb_ubc_flush_pending().
		1352	*/
		1353	struct cpumask cpumask;
		1354
		1355	/* True if any bit in cpumask is set */
		1356	bool flush_required;
		1357	};
		1358
1347	struct task_struct {	1359	struct task_struct {
1348	volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */	1360	volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
1349	void *stack;	1361	void *stack;
@@ -1700,6 +1712,10 @@ struct task_struct {
1700	unsigned long numa_pages_migrated;	1712	unsigned long numa_pages_migrated;
1701	#endif /* CONFIG_NUMA_BALANCING */	1713	#endif /* CONFIG_NUMA_BALANCING */
1702		1714
		1715	#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
		1716	struct tlbflush_unmap_batch tlb_ubc;
		1717	#endif
		1718
1703	struct rcu_head rcu;	1719	struct rcu_head rcu;
1704		1720
1705	/*	1721	/*


diff --git a/init/Kconfig b/init/Kconfig index 161acd8bc56f..cf7e4824c8d0 100644 --- a/init/Kconfig +++ b/init/Kconfig
@@ -883,6 +883,16 @@ config ARCH_SUPPORTS_NUMA_BALANCING
883	bool	883	bool
884		884
885	#	885	#
		886	# For architectures that prefer to flush all TLBs after a number of pages
		887	# are unmapped instead of sending one IPI per page to flush. The architecture
		888	# must provide guarantees on what happens if a clean TLB cache entry is
		889	# written after the unmap. Details are in mm/rmap.c near the check for
		890	# should_defer_flush. The architecture should also consider if the full flush
		891	# and the refill costs are offset by the savings of sending fewer IPIs.
		892	config ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
		893	bool
		894
		895	#
886	# For architectures that know their GCC __int128 support is sound	896	# For architectures that know their GCC __int128 support is sound
887	#	897	#
888	config ARCH_SUPPORTS_INT128	898	config ARCH_SUPPORTS_INT128


diff --git a/mm/internal.h b/mm/internal.h index 36b23f1e2ca6..bd6372ac5f7f 100644 --- a/mm/internal.h +++ b/mm/internal.h
@@ -426,4 +426,15 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
426	#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */	426	#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
427	#define ALLOC_FAIR 0x100 /* fair zone allocation */	427	#define ALLOC_FAIR 0x100 /* fair zone allocation */
428		428
		429	enum ttu_flags;
		430	struct tlbflush_unmap_batch;
		431
		432	#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
		433	void try_to_unmap_flush(void);
		434	#else
		435	static inline void try_to_unmap_flush(void)
		436	{
		437	}
		438
		439	#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
429	#endif /* __MM_INTERNAL_H */	440	#endif /* __MM_INTERNAL_H */


diff --git a/mm/rmap.c b/mm/rmap.c index 171b68768df1..326d5d89e45c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c
@@ -62,6 +62,8 @@
62		62
63	#include <asm/tlbflush.h>	63	#include <asm/tlbflush.h>
64		64
		65	#include <trace/events/tlb.h>
		66
65	#include "internal.h"	67	#include "internal.h"
66		68
67	static struct kmem_cache *anon_vma_cachep;	69	static struct kmem_cache *anon_vma_cachep;
@@ -583,6 +585,89 @@ vma_address(struct page page, struct vm_area_struct vma)
583	return address;	585	return address;
584	}	586	}
585		587
		588	#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
		589	static void percpu_flush_tlb_batch_pages(void *data)
		590	{
		591	/*
		592	* All TLB entries are flushed on the assumption that it is
		593	* cheaper to flush all TLBs and let them be refilled than
		594	* flushing individual PFNs. Note that we do not track mm's
		595	* to flush as that might simply be multiple full TLB flushes
		596	* for no gain.
		597	*/
		598	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
		599	flush_tlb_local();
		600	}
		601
		602	/*
		603	* Flush TLB entries for recently unmapped pages from remote CPUs. It is
		604	* important if a PTE was dirty when it was unmapped that it's flushed
		605	* before any IO is initiated on the page to prevent lost writes. Similarly,
		606	* it must be flushed before freeing to prevent data leakage.
		607	*/
		608	void try_to_unmap_flush(void)
		609	{
		610	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
		611	int cpu;
		612
		613	if (!tlb_ubc->flush_required)
		614	return;
		615
		616	cpu = get_cpu();
		617
		618	trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, -1UL);
		619
		620	if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask))
		621	percpu_flush_tlb_batch_pages(&tlb_ubc->cpumask);
		622
		623	if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) {
		624	smp_call_function_many(&tlb_ubc->cpumask,
		625	percpu_flush_tlb_batch_pages, (void *)tlb_ubc, true);
		626	}
		627	cpumask_clear(&tlb_ubc->cpumask);
		628	tlb_ubc->flush_required = false;
		629	put_cpu();
		630	}
		631
		632	static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
		633	struct page *page)
		634	{
		635	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
		636
		637	cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm));
		638	tlb_ubc->flush_required = true;
		639	}
		640
		641	/*
		642	* Returns true if the TLB flush should be deferred to the end of a batch of
		643	* unmap operations to reduce IPIs.
		644	*/
		645	static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
		646	{
		647	bool should_defer = false;
		648
		649	if (!(flags & TTU_BATCH_FLUSH))
		650	return false;
		651
		652	/* If remote CPUs need to be flushed then defer batch the flush */
		653	if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
		654	should_defer = true;
		655	put_cpu();
		656
		657	return should_defer;
		658	}
		659	#else
		660	static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
		661	struct page *page)
		662	{
		663	}
		664
		665	static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
		666	{
		667	return false;
		668	}
		669	#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
		670
586	/*	671	/*
587	* At what user virtual address is page expected in vma?	672	* At what user virtual address is page expected in vma?
588	* Caller should check the page is actually part of the vma.	673	* Caller should check the page is actually part of the vma.
@@ -1220,7 +1305,24 @@ static int try_to_unmap_one(struct page page, struct vm_area_struct vma,
1220		1305
1221	/* Nuke the page table entry. */	1306	/* Nuke the page table entry. */
1222	flush_cache_page(vma, address, page_to_pfn(page));	1307	flush_cache_page(vma, address, page_to_pfn(page));
1223	pteval = ptep_clear_flush(vma, address, pte);	1308	if (should_defer_flush(mm, flags)) {
		1309	/*
		1310	* We clear the PTE but do not flush so potentially a remote
		1311	* CPU could still be writing to the page. If the entry was
		1312	* previously clean then the architecture must guarantee that
		1313	* a clear->dirty transition on a cached TLB entry is written
		1314	* through and traps if the PTE is unmapped.
		1315	*/
		1316	pteval = ptep_get_and_clear(mm, address, pte);
		1317
		1318	/* Potentially writable TLBs must be flushed before IO */
		1319	if (pte_dirty(pteval))
		1320	flush_tlb_page(vma, address);
		1321	else
		1322	set_tlb_ubc_flush_pending(mm, page);
		1323	} else {
		1324	pteval = ptep_clear_flush(vma, address, pte);
		1325	}
1224		1326
1225	/* Move the dirty bit to the physical page now the pte is gone. */	1327	/* Move the dirty bit to the physical page now the pte is gone. */
1226	if (pte_dirty(pteval))	1328	if (pte_dirty(pteval))


diff --git a/mm/vmscan.c b/mm/vmscan.c index 8286938c70de..99ec00d6a5dd 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -1057,7 +1057,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
1057	* processes. Try to unmap it here.	1057	* processes. Try to unmap it here.
1058	*/	1058	*/
1059	if (page_mapped(page) && mapping) {	1059	if (page_mapped(page) && mapping) {
1060	switch (try_to_unmap(page, ttu_flags)) {	1060	switch (try_to_unmap(page,
		1061	ttu_flags\|TTU_BATCH_FLUSH)) {
1061	case SWAP_FAIL:	1062	case SWAP_FAIL:
1062	goto activate_locked;	1063	goto activate_locked;
1063	case SWAP_AGAIN:	1064	case SWAP_AGAIN:
@@ -1208,6 +1209,7 @@ keep:
1208	}	1209	}
1209		1210
1210	mem_cgroup_uncharge_list(&free_pages);	1211	mem_cgroup_uncharge_list(&free_pages);
		1212	try_to_unmap_flush();
1211	free_hot_cold_page_list(&free_pages, true);	1213	free_hot_cold_page_list(&free_pages, true);
1212		1214
1213	list_splice(&ret_pages, page_list);	1215	list_splice(&ret_pages, page_list);
@@ -2151,6 +2153,23 @@ out:
2151	}	2153	}
2152	}	2154	}
2153		2155
		2156	#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
		2157	static void init_tlb_ubc(void)
		2158	{
		2159	/*
		2160	* This deliberately does not clear the cpumask as it's expensive
		2161	* and unnecessary. If there happens to be data in there then the
		2162	* first SWAP_CLUSTER_MAX pages will send an unnecessary IPI and
		2163	* then will be cleared.
		2164	*/
		2165	current->tlb_ubc.flush_required = false;
		2166	}
		2167	#else
		2168	static inline void init_tlb_ubc(void)
		2169	{
		2170	}
		2171	#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
		2172
2154	/*	2173	/*
2155	* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.	2174	* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
2156	*/	2175	*/
@@ -2185,6 +2204,8 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
2185	scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&	2204	scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
2186	sc->priority == DEF_PRIORITY);	2205	sc->priority == DEF_PRIORITY);
2187		2206
		2207	init_tlb_ubc();
		2208
2188	blk_start_plug(&plug);	2209	blk_start_plug(&plug);
2189	while (nr[LRU_INACTIVE_ANON] \|\| nr[LRU_ACTIVE_FILE] \|\|	2210	while (nr[LRU_INACTIVE_ANON] \|\| nr[LRU_ACTIVE_FILE] \|\|
2190	nr[LRU_INACTIVE_FILE]) {	2211	nr[LRU_INACTIVE_FILE]) {