aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2015-09-04 18:47:32 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-09-04 19:54:41 -0400
commit72b252aed506b8f1a03f7abd29caef4cdf6a043b (patch)
treea0825c463af7ebca1b172ceb26ab8ea3eb6ff602
parent5b74283ab251b9db55cbbe31d19ca72482103290 (diff)
mm: send one IPI per CPU to TLB flush all entries after unmapping pages
An IPI is sent to flush remote TLBs when a page is unmapped that was potentially accesssed by other CPUs. There are many circumstances where this happens but the obvious one is kswapd reclaiming pages belonging to a running process as kswapd and the task are likely running on separate CPUs. On small machines, this is not a significant problem but as machine gets larger with more cores and more memory, the cost of these IPIs can be high. This patch uses a simple structure that tracks CPUs that potentially have TLB entries for pages being unmapped. When the unmapping is complete, the full TLB is flushed on the assumption that a refill cost is lower than flushing individual entries. Architectures wishing to do this must give the following guarantee. If a clean page is unmapped and not immediately flushed, the architecture must guarantee that a write to that linear address from a CPU with a cached TLB entry will trap a page fault. This is essentially what the kernel already depends on but the window is much larger with this patch applied and is worth highlighting. The architecture should consider whether the cost of the full TLB flush is higher than sending an IPI to flush each individual entry. An additional architecture helper called flush_tlb_local is required. It's a trivial wrapper with some accounting in the x86 case. The impact of this patch depends on the workload as measuring any benefit requires both mapped pages co-located on the LRU and memory pressure. The case with the biggest impact is multiple processes reading mapped pages taken from the vm-scalability test suite. The test case uses NR_CPU readers of mapped files that consume 10*RAM. Linear mapped reader on a 4-node machine with 64G RAM and 48 CPUs 4.2.0-rc1 4.2.0-rc1 vanilla flushfull-v7 Ops lru-file-mmap-read-elapsed 159.62 ( 0.00%) 120.68 ( 24.40%) Ops lru-file-mmap-read-time_range 30.59 ( 0.00%) 2.80 ( 90.85%) Ops lru-file-mmap-read-time_stddv 6.70 ( 0.00%) 0.64 ( 90.38%) 4.2.0-rc1 4.2.0-rc1 vanilla flushfull-v7 User 581.00 611.43 System 5804.93 4111.76 Elapsed 161.03 122.12 This is showing that the readers completed 24.40% faster with 29% less system CPU time. From vmstats, it is known that the vanilla kernel was interrupted roughly 900K times per second during the steady phase of the test and the patched kernel was interrupts 180K times per second. The impact is lower on a single socket machine. 4.2.0-rc1 4.2.0-rc1 vanilla flushfull-v7 Ops lru-file-mmap-read-elapsed 25.33 ( 0.00%) 20.38 ( 19.54%) Ops lru-file-mmap-read-time_range 0.91 ( 0.00%) 1.44 (-58.24%) Ops lru-file-mmap-read-time_stddv 0.28 ( 0.00%) 0.47 (-65.34%) 4.2.0-rc1 4.2.0-rc1 vanilla flushfull-v7 User 58.09 57.64 System 111.82 76.56 Elapsed 27.29 22.55 It's still a noticeable improvement with vmstat showing interrupts went from roughly 500K per second to 45K per second. The patch will have no impact on workloads with no memory pressure or have relatively few mapped pages. It will have an unpredictable impact on the workload running on the CPU being flushed as it'll depend on how many TLB entries need to be refilled and how long that takes. Worst case, the TLB will be completely cleared of active entries when the target PFNs were not resident at all. [sasha.levin@oracle.com: trace tlb flush after disabling preemption in try_to_unmap_flush] Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Rik van Riel <riel@redhat.com> Cc: Dave Hansen <dave.hansen@intel.com> Acked-by: Ingo Molnar <mingo@kernel.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Sasha Levin <sasha.levin@oracle.com> Cc: Michal Hocko <mhocko@suse.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/include/asm/tlbflush.h6
-rw-r--r--include/linux/rmap.h3
-rw-r--r--include/linux/sched.h16
-rw-r--r--init/Kconfig10
-rw-r--r--mm/internal.h11
-rw-r--r--mm/rmap.c104
-rw-r--r--mm/vmscan.c23
8 files changed, 172 insertions, 2 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 48f7433dac6f..117e2f373e50 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -41,6 +41,7 @@ config X86
41 select ARCH_USE_CMPXCHG_LOCKREF if X86_64 41 select ARCH_USE_CMPXCHG_LOCKREF if X86_64
42 select ARCH_USE_QUEUED_RWLOCKS 42 select ARCH_USE_QUEUED_RWLOCKS
43 select ARCH_USE_QUEUED_SPINLOCKS 43 select ARCH_USE_QUEUED_SPINLOCKS
44 select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
44 select ARCH_WANTS_DYNAMIC_TASK_STRUCT 45 select ARCH_WANTS_DYNAMIC_TASK_STRUCT
45 select ARCH_WANT_FRAME_POINTERS 46 select ARCH_WANT_FRAME_POINTERS
46 select ARCH_WANT_IPC_PARSE_VERSION if X86_32 47 select ARCH_WANT_IPC_PARSE_VERSION if X86_32
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index cd791948b286..6df2029405a3 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -261,6 +261,12 @@ static inline void reset_lazy_tlbstate(void)
261 261
262#endif /* SMP */ 262#endif /* SMP */
263 263
264/* Not inlined due to inc_irq_stat not being defined yet */
265#define flush_tlb_local() { \
266 inc_irq_stat(irq_tlb_count); \
267 local_flush_tlb(); \
268}
269
264#ifndef CONFIG_PARAVIRT 270#ifndef CONFIG_PARAVIRT
265#define flush_tlb_others(mask, mm, start, end) \ 271#define flush_tlb_others(mask, mm, start, end) \
266 native_flush_tlb_others(mask, mm, start, end) 272 native_flush_tlb_others(mask, mm, start, end)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index c89c53a113a8..29446aeef36e 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -89,6 +89,9 @@ enum ttu_flags {
89 TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ 89 TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */
90 TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ 90 TTU_IGNORE_ACCESS = (1 << 9), /* don't age */
91 TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */ 91 TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
92 TTU_BATCH_FLUSH = (1 << 11), /* Batch TLB flushes where possible
93 * and caller guarantees they will
94 * do a final flush if necessary */
92}; 95};
93 96
94#ifdef CONFIG_MMU 97#ifdef CONFIG_MMU
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 119823decc46..3c602c20c717 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1344,6 +1344,18 @@ enum perf_event_task_context {
1344 perf_nr_task_contexts, 1344 perf_nr_task_contexts,
1345}; 1345};
1346 1346
1347/* Track pages that require TLB flushes */
1348struct tlbflush_unmap_batch {
1349 /*
1350 * Each bit set is a CPU that potentially has a TLB entry for one of
1351 * the PFNs being flushed. See set_tlb_ubc_flush_pending().
1352 */
1353 struct cpumask cpumask;
1354
1355 /* True if any bit in cpumask is set */
1356 bool flush_required;
1357};
1358
1347struct task_struct { 1359struct task_struct {
1348 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ 1360 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
1349 void *stack; 1361 void *stack;
@@ -1700,6 +1712,10 @@ struct task_struct {
1700 unsigned long numa_pages_migrated; 1712 unsigned long numa_pages_migrated;
1701#endif /* CONFIG_NUMA_BALANCING */ 1713#endif /* CONFIG_NUMA_BALANCING */
1702 1714
1715#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
1716 struct tlbflush_unmap_batch tlb_ubc;
1717#endif
1718
1703 struct rcu_head rcu; 1719 struct rcu_head rcu;
1704 1720
1705 /* 1721 /*
diff --git a/init/Kconfig b/init/Kconfig
index 161acd8bc56f..cf7e4824c8d0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -883,6 +883,16 @@ config ARCH_SUPPORTS_NUMA_BALANCING
883 bool 883 bool
884 884
885# 885#
886# For architectures that prefer to flush all TLBs after a number of pages
887# are unmapped instead of sending one IPI per page to flush. The architecture
888# must provide guarantees on what happens if a clean TLB cache entry is
889# written after the unmap. Details are in mm/rmap.c near the check for
890# should_defer_flush. The architecture should also consider if the full flush
891# and the refill costs are offset by the savings of sending fewer IPIs.
892config ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
893 bool
894
895#
886# For architectures that know their GCC __int128 support is sound 896# For architectures that know their GCC __int128 support is sound
887# 897#
888config ARCH_SUPPORTS_INT128 898config ARCH_SUPPORTS_INT128
diff --git a/mm/internal.h b/mm/internal.h
index 36b23f1e2ca6..bd6372ac5f7f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -426,4 +426,15 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
426#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ 426#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
427#define ALLOC_FAIR 0x100 /* fair zone allocation */ 427#define ALLOC_FAIR 0x100 /* fair zone allocation */
428 428
429enum ttu_flags;
430struct tlbflush_unmap_batch;
431
432#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
433void try_to_unmap_flush(void);
434#else
435static inline void try_to_unmap_flush(void)
436{
437}
438
439#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
429#endif /* __MM_INTERNAL_H */ 440#endif /* __MM_INTERNAL_H */
diff --git a/mm/rmap.c b/mm/rmap.c
index 171b68768df1..326d5d89e45c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -62,6 +62,8 @@
62 62
63#include <asm/tlbflush.h> 63#include <asm/tlbflush.h>
64 64
65#include <trace/events/tlb.h>
66
65#include "internal.h" 67#include "internal.h"
66 68
67static struct kmem_cache *anon_vma_cachep; 69static struct kmem_cache *anon_vma_cachep;
@@ -583,6 +585,89 @@ vma_address(struct page *page, struct vm_area_struct *vma)
583 return address; 585 return address;
584} 586}
585 587
588#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
589static void percpu_flush_tlb_batch_pages(void *data)
590{
591 /*
592 * All TLB entries are flushed on the assumption that it is
593 * cheaper to flush all TLBs and let them be refilled than
594 * flushing individual PFNs. Note that we do not track mm's
595 * to flush as that might simply be multiple full TLB flushes
596 * for no gain.
597 */
598 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
599 flush_tlb_local();
600}
601
602/*
603 * Flush TLB entries for recently unmapped pages from remote CPUs. It is
604 * important if a PTE was dirty when it was unmapped that it's flushed
605 * before any IO is initiated on the page to prevent lost writes. Similarly,
606 * it must be flushed before freeing to prevent data leakage.
607 */
608void try_to_unmap_flush(void)
609{
610 struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
611 int cpu;
612
613 if (!tlb_ubc->flush_required)
614 return;
615
616 cpu = get_cpu();
617
618 trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, -1UL);
619
620 if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask))
621 percpu_flush_tlb_batch_pages(&tlb_ubc->cpumask);
622
623 if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) {
624 smp_call_function_many(&tlb_ubc->cpumask,
625 percpu_flush_tlb_batch_pages, (void *)tlb_ubc, true);
626 }
627 cpumask_clear(&tlb_ubc->cpumask);
628 tlb_ubc->flush_required = false;
629 put_cpu();
630}
631
632static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
633 struct page *page)
634{
635 struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
636
637 cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm));
638 tlb_ubc->flush_required = true;
639}
640
641/*
642 * Returns true if the TLB flush should be deferred to the end of a batch of
643 * unmap operations to reduce IPIs.
644 */
645static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
646{
647 bool should_defer = false;
648
649 if (!(flags & TTU_BATCH_FLUSH))
650 return false;
651
652 /* If remote CPUs need to be flushed then defer batch the flush */
653 if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
654 should_defer = true;
655 put_cpu();
656
657 return should_defer;
658}
659#else
660static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
661 struct page *page)
662{
663}
664
665static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
666{
667 return false;
668}
669#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
670
586/* 671/*
587 * At what user virtual address is page expected in vma? 672 * At what user virtual address is page expected in vma?
588 * Caller should check the page is actually part of the vma. 673 * Caller should check the page is actually part of the vma.
@@ -1220,7 +1305,24 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1220 1305
1221 /* Nuke the page table entry. */ 1306 /* Nuke the page table entry. */
1222 flush_cache_page(vma, address, page_to_pfn(page)); 1307 flush_cache_page(vma, address, page_to_pfn(page));
1223 pteval = ptep_clear_flush(vma, address, pte); 1308 if (should_defer_flush(mm, flags)) {
1309 /*
1310 * We clear the PTE but do not flush so potentially a remote
1311 * CPU could still be writing to the page. If the entry was
1312 * previously clean then the architecture must guarantee that
1313 * a clear->dirty transition on a cached TLB entry is written
1314 * through and traps if the PTE is unmapped.
1315 */
1316 pteval = ptep_get_and_clear(mm, address, pte);
1317
1318 /* Potentially writable TLBs must be flushed before IO */
1319 if (pte_dirty(pteval))
1320 flush_tlb_page(vma, address);
1321 else
1322 set_tlb_ubc_flush_pending(mm, page);
1323 } else {
1324 pteval = ptep_clear_flush(vma, address, pte);
1325 }
1224 1326
1225 /* Move the dirty bit to the physical page now the pte is gone. */ 1327 /* Move the dirty bit to the physical page now the pte is gone. */
1226 if (pte_dirty(pteval)) 1328 if (pte_dirty(pteval))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8286938c70de..99ec00d6a5dd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1057,7 +1057,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
1057 * processes. Try to unmap it here. 1057 * processes. Try to unmap it here.
1058 */ 1058 */
1059 if (page_mapped(page) && mapping) { 1059 if (page_mapped(page) && mapping) {
1060 switch (try_to_unmap(page, ttu_flags)) { 1060 switch (try_to_unmap(page,
1061 ttu_flags|TTU_BATCH_FLUSH)) {
1061 case SWAP_FAIL: 1062 case SWAP_FAIL:
1062 goto activate_locked; 1063 goto activate_locked;
1063 case SWAP_AGAIN: 1064 case SWAP_AGAIN:
@@ -1208,6 +1209,7 @@ keep:
1208 } 1209 }
1209 1210
1210 mem_cgroup_uncharge_list(&free_pages); 1211 mem_cgroup_uncharge_list(&free_pages);
1212 try_to_unmap_flush();
1211 free_hot_cold_page_list(&free_pages, true); 1213 free_hot_cold_page_list(&free_pages, true);
1212 1214
1213 list_splice(&ret_pages, page_list); 1215 list_splice(&ret_pages, page_list);
@@ -2151,6 +2153,23 @@ out:
2151 } 2153 }
2152} 2154}
2153 2155
2156#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
2157static void init_tlb_ubc(void)
2158{
2159 /*
2160 * This deliberately does not clear the cpumask as it's expensive
2161 * and unnecessary. If there happens to be data in there then the
2162 * first SWAP_CLUSTER_MAX pages will send an unnecessary IPI and
2163 * then will be cleared.
2164 */
2165 current->tlb_ubc.flush_required = false;
2166}
2167#else
2168static inline void init_tlb_ubc(void)
2169{
2170}
2171#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
2172
2154/* 2173/*
2155 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 2174 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
2156 */ 2175 */
@@ -2185,6 +2204,8 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
2185 scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && 2204 scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
2186 sc->priority == DEF_PRIORITY); 2205 sc->priority == DEF_PRIORITY);
2187 2206
2207 init_tlb_ubc();
2208
2188 blk_start_plug(&plug); 2209 blk_start_plug(&plug);
2189 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 2210 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
2190 nr[LRU_INACTIVE_FILE]) { 2211 nr[LRU_INACTIVE_FILE]) {