diff options
author | Dave Hansen <dave.hansen@linux.intel.com> | 2014-07-31 11:41:01 -0400 |
---|---|---|
committer | H. Peter Anvin <hpa@linux.intel.com> | 2014-07-31 11:48:51 -0400 |
commit | 2d040a1ce903ca5d6e7c983621fb29c6883c4c48 (patch) | |
tree | 2969f9de20422d8e247cf6e13d043330878baff6 | |
parent | d17d8f9dedb9dd76fd540a5c497101529d9eb25a (diff) |
x86/mm: New tunable for single vs full TLB flush
Most of the logic here is in the documentation file. Please take
a look at it.
I know we've come full-circle here back to a tunable, but this
new one is *WAY* simpler. I challenge anyone to describe in one
sentence how the old one worked. Here's the way the new one
works:
If we are flushing more pages than the ceiling, we use
the full flush, otherwise we use per-page flushes.
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: http://lkml.kernel.org/r/20140731154101.12B52CAF@viggo.jf.intel.com
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
-rw-r--r-- | Documentation/x86/tlb.txt | 75 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 46 |
2 files changed, 121 insertions, 0 deletions
diff --git a/Documentation/x86/tlb.txt b/Documentation/x86/tlb.txt new file mode 100644 index 000000000000..2b3a82e69151 --- /dev/null +++ b/Documentation/x86/tlb.txt | |||
@@ -0,0 +1,75 @@ | |||
1 | When the kernel unmaps or modified the attributes of a range of | ||
2 | memory, it has two choices: | ||
3 | 1. Flush the entire TLB with a two-instruction sequence. This is | ||
4 | a quick operation, but it causes collateral damage: TLB entries | ||
5 | from areas other than the one we are trying to flush will be | ||
6 | destroyed and must be refilled later, at some cost. | ||
7 | 2. Use the invlpg instruction to invalidate a single page at a | ||
8 | time. This could potentialy cost many more instructions, but | ||
9 | it is a much more precise operation, causing no collateral | ||
10 | damage to other TLB entries. | ||
11 | |||
12 | Which method to do depends on a few things: | ||
13 | 1. The size of the flush being performed. A flush of the entire | ||
14 | address space is obviously better performed by flushing the | ||
15 | entire TLB than doing 2^48/PAGE_SIZE individual flushes. | ||
16 | 2. The contents of the TLB. If the TLB is empty, then there will | ||
17 | be no collateral damage caused by doing the global flush, and | ||
18 | all of the individual flush will have ended up being wasted | ||
19 | work. | ||
20 | 3. The size of the TLB. The larger the TLB, the more collateral | ||
21 | damage we do with a full flush. So, the larger the TLB, the | ||
22 | more attrative an individual flush looks. Data and | ||
23 | instructions have separate TLBs, as do different page sizes. | ||
24 | 4. The microarchitecture. The TLB has become a multi-level | ||
25 | cache on modern CPUs, and the global flushes have become more | ||
26 | expensive relative to single-page flushes. | ||
27 | |||
28 | There is obviously no way the kernel can know all these things, | ||
29 | especially the contents of the TLB during a given flush. The | ||
30 | sizes of the flush will vary greatly depending on the workload as | ||
31 | well. There is essentially no "right" point to choose. | ||
32 | |||
33 | You may be doing too many individual invalidations if you see the | ||
34 | invlpg instruction (or instructions _near_ it) show up high in | ||
35 | profiles. If you believe that individual invalidations being | ||
36 | called too often, you can lower the tunable: | ||
37 | |||
38 | /sys/debug/kernel/x86/tlb_single_page_flush_ceiling | ||
39 | |||
40 | This will cause us to do the global flush for more cases. | ||
41 | Lowering it to 0 will disable the use of the individual flushes. | ||
42 | Setting it to 1 is a very conservative setting and it should | ||
43 | never need to be 0 under normal circumstances. | ||
44 | |||
45 | Despite the fact that a single individual flush on x86 is | ||
46 | guaranteed to flush a full 2MB [1], hugetlbfs always uses the full | ||
47 | flushes. THP is treated exactly the same as normal memory. | ||
48 | |||
49 | You might see invlpg inside of flush_tlb_mm_range() show up in | ||
50 | profiles, or you can use the trace_tlb_flush() tracepoints. to | ||
51 | determine how long the flush operations are taking. | ||
52 | |||
53 | Essentially, you are balancing the cycles you spend doing invlpg | ||
54 | with the cycles that you spend refilling the TLB later. | ||
55 | |||
56 | You can measure how expensive TLB refills are by using | ||
57 | performance counters and 'perf stat', like this: | ||
58 | |||
59 | perf stat -e | ||
60 | cpu/event=0x8,umask=0x84,name=dtlb_load_misses_walk_duration/, | ||
61 | cpu/event=0x8,umask=0x82,name=dtlb_load_misses_walk_completed/, | ||
62 | cpu/event=0x49,umask=0x4,name=dtlb_store_misses_walk_duration/, | ||
63 | cpu/event=0x49,umask=0x2,name=dtlb_store_misses_walk_completed/, | ||
64 | cpu/event=0x85,umask=0x4,name=itlb_misses_walk_duration/, | ||
65 | cpu/event=0x85,umask=0x2,name=itlb_misses_walk_completed/ | ||
66 | |||
67 | That works on an IvyBridge-era CPU (i5-3320M). Different CPUs | ||
68 | may have differently-named counters, but they should at least | ||
69 | be there in some form. You can use pmu-tools 'ocperf list' | ||
70 | (https://github.com/andikleen/pmu-tools) to find the right | ||
71 | counters for a given CPU. | ||
72 | |||
73 | 1. A footnote in Intel's SDM "4.10.4.2 Recommended Invalidation" | ||
74 | says: "One execution of INVLPG is sufficient even for a page | ||
75 | with size greater than 4 KBytes." | ||
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 6f00ecb9feeb..cb7caddf0902 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c | |||
@@ -265,3 +265,49 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) | |||
265 | on_each_cpu(do_kernel_range_flush, &info, 1); | 265 | on_each_cpu(do_kernel_range_flush, &info, 1); |
266 | } | 266 | } |
267 | } | 267 | } |
268 | |||
269 | static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, | ||
270 | size_t count, loff_t *ppos) | ||
271 | { | ||
272 | char buf[32]; | ||
273 | unsigned int len; | ||
274 | |||
275 | len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling); | ||
276 | return simple_read_from_buffer(user_buf, count, ppos, buf, len); | ||
277 | } | ||
278 | |||
279 | static ssize_t tlbflush_write_file(struct file *file, | ||
280 | const char __user *user_buf, size_t count, loff_t *ppos) | ||
281 | { | ||
282 | char buf[32]; | ||
283 | ssize_t len; | ||
284 | int ceiling; | ||
285 | |||
286 | len = min(count, sizeof(buf) - 1); | ||
287 | if (copy_from_user(buf, user_buf, len)) | ||
288 | return -EFAULT; | ||
289 | |||
290 | buf[len] = '\0'; | ||
291 | if (kstrtoint(buf, 0, &ceiling)) | ||
292 | return -EINVAL; | ||
293 | |||
294 | if (ceiling < 0) | ||
295 | return -EINVAL; | ||
296 | |||
297 | tlb_single_page_flush_ceiling = ceiling; | ||
298 | return count; | ||
299 | } | ||
300 | |||
301 | static const struct file_operations fops_tlbflush = { | ||
302 | .read = tlbflush_read_file, | ||
303 | .write = tlbflush_write_file, | ||
304 | .llseek = default_llseek, | ||
305 | }; | ||
306 | |||
307 | static int __init create_tlb_single_page_flush_ceiling(void) | ||
308 | { | ||
309 | debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR, | ||
310 | arch_debugfs_dir, NULL, &fops_tlbflush); | ||
311 | return 0; | ||
312 | } | ||
313 | late_initcall(create_tlb_single_page_flush_ceiling); | ||