diff options
author | Dave Hansen <dave.hansen@linux.intel.com> | 2014-07-31 11:40:55 -0400 |
---|---|---|
committer | H. Peter Anvin <hpa@linux.intel.com> | 2014-07-31 11:48:50 -0400 |
commit | e9f4e0a9fe2723078b7a1a1169828dd46a7b2f9e (patch) | |
tree | c16300d2f05f2fce6b7b70b2c6fed1ac58486129 /arch/x86/mm/tlb.c | |
parent | 4995ab9cf512e9a6cc07dfd6b1d4e2fc48ce7fef (diff) |
x86/mm: Rip out complicated, out-of-date, buggy TLB flushing
I think the flush_tlb_mm_range() code that tries to tune the
flush sizes based on the CPU needs to get ripped out for
several reasons:
1. It is obviously buggy. It uses mm->total_vm to judge the
task's footprint in the TLB. It should certainly be using
some measure of RSS, *NOT* ->total_vm since only resident
memory can populate the TLB.
2. Haswell, and several other CPUs are missing from the
intel_tlb_flushall_shift_set() function. Thus, it has been
demonstrated to bitrot quickly in practice.
3. It is plain wrong in my vm:
[ 0.037444] Last level iTLB entries: 4KB 0, 2MB 0, 4MB 0
[ 0.037444] Last level dTLB entries: 4KB 0, 2MB 0, 4MB 0
[ 0.037444] tlb_flushall_shift: 6
Which leads to it to never use invlpg.
4. The assumptions about TLB refill costs are wrong:
http://lkml.kernel.org/r/1337782555-8088-3-git-send-email-alex.shi@intel.com
(more on this in later patches)
5. I can not reproduce the original data: https://lkml.org/lkml/2012/5/17/59
I believe the sample times were too short. Running the
benchmark in a loop yields times that vary quite a bit.
Note that this leaves us with a static ceiling of 1 page. This
is a conservative, dumb setting, and will be revised in a later
patch.
This also removes the code which attempts to predict whether we
are flushing data or instructions. We expect instruction flushes
to be relatively rare and not worth tuning for explicitly.
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: http://lkml.kernel.org/r/20140731154055.ABC88E89@viggo.jf.intel.com
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Diffstat (limited to 'arch/x86/mm/tlb.c')
-rw-r--r-- | arch/x86/mm/tlb.c | 87 |
1 files changed, 11 insertions, 76 deletions
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 378fbef279d2..dff6ddebc45f 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c | |||
@@ -158,13 +158,14 @@ void flush_tlb_current_task(void) | |||
158 | preempt_enable(); | 158 | preempt_enable(); |
159 | } | 159 | } |
160 | 160 | ||
161 | /* in units of pages */ | ||
162 | unsigned long tlb_single_page_flush_ceiling = 1; | ||
163 | |||
161 | void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, | 164 | void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, |
162 | unsigned long end, unsigned long vmflag) | 165 | unsigned long end, unsigned long vmflag) |
163 | { | 166 | { |
164 | bool need_flush_others_all = true; | 167 | int need_flush_others_all = 1; |
165 | unsigned long addr; | 168 | unsigned long addr; |
166 | unsigned act_entries, tlb_entries = 0; | ||
167 | unsigned long nr_base_pages; | ||
168 | 169 | ||
169 | preempt_disable(); | 170 | preempt_disable(); |
170 | if (current->active_mm != mm) | 171 | if (current->active_mm != mm) |
@@ -175,29 +176,16 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, | |||
175 | goto out; | 176 | goto out; |
176 | } | 177 | } |
177 | 178 | ||
178 | if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 | 179 | if (end == TLB_FLUSH_ALL || vmflag & VM_HUGETLB) { |
179 | || vmflag & VM_HUGETLB) { | ||
180 | local_flush_tlb(); | 180 | local_flush_tlb(); |
181 | goto out; | 181 | goto out; |
182 | } | 182 | } |
183 | 183 | ||
184 | /* In modern CPU, last level tlb used for both data/ins */ | 184 | if ((end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) { |
185 | if (vmflag & VM_EXEC) | ||
186 | tlb_entries = tlb_lli_4k[ENTRIES]; | ||
187 | else | ||
188 | tlb_entries = tlb_lld_4k[ENTRIES]; | ||
189 | |||
190 | /* Assume all of TLB entries was occupied by this task */ | ||
191 | act_entries = tlb_entries >> tlb_flushall_shift; | ||
192 | act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm; | ||
193 | nr_base_pages = (end - start) >> PAGE_SHIFT; | ||
194 | |||
195 | /* tlb_flushall_shift is on balance point, details in commit log */ | ||
196 | if (nr_base_pages > act_entries) { | ||
197 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); | 185 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); |
198 | local_flush_tlb(); | 186 | local_flush_tlb(); |
199 | } else { | 187 | } else { |
200 | need_flush_others_all = false; | 188 | need_flush_others_all = 0; |
201 | /* flush range by one by one 'invlpg' */ | 189 | /* flush range by one by one 'invlpg' */ |
202 | for (addr = start; addr < end; addr += PAGE_SIZE) { | 190 | for (addr = start; addr < end; addr += PAGE_SIZE) { |
203 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); | 191 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); |
@@ -259,68 +247,15 @@ static void do_kernel_range_flush(void *info) | |||
259 | 247 | ||
260 | void flush_tlb_kernel_range(unsigned long start, unsigned long end) | 248 | void flush_tlb_kernel_range(unsigned long start, unsigned long end) |
261 | { | 249 | { |
262 | unsigned act_entries; | ||
263 | struct flush_tlb_info info; | ||
264 | |||
265 | /* In modern CPU, last level tlb used for both data/ins */ | ||
266 | act_entries = tlb_lld_4k[ENTRIES]; | ||
267 | 250 | ||
268 | /* Balance as user space task's flush, a bit conservative */ | 251 | /* Balance as user space task's flush, a bit conservative */ |
269 | if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 || | 252 | if (end == TLB_FLUSH_ALL || |
270 | (end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) | 253 | (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) { |
271 | |||
272 | on_each_cpu(do_flush_tlb_all, NULL, 1); | 254 | on_each_cpu(do_flush_tlb_all, NULL, 1); |
273 | else { | 255 | } else { |
256 | struct flush_tlb_info info; | ||
274 | info.flush_start = start; | 257 | info.flush_start = start; |
275 | info.flush_end = end; | 258 | info.flush_end = end; |
276 | on_each_cpu(do_kernel_range_flush, &info, 1); | 259 | on_each_cpu(do_kernel_range_flush, &info, 1); |
277 | } | 260 | } |
278 | } | 261 | } |
279 | |||
280 | #ifdef CONFIG_DEBUG_TLBFLUSH | ||
281 | static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, | ||
282 | size_t count, loff_t *ppos) | ||
283 | { | ||
284 | char buf[32]; | ||
285 | unsigned int len; | ||
286 | |||
287 | len = sprintf(buf, "%hd\n", tlb_flushall_shift); | ||
288 | return simple_read_from_buffer(user_buf, count, ppos, buf, len); | ||
289 | } | ||
290 | |||
291 | static ssize_t tlbflush_write_file(struct file *file, | ||
292 | const char __user *user_buf, size_t count, loff_t *ppos) | ||
293 | { | ||
294 | char buf[32]; | ||
295 | ssize_t len; | ||
296 | s8 shift; | ||
297 | |||
298 | len = min(count, sizeof(buf) - 1); | ||
299 | if (copy_from_user(buf, user_buf, len)) | ||
300 | return -EFAULT; | ||
301 | |||
302 | buf[len] = '\0'; | ||
303 | if (kstrtos8(buf, 0, &shift)) | ||
304 | return -EINVAL; | ||
305 | |||
306 | if (shift < -1 || shift >= BITS_PER_LONG) | ||
307 | return -EINVAL; | ||
308 | |||
309 | tlb_flushall_shift = shift; | ||
310 | return count; | ||
311 | } | ||
312 | |||
313 | static const struct file_operations fops_tlbflush = { | ||
314 | .read = tlbflush_read_file, | ||
315 | .write = tlbflush_write_file, | ||
316 | .llseek = default_llseek, | ||
317 | }; | ||
318 | |||
319 | static int __init create_tlb_flushall_shift(void) | ||
320 | { | ||
321 | debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR, | ||
322 | arch_debugfs_dir, NULL, &fops_tlbflush); | ||
323 | return 0; | ||
324 | } | ||
325 | late_initcall(create_tlb_flushall_shift); | ||
326 | #endif | ||