diff options
author | Will Deacon <will.deacon@arm.com> | 2013-02-11 08:47:48 -0500 |
---|---|---|
committer | Will Deacon <will.deacon@arm.com> | 2013-08-12 07:25:44 -0400 |
commit | f0915781bd5edf78b1154e61efe962dc15872d09 (patch) | |
tree | a8dc2c7b3c4f0f437171a80c6bde06b020418dc3 | |
parent | 792a843a9f353d3e2474b6f5057b7eaecba41675 (diff) |
ARM: tlb: don't perform inner-shareable invalidation for local TLB ops
Inner-shareable TLB invalidation is typically more expensive than local
(non-shareable) invalidation, so performing the broadcasting for
local_flush_tlb_* operations is a waste of cycles and needlessly
clobbers entries in the TLBs of other CPUs.
This patch introduces __flush_tlb_* versions for many of the TLB
invalidation functions, which only respect inner-shareable variants of
the invalidation instructions when presented with the TLB_V7_UIS_FULL
flag. The local version is also inlined to prevent SMP_ON_UP kernels
from missing flushes, where the __flush variant would be called with
the UP flags.
This gains us around 0.5% in hackbench scores for a dual-core A15, but I
would expect this to improve as more cores (and clusters) are added to
the equation.
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reported-by: Albin Tonnerre <Albin.Tonnerre@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
-rw-r--r-- | arch/arm/include/asm/tlbflush.h | 138 | ||||
-rw-r--r-- | arch/arm/kernel/smp_tlb.c | 8 | ||||
-rw-r--r-- | arch/arm/mm/context.c | 7 |
3 files changed, 123 insertions, 30 deletions
diff --git a/arch/arm/include/asm/tlbflush.h b/arch/arm/include/asm/tlbflush.h index f467e9b3f8d5..3316264916e9 100644 --- a/arch/arm/include/asm/tlbflush.h +++ b/arch/arm/include/asm/tlbflush.h | |||
@@ -319,6 +319,16 @@ extern struct cpu_tlb_fns cpu_tlb; | |||
319 | #define tlb_op(f, regs, arg) __tlb_op(f, "p15, 0, %0, " regs, arg) | 319 | #define tlb_op(f, regs, arg) __tlb_op(f, "p15, 0, %0, " regs, arg) |
320 | #define tlb_l2_op(f, regs, arg) __tlb_op(f, "p15, 1, %0, " regs, arg) | 320 | #define tlb_l2_op(f, regs, arg) __tlb_op(f, "p15, 1, %0, " regs, arg) |
321 | 321 | ||
322 | static inline void __local_flush_tlb_all(void) | ||
323 | { | ||
324 | const int zero = 0; | ||
325 | const unsigned int __tlb_flag = __cpu_tlb_flags; | ||
326 | |||
327 | tlb_op(TLB_V4_U_FULL | TLB_V6_U_FULL, "c8, c7, 0", zero); | ||
328 | tlb_op(TLB_V4_D_FULL | TLB_V6_D_FULL, "c8, c6, 0", zero); | ||
329 | tlb_op(TLB_V4_I_FULL | TLB_V6_I_FULL, "c8, c5, 0", zero); | ||
330 | } | ||
331 | |||
322 | static inline void local_flush_tlb_all(void) | 332 | static inline void local_flush_tlb_all(void) |
323 | { | 333 | { |
324 | const int zero = 0; | 334 | const int zero = 0; |
@@ -327,10 +337,8 @@ static inline void local_flush_tlb_all(void) | |||
327 | if (tlb_flag(TLB_WB)) | 337 | if (tlb_flag(TLB_WB)) |
328 | dsb(); | 338 | dsb(); |
329 | 339 | ||
330 | tlb_op(TLB_V4_U_FULL | TLB_V6_U_FULL, "c8, c7, 0", zero); | 340 | __local_flush_tlb_all(); |
331 | tlb_op(TLB_V4_D_FULL | TLB_V6_D_FULL, "c8, c6, 0", zero); | 341 | tlb_op(TLB_V7_UIS_FULL, "c8, c7, 0", zero); |
332 | tlb_op(TLB_V4_I_FULL | TLB_V6_I_FULL, "c8, c5, 0", zero); | ||
333 | tlb_op(TLB_V7_UIS_FULL, "c8, c3, 0", zero); | ||
334 | 342 | ||
335 | if (tlb_flag(TLB_BARRIER)) { | 343 | if (tlb_flag(TLB_BARRIER)) { |
336 | dsb(); | 344 | dsb(); |
@@ -338,31 +346,69 @@ static inline void local_flush_tlb_all(void) | |||
338 | } | 346 | } |
339 | } | 347 | } |
340 | 348 | ||
341 | static inline void local_flush_tlb_mm(struct mm_struct *mm) | 349 | static inline void __flush_tlb_all(void) |
342 | { | 350 | { |
343 | const int zero = 0; | 351 | const int zero = 0; |
344 | const int asid = ASID(mm); | ||
345 | const unsigned int __tlb_flag = __cpu_tlb_flags; | 352 | const unsigned int __tlb_flag = __cpu_tlb_flags; |
346 | 353 | ||
347 | if (tlb_flag(TLB_WB)) | 354 | if (tlb_flag(TLB_WB)) |
348 | dsb(); | 355 | dsb(); |
349 | 356 | ||
357 | __local_flush_tlb_all(); | ||
358 | tlb_op(TLB_V7_UIS_FULL, "c8, c3, 0", zero); | ||
359 | |||
360 | if (tlb_flag(TLB_BARRIER)) { | ||
361 | dsb(); | ||
362 | isb(); | ||
363 | } | ||
364 | } | ||
365 | |||
366 | static inline void __local_flush_tlb_mm(struct mm_struct *mm) | ||
367 | { | ||
368 | const int zero = 0; | ||
369 | const int asid = ASID(mm); | ||
370 | const unsigned int __tlb_flag = __cpu_tlb_flags; | ||
371 | |||
350 | if (possible_tlb_flags & (TLB_V4_U_FULL|TLB_V4_D_FULL|TLB_V4_I_FULL)) { | 372 | if (possible_tlb_flags & (TLB_V4_U_FULL|TLB_V4_D_FULL|TLB_V4_I_FULL)) { |
351 | if (cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) { | 373 | if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm))) { |
352 | tlb_op(TLB_V4_U_FULL, "c8, c7, 0", zero); | 374 | tlb_op(TLB_V4_U_FULL, "c8, c7, 0", zero); |
353 | tlb_op(TLB_V4_D_FULL, "c8, c6, 0", zero); | 375 | tlb_op(TLB_V4_D_FULL, "c8, c6, 0", zero); |
354 | tlb_op(TLB_V4_I_FULL, "c8, c5, 0", zero); | 376 | tlb_op(TLB_V4_I_FULL, "c8, c5, 0", zero); |
355 | } | 377 | } |
356 | put_cpu(); | ||
357 | } | 378 | } |
358 | 379 | ||
359 | tlb_op(TLB_V6_U_ASID, "c8, c7, 2", asid); | 380 | tlb_op(TLB_V6_U_ASID, "c8, c7, 2", asid); |
360 | tlb_op(TLB_V6_D_ASID, "c8, c6, 2", asid); | 381 | tlb_op(TLB_V6_D_ASID, "c8, c6, 2", asid); |
361 | tlb_op(TLB_V6_I_ASID, "c8, c5, 2", asid); | 382 | tlb_op(TLB_V6_I_ASID, "c8, c5, 2", asid); |
383 | } | ||
384 | |||
385 | static inline void local_flush_tlb_mm(struct mm_struct *mm) | ||
386 | { | ||
387 | const int asid = ASID(mm); | ||
388 | const unsigned int __tlb_flag = __cpu_tlb_flags; | ||
389 | |||
390 | if (tlb_flag(TLB_WB)) | ||
391 | dsb(); | ||
392 | |||
393 | __local_flush_tlb_mm(mm); | ||
394 | tlb_op(TLB_V7_UIS_ASID, "c8, c7, 2", asid); | ||
395 | |||
396 | if (tlb_flag(TLB_BARRIER)) | ||
397 | dsb(); | ||
398 | } | ||
399 | |||
400 | static inline void __flush_tlb_mm(struct mm_struct *mm) | ||
401 | { | ||
402 | const unsigned int __tlb_flag = __cpu_tlb_flags; | ||
403 | |||
404 | if (tlb_flag(TLB_WB)) | ||
405 | dsb(); | ||
406 | |||
407 | __local_flush_tlb_mm(mm); | ||
362 | #ifdef CONFIG_ARM_ERRATA_720789 | 408 | #ifdef CONFIG_ARM_ERRATA_720789 |
363 | tlb_op(TLB_V7_UIS_ASID, "c8, c3, 0", zero); | 409 | tlb_op(TLB_V7_UIS_ASID, "c8, c3, 0", 0); |
364 | #else | 410 | #else |
365 | tlb_op(TLB_V7_UIS_ASID, "c8, c3, 2", asid); | 411 | tlb_op(TLB_V7_UIS_ASID, "c8, c3, 2", ASID(mm)); |
366 | #endif | 412 | #endif |
367 | 413 | ||
368 | if (tlb_flag(TLB_BARRIER)) | 414 | if (tlb_flag(TLB_BARRIER)) |
@@ -370,16 +416,13 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm) | |||
370 | } | 416 | } |
371 | 417 | ||
372 | static inline void | 418 | static inline void |
373 | local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr) | 419 | __local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr) |
374 | { | 420 | { |
375 | const int zero = 0; | 421 | const int zero = 0; |
376 | const unsigned int __tlb_flag = __cpu_tlb_flags; | 422 | const unsigned int __tlb_flag = __cpu_tlb_flags; |
377 | 423 | ||
378 | uaddr = (uaddr & PAGE_MASK) | ASID(vma->vm_mm); | 424 | uaddr = (uaddr & PAGE_MASK) | ASID(vma->vm_mm); |
379 | 425 | ||
380 | if (tlb_flag(TLB_WB)) | ||
381 | dsb(); | ||
382 | |||
383 | if (possible_tlb_flags & (TLB_V4_U_PAGE|TLB_V4_D_PAGE|TLB_V4_I_PAGE|TLB_V4_I_FULL) && | 426 | if (possible_tlb_flags & (TLB_V4_U_PAGE|TLB_V4_D_PAGE|TLB_V4_I_PAGE|TLB_V4_I_FULL) && |
384 | cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) { | 427 | cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) { |
385 | tlb_op(TLB_V4_U_PAGE, "c8, c7, 1", uaddr); | 428 | tlb_op(TLB_V4_U_PAGE, "c8, c7, 1", uaddr); |
@@ -392,6 +435,36 @@ local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr) | |||
392 | tlb_op(TLB_V6_U_PAGE, "c8, c7, 1", uaddr); | 435 | tlb_op(TLB_V6_U_PAGE, "c8, c7, 1", uaddr); |
393 | tlb_op(TLB_V6_D_PAGE, "c8, c6, 1", uaddr); | 436 | tlb_op(TLB_V6_D_PAGE, "c8, c6, 1", uaddr); |
394 | tlb_op(TLB_V6_I_PAGE, "c8, c5, 1", uaddr); | 437 | tlb_op(TLB_V6_I_PAGE, "c8, c5, 1", uaddr); |
438 | } | ||
439 | |||
440 | static inline void | ||
441 | local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr) | ||
442 | { | ||
443 | const unsigned int __tlb_flag = __cpu_tlb_flags; | ||
444 | |||
445 | uaddr = (uaddr & PAGE_MASK) | ASID(vma->vm_mm); | ||
446 | |||
447 | if (tlb_flag(TLB_WB)) | ||
448 | dsb(); | ||
449 | |||
450 | __local_flush_tlb_page(vma, uaddr); | ||
451 | tlb_op(TLB_V7_UIS_PAGE, "c8, c7, 1", uaddr); | ||
452 | |||
453 | if (tlb_flag(TLB_BARRIER)) | ||
454 | dsb(); | ||
455 | } | ||
456 | |||
457 | static inline void | ||
458 | __flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr) | ||
459 | { | ||
460 | const unsigned int __tlb_flag = __cpu_tlb_flags; | ||
461 | |||
462 | uaddr = (uaddr & PAGE_MASK) | ASID(vma->vm_mm); | ||
463 | |||
464 | if (tlb_flag(TLB_WB)) | ||
465 | dsb(); | ||
466 | |||
467 | __local_flush_tlb_page(vma, uaddr); | ||
395 | #ifdef CONFIG_ARM_ERRATA_720789 | 468 | #ifdef CONFIG_ARM_ERRATA_720789 |
396 | tlb_op(TLB_V7_UIS_PAGE, "c8, c3, 3", uaddr & PAGE_MASK); | 469 | tlb_op(TLB_V7_UIS_PAGE, "c8, c3, 3", uaddr & PAGE_MASK); |
397 | #else | 470 | #else |
@@ -402,16 +475,11 @@ local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr) | |||
402 | dsb(); | 475 | dsb(); |
403 | } | 476 | } |
404 | 477 | ||
405 | static inline void local_flush_tlb_kernel_page(unsigned long kaddr) | 478 | static inline void __local_flush_tlb_kernel_page(unsigned long kaddr) |
406 | { | 479 | { |
407 | const int zero = 0; | 480 | const int zero = 0; |
408 | const unsigned int __tlb_flag = __cpu_tlb_flags; | 481 | const unsigned int __tlb_flag = __cpu_tlb_flags; |
409 | 482 | ||
410 | kaddr &= PAGE_MASK; | ||
411 | |||
412 | if (tlb_flag(TLB_WB)) | ||
413 | dsb(); | ||
414 | |||
415 | tlb_op(TLB_V4_U_PAGE, "c8, c7, 1", kaddr); | 483 | tlb_op(TLB_V4_U_PAGE, "c8, c7, 1", kaddr); |
416 | tlb_op(TLB_V4_D_PAGE, "c8, c6, 1", kaddr); | 484 | tlb_op(TLB_V4_D_PAGE, "c8, c6, 1", kaddr); |
417 | tlb_op(TLB_V4_I_PAGE, "c8, c5, 1", kaddr); | 485 | tlb_op(TLB_V4_I_PAGE, "c8, c5, 1", kaddr); |
@@ -421,6 +489,36 @@ static inline void local_flush_tlb_kernel_page(unsigned long kaddr) | |||
421 | tlb_op(TLB_V6_U_PAGE, "c8, c7, 1", kaddr); | 489 | tlb_op(TLB_V6_U_PAGE, "c8, c7, 1", kaddr); |
422 | tlb_op(TLB_V6_D_PAGE, "c8, c6, 1", kaddr); | 490 | tlb_op(TLB_V6_D_PAGE, "c8, c6, 1", kaddr); |
423 | tlb_op(TLB_V6_I_PAGE, "c8, c5, 1", kaddr); | 491 | tlb_op(TLB_V6_I_PAGE, "c8, c5, 1", kaddr); |
492 | } | ||
493 | |||
494 | static inline void local_flush_tlb_kernel_page(unsigned long kaddr) | ||
495 | { | ||
496 | const unsigned int __tlb_flag = __cpu_tlb_flags; | ||
497 | |||
498 | kaddr &= PAGE_MASK; | ||
499 | |||
500 | if (tlb_flag(TLB_WB)) | ||
501 | dsb(); | ||
502 | |||
503 | __local_flush_tlb_kernel_page(kaddr); | ||
504 | tlb_op(TLB_V7_UIS_PAGE, "c8, c7, 1", kaddr); | ||
505 | |||
506 | if (tlb_flag(TLB_BARRIER)) { | ||
507 | dsb(); | ||
508 | isb(); | ||
509 | } | ||
510 | } | ||
511 | |||
512 | static inline void __flush_tlb_kernel_page(unsigned long kaddr) | ||
513 | { | ||
514 | const unsigned int __tlb_flag = __cpu_tlb_flags; | ||
515 | |||
516 | kaddr &= PAGE_MASK; | ||
517 | |||
518 | if (tlb_flag(TLB_WB)) | ||
519 | dsb(); | ||
520 | |||
521 | __local_flush_tlb_kernel_page(kaddr); | ||
424 | tlb_op(TLB_V7_UIS_PAGE, "c8, c3, 1", kaddr); | 522 | tlb_op(TLB_V7_UIS_PAGE, "c8, c3, 1", kaddr); |
425 | 523 | ||
426 | if (tlb_flag(TLB_BARRIER)) { | 524 | if (tlb_flag(TLB_BARRIER)) { |
diff --git a/arch/arm/kernel/smp_tlb.c b/arch/arm/kernel/smp_tlb.c index c2edfff573c2..5883b8ae77c8 100644 --- a/arch/arm/kernel/smp_tlb.c +++ b/arch/arm/kernel/smp_tlb.c | |||
@@ -104,7 +104,7 @@ void flush_tlb_all(void) | |||
104 | if (tlb_ops_need_broadcast()) | 104 | if (tlb_ops_need_broadcast()) |
105 | on_each_cpu(ipi_flush_tlb_all, NULL, 1); | 105 | on_each_cpu(ipi_flush_tlb_all, NULL, 1); |
106 | else | 106 | else |
107 | local_flush_tlb_all(); | 107 | __flush_tlb_all(); |
108 | broadcast_tlb_a15_erratum(); | 108 | broadcast_tlb_a15_erratum(); |
109 | } | 109 | } |
110 | 110 | ||
@@ -113,7 +113,7 @@ void flush_tlb_mm(struct mm_struct *mm) | |||
113 | if (tlb_ops_need_broadcast()) | 113 | if (tlb_ops_need_broadcast()) |
114 | on_each_cpu_mask(mm_cpumask(mm), ipi_flush_tlb_mm, mm, 1); | 114 | on_each_cpu_mask(mm_cpumask(mm), ipi_flush_tlb_mm, mm, 1); |
115 | else | 115 | else |
116 | local_flush_tlb_mm(mm); | 116 | __flush_tlb_mm(mm); |
117 | broadcast_tlb_mm_a15_erratum(mm); | 117 | broadcast_tlb_mm_a15_erratum(mm); |
118 | } | 118 | } |
119 | 119 | ||
@@ -126,7 +126,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr) | |||
126 | on_each_cpu_mask(mm_cpumask(vma->vm_mm), ipi_flush_tlb_page, | 126 | on_each_cpu_mask(mm_cpumask(vma->vm_mm), ipi_flush_tlb_page, |
127 | &ta, 1); | 127 | &ta, 1); |
128 | } else | 128 | } else |
129 | local_flush_tlb_page(vma, uaddr); | 129 | __flush_tlb_page(vma, uaddr); |
130 | broadcast_tlb_mm_a15_erratum(vma->vm_mm); | 130 | broadcast_tlb_mm_a15_erratum(vma->vm_mm); |
131 | } | 131 | } |
132 | 132 | ||
@@ -137,7 +137,7 @@ void flush_tlb_kernel_page(unsigned long kaddr) | |||
137 | ta.ta_start = kaddr; | 137 | ta.ta_start = kaddr; |
138 | on_each_cpu(ipi_flush_tlb_kernel_page, &ta, 1); | 138 | on_each_cpu(ipi_flush_tlb_kernel_page, &ta, 1); |
139 | } else | 139 | } else |
140 | local_flush_tlb_kernel_page(kaddr); | 140 | __flush_tlb_kernel_page(kaddr); |
141 | broadcast_tlb_a15_erratum(); | 141 | broadcast_tlb_a15_erratum(); |
142 | } | 142 | } |
143 | 143 | ||
diff --git a/arch/arm/mm/context.c b/arch/arm/mm/context.c index 4a0544492f10..84e6f772e204 100644 --- a/arch/arm/mm/context.c +++ b/arch/arm/mm/context.c | |||
@@ -162,10 +162,7 @@ static void flush_context(unsigned int cpu) | |||
162 | } | 162 | } |
163 | 163 | ||
164 | /* Queue a TLB invalidate and flush the I-cache if necessary. */ | 164 | /* Queue a TLB invalidate and flush the I-cache if necessary. */ |
165 | if (!tlb_ops_need_broadcast()) | 165 | cpumask_setall(&tlb_flush_pending); |
166 | cpumask_set_cpu(cpu, &tlb_flush_pending); | ||
167 | else | ||
168 | cpumask_setall(&tlb_flush_pending); | ||
169 | 166 | ||
170 | if (icache_is_vivt_asid_tagged()) | 167 | if (icache_is_vivt_asid_tagged()) |
171 | __flush_icache_all(); | 168 | __flush_icache_all(); |
@@ -245,8 +242,6 @@ void check_and_switch_context(struct mm_struct *mm, struct task_struct *tsk) | |||
245 | if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending)) { | 242 | if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending)) { |
246 | local_flush_bp_all(); | 243 | local_flush_bp_all(); |
247 | local_flush_tlb_all(); | 244 | local_flush_tlb_all(); |
248 | if (erratum_a15_798181()) | ||
249 | dummy_flush_tlb_a15_erratum(); | ||
250 | } | 245 | } |
251 | 246 | ||
252 | atomic64_set(&per_cpu(active_asids, cpu), asid); | 247 | atomic64_set(&per_cpu(active_asids, cpu), asid); |