aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChen, Kenneth W <kenneth.w.chen@intel.com>2006-10-13 13:08:13 -0400
committerTony Luck <tony.luck@intel.com>2007-02-06 18:04:48 -0500
commit00b65985fb2fc542b855b03fcda0d0f2bab4f442 (patch)
treedc9372aced10184945862b9adf0848da3e0e946f
parenta0776ec8e97bf109e7d973d09fc3e1814eb32bfb (diff)
[IA64] relax per-cpu TLB requirement to DTC
Instead of pinning per-cpu TLB into a DTR, use DTC. This will free up one TLB entry for application, or even kernel if access pattern to per-cpu data area has high temporal locality. Since per-cpu is mapped at the top of region 7 address, we just need to add special case in alt_dtlb_miss. The physical address of per-cpu data is already conveniently stored in IA64_KR(PER_CPU_DATA). Latency for alt_dtlb_miss is not affected as we can hide all the latency. It was measured that alt_dtlb_miss handler has 23 cycles latency before and after the patch. The performance effect is massive for applications that put lots of tlb pressure on CPU. Workload environment like database online transaction processing or application uses tera-byte of memory would benefit the most. Measurement with industry standard database benchmark shown an upward of 1.6% gain. While smaller workloads like cpu, java also showing small improvement. Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Signed-off-by: Tony Luck <tony.luck@intel.com>
-rw-r--r--arch/ia64/kernel/ivt.S19
-rw-r--r--arch/ia64/kernel/mca_asm.S24
-rw-r--r--arch/ia64/mm/init.c11
-rw-r--r--include/asm-ia64/kregs.h3
4 files changed, 16 insertions, 41 deletions
diff --git a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S
index 6b7fcbd3f6f1..34f44d8be00d 100644
--- a/arch/ia64/kernel/ivt.S
+++ b/arch/ia64/kernel/ivt.S
@@ -374,6 +374,7 @@ ENTRY(alt_dtlb_miss)
374 movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff) 374 movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
375 mov r21=cr.ipsr 375 mov r21=cr.ipsr
376 mov r31=pr 376 mov r31=pr
377 mov r24=PERCPU_ADDR
377 ;; 378 ;;
378#ifdef CONFIG_DISABLE_VHPT 379#ifdef CONFIG_DISABLE_VHPT
379 shr.u r22=r16,61 // get the region number into r21 380 shr.u r22=r16,61 // get the region number into r21
@@ -386,22 +387,30 @@ ENTRY(alt_dtlb_miss)
386(p8) mov r29=b0 // save b0 387(p8) mov r29=b0 // save b0
387(p8) br.cond.dptk dtlb_fault 388(p8) br.cond.dptk dtlb_fault
388#endif 389#endif
390 cmp.ge p10,p11=r16,r24 // access to per_cpu_data?
391 tbit.z p12,p0=r16,61 // access to region 6?
392 mov r25=PERCPU_PAGE_SHIFT << 2
393 mov r26=PERCPU_PAGE_SIZE
394 nop.m 0
395 nop.b 0
396 ;;
397(p10) mov r19=IA64_KR(PER_CPU_DATA)
398(p11) and r19=r19,r16 // clear non-ppn fields
389 extr.u r23=r21,IA64_PSR_CPL0_BIT,2 // extract psr.cpl 399 extr.u r23=r21,IA64_PSR_CPL0_BIT,2 // extract psr.cpl
390 and r22=IA64_ISR_CODE_MASK,r20 // get the isr.code field 400 and r22=IA64_ISR_CODE_MASK,r20 // get the isr.code field
391 tbit.nz p6,p7=r20,IA64_ISR_SP_BIT // is speculation bit on? 401 tbit.nz p6,p7=r20,IA64_ISR_SP_BIT // is speculation bit on?
392 shr.u r18=r16,57 // move address bit 61 to bit 4
393 and r19=r19,r16 // clear ed, reserved bits, and PTE control bits
394 tbit.nz p9,p0=r20,IA64_ISR_NA_BIT // is non-access bit on? 402 tbit.nz p9,p0=r20,IA64_ISR_NA_BIT // is non-access bit on?
395 ;; 403 ;;
396 andcm r18=0x10,r18 // bit 4=~address-bit(61) 404(p10) sub r19=r19,r26
405(p10) mov cr.itir=r25
397 cmp.ne p8,p0=r0,r23 406 cmp.ne p8,p0=r0,r23
398(p9) cmp.eq.or.andcm p6,p7=IA64_ISR_CODE_LFETCH,r22 // check isr.code field 407(p9) cmp.eq.or.andcm p6,p7=IA64_ISR_CODE_LFETCH,r22 // check isr.code field
408(p12) dep r17=-1,r17,4,1 // set ma=UC for region 6 addr
399(p8) br.cond.spnt page_fault 409(p8) br.cond.spnt page_fault
400 410
401 dep r21=-1,r21,IA64_PSR_ED_BIT,1 411 dep r21=-1,r21,IA64_PSR_ED_BIT,1
402 or r19=r19,r17 // insert PTE control bits into r19
403 ;; 412 ;;
404 or r19=r19,r18 // set bit 4 (uncached) if the access was to region 6 413 or r19=r19,r17 // insert PTE control bits into r19
405(p6) mov cr.ipsr=r21 414(p6) mov cr.ipsr=r21
406 ;; 415 ;;
407(p7) itc.d r19 // insert the TLB entry 416(p7) itc.d r19 // insert the TLB entry
diff --git a/arch/ia64/kernel/mca_asm.S b/arch/ia64/kernel/mca_asm.S
index c6b607c00dee..8c9c26aa6ae0 100644
--- a/arch/ia64/kernel/mca_asm.S
+++ b/arch/ia64/kernel/mca_asm.S
@@ -101,14 +101,6 @@ ia64_do_tlb_purge:
101 ;; 101 ;;
102 srlz.d 102 srlz.d
103 ;; 103 ;;
104 // 2. Purge DTR for PERCPU data.
105 movl r16=PERCPU_ADDR
106 mov r18=PERCPU_PAGE_SHIFT<<2
107 ;;
108 ptr.d r16,r18
109 ;;
110 srlz.d
111 ;;
112 // 3. Purge ITR for PAL code. 104 // 3. Purge ITR for PAL code.
113 GET_THIS_PADDR(r2, ia64_mca_pal_base) 105 GET_THIS_PADDR(r2, ia64_mca_pal_base)
114 ;; 106 ;;
@@ -196,22 +188,6 @@ ia64_reload_tr:
196 srlz.i 188 srlz.i
197 srlz.d 189 srlz.d
198 ;; 190 ;;
199 // 2. Reload DTR register for PERCPU data.
200 GET_THIS_PADDR(r2, ia64_mca_per_cpu_pte)
201 ;;
202 movl r16=PERCPU_ADDR // vaddr
203 movl r18=PERCPU_PAGE_SHIFT<<2
204 ;;
205 mov cr.itir=r18
206 mov cr.ifa=r16
207 ;;
208 ld8 r18=[r2] // load per-CPU PTE
209 mov r16=IA64_TR_PERCPU_DATA;
210 ;;
211 itr.d dtr[r16]=r18
212 ;;
213 srlz.d
214 ;;
215 // 3. Reload ITR for PAL code. 191 // 3. Reload ITR for PAL code.
216 GET_THIS_PADDR(r2, ia64_mca_pal_pte) 192 GET_THIS_PADDR(r2, ia64_mca_pal_pte)
217 ;; 193 ;;
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 1373fae7657f..07d82cd7cbdd 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -337,7 +337,7 @@ setup_gate (void)
337void __devinit 337void __devinit
338ia64_mmu_init (void *my_cpu_data) 338ia64_mmu_init (void *my_cpu_data)
339{ 339{
340 unsigned long psr, pta, impl_va_bits; 340 unsigned long pta, impl_va_bits;
341 extern void __devinit tlb_init (void); 341 extern void __devinit tlb_init (void);
342 342
343#ifdef CONFIG_DISABLE_VHPT 343#ifdef CONFIG_DISABLE_VHPT
@@ -346,15 +346,6 @@ ia64_mmu_init (void *my_cpu_data)
346# define VHPT_ENABLE_BIT 1 346# define VHPT_ENABLE_BIT 1
347#endif 347#endif
348 348
349 /* Pin mapping for percpu area into TLB */
350 psr = ia64_clear_ic();
351 ia64_itr(0x2, IA64_TR_PERCPU_DATA, PERCPU_ADDR,
352 pte_val(pfn_pte(__pa(my_cpu_data) >> PAGE_SHIFT, PAGE_KERNEL)),
353 PERCPU_PAGE_SHIFT);
354
355 ia64_set_psr(psr);
356 ia64_srlz_i();
357
358 /* 349 /*
359 * Check if the virtually mapped linear page table (VMLPT) overlaps with a mapped 350 * Check if the virtually mapped linear page table (VMLPT) overlaps with a mapped
360 * address space. The IA-64 architecture guarantees that at least 50 bits of 351 * address space. The IA-64 architecture guarantees that at least 50 bits of
diff --git a/include/asm-ia64/kregs.h b/include/asm-ia64/kregs.h
index 221b5cb564b2..7e55a584975c 100644
--- a/include/asm-ia64/kregs.h
+++ b/include/asm-ia64/kregs.h
@@ -29,8 +29,7 @@
29 */ 29 */
30#define IA64_TR_KERNEL 0 /* itr0, dtr0: maps kernel image (code & data) */ 30#define IA64_TR_KERNEL 0 /* itr0, dtr0: maps kernel image (code & data) */
31#define IA64_TR_PALCODE 1 /* itr1: maps PALcode as required by EFI */ 31#define IA64_TR_PALCODE 1 /* itr1: maps PALcode as required by EFI */
32#define IA64_TR_PERCPU_DATA 1 /* dtr1: percpu data */ 32#define IA64_TR_CURRENT_STACK 1 /* dtr1: maps kernel's memory- & register-stacks */
33#define IA64_TR_CURRENT_STACK 2 /* dtr2: maps kernel's memory- & register-stacks */
34 33
35/* Processor status register bits: */ 34/* Processor status register bits: */
36#define IA64_PSR_BE_BIT 1 35#define IA64_PSR_BE_BIT 1