OMAP3: PM: Update clean_l2 to use v7_flush_dcache_all

Analysis in TI kernel with ETM showed that using cache mapped flush in kernel instead of SO mapped flush cost drops by 65% (3.39mS down to 1.17mS) for clean_l2 which is used during sleep sequences. Overall: - speed up - unfortunately there isn't a good alternative flush method today - code reduction and less maintenance and potential bug in unmaintained code This also fixes the bug with the clean_l2 function usage. Reported-by: Tony Lindgren <tony@atomide.com> Cc: Kevin Hilman <khilman@deeprootsystems.com> Cc: Tony Lindgren <tony@atomide.com> Acked-by: Santosh Shilimkar <santosh.shilimkar@ti.com> Acked-by: Jean Pihet <j-pihet@ti.com> [nm@ti.com: ported rkw's proposal to 2.6.37-rc2] Signed-off-by: Nishanth Menon <nm@ti.com> Signed-off-by: Richard Woodruff <r-woodruff2@ti.com> Signed-off-by: Kevin Hilman <khilman@deeprootsystems.com>
author: Richard Woodruff <r-woodruff2@ti.com> 2010-12-20 15:05:03 -0500
committer: Kevin Hilman <khilman@deeprootsystems.com> 2010-12-21 17:45:47 -0500
commit: 0bd40535365c318e331f5e872030a710d5746167 (patch)
tree: e08c1d72b3eac6e260f2d8841c465d154a03c2af /arch/arm/mach-omap2/sleep34xx.S
parent: 1cbbe37ac5c78fb59ce02f639d6c4f69b610cf5e (diff)
1 files changed, 14 insertions, 66 deletions
diff --git a/arch/arm/mach-omap2/sleep34xx.S b/arch/arm/mach-omap2/sleep34xx.S
index 2fb205a7f28..aa43da5176e 100644
--- a/arch/arm/mach-omap2/sleep34xx.S
+++ b/arch/arm/mach-omap2/sleep34xx.S
@@ -520,72 +520,18 @@ clean_caches:
        cmp     r9, #1 /* Check whether L2 inval is required or not*/
        bne     skip_l2_inval
 clean_l2:
-        /* read clidr */
+        /*
-        mrc     p15, 1, r0, c0, c0, 1
+         * Jump out to kernel flush routine
-        /* extract loc from clidr */
+         *  - reuse that code is better
-        ands    r3, r0, #0x7000000
+         *  - it executes in a cached space so is faster than refetch per-block
-        /* left align loc bit field */
+         *  - should be faster and will change with kernel
-        mov     r3, r3, lsr #23
+         *  - 'might' have to copy address, load and jump to it
-        /* if loc is 0, then no need to clean */
+         *  - lr is used since we are running in SRAM currently.
-        beq     finished
+         */
-        /* start clean at cache level 0 */
+        ldr r1, kernel_flush
-        mov     r10, #0
+        mov lr, pc
-loop1:
+        bx  r1
-        /* work out 3x current cache level */
-        add     r2, r10, r10, lsr #1
-        /* extract cache type bits from clidr*/
-        mov     r1, r0, lsr r2
-        /* mask of the bits for current cache only */
-        and     r1, r1, #7
-        /* see what cache we have at this level */
-        cmp     r1, #2
-        /* skip if no cache, or just i-cache */
-        blt     skip
-        /* select current cache level in cssr */
-        mcr     p15, 2, r10, c0, c0, 0
-        /* isb to sych the new cssr&csidr */
-        isb
-        /* read the new csidr */
-        mrc     p15, 1, r1, c0, c0, 0
-        /* extract the length of the cache lines */
-        and     r2, r1, #7
-        /* add 4 (line length offset) */
-        add     r2, r2, #4
-        ldr     r4, assoc_mask
-        /* find maximum number on the way size */
-        ands    r4, r4, r1, lsr #3
-        /* find bit position of way size increment */
-        clz     r5, r4
-        ldr     r7, numset_mask
-        /* extract max number of the index size*/
-        ands    r7, r7, r1, lsr #13
-loop2:
-        mov     r9, r4
-        /* create working copy of max way size*/
-loop3:
-        /* factor way and cache number into r11 */
-        orr     r11, r10, r9, lsl r5
-        /* factor index number into r11 */
-        orr     r11, r11, r7, lsl r2
-        /*clean & invalidate by set/way */
-        mcr     p15, 0, r11, c7, c10, 2
-        /* decrement the way*/
-        subs    r9, r9, #1
-        bge     loop3
-        /*decrement the index */
-        subs    r7, r7, #1
-        bge     loop2
-skip:
-        add     r10, r10, #2
-        /* increment cache number */
-        cmp     r3, r10
-        bgt     loop1
-finished:
-        /*swith back to cache level 0 */
-        mov     r10, #0
-        /* select current cache level in cssr */
-        mcr     p15, 2, r10, c0, c0, 0
-        isb
 skip_l2_inval:
        /* Data memory barrier and Data sync barrier */
        mov     r1, #0
@@ -668,5 +614,7 @@ cache_pred_disable_mask:
        .word   0xFFFFE7FB
 control_stat:
        .word   CONTROL_STAT
+kernel_flush:
+        .word v7_flush_dcache_all
 ENTRY(omap34xx_cpu_suspend_sz)
        .word   . - omap34xx_cpu_suspend
author	Richard Woodruff <r-woodruff2@ti.com>	2010-12-20 15:05:03 -0500
committer	Kevin Hilman <khilman@deeprootsystems.com>	2010-12-21 17:45:47 -0500
commit	0bd40535365c318e331f5e872030a710d5746167 (patch)
tree	e08c1d72b3eac6e260f2d8841c465d154a03c2af /arch/arm/mach-omap2/sleep34xx.S
parent	1cbbe37ac5c78fb59ce02f639d6c4f69b610cf5e (diff)

diff --git a/arch/arm/mach-omap2/sleep34xx.S b/arch/arm/mach-omap2/sleep34xx.S index 2fb205a7f28..aa43da5176e 100644 --- a/arch/arm/mach-omap2/sleep34xx.S +++ b/arch/arm/mach-omap2/sleep34xx.S
@@ -520,72 +520,18 @@ clean_caches:
520	cmp r9, #1 /* Check whether L2 inval is required or not*/	520	cmp r9, #1 /* Check whether L2 inval is required or not*/
521	bne skip_l2_inval	521	bne skip_l2_inval
522	clean_l2:	522	clean_l2:
523	/* read clidr */	523	/*
524	mrc p15, 1, r0, c0, c0, 1	524	* Jump out to kernel flush routine
525	/* extract loc from clidr */	525	* - reuse that code is better
526	ands r3, r0, #0x7000000	526	* - it executes in a cached space so is faster than refetch per-block
527	/* left align loc bit field */	527	* - should be faster and will change with kernel
528	mov r3, r3, lsr #23	528	* - 'might' have to copy address, load and jump to it
529	/* if loc is 0, then no need to clean */	529	* - lr is used since we are running in SRAM currently.
530	beq finished	530	*/
531	/* start clean at cache level 0 */	531	ldr r1, kernel_flush
532	mov r10, #0	532	mov lr, pc
533	loop1:	533	bx r1
534	/* work out 3x current cache level */	534
535	add r2, r10, r10, lsr #1
536	/* extract cache type bits from clidr*/
537	mov r1, r0, lsr r2
538	/* mask of the bits for current cache only */
539	and r1, r1, #7
540	/* see what cache we have at this level */
541	cmp r1, #2
542	/* skip if no cache, or just i-cache */
543	blt skip
544	/* select current cache level in cssr */
545	mcr p15, 2, r10, c0, c0, 0
546	/* isb to sych the new cssr&csidr */
547	isb
548	/* read the new csidr */
549	mrc p15, 1, r1, c0, c0, 0
550	/* extract the length of the cache lines */
551	and r2, r1, #7
552	/* add 4 (line length offset) */
553	add r2, r2, #4
554	ldr r4, assoc_mask
555	/* find maximum number on the way size */
556	ands r4, r4, r1, lsr #3
557	/* find bit position of way size increment */
558	clz r5, r4
559	ldr r7, numset_mask
560	/* extract max number of the index size*/
561	ands r7, r7, r1, lsr #13
562	loop2:
563	mov r9, r4
564	/* create working copy of max way size*/
565	loop3:
566	/* factor way and cache number into r11 */
567	orr r11, r10, r9, lsl r5
568	/* factor index number into r11 */
569	orr r11, r11, r7, lsl r2
570	/clean & invalidate by set/way /
571	mcr p15, 0, r11, c7, c10, 2
572	/* decrement the way*/
573	subs r9, r9, #1
574	bge loop3
575	/decrement the index /
576	subs r7, r7, #1
577	bge loop2
578	skip:
579	add r10, r10, #2
580	/* increment cache number */
581	cmp r3, r10
582	bgt loop1
583	finished:
584	/swith back to cache level 0 /
585	mov r10, #0
586	/* select current cache level in cssr */
587	mcr p15, 2, r10, c0, c0, 0
588	isb
589	skip_l2_inval:	535	skip_l2_inval:
590	/* Data memory barrier and Data sync barrier */	536	/* Data memory barrier and Data sync barrier */
591	mov r1, #0	537	mov r1, #0
@@ -668,5 +614,7 @@ cache_pred_disable_mask:
668	.word 0xFFFFE7FB	614	.word 0xFFFFE7FB
669	control_stat:	615	control_stat:
670	.word CONTROL_STAT	616	.word CONTROL_STAT
		617	kernel_flush:
		618	.word v7_flush_dcache_all
671	ENTRY(omap34xx_cpu_suspend_sz)	619	ENTRY(omap34xx_cpu_suspend_sz)
672	.word . - omap34xx_cpu_suspend	620	.word . - omap34xx_cpu_suspend