diff options
Diffstat (limited to 'arch/sh/lib64/copy_page.S')
-rw-r--r-- | arch/sh/lib64/copy_page.S | 89 |
1 files changed, 89 insertions, 0 deletions
diff --git a/arch/sh/lib64/copy_page.S b/arch/sh/lib64/copy_page.S new file mode 100644 index 00000000000..0ec6fca63b5 --- /dev/null +++ b/arch/sh/lib64/copy_page.S | |||
@@ -0,0 +1,89 @@ | |||
1 | /* | ||
2 | Copyright 2003 Richard Curnow, SuperH (UK) Ltd. | ||
3 | |||
4 | This file is subject to the terms and conditions of the GNU General Public | ||
5 | License. See the file "COPYING" in the main directory of this archive | ||
6 | for more details. | ||
7 | |||
8 | Tight version of mempy for the case of just copying a page. | ||
9 | Prefetch strategy empirically optimised against RTL simulations | ||
10 | of SH5-101 cut2 eval chip with Cayman board DDR memory. | ||
11 | |||
12 | Parameters: | ||
13 | r2 : destination effective address (start of page) | ||
14 | r3 : source effective address (start of page) | ||
15 | |||
16 | Always copies 4096 bytes. | ||
17 | |||
18 | Points to review. | ||
19 | * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead. | ||
20 | It seems like the prefetch needs to be at at least 4 lines ahead to get | ||
21 | the data into the cache in time, and the allocos contend with outstanding | ||
22 | prefetches for the same cache set, so it's better to have the numbers | ||
23 | different. | ||
24 | */ | ||
25 | |||
26 | .section .text..SHmedia32,"ax" | ||
27 | .little | ||
28 | |||
29 | .balign 8 | ||
30 | .global copy_page | ||
31 | copy_page: | ||
32 | |||
33 | /* Copy 4096 bytes worth of data from r3 to r2. | ||
34 | Do prefetches 4 lines ahead. | ||
35 | Do alloco 2 lines ahead */ | ||
36 | |||
37 | pta 1f, tr1 | ||
38 | pta 2f, tr2 | ||
39 | pta 3f, tr3 | ||
40 | ptabs r18, tr0 | ||
41 | |||
42 | #if 0 | ||
43 | /* TAKum03020 */ | ||
44 | ld.q r3, 0x00, r63 | ||
45 | ld.q r3, 0x20, r63 | ||
46 | ld.q r3, 0x40, r63 | ||
47 | ld.q r3, 0x60, r63 | ||
48 | #endif | ||
49 | alloco r2, 0x00 | ||
50 | synco ! TAKum03020 | ||
51 | alloco r2, 0x20 | ||
52 | synco ! TAKum03020 | ||
53 | |||
54 | movi 3968, r6 | ||
55 | add r2, r6, r6 | ||
56 | addi r6, 64, r7 | ||
57 | addi r7, 64, r8 | ||
58 | sub r3, r2, r60 | ||
59 | addi r60, 8, r61 | ||
60 | addi r61, 8, r62 | ||
61 | addi r62, 8, r23 | ||
62 | addi r60, 0x80, r22 | ||
63 | |||
64 | /* Minimal code size. The extra branches inside the loop don't cost much | ||
65 | because they overlap with the time spent waiting for prefetches to | ||
66 | complete. */ | ||
67 | 1: | ||
68 | #if 0 | ||
69 | /* TAKum03020 */ | ||
70 | bge/u r2, r6, tr2 ! skip prefetch for last 4 lines | ||
71 | ldx.q r2, r22, r63 ! prefetch 4 lines hence | ||
72 | #endif | ||
73 | 2: | ||
74 | bge/u r2, r7, tr3 ! skip alloco for last 2 lines | ||
75 | alloco r2, 0x40 ! alloc destination line 2 lines ahead | ||
76 | synco ! TAKum03020 | ||
77 | 3: | ||
78 | ldx.q r2, r60, r36 | ||
79 | ldx.q r2, r61, r37 | ||
80 | ldx.q r2, r62, r38 | ||
81 | ldx.q r2, r23, r39 | ||
82 | st.q r2, 0, r36 | ||
83 | st.q r2, 8, r37 | ||
84 | st.q r2, 16, r38 | ||
85 | st.q r2, 24, r39 | ||
86 | addi r2, 32, r2 | ||
87 | bgt/l r8, r2, tr1 | ||
88 | |||
89 | blink tr0, r63 ! return | ||