diff options
Diffstat (limited to 'arch/cris/arch-v32/lib')
-rw-r--r-- | arch/cris/arch-v32/lib/Makefile | 6 | ||||
-rw-r--r-- | arch/cris/arch-v32/lib/checksum.S | 111 | ||||
-rw-r--r-- | arch/cris/arch-v32/lib/checksumcopy.S | 120 | ||||
-rw-r--r-- | arch/cris/arch-v32/lib/csumcpfruser.S | 69 | ||||
-rw-r--r-- | arch/cris/arch-v32/lib/dram_init.S | 120 | ||||
-rw-r--r-- | arch/cris/arch-v32/lib/hw_settings.S | 73 | ||||
-rw-r--r-- | arch/cris/arch-v32/lib/memset.c | 253 | ||||
-rw-r--r-- | arch/cris/arch-v32/lib/nand_init.S | 179 | ||||
-rw-r--r-- | arch/cris/arch-v32/lib/spinlock.S | 33 | ||||
-rw-r--r-- | arch/cris/arch-v32/lib/string.c | 219 | ||||
-rw-r--r-- | arch/cris/arch-v32/lib/usercopy.c | 470 |
11 files changed, 1653 insertions, 0 deletions
diff --git a/arch/cris/arch-v32/lib/Makefile b/arch/cris/arch-v32/lib/Makefile new file mode 100644 index 000000000000..05b3ec6978d6 --- /dev/null +++ b/arch/cris/arch-v32/lib/Makefile | |||
@@ -0,0 +1,6 @@ | |||
1 | # | ||
2 | # Makefile for Etrax-specific library files.. | ||
3 | # | ||
4 | |||
5 | lib-y = checksum.o checksumcopy.o string.o usercopy.o memset.o csumcpfruser.o spinlock.o | ||
6 | |||
diff --git a/arch/cris/arch-v32/lib/checksum.S b/arch/cris/arch-v32/lib/checksum.S new file mode 100644 index 000000000000..32e66181b826 --- /dev/null +++ b/arch/cris/arch-v32/lib/checksum.S | |||
@@ -0,0 +1,111 @@ | |||
1 | /* | ||
2 | * A fast checksum routine using movem | ||
3 | * Copyright (c) 1998-2001, 2003 Axis Communications AB | ||
4 | * | ||
5 | * csum_partial(const unsigned char * buff, int len, unsigned int sum) | ||
6 | */ | ||
7 | |||
8 | .globl csum_partial | ||
9 | csum_partial: | ||
10 | |||
11 | ;; r10 - src | ||
12 | ;; r11 - length | ||
13 | ;; r12 - checksum | ||
14 | |||
15 | ;; check for breakeven length between movem and normal word looping versions | ||
16 | ;; we also do _NOT_ want to compute a checksum over more than the | ||
17 | ;; actual length when length < 40 | ||
18 | |||
19 | cmpu.w 80,$r11 | ||
20 | blo _word_loop | ||
21 | nop | ||
22 | |||
23 | ;; need to save the registers we use below in the movem loop | ||
24 | ;; this overhead is why we have a check above for breakeven length | ||
25 | ;; only r0 - r8 have to be saved, the other ones are clobber-able | ||
26 | ;; according to the ABI | ||
27 | |||
28 | subq 9*4,$sp | ||
29 | subq 10*4,$r11 ; update length for the first loop | ||
30 | movem $r8,[$sp] | ||
31 | |||
32 | ;; do a movem checksum | ||
33 | |||
34 | _mloop: movem [$r10+],$r9 ; read 10 longwords | ||
35 | |||
36 | ;; perform dword checksumming on the 10 longwords | ||
37 | |||
38 | add.d $r0,$r12 | ||
39 | addc $r1,$r12 | ||
40 | addc $r2,$r12 | ||
41 | addc $r3,$r12 | ||
42 | addc $r4,$r12 | ||
43 | addc $r5,$r12 | ||
44 | addc $r6,$r12 | ||
45 | addc $r7,$r12 | ||
46 | addc $r8,$r12 | ||
47 | addc $r9,$r12 | ||
48 | |||
49 | ;; fold the carry into the checksum, to avoid having to loop the carry | ||
50 | ;; back into the top | ||
51 | |||
52 | addc 0,$r12 | ||
53 | addc 0,$r12 ; do it again, since we might have generated a carry | ||
54 | |||
55 | subq 10*4,$r11 | ||
56 | bge _mloop | ||
57 | nop | ||
58 | |||
59 | addq 10*4,$r11 ; compensate for last loop underflowing length | ||
60 | |||
61 | movem [$sp+],$r8 ; restore regs | ||
62 | |||
63 | _word_loop: | ||
64 | ;; only fold if there is anything to fold. | ||
65 | |||
66 | cmpq 0,$r12 | ||
67 | beq _no_fold | ||
68 | |||
69 | ;; fold 32-bit checksum into a 16-bit checksum, to avoid carries below. | ||
70 | ;; r9 and r13 can be used as temporaries. | ||
71 | |||
72 | moveq -1,$r9 ; put 0xffff in r9, faster than move.d 0xffff,r9 | ||
73 | lsrq 16,$r9 | ||
74 | |||
75 | move.d $r12,$r13 | ||
76 | lsrq 16,$r13 ; r13 = checksum >> 16 | ||
77 | and.d $r9,$r12 ; checksum = checksum & 0xffff | ||
78 | add.d $r13,$r12 ; checksum += r13 | ||
79 | move.d $r12,$r13 ; do the same again, maybe we got a carry last add | ||
80 | lsrq 16,$r13 | ||
81 | and.d $r9,$r12 | ||
82 | add.d $r13,$r12 | ||
83 | |||
84 | _no_fold: | ||
85 | cmpq 2,$r11 | ||
86 | blt _no_words | ||
87 | nop | ||
88 | |||
89 | ;; checksum the rest of the words | ||
90 | |||
91 | subq 2,$r11 | ||
92 | |||
93 | _wloop: subq 2,$r11 | ||
94 | bge _wloop | ||
95 | addu.w [$r10+],$r12 | ||
96 | |||
97 | addq 2,$r11 | ||
98 | |||
99 | _no_words: | ||
100 | ;; see if we have one odd byte more | ||
101 | cmpq 1,$r11 | ||
102 | beq _do_byte | ||
103 | nop | ||
104 | ret | ||
105 | move.d $r12,$r10 | ||
106 | |||
107 | _do_byte: | ||
108 | ;; copy and checksum the last byte | ||
109 | addu.b [$r10],$r12 | ||
110 | ret | ||
111 | move.d $r12,$r10 | ||
diff --git a/arch/cris/arch-v32/lib/checksumcopy.S b/arch/cris/arch-v32/lib/checksumcopy.S new file mode 100644 index 000000000000..9303ccbadc6d --- /dev/null +++ b/arch/cris/arch-v32/lib/checksumcopy.S | |||
@@ -0,0 +1,120 @@ | |||
1 | /* | ||
2 | * A fast checksum+copy routine using movem | ||
3 | * Copyright (c) 1998, 2001, 2003 Axis Communications AB | ||
4 | * | ||
5 | * Authors: Bjorn Wesen | ||
6 | * | ||
7 | * csum_partial_copy_nocheck(const char *src, char *dst, | ||
8 | * int len, unsigned int sum) | ||
9 | */ | ||
10 | |||
11 | .globl csum_partial_copy_nocheck | ||
12 | csum_partial_copy_nocheck: | ||
13 | |||
14 | ;; r10 - src | ||
15 | ;; r11 - dst | ||
16 | ;; r12 - length | ||
17 | ;; r13 - checksum | ||
18 | |||
19 | ;; check for breakeven length between movem and normal word looping versions | ||
20 | ;; we also do _NOT_ want to compute a checksum over more than the | ||
21 | ;; actual length when length < 40 | ||
22 | |||
23 | cmpu.w 80,$r12 | ||
24 | blo _word_loop | ||
25 | nop | ||
26 | |||
27 | ;; need to save the registers we use below in the movem loop | ||
28 | ;; this overhead is why we have a check above for breakeven length | ||
29 | ;; only r0 - r8 have to be saved, the other ones are clobber-able | ||
30 | ;; according to the ABI | ||
31 | |||
32 | subq 9*4,$sp | ||
33 | subq 10*4,$r12 ; update length for the first loop | ||
34 | movem $r8,[$sp] | ||
35 | |||
36 | ;; do a movem copy and checksum | ||
37 | |||
38 | 1: ;; A failing userspace access (the read) will have this as PC. | ||
39 | _mloop: movem [$r10+],$r9 ; read 10 longwords | ||
40 | movem $r9,[$r11+] ; write 10 longwords | ||
41 | |||
42 | ;; perform dword checksumming on the 10 longwords | ||
43 | |||
44 | add.d $r0,$r13 | ||
45 | addc $r1,$r13 | ||
46 | addc $r2,$r13 | ||
47 | addc $r3,$r13 | ||
48 | addc $r4,$r13 | ||
49 | addc $r5,$r13 | ||
50 | addc $r6,$r13 | ||
51 | addc $r7,$r13 | ||
52 | addc $r8,$r13 | ||
53 | addc $r9,$r13 | ||
54 | |||
55 | ;; fold the carry into the checksum, to avoid having to loop the carry | ||
56 | ;; back into the top | ||
57 | |||
58 | addc 0,$r13 | ||
59 | addc 0,$r13 ; do it again, since we might have generated a carry | ||
60 | |||
61 | subq 10*4,$r12 | ||
62 | bge _mloop | ||
63 | nop | ||
64 | |||
65 | addq 10*4,$r12 ; compensate for last loop underflowing length | ||
66 | |||
67 | movem [$sp+],$r8 ; restore regs | ||
68 | |||
69 | _word_loop: | ||
70 | ;; only fold if there is anything to fold. | ||
71 | |||
72 | cmpq 0,$r13 | ||
73 | beq _no_fold | ||
74 | |||
75 | ;; fold 32-bit checksum into a 16-bit checksum, to avoid carries below | ||
76 | ;; r9 can be used as temporary. | ||
77 | |||
78 | move.d $r13,$r9 | ||
79 | lsrq 16,$r9 ; r0 = checksum >> 16 | ||
80 | and.d 0xffff,$r13 ; checksum = checksum & 0xffff | ||
81 | add.d $r9,$r13 ; checksum += r0 | ||
82 | move.d $r13,$r9 ; do the same again, maybe we got a carry last add | ||
83 | lsrq 16,$r9 | ||
84 | and.d 0xffff,$r13 | ||
85 | add.d $r9,$r13 | ||
86 | |||
87 | _no_fold: | ||
88 | cmpq 2,$r12 | ||
89 | blt _no_words | ||
90 | nop | ||
91 | |||
92 | ;; copy and checksum the rest of the words | ||
93 | |||
94 | subq 2,$r12 | ||
95 | |||
96 | 2: ;; A failing userspace access for the read below will have this as PC. | ||
97 | _wloop: move.w [$r10+],$r9 | ||
98 | addu.w $r9,$r13 | ||
99 | subq 2,$r12 | ||
100 | bge _wloop | ||
101 | move.w $r9,[$r11+] | ||
102 | |||
103 | addq 2,$r12 | ||
104 | |||
105 | _no_words: | ||
106 | ;; see if we have one odd byte more | ||
107 | cmpq 1,$r12 | ||
108 | beq _do_byte | ||
109 | nop | ||
110 | ret | ||
111 | move.d $r13,$r10 | ||
112 | |||
113 | _do_byte: | ||
114 | ;; copy and checksum the last byte | ||
115 | 3: ;; A failing userspace access for the read below will have this as PC. | ||
116 | move.b [$r10],$r9 | ||
117 | addu.b $r9,$r13 | ||
118 | move.b $r9,[$r11] | ||
119 | ret | ||
120 | move.d $r13,$r10 | ||
diff --git a/arch/cris/arch-v32/lib/csumcpfruser.S b/arch/cris/arch-v32/lib/csumcpfruser.S new file mode 100644 index 000000000000..600ec16b9f28 --- /dev/null +++ b/arch/cris/arch-v32/lib/csumcpfruser.S | |||
@@ -0,0 +1,69 @@ | |||
1 | /* | ||
2 | * Add-on to transform csum_partial_copy_nocheck in checksumcopy.S into | ||
3 | * csum_partial_copy_from_user by adding exception records. | ||
4 | * | ||
5 | * Copyright (C) 2001, 2003 Axis Communications AB. | ||
6 | * | ||
7 | * Author: Hans-Peter Nilsson. | ||
8 | */ | ||
9 | |||
10 | #include <asm/errno.h> | ||
11 | |||
12 | /* Same function body, but a different name. If we just added exception | ||
13 | records to _csum_partial_copy_nocheck and made it generic, we wouldn't | ||
14 | know a user fault from a kernel fault and we would have overhead in | ||
15 | each kernel caller for the error-pointer argument. | ||
16 | |||
17 | unsigned int csum_partial_copy_from_user | ||
18 | (const char *src, char *dst, int len, unsigned int sum, int *errptr); | ||
19 | |||
20 | Note that the errptr argument is only set if we encounter an error. | ||
21 | It is conveniently located on the stack, so the normal function body | ||
22 | does not have to handle it. */ | ||
23 | |||
24 | #define csum_partial_copy_nocheck csum_partial_copy_from_user | ||
25 | |||
26 | /* There are local labels numbered 1, 2 and 3 present to mark the | ||
27 | different from-user accesses. */ | ||
28 | #include "checksumcopy.S" | ||
29 | |||
30 | .section .fixup,"ax" | ||
31 | |||
32 | ;; Here from the movem loop; restore stack. | ||
33 | 4: | ||
34 | movem [$sp+],$r8 | ||
35 | ;; r12 is already decremented. Add back chunk_size-2. | ||
36 | addq 40-2,$r12 | ||
37 | |||
38 | ;; Here from the word loop; r12 is off by 2; add it back. | ||
39 | 5: | ||
40 | addq 2,$r12 | ||
41 | |||
42 | ;; Here from a failing single byte. | ||
43 | 6: | ||
44 | |||
45 | ;; Signal in *errptr that we had a failing access. | ||
46 | move.d [$sp],$acr | ||
47 | moveq -EFAULT,$r9 | ||
48 | subq 4,$sp | ||
49 | move.d $r9,[$acr] | ||
50 | |||
51 | ;; Clear the rest of the destination area using memset. Preserve the | ||
52 | ;; checksum for the readable bytes. | ||
53 | move.d $r13,[$sp] | ||
54 | subq 4,$sp | ||
55 | move.d $r11,$r10 | ||
56 | move $srp,[$sp] | ||
57 | jsr memset | ||
58 | clear.d $r11 | ||
59 | |||
60 | move [$sp+],$srp | ||
61 | ret | ||
62 | move.d [$sp+],$r10 | ||
63 | |||
64 | .previous | ||
65 | .section __ex_table,"a" | ||
66 | .dword 1b,4b | ||
67 | .dword 2b,5b | ||
68 | .dword 3b,6b | ||
69 | .previous | ||
diff --git a/arch/cris/arch-v32/lib/dram_init.S b/arch/cris/arch-v32/lib/dram_init.S new file mode 100644 index 000000000000..47b6cf5f4afd --- /dev/null +++ b/arch/cris/arch-v32/lib/dram_init.S | |||
@@ -0,0 +1,120 @@ | |||
1 | /* $Id: dram_init.S,v 1.4 2005/04/24 18:48:32 starvik Exp $ | ||
2 | * | ||
3 | * DRAM/SDRAM initialization - alter with care | ||
4 | * This file is intended to be included from other assembler files | ||
5 | * | ||
6 | * Note: This file may not modify r8 or r9 because they are used to | ||
7 | * carry information from the decompresser to the kernel | ||
8 | * | ||
9 | * Copyright (C) 2000-2003 Axis Communications AB | ||
10 | * | ||
11 | * Authors: Mikael Starvik (starvik@axis.com) | ||
12 | */ | ||
13 | |||
14 | /* Just to be certain the config file is included, we include it here | ||
15 | * explicitely instead of depending on it being included in the file that | ||
16 | * uses this code. | ||
17 | */ | ||
18 | |||
19 | #include <linux/config.h> | ||
20 | #include <asm/arch/hwregs/asm/reg_map_asm.h> | ||
21 | #include <asm/arch/hwregs/asm/bif_core_defs_asm.h> | ||
22 | |||
23 | ;; WARNING! The registers r8 and r9 are used as parameters carrying | ||
24 | ;; information from the decompressor (if the kernel was compressed). | ||
25 | ;; They should not be used in the code below. | ||
26 | |||
27 | ; Refer to BIF MDS for a description of SDRAM initialization | ||
28 | |||
29 | ; Bank configuration | ||
30 | move.d REG_ADDR(bif_core, regi_bif_core, rw_sdram_cfg_grp0), $r0 | ||
31 | move.d CONFIG_ETRAX_SDRAM_GRP0_CONFIG, $r1 | ||
32 | move.d $r1, [$r0] | ||
33 | move.d REG_ADDR(bif_core, regi_bif_core, rw_sdram_cfg_grp1), $r0 | ||
34 | move.d CONFIG_ETRAX_SDRAM_GRP1_CONFIG, $r1 | ||
35 | move.d $r1, [$r0] | ||
36 | |||
37 | ; Calculate value of mrs_data | ||
38 | ; CAS latency = 2 && bus_width = 32 => 0x40 | ||
39 | ; CAS latency = 3 && bus_width = 32 => 0x60 | ||
40 | ; CAS latency = 2 && bus_width = 16 => 0x20 | ||
41 | ; CAS latency = 3 && bus_width = 16 => 0x30 | ||
42 | |||
43 | ; Check if value is already supplied in kernel config | ||
44 | move.d CONFIG_ETRAX_SDRAM_COMMAND, $r2 | ||
45 | bne _set_timing | ||
46 | nop | ||
47 | |||
48 | move.d 0x40, $r4 ; Assume 32 bits and CAS latency = 2 | ||
49 | move.d CONFIG_ETRAX_SDRAM_TIMING, $r1 | ||
50 | and.d 0x07, $r1 ; Get CAS latency | ||
51 | cmpq 2, $r1 ; CL = 2 ? | ||
52 | beq _bw_check | ||
53 | nop | ||
54 | move.d 0x60, $r4 | ||
55 | |||
56 | _bw_check: | ||
57 | ; Assume that group 0 width is equal to group 1. This assumption | ||
58 | ; is wrong for a group 1 only hardware (such as the grand old | ||
59 | ; StorPoint+). | ||
60 | move.d CONFIG_ETRAX_SDRAM_GRP0_CONFIG, $r1 | ||
61 | and.d 0x200, $r1 ; DRAM width is bit 9 | ||
62 | beq _set_timing | ||
63 | lslq 2, $r4 ; mrs_data starts at bit 2 | ||
64 | lsrq 1, $r4 ; 16 bits. Shift down value. | ||
65 | |||
66 | ; Set timing parameters (refresh off to avoid Guinness TR 83) | ||
67 | _set_timing: | ||
68 | move.d CONFIG_ETRAX_SDRAM_TIMING, $r1 | ||
69 | and.d ~(3 << reg_bif_core_rw_sdram_timing___ref___lsb), $r1 | ||
70 | move.d REG_ADDR(bif_core, regi_bif_core, rw_sdram_timing), $r0 | ||
71 | move.d $r1, [$r0] | ||
72 | |||
73 | ; Issue NOP command | ||
74 | move.d REG_ADDR(bif_core, regi_bif_core, rw_sdram_cmd), $r5 | ||
75 | moveq regk_bif_core_nop, $r1 | ||
76 | move.d $r1, [$r5] | ||
77 | |||
78 | ; Wait 200us | ||
79 | move.d 10000, $r2 | ||
80 | 1: bne 1b | ||
81 | subq 1, $r2 | ||
82 | |||
83 | ; Issue initialization command sequence | ||
84 | move.d _sdram_commands_start, $r2 | ||
85 | and.d 0x000fffff, $r2 ; Make sure commands are read from flash | ||
86 | move.d _sdram_commands_end, $r3 | ||
87 | and.d 0x000fffff, $r3 | ||
88 | 1: clear.d $r6 | ||
89 | move.b [$r2+], $r6 ; Load command | ||
90 | or.d $r4, $r6 ; Add calculated mrs | ||
91 | move.d $r6, [$r5] ; Write rw_sdram_cmd | ||
92 | ; Wait 80 ns between each command | ||
93 | move.d 4000, $r7 | ||
94 | 2: bne 2b | ||
95 | subq 1, $r7 | ||
96 | cmp.d $r2, $r3 ; Last command? | ||
97 | bne 1b | ||
98 | nop | ||
99 | |||
100 | ; Start refresh | ||
101 | move.d CONFIG_ETRAX_SDRAM_TIMING, $r1 | ||
102 | move.d REG_ADDR(bif_core, regi_bif_core, rw_sdram_timing), $r0 | ||
103 | move.d $r1, [$r0] | ||
104 | |||
105 | ; Initialization finished | ||
106 | ba _sdram_commands_end | ||
107 | nop | ||
108 | |||
109 | _sdram_commands_start: | ||
110 | .byte regk_bif_core_pre ; Precharge | ||
111 | .byte regk_bif_core_ref ; refresh | ||
112 | .byte regk_bif_core_ref ; refresh | ||
113 | .byte regk_bif_core_ref ; refresh | ||
114 | .byte regk_bif_core_ref ; refresh | ||
115 | .byte regk_bif_core_ref ; refresh | ||
116 | .byte regk_bif_core_ref ; refresh | ||
117 | .byte regk_bif_core_ref ; refresh | ||
118 | .byte regk_bif_core_ref ; refresh | ||
119 | .byte regk_bif_core_mrs ; mrs | ||
120 | _sdram_commands_end: | ||
diff --git a/arch/cris/arch-v32/lib/hw_settings.S b/arch/cris/arch-v32/lib/hw_settings.S new file mode 100644 index 000000000000..5182e8c2cff2 --- /dev/null +++ b/arch/cris/arch-v32/lib/hw_settings.S | |||
@@ -0,0 +1,73 @@ | |||
1 | /* | ||
2 | * $Id: hw_settings.S,v 1.3 2005/04/24 18:36:57 starvik Exp $ | ||
3 | * | ||
4 | * This table is used by some tools to extract hardware parameters. | ||
5 | * The table should be included in the kernel and the decompressor. | ||
6 | * Don't forget to update the tools if you change this table. | ||
7 | * | ||
8 | * Copyright (C) 2001 Axis Communications AB | ||
9 | * | ||
10 | * Authors: Mikael Starvik (starvik@axis.com) | ||
11 | */ | ||
12 | |||
13 | #include <linux/config.h> | ||
14 | #include <asm/arch/hwregs/asm/reg_map_asm.h> | ||
15 | #include <asm/arch/hwregs/asm/bif_core_defs_asm.h> | ||
16 | #include <asm/arch/hwregs/asm/gio_defs_asm.h> | ||
17 | |||
18 | .ascii "HW_PARAM_MAGIC" ; Magic number | ||
19 | .dword 0xc0004000 ; Kernel start address | ||
20 | |||
21 | ; Debug port | ||
22 | #ifdef CONFIG_ETRAX_DEBUG_PORT0 | ||
23 | .dword 0 | ||
24 | #elif defined(CONFIG_ETRAX_DEBUG_PORT1) | ||
25 | .dword 1 | ||
26 | #elif defined(CONFIG_ETRAX_DEBUG_PORT2) | ||
27 | .dword 2 | ||
28 | #elif defined(CONFIG_ETRAX_DEBUG_PORT3) | ||
29 | .dword 3 | ||
30 | #else | ||
31 | .dword 4 ; No debug | ||
32 | #endif | ||
33 | |||
34 | ; Register values | ||
35 | .dword REG_ADDR(bif_core, regi_bif_core, rw_grp1_cfg) | ||
36 | .dword CONFIG_ETRAX_MEM_GRP1_CONFIG | ||
37 | .dword REG_ADDR(bif_core, regi_bif_core, rw_grp2_cfg) | ||
38 | .dword CONFIG_ETRAX_MEM_GRP2_CONFIG | ||
39 | .dword REG_ADDR(bif_core, regi_bif_core, rw_grp3_cfg) | ||
40 | .dword CONFIG_ETRAX_MEM_GRP3_CONFIG | ||
41 | .dword REG_ADDR(bif_core, regi_bif_core, rw_grp4_cfg) | ||
42 | .dword CONFIG_ETRAX_MEM_GRP4_CONFIG | ||
43 | .dword REG_ADDR(bif_core, regi_bif_core, rw_sdram_cfg_grp0) | ||
44 | .dword CONFIG_ETRAX_SDRAM_GRP0_CONFIG | ||
45 | .dword REG_ADDR(bif_core, regi_bif_core, rw_sdram_cfg_grp1) | ||
46 | .dword CONFIG_ETRAX_SDRAM_GRP1_CONFIG | ||
47 | .dword REG_ADDR(bif_core, regi_bif_core, rw_sdram_timing) | ||
48 | .dword CONFIG_ETRAX_SDRAM_TIMING | ||
49 | .dword REG_ADDR(bif_core, regi_bif_core, rw_sdram_cmd) | ||
50 | .dword CONFIG_ETRAX_SDRAM_COMMAND | ||
51 | |||
52 | .dword REG_ADDR(gio, regi_gio, rw_pa_dout) | ||
53 | .dword CONFIG_ETRAX_DEF_GIO_PA_OUT | ||
54 | .dword REG_ADDR(gio, regi_gio, rw_pa_oe) | ||
55 | .dword CONFIG_ETRAX_DEF_GIO_PA_OE | ||
56 | .dword REG_ADDR(gio, regi_gio, rw_pb_dout) | ||
57 | .dword CONFIG_ETRAX_DEF_GIO_PB_OUT | ||
58 | .dword REG_ADDR(gio, regi_gio, rw_pb_oe) | ||
59 | .dword CONFIG_ETRAX_DEF_GIO_PB_OE | ||
60 | .dword REG_ADDR(gio, regi_gio, rw_pc_dout) | ||
61 | .dword CONFIG_ETRAX_DEF_GIO_PC_OUT | ||
62 | .dword REG_ADDR(gio, regi_gio, rw_pc_oe) | ||
63 | .dword CONFIG_ETRAX_DEF_GIO_PC_OE | ||
64 | .dword REG_ADDR(gio, regi_gio, rw_pd_dout) | ||
65 | .dword CONFIG_ETRAX_DEF_GIO_PD_OUT | ||
66 | .dword REG_ADDR(gio, regi_gio, rw_pd_oe) | ||
67 | .dword CONFIG_ETRAX_DEF_GIO_PD_OE | ||
68 | .dword REG_ADDR(gio, regi_gio, rw_pe_dout) | ||
69 | .dword CONFIG_ETRAX_DEF_GIO_PE_OUT | ||
70 | .dword REG_ADDR(gio, regi_gio, rw_pe_oe) | ||
71 | .dword CONFIG_ETRAX_DEF_GIO_PE_OE | ||
72 | |||
73 | .dword 0 ; No more register values | ||
diff --git a/arch/cris/arch-v32/lib/memset.c b/arch/cris/arch-v32/lib/memset.c new file mode 100644 index 000000000000..ffca1214674e --- /dev/null +++ b/arch/cris/arch-v32/lib/memset.c | |||
@@ -0,0 +1,253 @@ | |||
1 | /*#************************************************************************#*/ | ||
2 | /*#-------------------------------------------------------------------------*/ | ||
3 | /*# */ | ||
4 | /*# FUNCTION NAME: memset() */ | ||
5 | /*# */ | ||
6 | /*# PARAMETERS: void* dst; Destination address. */ | ||
7 | /*# int c; Value of byte to write. */ | ||
8 | /*# int len; Number of bytes to write. */ | ||
9 | /*# */ | ||
10 | /*# RETURNS: dst. */ | ||
11 | /*# */ | ||
12 | /*# DESCRIPTION: Sets the memory dst of length len bytes to c, as standard. */ | ||
13 | /*# Framework taken from memcpy. This routine is */ | ||
14 | /*# very sensitive to compiler changes in register allocation. */ | ||
15 | /*# Should really be rewritten to avoid this problem. */ | ||
16 | /*# */ | ||
17 | /*#-------------------------------------------------------------------------*/ | ||
18 | /*# */ | ||
19 | /*# HISTORY */ | ||
20 | /*# */ | ||
21 | /*# DATE NAME CHANGES */ | ||
22 | /*# ---- ---- ------- */ | ||
23 | /*# 990713 HP Tired of watching this function (or */ | ||
24 | /*# really, the nonoptimized generic */ | ||
25 | /*# implementation) take up 90% of simulator */ | ||
26 | /*# output. Measurements needed. */ | ||
27 | /*# */ | ||
28 | /*#-------------------------------------------------------------------------*/ | ||
29 | |||
30 | #include <linux/types.h> | ||
31 | |||
32 | /* No, there's no macro saying 12*4, since it is "hard" to get it into | ||
33 | the asm in a good way. Thus better to expose the problem everywhere. | ||
34 | */ | ||
35 | |||
36 | /* Assuming 1 cycle per dword written or read (ok, not really true), and | ||
37 | one per instruction, then 43+3*(n/48-1) <= 24+24*(n/48-1) | ||
38 | so n >= 45.7; n >= 0.9; we win on the first full 48-byte block to set. */ | ||
39 | |||
40 | #define ZERO_BLOCK_SIZE (1*12*4) | ||
41 | |||
42 | void *memset(void *pdst, | ||
43 | int c, | ||
44 | size_t plen) | ||
45 | { | ||
46 | /* Ok. Now we want the parameters put in special registers. | ||
47 | Make sure the compiler is able to make something useful of this. */ | ||
48 | |||
49 | register char *return_dst __asm__ ("r10") = pdst; | ||
50 | register int n __asm__ ("r12") = plen; | ||
51 | register int lc __asm__ ("r11") = c; | ||
52 | |||
53 | /* Most apps use memset sanely. Only those memsetting about 3..4 | ||
54 | bytes or less get penalized compared to the generic implementation | ||
55 | - and that's not really sane use. */ | ||
56 | |||
57 | /* Ugh. This is fragile at best. Check with newer GCC releases, if | ||
58 | they compile cascaded "x |= x << 8" sanely! */ | ||
59 | __asm__("movu.b %0,$r13 \n\ | ||
60 | lslq 8,$r13 \n\ | ||
61 | move.b %0,$r13 \n\ | ||
62 | move.d $r13,%0 \n\ | ||
63 | lslq 16,$r13 \n\ | ||
64 | or.d $r13,%0" | ||
65 | : "=r" (lc) : "0" (lc) : "r13"); | ||
66 | |||
67 | { | ||
68 | register char *dst __asm__ ("r13") = pdst; | ||
69 | |||
70 | /* This is NONPORTABLE, but since this whole routine is */ | ||
71 | /* grossly nonportable that doesn't matter. */ | ||
72 | |||
73 | if (((unsigned long) pdst & 3) != 0 | ||
74 | /* Oops! n=0 must be a legal call, regardless of alignment. */ | ||
75 | && n >= 3) | ||
76 | { | ||
77 | if ((unsigned long)dst & 1) | ||
78 | { | ||
79 | *dst = (char) lc; | ||
80 | n--; | ||
81 | dst++; | ||
82 | } | ||
83 | |||
84 | if ((unsigned long)dst & 2) | ||
85 | { | ||
86 | *(short *)dst = lc; | ||
87 | n -= 2; | ||
88 | dst += 2; | ||
89 | } | ||
90 | } | ||
91 | |||
92 | /* Now the fun part. For the threshold value of this, check the equation | ||
93 | above. */ | ||
94 | /* Decide which copying method to use. */ | ||
95 | if (n >= ZERO_BLOCK_SIZE) | ||
96 | { | ||
97 | /* For large copies we use 'movem' */ | ||
98 | |||
99 | /* It is not optimal to tell the compiler about clobbering any | ||
100 | registers; that will move the saving/restoring of those registers | ||
101 | to the function prologue/epilogue, and make non-movem sizes | ||
102 | suboptimal. | ||
103 | |||
104 | This method is not foolproof; it assumes that the "asm reg" | ||
105 | declarations at the beginning of the function really are used | ||
106 | here (beware: they may be moved to temporary registers). | ||
107 | This way, we do not have to save/move the registers around into | ||
108 | temporaries; we can safely use them straight away. | ||
109 | |||
110 | If you want to check that the allocation was right; then | ||
111 | check the equalities in the first comment. It should say | ||
112 | "r13=r13, r12=r12, r11=r11" */ | ||
113 | __asm__ volatile (" \n\ | ||
114 | ;; Check that the register asm declaration got right. \n\ | ||
115 | ;; The GCC manual says it will work, but there *has* been bugs. \n\ | ||
116 | .ifnc %0-%1-%4,$r13-$r12-$r11 \n\ | ||
117 | .err \n\ | ||
118 | .endif \n\ | ||
119 | \n\ | ||
120 | ;; Save the registers we'll clobber in the movem process \n\ | ||
121 | ;; on the stack. Don't mention them to gcc, it will only be \n\ | ||
122 | ;; upset. \n\ | ||
123 | subq 11*4,$sp \n\ | ||
124 | movem $r10,[$sp] \n\ | ||
125 | \n\ | ||
126 | move.d $r11,$r0 \n\ | ||
127 | move.d $r11,$r1 \n\ | ||
128 | move.d $r11,$r2 \n\ | ||
129 | move.d $r11,$r3 \n\ | ||
130 | move.d $r11,$r4 \n\ | ||
131 | move.d $r11,$r5 \n\ | ||
132 | move.d $r11,$r6 \n\ | ||
133 | move.d $r11,$r7 \n\ | ||
134 | move.d $r11,$r8 \n\ | ||
135 | move.d $r11,$r9 \n\ | ||
136 | move.d $r11,$r10 \n\ | ||
137 | \n\ | ||
138 | ;; Now we've got this: \n\ | ||
139 | ;; r13 - dst \n\ | ||
140 | ;; r12 - n \n\ | ||
141 | \n\ | ||
142 | ;; Update n for the first loop \n\ | ||
143 | subq 12*4,$r12 \n\ | ||
144 | 0: \n\ | ||
145 | subq 12*4,$r12 \n\ | ||
146 | bge 0b \n\ | ||
147 | movem $r11,[$r13+] \n\ | ||
148 | \n\ | ||
149 | addq 12*4,$r12 ;; compensate for last loop underflowing n \n\ | ||
150 | \n\ | ||
151 | ;; Restore registers from stack \n\ | ||
152 | movem [$sp+],$r10" | ||
153 | |||
154 | /* Outputs */ : "=r" (dst), "=r" (n) | ||
155 | /* Inputs */ : "0" (dst), "1" (n), "r" (lc)); | ||
156 | } | ||
157 | |||
158 | /* Either we directly starts copying, using dword copying | ||
159 | in a loop, or we copy as much as possible with 'movem' | ||
160 | and then the last block (<44 bytes) is copied here. | ||
161 | This will work since 'movem' will have updated src,dst,n. */ | ||
162 | |||
163 | while ( n >= 16 ) | ||
164 | { | ||
165 | *((long*)dst)++ = lc; | ||
166 | *((long*)dst)++ = lc; | ||
167 | *((long*)dst)++ = lc; | ||
168 | *((long*)dst)++ = lc; | ||
169 | n -= 16; | ||
170 | } | ||
171 | |||
172 | /* A switch() is definitely the fastest although it takes a LOT of code. | ||
173 | * Particularly if you inline code this. | ||
174 | */ | ||
175 | switch (n) | ||
176 | { | ||
177 | case 0: | ||
178 | break; | ||
179 | case 1: | ||
180 | *(char*)dst = (char) lc; | ||
181 | break; | ||
182 | case 2: | ||
183 | *(short*)dst = (short) lc; | ||
184 | break; | ||
185 | case 3: | ||
186 | *((short*)dst)++ = (short) lc; | ||
187 | *(char*)dst = (char) lc; | ||
188 | break; | ||
189 | case 4: | ||
190 | *((long*)dst)++ = lc; | ||
191 | break; | ||
192 | case 5: | ||
193 | *((long*)dst)++ = lc; | ||
194 | *(char*)dst = (char) lc; | ||
195 | break; | ||
196 | case 6: | ||
197 | *((long*)dst)++ = lc; | ||
198 | *(short*)dst = (short) lc; | ||
199 | break; | ||
200 | case 7: | ||
201 | *((long*)dst)++ = lc; | ||
202 | *((short*)dst)++ = (short) lc; | ||
203 | *(char*)dst = (char) lc; | ||
204 | break; | ||
205 | case 8: | ||
206 | *((long*)dst)++ = lc; | ||
207 | *((long*)dst)++ = lc; | ||
208 | break; | ||
209 | case 9: | ||
210 | *((long*)dst)++ = lc; | ||
211 | *((long*)dst)++ = lc; | ||
212 | *(char*)dst = (char) lc; | ||
213 | break; | ||
214 | case 10: | ||
215 | *((long*)dst)++ = lc; | ||
216 | *((long*)dst)++ = lc; | ||
217 | *(short*)dst = (short) lc; | ||
218 | break; | ||
219 | case 11: | ||
220 | *((long*)dst)++ = lc; | ||
221 | *((long*)dst)++ = lc; | ||
222 | *((short*)dst)++ = (short) lc; | ||
223 | *(char*)dst = (char) lc; | ||
224 | break; | ||
225 | case 12: | ||
226 | *((long*)dst)++ = lc; | ||
227 | *((long*)dst)++ = lc; | ||
228 | *((long*)dst)++ = lc; | ||
229 | break; | ||
230 | case 13: | ||
231 | *((long*)dst)++ = lc; | ||
232 | *((long*)dst)++ = lc; | ||
233 | *((long*)dst)++ = lc; | ||
234 | *(char*)dst = (char) lc; | ||
235 | break; | ||
236 | case 14: | ||
237 | *((long*)dst)++ = lc; | ||
238 | *((long*)dst)++ = lc; | ||
239 | *((long*)dst)++ = lc; | ||
240 | *(short*)dst = (short) lc; | ||
241 | break; | ||
242 | case 15: | ||
243 | *((long*)dst)++ = lc; | ||
244 | *((long*)dst)++ = lc; | ||
245 | *((long*)dst)++ = lc; | ||
246 | *((short*)dst)++ = (short) lc; | ||
247 | *(char*)dst = (char) lc; | ||
248 | break; | ||
249 | } | ||
250 | } | ||
251 | |||
252 | return return_dst; /* destination pointer. */ | ||
253 | } /* memset() */ | ||
diff --git a/arch/cris/arch-v32/lib/nand_init.S b/arch/cris/arch-v32/lib/nand_init.S new file mode 100644 index 000000000000..aba5c751c282 --- /dev/null +++ b/arch/cris/arch-v32/lib/nand_init.S | |||
@@ -0,0 +1,179 @@ | |||
1 | ##============================================================================= | ||
2 | ## | ||
3 | ## nand_init.S | ||
4 | ## | ||
5 | ## The bootrom copies data from the NAND flash to the internal RAM but | ||
6 | ## due to a bug/feature we can only trust the 256 first bytes. So this | ||
7 | ## code copies more data from NAND flash to internal RAM. Obvioulsy this | ||
8 | ## code must fit in the first 256 bytes so alter with care. | ||
9 | ## | ||
10 | ## Some notes about the bug/feature for future reference: | ||
11 | ## The bootrom copies the first 127 KB from NAND flash to internal | ||
12 | ## memory. The problem is that it does a bytewise copy. NAND flashes | ||
13 | ## does autoincrement on the address so for a 16-bite device each | ||
14 | ## read/write increases the address by two. So the copy loop in the | ||
15 | ## bootrom will discard every second byte. This is solved by inserting | ||
16 | ## zeroes in every second byte in the first erase block. | ||
17 | ## | ||
18 | ## The bootrom also incorrectly assumes that it can read the flash | ||
19 | ## linear with only one read command but the flash will actually | ||
20 | ## switch between normal area and spare area if you do that so we | ||
21 | ## can't trust more than the first 256 bytes. | ||
22 | ## | ||
23 | ##============================================================================= | ||
24 | |||
25 | #include <asm/arch/hwregs/asm/reg_map_asm.h> | ||
26 | #include <asm/arch/hwregs/asm/gio_defs_asm.h> | ||
27 | #include <asm/arch/hwregs/asm/pinmux_defs_asm.h> | ||
28 | #include <asm/arch/hwregs/asm/bif_core_defs_asm.h> | ||
29 | #include <asm/arch/hwregs/asm/config_defs_asm.h> | ||
30 | #include <linux/config.h> | ||
31 | |||
32 | ;; There are 8-bit NAND flashes and 16-bit NAND flashes. | ||
33 | ;; We need to treat them slightly different. | ||
34 | #if CONFIG_ETRAX_FLASH_BUSWIDTH==2 | ||
35 | #define PAGE_SIZE 256 | ||
36 | #else | ||
37 | #error 2 | ||
38 | #define PAGE_SIZE 512 | ||
39 | #endif | ||
40 | #define ERASE_BLOCK 16384 | ||
41 | |||
42 | ;; GPIO pins connected to NAND flash | ||
43 | #define CE 4 | ||
44 | #define CLE 5 | ||
45 | #define ALE 6 | ||
46 | #define BY 7 | ||
47 | |||
48 | ;; Address space for NAND flash | ||
49 | #define NAND_RD_ADDR 0x90000000 | ||
50 | #define NAND_WR_ADDR 0x94000000 | ||
51 | |||
52 | #define READ_CMD 0x00 | ||
53 | |||
54 | ;; Readability macros | ||
55 | #define CSP_MASK \ | ||
56 | REG_MASK(bif_core, rw_grp3_cfg, gated_csp0) | \ | ||
57 | REG_MASK(bif_core, rw_grp3_cfg, gated_csp1) | ||
58 | #define CSP_VAL \ | ||
59 | REG_STATE(bif_core, rw_grp3_cfg, gated_csp0, rd) | \ | ||
60 | REG_STATE(bif_core, rw_grp3_cfg, gated_csp1, wr) | ||
61 | |||
62 | ;;---------------------------------------------------------------------------- | ||
63 | ;; Macros to set/clear GPIO bits | ||
64 | |||
65 | .macro SET x | ||
66 | or.b (1<<\x),$r9 | ||
67 | move.d $r9, [$r2] | ||
68 | .endm | ||
69 | |||
70 | .macro CLR x | ||
71 | and.b ~(1<<\x),$r9 | ||
72 | move.d $r9, [$r2] | ||
73 | .endm | ||
74 | |||
75 | ;;---------------------------------------------------------------------------- | ||
76 | |||
77 | nand_boot: | ||
78 | ;; Check if nand boot was selected | ||
79 | move.d REG_ADDR(config, regi_config, r_bootsel), $r0 | ||
80 | move.d [$r0], $r0 | ||
81 | and.d REG_MASK(config, r_bootsel, boot_mode), $r0 | ||
82 | cmp.d REG_STATE(config, r_bootsel, boot_mode, nand), $r0 | ||
83 | bne normal_boot ; No NAND boot | ||
84 | nop | ||
85 | |||
86 | copy_nand_to_ram: | ||
87 | ;; copy_nand_to_ram | ||
88 | ;; Arguments | ||
89 | ;; r10 - destination | ||
90 | ;; r11 - source offset | ||
91 | ;; r12 - size | ||
92 | ;; r13 - Address to jump to after completion | ||
93 | ;; Note : r10-r12 are clobbered on return | ||
94 | ;; Registers used: | ||
95 | ;; r0 - NAND_RD_ADDR | ||
96 | ;; r1 - NAND_WR_ADDR | ||
97 | ;; r2 - reg_gio_rw_pa_dout | ||
98 | ;; r3 - reg_gio_r_pa_din | ||
99 | ;; r4 - tmp | ||
100 | ;; r5 - byte counter within a page | ||
101 | ;; r6 - reg_pinmux_rw_pa | ||
102 | ;; r7 - reg_gio_rw_pa_oe | ||
103 | ;; r8 - reg_bif_core_rw_grp3_cfg | ||
104 | ;; r9 - reg_gio_rw_pa_dout shadow | ||
105 | move.d 0x90000000, $r0 | ||
106 | move.d 0x94000000, $r1 | ||
107 | move.d REG_ADDR(gio, regi_gio, rw_pa_dout), $r2 | ||
108 | move.d REG_ADDR(gio, regi_gio, r_pa_din), $r3 | ||
109 | move.d REG_ADDR(pinmux, regi_pinmux, rw_pa), $r6 | ||
110 | move.d REG_ADDR(gio, regi_gio, rw_pa_oe), $r7 | ||
111 | move.d REG_ADDR(bif_core, regi_bif_core, rw_grp3_cfg), $r8 | ||
112 | |||
113 | #if CONFIG_ETRAX_FLASH_BUSWIDTH==2 | ||
114 | lsrq 1, $r11 | ||
115 | #endif | ||
116 | ;; Set up GPIO | ||
117 | move.d [$r2], $r9 | ||
118 | move.d [$r7], $r4 | ||
119 | or.b (1<<ALE) | (1 << CLE) | (1<<CE), $r4 | ||
120 | move.d $r4, [$r7] | ||
121 | |||
122 | ;; Set up bif | ||
123 | move.d [$r8], $r4 | ||
124 | and.d CSP_MASK, $r4 | ||
125 | or.d CSP_VAL, $r4 | ||
126 | move.d $r4, [$r8] | ||
127 | |||
128 | 1: ;; Copy one page | ||
129 | CLR CE | ||
130 | SET CLE | ||
131 | moveq READ_CMD, $r4 | ||
132 | move.b $r4, [$r1] | ||
133 | moveq 20, $r4 | ||
134 | 2: bne 2b | ||
135 | subq 1, $r4 | ||
136 | CLR CLE | ||
137 | SET ALE | ||
138 | clear.w [$r1] ; Column address = 0 | ||
139 | move.d $r11, $r4 | ||
140 | lsrq 8, $r4 | ||
141 | move.b $r4, [$r1] ; Row address | ||
142 | lsrq 8, $r4 | ||
143 | move.b $r4, [$r1] ; Row adddress | ||
144 | moveq 20, $r4 | ||
145 | 2: bne 2b | ||
146 | subq 1, $r4 | ||
147 | CLR ALE | ||
148 | 2: move.d [$r3], $r4 | ||
149 | and.d 1 << BY, $r4 | ||
150 | beq 2b | ||
151 | movu.w PAGE_SIZE, $r5 | ||
152 | 2: ; Copy one byte/word | ||
153 | #if CONFIG_ETRAX_FLASH_BUSWIDTH==2 | ||
154 | move.w [$r0], $r4 | ||
155 | #else | ||
156 | move.b [$r0], $r4 | ||
157 | #endif | ||
158 | subq 1, $r5 | ||
159 | bne 2b | ||
160 | #if CONFIG_ETRAX_FLASH_BUSWIDTH==2 | ||
161 | move.w $r4, [$r10+] | ||
162 | subu.w PAGE_SIZE*2, $r12 | ||
163 | #else | ||
164 | move.b $r4, [$r10+] | ||
165 | subu.w PAGE_SIZE, $r12 | ||
166 | #endif | ||
167 | bpl 1b | ||
168 | addu.w PAGE_SIZE, $r11 | ||
169 | |||
170 | ;; End of copy | ||
171 | jump $r13 | ||
172 | nop | ||
173 | |||
174 | ;; This will warn if the code above is too large. If you consider | ||
175 | ;; to remove this you don't understand the bug/feature. | ||
176 | .org 256 | ||
177 | .org ERASE_BLOCK | ||
178 | |||
179 | normal_boot: | ||
diff --git a/arch/cris/arch-v32/lib/spinlock.S b/arch/cris/arch-v32/lib/spinlock.S new file mode 100644 index 000000000000..2437ae7f6ed2 --- /dev/null +++ b/arch/cris/arch-v32/lib/spinlock.S | |||
@@ -0,0 +1,33 @@ | |||
1 | ;; Core of the spinlock implementation | ||
2 | ;; | ||
3 | ;; Copyright (C) 2004 Axis Communications AB. | ||
4 | ;; | ||
5 | ;; Author: Mikael Starvik | ||
6 | |||
7 | |||
8 | .global cris_spin_lock | ||
9 | .global cris_spin_trylock | ||
10 | |||
11 | .text | ||
12 | |||
13 | cris_spin_lock: | ||
14 | clearf p | ||
15 | 1: test.d [$r10] | ||
16 | beq 1b | ||
17 | clearf p | ||
18 | ax | ||
19 | clear.d [$r10] | ||
20 | bcs 1b | ||
21 | clearf p | ||
22 | ret | ||
23 | nop | ||
24 | |||
25 | cris_spin_trylock: | ||
26 | clearf p | ||
27 | 1: move.d [$r10], $r11 | ||
28 | ax | ||
29 | clear.d [$r10] | ||
30 | bcs 1b | ||
31 | clearf p | ||
32 | ret | ||
33 | move.d $r11,$r10 | ||
diff --git a/arch/cris/arch-v32/lib/string.c b/arch/cris/arch-v32/lib/string.c new file mode 100644 index 000000000000..98e282ac824a --- /dev/null +++ b/arch/cris/arch-v32/lib/string.c | |||
@@ -0,0 +1,219 @@ | |||
1 | /*#************************************************************************#*/ | ||
2 | /*#-------------------------------------------------------------------------*/ | ||
3 | /*# */ | ||
4 | /*# FUNCTION NAME: memcpy() */ | ||
5 | /*# */ | ||
6 | /*# PARAMETERS: void* dst; Destination address. */ | ||
7 | /*# void* src; Source address. */ | ||
8 | /*# int len; Number of bytes to copy. */ | ||
9 | /*# */ | ||
10 | /*# RETURNS: dst. */ | ||
11 | /*# */ | ||
12 | /*# DESCRIPTION: Copies len bytes of memory from src to dst. No guarantees */ | ||
13 | /*# about copying of overlapping memory areas. This routine is */ | ||
14 | /*# very sensitive to compiler changes in register allocation. */ | ||
15 | /*# Should really be rewritten to avoid this problem. */ | ||
16 | /*# */ | ||
17 | /*#-------------------------------------------------------------------------*/ | ||
18 | /*# */ | ||
19 | /*# HISTORY */ | ||
20 | /*# */ | ||
21 | /*# DATE NAME CHANGES */ | ||
22 | /*# ---- ---- ------- */ | ||
23 | /*# 941007 Kenny R Creation */ | ||
24 | /*# 941011 Kenny R Lots of optimizations and inlining. */ | ||
25 | /*# 941129 Ulf A Adapted for use in libc. */ | ||
26 | /*# 950216 HP N==0 forgotten if non-aligned src/dst. */ | ||
27 | /*# Added some optimizations. */ | ||
28 | /*# 001025 HP Make src and dst char *. Align dst to */ | ||
29 | /*# dword, not just word-if-both-src-and-dst- */ | ||
30 | /*# are-misaligned. */ | ||
31 | /*# */ | ||
32 | /*#-------------------------------------------------------------------------*/ | ||
33 | |||
34 | #include <linux/types.h> | ||
35 | |||
36 | void *memcpy(void *pdst, | ||
37 | const void *psrc, | ||
38 | size_t pn) | ||
39 | { | ||
40 | /* Ok. Now we want the parameters put in special registers. | ||
41 | Make sure the compiler is able to make something useful of this. | ||
42 | As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop). | ||
43 | |||
44 | If gcc was allright, it really would need no temporaries, and no | ||
45 | stack space to save stuff on. */ | ||
46 | |||
47 | register void *return_dst __asm__ ("r10") = pdst; | ||
48 | register char *dst __asm__ ("r13") = pdst; | ||
49 | register const char *src __asm__ ("r11") = psrc; | ||
50 | register int n __asm__ ("r12") = pn; | ||
51 | |||
52 | |||
53 | /* When src is aligned but not dst, this makes a few extra needless | ||
54 | cycles. I believe it would take as many to check that the | ||
55 | re-alignment was unnecessary. */ | ||
56 | if (((unsigned long) dst & 3) != 0 | ||
57 | /* Don't align if we wouldn't copy more than a few bytes; so we | ||
58 | don't have to check further for overflows. */ | ||
59 | && n >= 3) | ||
60 | { | ||
61 | if ((unsigned long) dst & 1) | ||
62 | { | ||
63 | n--; | ||
64 | *(char*)dst = *(char*)src; | ||
65 | src++; | ||
66 | dst++; | ||
67 | } | ||
68 | |||
69 | if ((unsigned long) dst & 2) | ||
70 | { | ||
71 | n -= 2; | ||
72 | *(short*)dst = *(short*)src; | ||
73 | src += 2; | ||
74 | dst += 2; | ||
75 | } | ||
76 | } | ||
77 | |||
78 | /* Decide which copying method to use. Movem is dirt cheap, so the | ||
79 | overheap is low enough to always use the minimum block size as the | ||
80 | threshold. */ | ||
81 | if (n >= 44) | ||
82 | { | ||
83 | /* For large copies we use 'movem' */ | ||
84 | |||
85 | /* It is not optimal to tell the compiler about clobbering any | ||
86 | registers; that will move the saving/restoring of those registers | ||
87 | to the function prologue/epilogue, and make non-movem sizes | ||
88 | suboptimal. */ | ||
89 | __asm__ volatile (" \n\ | ||
90 | ;; Check that the register asm declaration got right. \n\ | ||
91 | ;; The GCC manual explicitly says TRT will happen. \n\ | ||
92 | .ifnc %0-%1-%2,$r13-$r11-$r12 \n\ | ||
93 | .err \n\ | ||
94 | .endif \n\ | ||
95 | \n\ | ||
96 | ;; Save the registers we'll use in the movem process \n\ | ||
97 | \n\ | ||
98 | ;; on the stack. \n\ | ||
99 | subq 11*4,$sp \n\ | ||
100 | movem $r10,[$sp] \n\ | ||
101 | \n\ | ||
102 | ;; Now we've got this: \n\ | ||
103 | ;; r11 - src \n\ | ||
104 | ;; r13 - dst \n\ | ||
105 | ;; r12 - n \n\ | ||
106 | \n\ | ||
107 | ;; Update n for the first loop \n\ | ||
108 | subq 44,$r12 \n\ | ||
109 | 0: \n\ | ||
110 | movem [$r11+],$r10 \n\ | ||
111 | subq 44,$r12 \n\ | ||
112 | bge 0b \n\ | ||
113 | movem $r10,[$r13+] \n\ | ||
114 | \n\ | ||
115 | addq 44,$r12 ;; compensate for last loop underflowing n \n\ | ||
116 | \n\ | ||
117 | ;; Restore registers from stack \n\ | ||
118 | movem [$sp+],$r10" | ||
119 | |||
120 | /* Outputs */ : "=r" (dst), "=r" (src), "=r" (n) | ||
121 | /* Inputs */ : "0" (dst), "1" (src), "2" (n)); | ||
122 | |||
123 | } | ||
124 | |||
125 | /* Either we directly starts copying, using dword copying | ||
126 | in a loop, or we copy as much as possible with 'movem' | ||
127 | and then the last block (<44 bytes) is copied here. | ||
128 | This will work since 'movem' will have updated src,dst,n. */ | ||
129 | |||
130 | while ( n >= 16 ) | ||
131 | { | ||
132 | *((long*)dst)++ = *((long*)src)++; | ||
133 | *((long*)dst)++ = *((long*)src)++; | ||
134 | *((long*)dst)++ = *((long*)src)++; | ||
135 | *((long*)dst)++ = *((long*)src)++; | ||
136 | n -= 16; | ||
137 | } | ||
138 | |||
139 | /* A switch() is definitely the fastest although it takes a LOT of code. | ||
140 | * Particularly if you inline code this. | ||
141 | */ | ||
142 | switch (n) | ||
143 | { | ||
144 | case 0: | ||
145 | break; | ||
146 | case 1: | ||
147 | *(char*)dst = *(char*)src; | ||
148 | break; | ||
149 | case 2: | ||
150 | *(short*)dst = *(short*)src; | ||
151 | break; | ||
152 | case 3: | ||
153 | *((short*)dst)++ = *((short*)src)++; | ||
154 | *(char*)dst = *(char*)src; | ||
155 | break; | ||
156 | case 4: | ||
157 | *((long*)dst)++ = *((long*)src)++; | ||
158 | break; | ||
159 | case 5: | ||
160 | *((long*)dst)++ = *((long*)src)++; | ||
161 | *(char*)dst = *(char*)src; | ||
162 | break; | ||
163 | case 6: | ||
164 | *((long*)dst)++ = *((long*)src)++; | ||
165 | *(short*)dst = *(short*)src; | ||
166 | break; | ||
167 | case 7: | ||
168 | *((long*)dst)++ = *((long*)src)++; | ||
169 | *((short*)dst)++ = *((short*)src)++; | ||
170 | *(char*)dst = *(char*)src; | ||
171 | break; | ||
172 | case 8: | ||
173 | *((long*)dst)++ = *((long*)src)++; | ||
174 | *((long*)dst)++ = *((long*)src)++; | ||
175 | break; | ||
176 | case 9: | ||
177 | *((long*)dst)++ = *((long*)src)++; | ||
178 | *((long*)dst)++ = *((long*)src)++; | ||
179 | *(char*)dst = *(char*)src; | ||
180 | break; | ||
181 | case 10: | ||
182 | *((long*)dst)++ = *((long*)src)++; | ||
183 | *((long*)dst)++ = *((long*)src)++; | ||
184 | *(short*)dst = *(short*)src; | ||
185 | break; | ||
186 | case 11: | ||
187 | *((long*)dst)++ = *((long*)src)++; | ||
188 | *((long*)dst)++ = *((long*)src)++; | ||
189 | *((short*)dst)++ = *((short*)src)++; | ||
190 | *(char*)dst = *(char*)src; | ||
191 | break; | ||
192 | case 12: | ||
193 | *((long*)dst)++ = *((long*)src)++; | ||
194 | *((long*)dst)++ = *((long*)src)++; | ||
195 | *((long*)dst)++ = *((long*)src)++; | ||
196 | break; | ||
197 | case 13: | ||
198 | *((long*)dst)++ = *((long*)src)++; | ||
199 | *((long*)dst)++ = *((long*)src)++; | ||
200 | *((long*)dst)++ = *((long*)src)++; | ||
201 | *(char*)dst = *(char*)src; | ||
202 | break; | ||
203 | case 14: | ||
204 | *((long*)dst)++ = *((long*)src)++; | ||
205 | *((long*)dst)++ = *((long*)src)++; | ||
206 | *((long*)dst)++ = *((long*)src)++; | ||
207 | *(short*)dst = *(short*)src; | ||
208 | break; | ||
209 | case 15: | ||
210 | *((long*)dst)++ = *((long*)src)++; | ||
211 | *((long*)dst)++ = *((long*)src)++; | ||
212 | *((long*)dst)++ = *((long*)src)++; | ||
213 | *((short*)dst)++ = *((short*)src)++; | ||
214 | *(char*)dst = *(char*)src; | ||
215 | break; | ||
216 | } | ||
217 | |||
218 | return return_dst; /* destination pointer. */ | ||
219 | } /* memcpy() */ | ||
diff --git a/arch/cris/arch-v32/lib/usercopy.c b/arch/cris/arch-v32/lib/usercopy.c new file mode 100644 index 000000000000..f0b08460c1be --- /dev/null +++ b/arch/cris/arch-v32/lib/usercopy.c | |||
@@ -0,0 +1,470 @@ | |||
1 | /* | ||
2 | * User address space access functions. | ||
3 | * The non-inlined parts of asm-cris/uaccess.h are here. | ||
4 | * | ||
5 | * Copyright (C) 2000, 2003 Axis Communications AB. | ||
6 | * | ||
7 | * Written by Hans-Peter Nilsson. | ||
8 | * Pieces used from memcpy, originally by Kenny Ranerup long time ago. | ||
9 | */ | ||
10 | |||
11 | #include <asm/uaccess.h> | ||
12 | |||
13 | /* Asm:s have been tweaked (within the domain of correctness) to give | ||
14 | satisfactory results for "gcc version 3.2.1 Axis release R53/1.53-v32". | ||
15 | |||
16 | Check regularly... | ||
17 | |||
18 | Note that for CRISv32, the PC saved at a bus-fault is the address | ||
19 | *at* the faulting instruction, with a special case for instructions | ||
20 | in delay slots: then it's the address of the branch. Note also that | ||
21 | in contrast to v10, a postincrement in the instruction is *not* | ||
22 | performed at a bus-fault; the register is seen having the original | ||
23 | value in fault handlers. */ | ||
24 | |||
25 | |||
26 | /* Copy to userspace. This is based on the memcpy used for | ||
27 | kernel-to-kernel copying; see "string.c". */ | ||
28 | |||
29 | unsigned long | ||
30 | __copy_user (void __user *pdst, const void *psrc, unsigned long pn) | ||
31 | { | ||
32 | /* We want the parameters put in special registers. | ||
33 | Make sure the compiler is able to make something useful of this. | ||
34 | As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop). | ||
35 | |||
36 | FIXME: Comment for old gcc version. Check. | ||
37 | If gcc was allright, it really would need no temporaries, and no | ||
38 | stack space to save stuff on. */ | ||
39 | |||
40 | register char *dst __asm__ ("r13") = pdst; | ||
41 | register const char *src __asm__ ("r11") = psrc; | ||
42 | register int n __asm__ ("r12") = pn; | ||
43 | register int retn __asm__ ("r10") = 0; | ||
44 | |||
45 | |||
46 | /* When src is aligned but not dst, this makes a few extra needless | ||
47 | cycles. I believe it would take as many to check that the | ||
48 | re-alignment was unnecessary. */ | ||
49 | if (((unsigned long) dst & 3) != 0 | ||
50 | /* Don't align if we wouldn't copy more than a few bytes; so we | ||
51 | don't have to check further for overflows. */ | ||
52 | && n >= 3) | ||
53 | { | ||
54 | if ((unsigned long) dst & 1) | ||
55 | { | ||
56 | __asm_copy_to_user_1 (dst, src, retn); | ||
57 | n--; | ||
58 | } | ||
59 | |||
60 | if ((unsigned long) dst & 2) | ||
61 | { | ||
62 | __asm_copy_to_user_2 (dst, src, retn); | ||
63 | n -= 2; | ||
64 | } | ||
65 | } | ||
66 | |||
67 | /* Movem is dirt cheap. The overheap is low enough to always use the | ||
68 | minimum possible block size as the threshold. */ | ||
69 | if (n >= 44) | ||
70 | { | ||
71 | /* For large copies we use 'movem'. */ | ||
72 | |||
73 | /* It is not optimal to tell the compiler about clobbering any | ||
74 | registers; that will move the saving/restoring of those registers | ||
75 | to the function prologue/epilogue, and make non-movem sizes | ||
76 | suboptimal. */ | ||
77 | __asm__ volatile ("\ | ||
78 | ;; Check that the register asm declaration got right. \n\ | ||
79 | ;; The GCC manual explicitly says TRT will happen. \n\ | ||
80 | .ifnc %0%1%2%3,$r13$r11$r12$r10 \n\ | ||
81 | .err \n\ | ||
82 | .endif \n\ | ||
83 | \n\ | ||
84 | ;; Save the registers we'll use in the movem process \n\ | ||
85 | ;; on the stack. \n\ | ||
86 | subq 11*4,$sp \n\ | ||
87 | movem $r10,[$sp] \n\ | ||
88 | \n\ | ||
89 | ;; Now we've got this: \n\ | ||
90 | ;; r11 - src \n\ | ||
91 | ;; r13 - dst \n\ | ||
92 | ;; r12 - n \n\ | ||
93 | \n\ | ||
94 | ;; Update n for the first loop \n\ | ||
95 | subq 44,$r12 \n\ | ||
96 | 0: \n\ | ||
97 | movem [$r11+],$r10 \n\ | ||
98 | subq 44,$r12 \n\ | ||
99 | 1: bge 0b \n\ | ||
100 | movem $r10,[$r13+] \n\ | ||
101 | 3: \n\ | ||
102 | addq 44,$r12 ;; compensate for last loop underflowing n \n\ | ||
103 | \n\ | ||
104 | ;; Restore registers from stack \n\ | ||
105 | movem [$sp+],$r10 \n\ | ||
106 | 2: \n\ | ||
107 | .section .fixup,\"ax\" \n\ | ||
108 | 4: \n\ | ||
109 | ; When failing on any of the 1..44 bytes in a chunk, we adjust back the \n\ | ||
110 | ; source pointer and just drop through to the by-16 and by-4 loops to \n\ | ||
111 | ; get the correct number of failing bytes. This necessarily means a \n\ | ||
112 | ; few extra exceptions, but invalid user pointers shouldn't happen in \n\ | ||
113 | ; time-critical code anyway. \n\ | ||
114 | jump 3b \n\ | ||
115 | subq 44,$r11 \n\ | ||
116 | \n\ | ||
117 | .previous \n\ | ||
118 | .section __ex_table,\"a\" \n\ | ||
119 | .dword 1b,4b \n\ | ||
120 | .previous" | ||
121 | |||
122 | /* Outputs */ : "=r" (dst), "=r" (src), "=r" (n), "=r" (retn) | ||
123 | /* Inputs */ : "0" (dst), "1" (src), "2" (n), "3" (retn)); | ||
124 | |||
125 | } | ||
126 | |||
127 | while (n >= 16) | ||
128 | { | ||
129 | __asm_copy_to_user_16 (dst, src, retn); | ||
130 | n -= 16; | ||
131 | } | ||
132 | |||
133 | /* Having a separate by-four loops cuts down on cache footprint. | ||
134 | FIXME: Test with and without; increasing switch to be 0..15. */ | ||
135 | while (n >= 4) | ||
136 | { | ||
137 | __asm_copy_to_user_4 (dst, src, retn); | ||
138 | n -= 4; | ||
139 | } | ||
140 | |||
141 | switch (n) | ||
142 | { | ||
143 | case 0: | ||
144 | break; | ||
145 | case 1: | ||
146 | __asm_copy_to_user_1 (dst, src, retn); | ||
147 | break; | ||
148 | case 2: | ||
149 | __asm_copy_to_user_2 (dst, src, retn); | ||
150 | break; | ||
151 | case 3: | ||
152 | __asm_copy_to_user_3 (dst, src, retn); | ||
153 | break; | ||
154 | } | ||
155 | |||
156 | return retn; | ||
157 | } | ||
158 | |||
159 | /* Copy from user to kernel, zeroing the bytes that were inaccessible in | ||
160 | userland. The return-value is the number of bytes that were | ||
161 | inaccessible. */ | ||
162 | |||
163 | unsigned long | ||
164 | __copy_user_zeroing (void __user *pdst, const void *psrc, unsigned long pn) | ||
165 | { | ||
166 | /* We want the parameters put in special registers. | ||
167 | Make sure the compiler is able to make something useful of this. | ||
168 | As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop). | ||
169 | |||
170 | FIXME: Comment for old gcc version. Check. | ||
171 | If gcc was allright, it really would need no temporaries, and no | ||
172 | stack space to save stuff on. */ | ||
173 | |||
174 | register char *dst __asm__ ("r13") = pdst; | ||
175 | register const char *src __asm__ ("r11") = psrc; | ||
176 | register int n __asm__ ("r12") = pn; | ||
177 | register int retn __asm__ ("r10") = 0; | ||
178 | |||
179 | /* The best reason to align src is that we then know that a read-fault | ||
180 | was for aligned bytes; there's no 1..3 remaining good bytes to | ||
181 | pickle. */ | ||
182 | if (((unsigned long) src & 3) != 0) | ||
183 | { | ||
184 | if (((unsigned long) src & 1) && n != 0) | ||
185 | { | ||
186 | __asm_copy_from_user_1 (dst, src, retn); | ||
187 | n--; | ||
188 | } | ||
189 | |||
190 | if (((unsigned long) src & 2) && n >= 2) | ||
191 | { | ||
192 | __asm_copy_from_user_2 (dst, src, retn); | ||
193 | n -= 2; | ||
194 | } | ||
195 | |||
196 | /* We only need one check after the unalignment-adjustments, because | ||
197 | if both adjustments were done, either both or neither reference | ||
198 | had an exception. */ | ||
199 | if (retn != 0) | ||
200 | goto copy_exception_bytes; | ||
201 | } | ||
202 | |||
203 | /* Movem is dirt cheap. The overheap is low enough to always use the | ||
204 | minimum possible block size as the threshold. */ | ||
205 | if (n >= 44) | ||
206 | { | ||
207 | /* It is not optimal to tell the compiler about clobbering any | ||
208 | registers; that will move the saving/restoring of those registers | ||
209 | to the function prologue/epilogue, and make non-movem sizes | ||
210 | suboptimal. */ | ||
211 | __asm__ volatile ("\ | ||
212 | .ifnc %0%1%2%3,$r13$r11$r12$r10 \n\ | ||
213 | .err \n\ | ||
214 | .endif \n\ | ||
215 | \n\ | ||
216 | ;; Save the registers we'll use in the movem process \n\ | ||
217 | ;; on the stack. \n\ | ||
218 | subq 11*4,$sp \n\ | ||
219 | movem $r10,[$sp] \n\ | ||
220 | \n\ | ||
221 | ;; Now we've got this: \n\ | ||
222 | ;; r11 - src \n\ | ||
223 | ;; r13 - dst \n\ | ||
224 | ;; r12 - n \n\ | ||
225 | \n\ | ||
226 | ;; Update n for the first loop \n\ | ||
227 | subq 44,$r12 \n\ | ||
228 | 0: \n\ | ||
229 | movem [$r11+],$r10 \n\ | ||
230 | \n\ | ||
231 | subq 44,$r12 \n\ | ||
232 | bge 0b \n\ | ||
233 | movem $r10,[$r13+] \n\ | ||
234 | \n\ | ||
235 | 4: \n\ | ||
236 | addq 44,$r12 ;; compensate for last loop underflowing n \n\ | ||
237 | \n\ | ||
238 | ;; Restore registers from stack \n\ | ||
239 | movem [$sp+],$r10 \n\ | ||
240 | .section .fixup,\"ax\" \n\ | ||
241 | \n\ | ||
242 | ;; Do not jump back into the loop if we fail. For some uses, we get a \n\ | ||
243 | ;; page fault somewhere on the line. Without checking for page limits, \n\ | ||
244 | ;; we don't know where, but we need to copy accurately and keep an \n\ | ||
245 | ;; accurate count; not just clear the whole line. To do that, we fall \n\ | ||
246 | ;; down in the code below, proceeding with smaller amounts. It should \n\ | ||
247 | ;; be kept in mind that we have to cater to code like what at one time \n\ | ||
248 | ;; was in fs/super.c: \n\ | ||
249 | ;; i = size - copy_from_user((void *)page, data, size); \n\ | ||
250 | ;; which would cause repeated faults while clearing the remainder of \n\ | ||
251 | ;; the SIZE bytes at PAGE after the first fault. \n\ | ||
252 | ;; A caveat here is that we must not fall through from a failing page \n\ | ||
253 | ;; to a valid page. \n\ | ||
254 | \n\ | ||
255 | 3: \n\ | ||
256 | jump 4b ;; Fall through, pretending the fault didn't happen. \n\ | ||
257 | nop \n\ | ||
258 | \n\ | ||
259 | .previous \n\ | ||
260 | .section __ex_table,\"a\" \n\ | ||
261 | .dword 0b,3b \n\ | ||
262 | .previous" | ||
263 | |||
264 | /* Outputs */ : "=r" (dst), "=r" (src), "=r" (n), "=r" (retn) | ||
265 | /* Inputs */ : "0" (dst), "1" (src), "2" (n), "3" (retn)); | ||
266 | } | ||
267 | |||
268 | /* Either we directly start copying here, using dword copying in a loop, | ||
269 | or we copy as much as possible with 'movem' and then the last block | ||
270 | (<44 bytes) is copied here. This will work since 'movem' will have | ||
271 | updated src, dst and n. (Except with failing src.) | ||
272 | |||
273 | Since we want to keep src accurate, we can't use | ||
274 | __asm_copy_from_user_N with N != (1, 2, 4); it updates dst and | ||
275 | retn, but not src (by design; it's value is ignored elsewhere). */ | ||
276 | |||
277 | while (n >= 4) | ||
278 | { | ||
279 | __asm_copy_from_user_4 (dst, src, retn); | ||
280 | n -= 4; | ||
281 | |||
282 | if (retn) | ||
283 | goto copy_exception_bytes; | ||
284 | } | ||
285 | |||
286 | /* If we get here, there were no memory read faults. */ | ||
287 | switch (n) | ||
288 | { | ||
289 | /* These copies are at least "naturally aligned" (so we don't have | ||
290 | to check each byte), due to the src alignment code before the | ||
291 | movem loop. The *_3 case *will* get the correct count for retn. */ | ||
292 | case 0: | ||
293 | /* This case deliberately left in (if you have doubts check the | ||
294 | generated assembly code). */ | ||
295 | break; | ||
296 | case 1: | ||
297 | __asm_copy_from_user_1 (dst, src, retn); | ||
298 | break; | ||
299 | case 2: | ||
300 | __asm_copy_from_user_2 (dst, src, retn); | ||
301 | break; | ||
302 | case 3: | ||
303 | __asm_copy_from_user_3 (dst, src, retn); | ||
304 | break; | ||
305 | } | ||
306 | |||
307 | /* If we get here, retn correctly reflects the number of failing | ||
308 | bytes. */ | ||
309 | return retn; | ||
310 | |||
311 | copy_exception_bytes: | ||
312 | /* We already have "retn" bytes cleared, and need to clear the | ||
313 | remaining "n" bytes. A non-optimized simple byte-for-byte in-line | ||
314 | memset is preferred here, since this isn't speed-critical code and | ||
315 | we'd rather have this a leaf-function than calling memset. */ | ||
316 | { | ||
317 | char *endp; | ||
318 | for (endp = dst + n; dst < endp; dst++) | ||
319 | *dst = 0; | ||
320 | } | ||
321 | |||
322 | return retn + n; | ||
323 | } | ||
324 | |||
325 | /* Zero userspace. */ | ||
326 | |||
327 | unsigned long | ||
328 | __do_clear_user (void __user *pto, unsigned long pn) | ||
329 | { | ||
330 | /* We want the parameters put in special registers. | ||
331 | Make sure the compiler is able to make something useful of this. | ||
332 | As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop). | ||
333 | |||
334 | FIXME: Comment for old gcc version. Check. | ||
335 | If gcc was allright, it really would need no temporaries, and no | ||
336 | stack space to save stuff on. */ | ||
337 | |||
338 | register char *dst __asm__ ("r13") = pto; | ||
339 | register int n __asm__ ("r12") = pn; | ||
340 | register int retn __asm__ ("r10") = 0; | ||
341 | |||
342 | |||
343 | if (((unsigned long) dst & 3) != 0 | ||
344 | /* Don't align if we wouldn't copy more than a few bytes. */ | ||
345 | && n >= 3) | ||
346 | { | ||
347 | if ((unsigned long) dst & 1) | ||
348 | { | ||
349 | __asm_clear_1 (dst, retn); | ||
350 | n--; | ||
351 | } | ||
352 | |||
353 | if ((unsigned long) dst & 2) | ||
354 | { | ||
355 | __asm_clear_2 (dst, retn); | ||
356 | n -= 2; | ||
357 | } | ||
358 | } | ||
359 | |||
360 | /* Decide which copying method to use. | ||
361 | FIXME: This number is from the "ordinary" kernel memset. */ | ||
362 | if (n >= 48) | ||
363 | { | ||
364 | /* For large clears we use 'movem' */ | ||
365 | |||
366 | /* It is not optimal to tell the compiler about clobbering any | ||
367 | call-saved registers; that will move the saving/restoring of | ||
368 | those registers to the function prologue/epilogue, and make | ||
369 | non-movem sizes suboptimal. | ||
370 | |||
371 | This method is not foolproof; it assumes that the "asm reg" | ||
372 | declarations at the beginning of the function really are used | ||
373 | here (beware: they may be moved to temporary registers). | ||
374 | This way, we do not have to save/move the registers around into | ||
375 | temporaries; we can safely use them straight away. | ||
376 | |||
377 | If you want to check that the allocation was right; then | ||
378 | check the equalities in the first comment. It should say | ||
379 | something like "r13=r13, r11=r11, r12=r12". */ | ||
380 | __asm__ volatile ("\ | ||
381 | .ifnc %0%1%2,$r13$r12$r10 \n\ | ||
382 | .err \n\ | ||
383 | .endif \n\ | ||
384 | \n\ | ||
385 | ;; Save the registers we'll clobber in the movem process \n\ | ||
386 | ;; on the stack. Don't mention them to gcc, it will only be \n\ | ||
387 | ;; upset. \n\ | ||
388 | subq 11*4,$sp \n\ | ||
389 | movem $r10,[$sp] \n\ | ||
390 | \n\ | ||
391 | clear.d $r0 \n\ | ||
392 | clear.d $r1 \n\ | ||
393 | clear.d $r2 \n\ | ||
394 | clear.d $r3 \n\ | ||
395 | clear.d $r4 \n\ | ||
396 | clear.d $r5 \n\ | ||
397 | clear.d $r6 \n\ | ||
398 | clear.d $r7 \n\ | ||
399 | clear.d $r8 \n\ | ||
400 | clear.d $r9 \n\ | ||
401 | clear.d $r10 \n\ | ||
402 | clear.d $r11 \n\ | ||
403 | \n\ | ||
404 | ;; Now we've got this: \n\ | ||
405 | ;; r13 - dst \n\ | ||
406 | ;; r12 - n \n\ | ||
407 | \n\ | ||
408 | ;; Update n for the first loop \n\ | ||
409 | subq 12*4,$r12 \n\ | ||
410 | 0: \n\ | ||
411 | subq 12*4,$r12 \n\ | ||
412 | 1: \n\ | ||
413 | bge 0b \n\ | ||
414 | movem $r11,[$r13+] \n\ | ||
415 | \n\ | ||
416 | addq 12*4,$r12 ;; compensate for last loop underflowing n \n\ | ||
417 | \n\ | ||
418 | ;; Restore registers from stack \n\ | ||
419 | movem [$sp+],$r10 \n\ | ||
420 | 2: \n\ | ||
421 | .section .fixup,\"ax\" \n\ | ||
422 | 3: \n\ | ||
423 | movem [$sp],$r10 \n\ | ||
424 | addq 12*4,$r10 \n\ | ||
425 | addq 12*4,$r13 \n\ | ||
426 | movem $r10,[$sp] \n\ | ||
427 | jump 0b \n\ | ||
428 | clear.d $r10 \n\ | ||
429 | \n\ | ||
430 | .previous \n\ | ||
431 | .section __ex_table,\"a\" \n\ | ||
432 | .dword 1b,3b \n\ | ||
433 | .previous" | ||
434 | |||
435 | /* Outputs */ : "=r" (dst), "=r" (n), "=r" (retn) | ||
436 | /* Inputs */ : "0" (dst), "1" (n), "2" (retn) | ||
437 | /* Clobber */ : "r11"); | ||
438 | } | ||
439 | |||
440 | while (n >= 16) | ||
441 | { | ||
442 | __asm_clear_16 (dst, retn); | ||
443 | n -= 16; | ||
444 | } | ||
445 | |||
446 | /* Having a separate by-four loops cuts down on cache footprint. | ||
447 | FIXME: Test with and without; increasing switch to be 0..15. */ | ||
448 | while (n >= 4) | ||
449 | { | ||
450 | __asm_clear_4 (dst, retn); | ||
451 | n -= 4; | ||
452 | } | ||
453 | |||
454 | switch (n) | ||
455 | { | ||
456 | case 0: | ||
457 | break; | ||
458 | case 1: | ||
459 | __asm_clear_1 (dst, retn); | ||
460 | break; | ||
461 | case 2: | ||
462 | __asm_clear_2 (dst, retn); | ||
463 | break; | ||
464 | case 3: | ||
465 | __asm_clear_3 (dst, retn); | ||
466 | break; | ||
467 | } | ||
468 | |||
469 | return retn; | ||
470 | } | ||