diff options
Diffstat (limited to 'arch/ia64/lib')
-rw-r--r-- | arch/ia64/lib/Makefile | 52 | ||||
-rw-r--r-- | arch/ia64/lib/bitop.c | 88 | ||||
-rw-r--r-- | arch/ia64/lib/carta_random.S | 54 | ||||
-rw-r--r-- | arch/ia64/lib/checksum.c | 102 | ||||
-rw-r--r-- | arch/ia64/lib/clear_page.S | 77 | ||||
-rw-r--r-- | arch/ia64/lib/clear_user.S | 209 | ||||
-rw-r--r-- | arch/ia64/lib/copy_page.S | 98 | ||||
-rw-r--r-- | arch/ia64/lib/copy_page_mck.S | 185 | ||||
-rw-r--r-- | arch/ia64/lib/copy_user.S | 610 | ||||
-rw-r--r-- | arch/ia64/lib/csum_partial_copy.c | 151 | ||||
-rw-r--r-- | arch/ia64/lib/dec_and_lock.c | 42 | ||||
-rw-r--r-- | arch/ia64/lib/do_csum.S | 323 | ||||
-rw-r--r-- | arch/ia64/lib/flush.S | 39 | ||||
-rw-r--r-- | arch/ia64/lib/idiv32.S | 83 | ||||
-rw-r--r-- | arch/ia64/lib/idiv64.S | 80 | ||||
-rw-r--r-- | arch/ia64/lib/io.c | 165 | ||||
-rw-r--r-- | arch/ia64/lib/ip_fast_csum.S | 90 | ||||
-rw-r--r-- | arch/ia64/lib/memcpy.S | 301 | ||||
-rw-r--r-- | arch/ia64/lib/memcpy_mck.S | 661 | ||||
-rw-r--r-- | arch/ia64/lib/memset.S | 362 | ||||
-rw-r--r-- | arch/ia64/lib/strlen.S | 192 | ||||
-rw-r--r-- | arch/ia64/lib/strlen_user.S | 198 | ||||
-rw-r--r-- | arch/ia64/lib/strncpy_from_user.S | 44 | ||||
-rw-r--r-- | arch/ia64/lib/strnlen_user.S | 45 | ||||
-rw-r--r-- | arch/ia64/lib/swiotlb.c | 658 | ||||
-rw-r--r-- | arch/ia64/lib/xor.S | 184 |
26 files changed, 5093 insertions, 0 deletions
diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile new file mode 100644 index 000000000000..1902c3c2ef92 --- /dev/null +++ b/arch/ia64/lib/Makefile | |||
@@ -0,0 +1,52 @@ | |||
1 | # | ||
2 | # Makefile for ia64-specific library routines.. | ||
3 | # | ||
4 | |||
5 | obj-y := io.o | ||
6 | |||
7 | lib-y := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \ | ||
8 | __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o \ | ||
9 | bitop.o checksum.o clear_page.o csum_partial_copy.o copy_page.o \ | ||
10 | clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o \ | ||
11 | flush.o ip_fast_csum.o do_csum.o \ | ||
12 | memset.o strlen.o swiotlb.o | ||
13 | |||
14 | lib-$(CONFIG_ITANIUM) += copy_page.o copy_user.o memcpy.o | ||
15 | lib-$(CONFIG_MCKINLEY) += copy_page_mck.o memcpy_mck.o | ||
16 | lib-$(CONFIG_PERFMON) += carta_random.o | ||
17 | lib-$(CONFIG_MD_RAID5) += xor.o | ||
18 | lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o | ||
19 | |||
20 | AFLAGS___divdi3.o = | ||
21 | AFLAGS___udivdi3.o = -DUNSIGNED | ||
22 | AFLAGS___moddi3.o = -DMODULO | ||
23 | AFLAGS___umoddi3.o = -DUNSIGNED -DMODULO | ||
24 | |||
25 | AFLAGS___divsi3.o = | ||
26 | AFLAGS___udivsi3.o = -DUNSIGNED | ||
27 | AFLAGS___modsi3.o = -DMODULO | ||
28 | AFLAGS___umodsi3.o = -DUNSIGNED -DMODULO | ||
29 | |||
30 | $(obj)/__divdi3.o: $(src)/idiv64.S FORCE | ||
31 | $(call if_changed_dep,as_o_S) | ||
32 | |||
33 | $(obj)/__udivdi3.o: $(src)/idiv64.S FORCE | ||
34 | $(call if_changed_dep,as_o_S) | ||
35 | |||
36 | $(obj)/__moddi3.o: $(src)/idiv64.S FORCE | ||
37 | $(call if_changed_dep,as_o_S) | ||
38 | |||
39 | $(obj)/__umoddi3.o: $(src)/idiv64.S FORCE | ||
40 | $(call if_changed_dep,as_o_S) | ||
41 | |||
42 | $(obj)/__divsi3.o: $(src)/idiv32.S FORCE | ||
43 | $(call if_changed_dep,as_o_S) | ||
44 | |||
45 | $(obj)/__udivsi3.o: $(src)/idiv32.S FORCE | ||
46 | $(call if_changed_dep,as_o_S) | ||
47 | |||
48 | $(obj)/__modsi3.o: $(src)/idiv32.S FORCE | ||
49 | $(call if_changed_dep,as_o_S) | ||
50 | |||
51 | $(obj)/__umodsi3.o: $(src)/idiv32.S FORCE | ||
52 | $(call if_changed_dep,as_o_S) | ||
diff --git a/arch/ia64/lib/bitop.c b/arch/ia64/lib/bitop.c new file mode 100644 index 000000000000..82e299c8464e --- /dev/null +++ b/arch/ia64/lib/bitop.c | |||
@@ -0,0 +1,88 @@ | |||
1 | #include <linux/compiler.h> | ||
2 | #include <linux/types.h> | ||
3 | #include <asm/intrinsics.h> | ||
4 | #include <linux/module.h> | ||
5 | #include <linux/bitops.h> | ||
6 | |||
7 | /* | ||
8 | * Find next zero bit in a bitmap reasonably efficiently.. | ||
9 | */ | ||
10 | |||
11 | int __find_next_zero_bit (const void *addr, unsigned long size, unsigned long offset) | ||
12 | { | ||
13 | unsigned long *p = ((unsigned long *) addr) + (offset >> 6); | ||
14 | unsigned long result = offset & ~63UL; | ||
15 | unsigned long tmp; | ||
16 | |||
17 | if (offset >= size) | ||
18 | return size; | ||
19 | size -= result; | ||
20 | offset &= 63UL; | ||
21 | if (offset) { | ||
22 | tmp = *(p++); | ||
23 | tmp |= ~0UL >> (64-offset); | ||
24 | if (size < 64) | ||
25 | goto found_first; | ||
26 | if (~tmp) | ||
27 | goto found_middle; | ||
28 | size -= 64; | ||
29 | result += 64; | ||
30 | } | ||
31 | while (size & ~63UL) { | ||
32 | if (~(tmp = *(p++))) | ||
33 | goto found_middle; | ||
34 | result += 64; | ||
35 | size -= 64; | ||
36 | } | ||
37 | if (!size) | ||
38 | return result; | ||
39 | tmp = *p; | ||
40 | found_first: | ||
41 | tmp |= ~0UL << size; | ||
42 | if (tmp == ~0UL) /* any bits zero? */ | ||
43 | return result + size; /* nope */ | ||
44 | found_middle: | ||
45 | return result + ffz(tmp); | ||
46 | } | ||
47 | EXPORT_SYMBOL(__find_next_zero_bit); | ||
48 | |||
49 | /* | ||
50 | * Find next bit in a bitmap reasonably efficiently.. | ||
51 | */ | ||
52 | int __find_next_bit(const void *addr, unsigned long size, unsigned long offset) | ||
53 | { | ||
54 | unsigned long *p = ((unsigned long *) addr) + (offset >> 6); | ||
55 | unsigned long result = offset & ~63UL; | ||
56 | unsigned long tmp; | ||
57 | |||
58 | if (offset >= size) | ||
59 | return size; | ||
60 | size -= result; | ||
61 | offset &= 63UL; | ||
62 | if (offset) { | ||
63 | tmp = *(p++); | ||
64 | tmp &= ~0UL << offset; | ||
65 | if (size < 64) | ||
66 | goto found_first; | ||
67 | if (tmp) | ||
68 | goto found_middle; | ||
69 | size -= 64; | ||
70 | result += 64; | ||
71 | } | ||
72 | while (size & ~63UL) { | ||
73 | if ((tmp = *(p++))) | ||
74 | goto found_middle; | ||
75 | result += 64; | ||
76 | size -= 64; | ||
77 | } | ||
78 | if (!size) | ||
79 | return result; | ||
80 | tmp = *p; | ||
81 | found_first: | ||
82 | tmp &= ~0UL >> (64-size); | ||
83 | if (tmp == 0UL) /* Are any bits set? */ | ||
84 | return result + size; /* Nope. */ | ||
85 | found_middle: | ||
86 | return result + __ffs(tmp); | ||
87 | } | ||
88 | EXPORT_SYMBOL(__find_next_bit); | ||
diff --git a/arch/ia64/lib/carta_random.S b/arch/ia64/lib/carta_random.S new file mode 100644 index 000000000000..d0674c360364 --- /dev/null +++ b/arch/ia64/lib/carta_random.S | |||
@@ -0,0 +1,54 @@ | |||
1 | /* | ||
2 | * Fast, simple, yet decent quality random number generator based on | ||
3 | * a paper by David G. Carta ("Two Fast Implementations of the | ||
4 | * `Minimal Standard' Random Number Generator," Communications of the | ||
5 | * ACM, January, 1990). | ||
6 | * | ||
7 | * Copyright (C) 2002 Hewlett-Packard Co | ||
8 | * David Mosberger-Tang <davidm@hpl.hp.com> | ||
9 | */ | ||
10 | |||
11 | #include <asm/asmmacro.h> | ||
12 | |||
13 | #define a r2 | ||
14 | #define m r3 | ||
15 | #define lo r8 | ||
16 | #define hi r9 | ||
17 | #define t0 r16 | ||
18 | #define t1 r17 | ||
19 | #define seed r32 | ||
20 | |||
21 | GLOBAL_ENTRY(carta_random32) | ||
22 | movl a = (16807 << 16) | 16807 | ||
23 | ;; | ||
24 | pmpyshr2.u t0 = a, seed, 0 | ||
25 | pmpyshr2.u t1 = a, seed, 16 | ||
26 | ;; | ||
27 | unpack2.l t0 = t1, t0 | ||
28 | dep m = -1, r0, 0, 31 | ||
29 | ;; | ||
30 | zxt4 lo = t0 | ||
31 | shr.u hi = t0, 32 | ||
32 | ;; | ||
33 | dep t0 = 0, hi, 15, 49 // t0 = (hi & 0x7fff) | ||
34 | ;; | ||
35 | shl t0 = t0, 16 // t0 = (hi & 0x7fff) << 16 | ||
36 | shr t1 = hi, 15 // t1 = (hi >> 15) | ||
37 | ;; | ||
38 | add lo = lo, t0 | ||
39 | ;; | ||
40 | cmp.gtu p6, p0 = lo, m | ||
41 | ;; | ||
42 | (p6) and lo = lo, m | ||
43 | ;; | ||
44 | (p6) add lo = 1, lo | ||
45 | ;; | ||
46 | add lo = lo, t1 | ||
47 | ;; | ||
48 | cmp.gtu p6, p0 = lo, m | ||
49 | ;; | ||
50 | (p6) and lo = lo, m | ||
51 | ;; | ||
52 | (p6) add lo = 1, lo | ||
53 | br.ret.sptk.many rp | ||
54 | END(carta_random32) | ||
diff --git a/arch/ia64/lib/checksum.c b/arch/ia64/lib/checksum.c new file mode 100644 index 000000000000..beb11721d9f5 --- /dev/null +++ b/arch/ia64/lib/checksum.c | |||
@@ -0,0 +1,102 @@ | |||
1 | /* | ||
2 | * Network checksum routines | ||
3 | * | ||
4 | * Copyright (C) 1999, 2003 Hewlett-Packard Co | ||
5 | * Stephane Eranian <eranian@hpl.hp.com> | ||
6 | * | ||
7 | * Most of the code coming from arch/alpha/lib/checksum.c | ||
8 | * | ||
9 | * This file contains network checksum routines that are better done | ||
10 | * in an architecture-specific manner due to speed.. | ||
11 | */ | ||
12 | |||
13 | #include <linux/module.h> | ||
14 | #include <linux/string.h> | ||
15 | |||
16 | #include <asm/byteorder.h> | ||
17 | |||
18 | static inline unsigned short | ||
19 | from64to16 (unsigned long x) | ||
20 | { | ||
21 | /* add up 32-bit words for 33 bits */ | ||
22 | x = (x & 0xffffffff) + (x >> 32); | ||
23 | /* add up 16-bit and 17-bit words for 17+c bits */ | ||
24 | x = (x & 0xffff) + (x >> 16); | ||
25 | /* add up 16-bit and 2-bit for 16+c bit */ | ||
26 | x = (x & 0xffff) + (x >> 16); | ||
27 | /* add up carry.. */ | ||
28 | x = (x & 0xffff) + (x >> 16); | ||
29 | return x; | ||
30 | } | ||
31 | |||
32 | /* | ||
33 | * computes the checksum of the TCP/UDP pseudo-header | ||
34 | * returns a 16-bit checksum, already complemented. | ||
35 | */ | ||
36 | unsigned short int | ||
37 | csum_tcpudp_magic (unsigned long saddr, unsigned long daddr, unsigned short len, | ||
38 | unsigned short proto, unsigned int sum) | ||
39 | { | ||
40 | return ~from64to16(saddr + daddr + sum + ((unsigned long) ntohs(len) << 16) + | ||
41 | ((unsigned long) proto << 8)); | ||
42 | } | ||
43 | |||
44 | EXPORT_SYMBOL(csum_tcpudp_magic); | ||
45 | |||
46 | unsigned int | ||
47 | csum_tcpudp_nofold (unsigned long saddr, unsigned long daddr, unsigned short len, | ||
48 | unsigned short proto, unsigned int sum) | ||
49 | { | ||
50 | unsigned long result; | ||
51 | |||
52 | result = (saddr + daddr + sum + | ||
53 | ((unsigned long) ntohs(len) << 16) + | ||
54 | ((unsigned long) proto << 8)); | ||
55 | |||
56 | /* Fold down to 32-bits so we don't lose in the typedef-less network stack. */ | ||
57 | /* 64 to 33 */ | ||
58 | result = (result & 0xffffffff) + (result >> 32); | ||
59 | /* 33 to 32 */ | ||
60 | result = (result & 0xffffffff) + (result >> 32); | ||
61 | return result; | ||
62 | } | ||
63 | |||
64 | extern unsigned long do_csum (const unsigned char *, long); | ||
65 | |||
66 | /* | ||
67 | * computes the checksum of a memory block at buff, length len, | ||
68 | * and adds in "sum" (32-bit) | ||
69 | * | ||
70 | * returns a 32-bit number suitable for feeding into itself | ||
71 | * or csum_tcpudp_magic | ||
72 | * | ||
73 | * this function must be called with even lengths, except | ||
74 | * for the last fragment, which may be odd | ||
75 | * | ||
76 | * it's best to have buff aligned on a 32-bit boundary | ||
77 | */ | ||
78 | unsigned int | ||
79 | csum_partial (const unsigned char * buff, int len, unsigned int sum) | ||
80 | { | ||
81 | unsigned long result = do_csum(buff, len); | ||
82 | |||
83 | /* add in old sum, and carry.. */ | ||
84 | result += sum; | ||
85 | /* 32+c bits -> 32 bits */ | ||
86 | result = (result & 0xffffffff) + (result >> 32); | ||
87 | return result; | ||
88 | } | ||
89 | |||
90 | EXPORT_SYMBOL(csum_partial); | ||
91 | |||
92 | /* | ||
93 | * this routine is used for miscellaneous IP-like checksums, mainly | ||
94 | * in icmp.c | ||
95 | */ | ||
96 | unsigned short | ||
97 | ip_compute_csum (unsigned char * buff, int len) | ||
98 | { | ||
99 | return ~do_csum(buff,len); | ||
100 | } | ||
101 | |||
102 | EXPORT_SYMBOL(ip_compute_csum); | ||
diff --git a/arch/ia64/lib/clear_page.S b/arch/ia64/lib/clear_page.S new file mode 100644 index 000000000000..d4987061dda7 --- /dev/null +++ b/arch/ia64/lib/clear_page.S | |||
@@ -0,0 +1,77 @@ | |||
1 | /* | ||
2 | * Copyright (C) 1999-2002 Hewlett-Packard Co | ||
3 | * Stephane Eranian <eranian@hpl.hp.com> | ||
4 | * David Mosberger-Tang <davidm@hpl.hp.com> | ||
5 | * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com> | ||
6 | * | ||
7 | * 1/06/01 davidm Tuned for Itanium. | ||
8 | * 2/12/02 kchen Tuned for both Itanium and McKinley | ||
9 | * 3/08/02 davidm Some more tweaking | ||
10 | */ | ||
11 | #include <linux/config.h> | ||
12 | |||
13 | #include <asm/asmmacro.h> | ||
14 | #include <asm/page.h> | ||
15 | |||
16 | #ifdef CONFIG_ITANIUM | ||
17 | # define L3_LINE_SIZE 64 // Itanium L3 line size | ||
18 | # define PREFETCH_LINES 9 // magic number | ||
19 | #else | ||
20 | # define L3_LINE_SIZE 128 // McKinley L3 line size | ||
21 | # define PREFETCH_LINES 12 // magic number | ||
22 | #endif | ||
23 | |||
24 | #define saved_lc r2 | ||
25 | #define dst_fetch r3 | ||
26 | #define dst1 r8 | ||
27 | #define dst2 r9 | ||
28 | #define dst3 r10 | ||
29 | #define dst4 r11 | ||
30 | |||
31 | #define dst_last r31 | ||
32 | |||
33 | GLOBAL_ENTRY(clear_page) | ||
34 | .prologue | ||
35 | .regstk 1,0,0,0 | ||
36 | mov r16 = PAGE_SIZE/L3_LINE_SIZE-1 // main loop count, -1=repeat/until | ||
37 | .save ar.lc, saved_lc | ||
38 | mov saved_lc = ar.lc | ||
39 | |||
40 | .body | ||
41 | mov ar.lc = (PREFETCH_LINES - 1) | ||
42 | mov dst_fetch = in0 | ||
43 | adds dst1 = 16, in0 | ||
44 | adds dst2 = 32, in0 | ||
45 | ;; | ||
46 | .fetch: stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE | ||
47 | adds dst3 = 48, in0 // executing this multiple times is harmless | ||
48 | br.cloop.sptk.few .fetch | ||
49 | ;; | ||
50 | addl dst_last = (PAGE_SIZE - PREFETCH_LINES*L3_LINE_SIZE), dst_fetch | ||
51 | mov ar.lc = r16 // one L3 line per iteration | ||
52 | adds dst4 = 64, in0 | ||
53 | ;; | ||
54 | #ifdef CONFIG_ITANIUM | ||
55 | // Optimized for Itanium | ||
56 | 1: stf.spill.nta [dst1] = f0, 64 | ||
57 | stf.spill.nta [dst2] = f0, 64 | ||
58 | cmp.lt p8,p0=dst_fetch, dst_last | ||
59 | ;; | ||
60 | #else | ||
61 | // Optimized for McKinley | ||
62 | 1: stf.spill.nta [dst1] = f0, 64 | ||
63 | stf.spill.nta [dst2] = f0, 64 | ||
64 | stf.spill.nta [dst3] = f0, 64 | ||
65 | stf.spill.nta [dst4] = f0, 128 | ||
66 | cmp.lt p8,p0=dst_fetch, dst_last | ||
67 | ;; | ||
68 | stf.spill.nta [dst1] = f0, 64 | ||
69 | stf.spill.nta [dst2] = f0, 64 | ||
70 | #endif | ||
71 | stf.spill.nta [dst3] = f0, 64 | ||
72 | (p8) stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE | ||
73 | br.cloop.sptk.few 1b | ||
74 | ;; | ||
75 | mov ar.lc = saved_lc // restore lc | ||
76 | br.ret.sptk.many rp | ||
77 | END(clear_page) | ||
diff --git a/arch/ia64/lib/clear_user.S b/arch/ia64/lib/clear_user.S new file mode 100644 index 000000000000..eecd8577b209 --- /dev/null +++ b/arch/ia64/lib/clear_user.S | |||
@@ -0,0 +1,209 @@ | |||
1 | /* | ||
2 | * This routine clears to zero a linear memory buffer in user space. | ||
3 | * | ||
4 | * Inputs: | ||
5 | * in0: address of buffer | ||
6 | * in1: length of buffer in bytes | ||
7 | * Outputs: | ||
8 | * r8: number of bytes that didn't get cleared due to a fault | ||
9 | * | ||
10 | * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co | ||
11 | * Stephane Eranian <eranian@hpl.hp.com> | ||
12 | */ | ||
13 | |||
14 | #include <asm/asmmacro.h> | ||
15 | |||
16 | // | ||
17 | // arguments | ||
18 | // | ||
19 | #define buf r32 | ||
20 | #define len r33 | ||
21 | |||
22 | // | ||
23 | // local registers | ||
24 | // | ||
25 | #define cnt r16 | ||
26 | #define buf2 r17 | ||
27 | #define saved_lc r18 | ||
28 | #define saved_pfs r19 | ||
29 | #define tmp r20 | ||
30 | #define len2 r21 | ||
31 | #define len3 r22 | ||
32 | |||
33 | // | ||
34 | // Theory of operations: | ||
35 | // - we check whether or not the buffer is small, i.e., less than 17 | ||
36 | // in which case we do the byte by byte loop. | ||
37 | // | ||
38 | // - Otherwise we go progressively from 1 byte store to 8byte store in | ||
39 | // the head part, the body is a 16byte store loop and we finish we the | ||
40 | // tail for the last 15 bytes. | ||
41 | // The good point about this breakdown is that the long buffer handling | ||
42 | // contains only 2 branches. | ||
43 | // | ||
44 | // The reason for not using shifting & masking for both the head and the | ||
45 | // tail is to stay semantically correct. This routine is not supposed | ||
46 | // to write bytes outside of the buffer. While most of the time this would | ||
47 | // be ok, we can't tolerate a mistake. A classical example is the case | ||
48 | // of multithreaded code were to the extra bytes touched is actually owned | ||
49 | // by another thread which runs concurrently to ours. Another, less likely, | ||
50 | // example is with device drivers where reading an I/O mapped location may | ||
51 | // have side effects (same thing for writing). | ||
52 | // | ||
53 | |||
54 | GLOBAL_ENTRY(__do_clear_user) | ||
55 | .prologue | ||
56 | .save ar.pfs, saved_pfs | ||
57 | alloc saved_pfs=ar.pfs,2,0,0,0 | ||
58 | cmp.eq p6,p0=r0,len // check for zero length | ||
59 | .save ar.lc, saved_lc | ||
60 | mov saved_lc=ar.lc // preserve ar.lc (slow) | ||
61 | .body | ||
62 | ;; // avoid WAW on CFM | ||
63 | adds tmp=-1,len // br.ctop is repeat/until | ||
64 | mov ret0=len // return value is length at this point | ||
65 | (p6) br.ret.spnt.many rp | ||
66 | ;; | ||
67 | cmp.lt p6,p0=16,len // if len > 16 then long memset | ||
68 | mov ar.lc=tmp // initialize lc for small count | ||
69 | (p6) br.cond.dptk .long_do_clear | ||
70 | ;; // WAR on ar.lc | ||
71 | // | ||
72 | // worst case 16 iterations, avg 8 iterations | ||
73 | // | ||
74 | // We could have played with the predicates to use the extra | ||
75 | // M slot for 2 stores/iteration but the cost the initialization | ||
76 | // the various counters compared to how long the loop is supposed | ||
77 | // to last on average does not make this solution viable. | ||
78 | // | ||
79 | 1: | ||
80 | EX( .Lexit1, st1 [buf]=r0,1 ) | ||
81 | adds len=-1,len // countdown length using len | ||
82 | br.cloop.dptk 1b | ||
83 | ;; // avoid RAW on ar.lc | ||
84 | // | ||
85 | // .Lexit4: comes from byte by byte loop | ||
86 | // len contains bytes left | ||
87 | .Lexit1: | ||
88 | mov ret0=len // faster than using ar.lc | ||
89 | mov ar.lc=saved_lc | ||
90 | br.ret.sptk.many rp // end of short clear_user | ||
91 | |||
92 | |||
93 | // | ||
94 | // At this point we know we have more than 16 bytes to copy | ||
95 | // so we focus on alignment (no branches required) | ||
96 | // | ||
97 | // The use of len/len2 for countdown of the number of bytes left | ||
98 | // instead of ret0 is due to the fact that the exception code | ||
99 | // changes the values of r8. | ||
100 | // | ||
101 | .long_do_clear: | ||
102 | tbit.nz p6,p0=buf,0 // odd alignment (for long_do_clear) | ||
103 | ;; | ||
104 | EX( .Lexit3, (p6) st1 [buf]=r0,1 ) // 1-byte aligned | ||
105 | (p6) adds len=-1,len;; // sync because buf is modified | ||
106 | tbit.nz p6,p0=buf,1 | ||
107 | ;; | ||
108 | EX( .Lexit3, (p6) st2 [buf]=r0,2 ) // 2-byte aligned | ||
109 | (p6) adds len=-2,len;; | ||
110 | tbit.nz p6,p0=buf,2 | ||
111 | ;; | ||
112 | EX( .Lexit3, (p6) st4 [buf]=r0,4 ) // 4-byte aligned | ||
113 | (p6) adds len=-4,len;; | ||
114 | tbit.nz p6,p0=buf,3 | ||
115 | ;; | ||
116 | EX( .Lexit3, (p6) st8 [buf]=r0,8 ) // 8-byte aligned | ||
117 | (p6) adds len=-8,len;; | ||
118 | shr.u cnt=len,4 // number of 128-bit (2x64bit) words | ||
119 | ;; | ||
120 | cmp.eq p6,p0=r0,cnt | ||
121 | adds tmp=-1,cnt | ||
122 | (p6) br.cond.dpnt .dotail // we have less than 16 bytes left | ||
123 | ;; | ||
124 | adds buf2=8,buf // setup second base pointer | ||
125 | mov ar.lc=tmp | ||
126 | ;; | ||
127 | |||
128 | // | ||
129 | // 16bytes/iteration core loop | ||
130 | // | ||
131 | // The second store can never generate a fault because | ||
132 | // we come into the loop only when we are 16-byte aligned. | ||
133 | // This means that if we cross a page then it will always be | ||
134 | // in the first store and never in the second. | ||
135 | // | ||
136 | // | ||
137 | // We need to keep track of the remaining length. A possible (optimistic) | ||
138 | // way would be to use ar.lc and derive how many byte were left by | ||
139 | // doing : left= 16*ar.lc + 16. this would avoid the addition at | ||
140 | // every iteration. | ||
141 | // However we need to keep the synchronization point. A template | ||
142 | // M;;MB does not exist and thus we can keep the addition at no | ||
143 | // extra cycle cost (use a nop slot anyway). It also simplifies the | ||
144 | // (unlikely) error recovery code | ||
145 | // | ||
146 | |||
147 | 2: EX(.Lexit3, st8 [buf]=r0,16 ) | ||
148 | ;; // needed to get len correct when error | ||
149 | st8 [buf2]=r0,16 | ||
150 | adds len=-16,len | ||
151 | br.cloop.dptk 2b | ||
152 | ;; | ||
153 | mov ar.lc=saved_lc | ||
154 | // | ||
155 | // tail correction based on len only | ||
156 | // | ||
157 | // We alternate the use of len3,len2 to allow parallelism and correct | ||
158 | // error handling. We also reuse p6/p7 to return correct value. | ||
159 | // The addition of len2/len3 does not cost anything more compared to | ||
160 | // the regular memset as we had empty slots. | ||
161 | // | ||
162 | .dotail: | ||
163 | mov len2=len // for parallelization of error handling | ||
164 | mov len3=len | ||
165 | tbit.nz p6,p0=len,3 | ||
166 | ;; | ||
167 | EX( .Lexit2, (p6) st8 [buf]=r0,8 ) // at least 8 bytes | ||
168 | (p6) adds len3=-8,len2 | ||
169 | tbit.nz p7,p6=len,2 | ||
170 | ;; | ||
171 | EX( .Lexit2, (p7) st4 [buf]=r0,4 ) // at least 4 bytes | ||
172 | (p7) adds len2=-4,len3 | ||
173 | tbit.nz p6,p7=len,1 | ||
174 | ;; | ||
175 | EX( .Lexit2, (p6) st2 [buf]=r0,2 ) // at least 2 bytes | ||
176 | (p6) adds len3=-2,len2 | ||
177 | tbit.nz p7,p6=len,0 | ||
178 | ;; | ||
179 | EX( .Lexit2, (p7) st1 [buf]=r0 ) // only 1 byte left | ||
180 | mov ret0=r0 // success | ||
181 | br.ret.sptk.many rp // end of most likely path | ||
182 | |||
183 | // | ||
184 | // Outlined error handling code | ||
185 | // | ||
186 | |||
187 | // | ||
188 | // .Lexit3: comes from core loop, need restore pr/lc | ||
189 | // len contains bytes left | ||
190 | // | ||
191 | // | ||
192 | // .Lexit2: | ||
193 | // if p6 -> coming from st8 or st2 : len2 contains what's left | ||
194 | // if p7 -> coming from st4 or st1 : len3 contains what's left | ||
195 | // We must restore lc/pr even though might not have been used. | ||
196 | .Lexit2: | ||
197 | .pred.rel "mutex", p6, p7 | ||
198 | (p6) mov len=len2 | ||
199 | (p7) mov len=len3 | ||
200 | ;; | ||
201 | // | ||
202 | // .Lexit4: comes from head, need not restore pr/lc | ||
203 | // len contains bytes left | ||
204 | // | ||
205 | .Lexit3: | ||
206 | mov ret0=len | ||
207 | mov ar.lc=saved_lc | ||
208 | br.ret.sptk.many rp | ||
209 | END(__do_clear_user) | ||
diff --git a/arch/ia64/lib/copy_page.S b/arch/ia64/lib/copy_page.S new file mode 100644 index 000000000000..127d1d050d78 --- /dev/null +++ b/arch/ia64/lib/copy_page.S | |||
@@ -0,0 +1,98 @@ | |||
1 | /* | ||
2 | * | ||
3 | * Optimized version of the standard copy_page() function | ||
4 | * | ||
5 | * Inputs: | ||
6 | * in0: address of target page | ||
7 | * in1: address of source page | ||
8 | * Output: | ||
9 | * no return value | ||
10 | * | ||
11 | * Copyright (C) 1999, 2001 Hewlett-Packard Co | ||
12 | * Stephane Eranian <eranian@hpl.hp.com> | ||
13 | * David Mosberger <davidm@hpl.hp.com> | ||
14 | * | ||
15 | * 4/06/01 davidm Tuned to make it perform well both for cached and uncached copies. | ||
16 | */ | ||
17 | #include <asm/asmmacro.h> | ||
18 | #include <asm/page.h> | ||
19 | |||
20 | #define PIPE_DEPTH 3 | ||
21 | #define EPI p[PIPE_DEPTH-1] | ||
22 | |||
23 | #define lcount r16 | ||
24 | #define saved_pr r17 | ||
25 | #define saved_lc r18 | ||
26 | #define saved_pfs r19 | ||
27 | #define src1 r20 | ||
28 | #define src2 r21 | ||
29 | #define tgt1 r22 | ||
30 | #define tgt2 r23 | ||
31 | #define srcf r24 | ||
32 | #define tgtf r25 | ||
33 | #define tgt_last r26 | ||
34 | |||
35 | #define Nrot ((8*PIPE_DEPTH+7)&~7) | ||
36 | |||
37 | GLOBAL_ENTRY(copy_page) | ||
38 | .prologue | ||
39 | .save ar.pfs, saved_pfs | ||
40 | alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot | ||
41 | |||
42 | .rotr t1[PIPE_DEPTH], t2[PIPE_DEPTH], t3[PIPE_DEPTH], t4[PIPE_DEPTH], \ | ||
43 | t5[PIPE_DEPTH], t6[PIPE_DEPTH], t7[PIPE_DEPTH], t8[PIPE_DEPTH] | ||
44 | .rotp p[PIPE_DEPTH] | ||
45 | |||
46 | .save ar.lc, saved_lc | ||
47 | mov saved_lc=ar.lc | ||
48 | mov ar.ec=PIPE_DEPTH | ||
49 | |||
50 | mov lcount=PAGE_SIZE/64-1 | ||
51 | .save pr, saved_pr | ||
52 | mov saved_pr=pr | ||
53 | mov pr.rot=1<<16 | ||
54 | |||
55 | .body | ||
56 | |||
57 | mov src1=in1 | ||
58 | adds src2=8,in1 | ||
59 | mov tgt_last = PAGE_SIZE | ||
60 | ;; | ||
61 | adds tgt2=8,in0 | ||
62 | add srcf=512,in1 | ||
63 | mov ar.lc=lcount | ||
64 | mov tgt1=in0 | ||
65 | add tgtf=512,in0 | ||
66 | add tgt_last = tgt_last, in0 | ||
67 | ;; | ||
68 | 1: | ||
69 | (p[0]) ld8 t1[0]=[src1],16 | ||
70 | (EPI) st8 [tgt1]=t1[PIPE_DEPTH-1],16 | ||
71 | (p[0]) ld8 t2[0]=[src2],16 | ||
72 | (EPI) st8 [tgt2]=t2[PIPE_DEPTH-1],16 | ||
73 | cmp.ltu p6,p0 = tgtf, tgt_last | ||
74 | ;; | ||
75 | (p[0]) ld8 t3[0]=[src1],16 | ||
76 | (EPI) st8 [tgt1]=t3[PIPE_DEPTH-1],16 | ||
77 | (p[0]) ld8 t4[0]=[src2],16 | ||
78 | (EPI) st8 [tgt2]=t4[PIPE_DEPTH-1],16 | ||
79 | ;; | ||
80 | (p[0]) ld8 t5[0]=[src1],16 | ||
81 | (EPI) st8 [tgt1]=t5[PIPE_DEPTH-1],16 | ||
82 | (p[0]) ld8 t6[0]=[src2],16 | ||
83 | (EPI) st8 [tgt2]=t6[PIPE_DEPTH-1],16 | ||
84 | ;; | ||
85 | (p[0]) ld8 t7[0]=[src1],16 | ||
86 | (EPI) st8 [tgt1]=t7[PIPE_DEPTH-1],16 | ||
87 | (p[0]) ld8 t8[0]=[src2],16 | ||
88 | (EPI) st8 [tgt2]=t8[PIPE_DEPTH-1],16 | ||
89 | |||
90 | (p6) lfetch [srcf], 64 | ||
91 | (p6) lfetch [tgtf], 64 | ||
92 | br.ctop.sptk.few 1b | ||
93 | ;; | ||
94 | mov pr=saved_pr,0xffffffffffff0000 // restore predicates | ||
95 | mov ar.pfs=saved_pfs | ||
96 | mov ar.lc=saved_lc | ||
97 | br.ret.sptk.many rp | ||
98 | END(copy_page) | ||
diff --git a/arch/ia64/lib/copy_page_mck.S b/arch/ia64/lib/copy_page_mck.S new file mode 100644 index 000000000000..3c45d60a81b4 --- /dev/null +++ b/arch/ia64/lib/copy_page_mck.S | |||
@@ -0,0 +1,185 @@ | |||
1 | /* | ||
2 | * McKinley-optimized version of copy_page(). | ||
3 | * | ||
4 | * Copyright (C) 2002 Hewlett-Packard Co | ||
5 | * David Mosberger <davidm@hpl.hp.com> | ||
6 | * | ||
7 | * Inputs: | ||
8 | * in0: address of target page | ||
9 | * in1: address of source page | ||
10 | * Output: | ||
11 | * no return value | ||
12 | * | ||
13 | * General idea: | ||
14 | * - use regular loads and stores to prefetch data to avoid consuming M-slot just for | ||
15 | * lfetches => good for in-cache performance | ||
16 | * - avoid l2 bank-conflicts by not storing into the same 16-byte bank within a single | ||
17 | * cycle | ||
18 | * | ||
19 | * Principle of operation: | ||
20 | * First, note that L1 has a line-size of 64 bytes and L2 a line-size of 128 bytes. | ||
21 | * To avoid secondary misses in L2, we prefetch both source and destination with a line-size | ||
22 | * of 128 bytes. When both of these lines are in the L2 and the first half of the | ||
23 | * source line is in L1, we start copying the remaining words. The second half of the | ||
24 | * source line is prefetched in an earlier iteration, so that by the time we start | ||
25 | * accessing it, it's also present in the L1. | ||
26 | * | ||
27 | * We use a software-pipelined loop to control the overall operation. The pipeline | ||
28 | * has 2*PREFETCH_DIST+K stages. The first PREFETCH_DIST stages are used for prefetching | ||
29 | * source cache-lines. The second PREFETCH_DIST stages are used for prefetching destination | ||
30 | * cache-lines, the last K stages are used to copy the cache-line words not copied by | ||
31 | * the prefetches. The four relevant points in the pipelined are called A, B, C, D: | ||
32 | * p[A] is TRUE if a source-line should be prefetched, p[B] is TRUE if a destination-line | ||
33 | * should be prefetched, p[C] is TRUE if the second half of an L2 line should be brought | ||
34 | * into L1D and p[D] is TRUE if a cacheline needs to be copied. | ||
35 | * | ||
36 | * This all sounds very complicated, but thanks to the modulo-scheduled loop support, | ||
37 | * the resulting code is very regular and quite easy to follow (once you get the idea). | ||
38 | * | ||
39 | * As a secondary optimization, the first 2*PREFETCH_DIST iterations are implemented | ||
40 | * as the separate .prefetch_loop. Logically, this loop performs exactly like the | ||
41 | * main-loop (.line_copy), but has all known-to-be-predicated-off instructions removed, | ||
42 | * so that each loop iteration is faster (again, good for cached case). | ||
43 | * | ||
44 | * When reading the code, it helps to keep the following picture in mind: | ||
45 | * | ||
46 | * word 0 word 1 | ||
47 | * +------+------+--- | ||
48 | * | v[x] | t1 | ^ | ||
49 | * | t2 | t3 | | | ||
50 | * | t4 | t5 | | | ||
51 | * | t6 | t7 | | 128 bytes | ||
52 | * | n[y] | t9 | | (L2 cache line) | ||
53 | * | t10 | t11 | | | ||
54 | * | t12 | t13 | | | ||
55 | * | t14 | t15 | v | ||
56 | * +------+------+--- | ||
57 | * | ||
58 | * Here, v[x] is copied by the (memory) prefetch. n[y] is loaded at p[C] | ||
59 | * to fetch the second-half of the L2 cache line into L1, and the tX words are copied in | ||
60 | * an order that avoids bank conflicts. | ||
61 | */ | ||
62 | #include <asm/asmmacro.h> | ||
63 | #include <asm/page.h> | ||
64 | |||
65 | #define PREFETCH_DIST 8 // McKinley sustains 16 outstanding L2 misses (8 ld, 8 st) | ||
66 | |||
67 | #define src0 r2 | ||
68 | #define src1 r3 | ||
69 | #define dst0 r9 | ||
70 | #define dst1 r10 | ||
71 | #define src_pre_mem r11 | ||
72 | #define dst_pre_mem r14 | ||
73 | #define src_pre_l2 r15 | ||
74 | #define dst_pre_l2 r16 | ||
75 | #define t1 r17 | ||
76 | #define t2 r18 | ||
77 | #define t3 r19 | ||
78 | #define t4 r20 | ||
79 | #define t5 t1 // alias! | ||
80 | #define t6 t2 // alias! | ||
81 | #define t7 t3 // alias! | ||
82 | #define t9 t5 // alias! | ||
83 | #define t10 t4 // alias! | ||
84 | #define t11 t7 // alias! | ||
85 | #define t12 t6 // alias! | ||
86 | #define t14 t10 // alias! | ||
87 | #define t13 r21 | ||
88 | #define t15 r22 | ||
89 | |||
90 | #define saved_lc r23 | ||
91 | #define saved_pr r24 | ||
92 | |||
93 | #define A 0 | ||
94 | #define B (PREFETCH_DIST) | ||
95 | #define C (B + PREFETCH_DIST) | ||
96 | #define D (C + 3) | ||
97 | #define N (D + 1) | ||
98 | #define Nrot ((N + 7) & ~7) | ||
99 | |||
100 | GLOBAL_ENTRY(copy_page) | ||
101 | .prologue | ||
102 | alloc r8 = ar.pfs, 2, Nrot-2, 0, Nrot | ||
103 | |||
104 | .rotr v[2*PREFETCH_DIST], n[D-C+1] | ||
105 | .rotp p[N] | ||
106 | |||
107 | .save ar.lc, saved_lc | ||
108 | mov saved_lc = ar.lc | ||
109 | .save pr, saved_pr | ||
110 | mov saved_pr = pr | ||
111 | .body | ||
112 | |||
113 | mov src_pre_mem = in1 | ||
114 | mov pr.rot = 0x10000 | ||
115 | mov ar.ec = 1 // special unrolled loop | ||
116 | |||
117 | mov dst_pre_mem = in0 | ||
118 | mov ar.lc = 2*PREFETCH_DIST - 1 | ||
119 | |||
120 | add src_pre_l2 = 8*8, in1 | ||
121 | add dst_pre_l2 = 8*8, in0 | ||
122 | add src0 = 8, in1 // first t1 src | ||
123 | add src1 = 3*8, in1 // first t3 src | ||
124 | add dst0 = 8, in0 // first t1 dst | ||
125 | add dst1 = 3*8, in0 // first t3 dst | ||
126 | mov t1 = (PAGE_SIZE/128) - (2*PREFETCH_DIST) - 1 | ||
127 | nop.m 0 | ||
128 | nop.i 0 | ||
129 | ;; | ||
130 | // same as .line_copy loop, but with all predicated-off instructions removed: | ||
131 | .prefetch_loop: | ||
132 | (p[A]) ld8 v[A] = [src_pre_mem], 128 // M0 | ||
133 | (p[B]) st8 [dst_pre_mem] = v[B], 128 // M2 | ||
134 | br.ctop.sptk .prefetch_loop | ||
135 | ;; | ||
136 | cmp.eq p16, p0 = r0, r0 // reset p16 to 1 (br.ctop cleared it to zero) | ||
137 | mov ar.lc = t1 // with 64KB pages, t1 is too big to fit in 8 bits! | ||
138 | mov ar.ec = N // # of stages in pipeline | ||
139 | ;; | ||
140 | .line_copy: | ||
141 | (p[D]) ld8 t2 = [src0], 3*8 // M0 | ||
142 | (p[D]) ld8 t4 = [src1], 3*8 // M1 | ||
143 | (p[B]) st8 [dst_pre_mem] = v[B], 128 // M2 prefetch dst from memory | ||
144 | (p[D]) st8 [dst_pre_l2] = n[D-C], 128 // M3 prefetch dst from L2 | ||
145 | ;; | ||
146 | (p[A]) ld8 v[A] = [src_pre_mem], 128 // M0 prefetch src from memory | ||
147 | (p[C]) ld8 n[0] = [src_pre_l2], 128 // M1 prefetch src from L2 | ||
148 | (p[D]) st8 [dst0] = t1, 8 // M2 | ||
149 | (p[D]) st8 [dst1] = t3, 8 // M3 | ||
150 | ;; | ||
151 | (p[D]) ld8 t5 = [src0], 8 | ||
152 | (p[D]) ld8 t7 = [src1], 3*8 | ||
153 | (p[D]) st8 [dst0] = t2, 3*8 | ||
154 | (p[D]) st8 [dst1] = t4, 3*8 | ||
155 | ;; | ||
156 | (p[D]) ld8 t6 = [src0], 3*8 | ||
157 | (p[D]) ld8 t10 = [src1], 8 | ||
158 | (p[D]) st8 [dst0] = t5, 8 | ||
159 | (p[D]) st8 [dst1] = t7, 3*8 | ||
160 | ;; | ||
161 | (p[D]) ld8 t9 = [src0], 3*8 | ||
162 | (p[D]) ld8 t11 = [src1], 3*8 | ||
163 | (p[D]) st8 [dst0] = t6, 3*8 | ||
164 | (p[D]) st8 [dst1] = t10, 8 | ||
165 | ;; | ||
166 | (p[D]) ld8 t12 = [src0], 8 | ||
167 | (p[D]) ld8 t14 = [src1], 8 | ||
168 | (p[D]) st8 [dst0] = t9, 3*8 | ||
169 | (p[D]) st8 [dst1] = t11, 3*8 | ||
170 | ;; | ||
171 | (p[D]) ld8 t13 = [src0], 4*8 | ||
172 | (p[D]) ld8 t15 = [src1], 4*8 | ||
173 | (p[D]) st8 [dst0] = t12, 8 | ||
174 | (p[D]) st8 [dst1] = t14, 8 | ||
175 | ;; | ||
176 | (p[D-1])ld8 t1 = [src0], 8 | ||
177 | (p[D-1])ld8 t3 = [src1], 8 | ||
178 | (p[D]) st8 [dst0] = t13, 4*8 | ||
179 | (p[D]) st8 [dst1] = t15, 4*8 | ||
180 | br.ctop.sptk .line_copy | ||
181 | ;; | ||
182 | mov ar.lc = saved_lc | ||
183 | mov pr = saved_pr, -1 | ||
184 | br.ret.sptk.many rp | ||
185 | END(copy_page) | ||
diff --git a/arch/ia64/lib/copy_user.S b/arch/ia64/lib/copy_user.S new file mode 100644 index 000000000000..c952bdc6a093 --- /dev/null +++ b/arch/ia64/lib/copy_user.S | |||
@@ -0,0 +1,610 @@ | |||
1 | /* | ||
2 | * | ||
3 | * Optimized version of the copy_user() routine. | ||
4 | * It is used to copy date across the kernel/user boundary. | ||
5 | * | ||
6 | * The source and destination are always on opposite side of | ||
7 | * the boundary. When reading from user space we must catch | ||
8 | * faults on loads. When writing to user space we must catch | ||
9 | * errors on stores. Note that because of the nature of the copy | ||
10 | * we don't need to worry about overlapping regions. | ||
11 | * | ||
12 | * | ||
13 | * Inputs: | ||
14 | * in0 address of source buffer | ||
15 | * in1 address of destination buffer | ||
16 | * in2 number of bytes to copy | ||
17 | * | ||
18 | * Outputs: | ||
19 | * ret0 0 in case of success. The number of bytes NOT copied in | ||
20 | * case of error. | ||
21 | * | ||
22 | * Copyright (C) 2000-2001 Hewlett-Packard Co | ||
23 | * Stephane Eranian <eranian@hpl.hp.com> | ||
24 | * | ||
25 | * Fixme: | ||
26 | * - handle the case where we have more than 16 bytes and the alignment | ||
27 | * are different. | ||
28 | * - more benchmarking | ||
29 | * - fix extraneous stop bit introduced by the EX() macro. | ||
30 | */ | ||
31 | |||
32 | #include <asm/asmmacro.h> | ||
33 | |||
34 | // | ||
35 | // Tuneable parameters | ||
36 | // | ||
37 | #define COPY_BREAK 16 // we do byte copy below (must be >=16) | ||
38 | #define PIPE_DEPTH 21 // pipe depth | ||
39 | |||
40 | #define EPI p[PIPE_DEPTH-1] | ||
41 | |||
42 | // | ||
43 | // arguments | ||
44 | // | ||
45 | #define dst in0 | ||
46 | #define src in1 | ||
47 | #define len in2 | ||
48 | |||
49 | // | ||
50 | // local registers | ||
51 | // | ||
52 | #define t1 r2 // rshift in bytes | ||
53 | #define t2 r3 // lshift in bytes | ||
54 | #define rshift r14 // right shift in bits | ||
55 | #define lshift r15 // left shift in bits | ||
56 | #define word1 r16 | ||
57 | #define word2 r17 | ||
58 | #define cnt r18 | ||
59 | #define len2 r19 | ||
60 | #define saved_lc r20 | ||
61 | #define saved_pr r21 | ||
62 | #define tmp r22 | ||
63 | #define val r23 | ||
64 | #define src1 r24 | ||
65 | #define dst1 r25 | ||
66 | #define src2 r26 | ||
67 | #define dst2 r27 | ||
68 | #define len1 r28 | ||
69 | #define enddst r29 | ||
70 | #define endsrc r30 | ||
71 | #define saved_pfs r31 | ||
72 | |||
73 | GLOBAL_ENTRY(__copy_user) | ||
74 | .prologue | ||
75 | .save ar.pfs, saved_pfs | ||
76 | alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7) | ||
77 | |||
78 | .rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH] | ||
79 | .rotp p[PIPE_DEPTH] | ||
80 | |||
81 | adds len2=-1,len // br.ctop is repeat/until | ||
82 | mov ret0=r0 | ||
83 | |||
84 | ;; // RAW of cfm when len=0 | ||
85 | cmp.eq p8,p0=r0,len // check for zero length | ||
86 | .save ar.lc, saved_lc | ||
87 | mov saved_lc=ar.lc // preserve ar.lc (slow) | ||
88 | (p8) br.ret.spnt.many rp // empty mempcy() | ||
89 | ;; | ||
90 | add enddst=dst,len // first byte after end of source | ||
91 | add endsrc=src,len // first byte after end of destination | ||
92 | .save pr, saved_pr | ||
93 | mov saved_pr=pr // preserve predicates | ||
94 | |||
95 | .body | ||
96 | |||
97 | mov dst1=dst // copy because of rotation | ||
98 | mov ar.ec=PIPE_DEPTH | ||
99 | mov pr.rot=1<<16 // p16=true all others are false | ||
100 | |||
101 | mov src1=src // copy because of rotation | ||
102 | mov ar.lc=len2 // initialize lc for small count | ||
103 | cmp.lt p10,p7=COPY_BREAK,len // if len > COPY_BREAK then long copy | ||
104 | |||
105 | xor tmp=src,dst // same alignment test prepare | ||
106 | (p10) br.cond.dptk .long_copy_user | ||
107 | ;; // RAW pr.rot/p16 ? | ||
108 | // | ||
109 | // Now we do the byte by byte loop with software pipeline | ||
110 | // | ||
111 | // p7 is necessarily false by now | ||
112 | 1: | ||
113 | EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1) | ||
114 | EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1) | ||
115 | br.ctop.dptk.few 1b | ||
116 | ;; | ||
117 | mov ar.lc=saved_lc | ||
118 | mov pr=saved_pr,0xffffffffffff0000 | ||
119 | mov ar.pfs=saved_pfs // restore ar.ec | ||
120 | br.ret.sptk.many rp // end of short memcpy | ||
121 | |||
122 | // | ||
123 | // Not 8-byte aligned | ||
124 | // | ||
125 | .diff_align_copy_user: | ||
126 | // At this point we know we have more than 16 bytes to copy | ||
127 | // and also that src and dest do _not_ have the same alignment. | ||
128 | and src2=0x7,src1 // src offset | ||
129 | and dst2=0x7,dst1 // dst offset | ||
130 | ;; | ||
131 | // The basic idea is that we copy byte-by-byte at the head so | ||
132 | // that we can reach 8-byte alignment for both src1 and dst1. | ||
133 | // Then copy the body using software pipelined 8-byte copy, | ||
134 | // shifting the two back-to-back words right and left, then copy | ||
135 | // the tail by copying byte-by-byte. | ||
136 | // | ||
137 | // Fault handling. If the byte-by-byte at the head fails on the | ||
138 | // load, then restart and finish the pipleline by copying zeros | ||
139 | // to the dst1. Then copy zeros for the rest of dst1. | ||
140 | // If 8-byte software pipeline fails on the load, do the same as | ||
141 | // failure_in3 does. If the byte-by-byte at the tail fails, it is | ||
142 | // handled simply by failure_in_pipe1. | ||
143 | // | ||
144 | // The case p14 represents the source has more bytes in the | ||
145 | // the first word (by the shifted part), whereas the p15 needs to | ||
146 | // copy some bytes from the 2nd word of the source that has the | ||
147 | // tail of the 1st of the destination. | ||
148 | // | ||
149 | |||
150 | // | ||
151 | // Optimization. If dst1 is 8-byte aligned (quite common), we don't need | ||
152 | // to copy the head to dst1, to start 8-byte copy software pipeline. | ||
153 | // We know src1 is not 8-byte aligned in this case. | ||
154 | // | ||
155 | cmp.eq p14,p15=r0,dst2 | ||
156 | (p15) br.cond.spnt 1f | ||
157 | ;; | ||
158 | sub t1=8,src2 | ||
159 | mov t2=src2 | ||
160 | ;; | ||
161 | shl rshift=t2,3 | ||
162 | sub len1=len,t1 // set len1 | ||
163 | ;; | ||
164 | sub lshift=64,rshift | ||
165 | ;; | ||
166 | br.cond.spnt .word_copy_user | ||
167 | ;; | ||
168 | 1: | ||
169 | cmp.leu p14,p15=src2,dst2 | ||
170 | sub t1=dst2,src2 | ||
171 | ;; | ||
172 | .pred.rel "mutex", p14, p15 | ||
173 | (p14) sub word1=8,src2 // (8 - src offset) | ||
174 | (p15) sub t1=r0,t1 // absolute value | ||
175 | (p15) sub word1=8,dst2 // (8 - dst offset) | ||
176 | ;; | ||
177 | // For the case p14, we don't need to copy the shifted part to | ||
178 | // the 1st word of destination. | ||
179 | sub t2=8,t1 | ||
180 | (p14) sub word1=word1,t1 | ||
181 | ;; | ||
182 | sub len1=len,word1 // resulting len | ||
183 | (p15) shl rshift=t1,3 // in bits | ||
184 | (p14) shl rshift=t2,3 | ||
185 | ;; | ||
186 | (p14) sub len1=len1,t1 | ||
187 | adds cnt=-1,word1 | ||
188 | ;; | ||
189 | sub lshift=64,rshift | ||
190 | mov ar.ec=PIPE_DEPTH | ||
191 | mov pr.rot=1<<16 // p16=true all others are false | ||
192 | mov ar.lc=cnt | ||
193 | ;; | ||
194 | 2: | ||
195 | EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1) | ||
196 | EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1) | ||
197 | br.ctop.dptk.few 2b | ||
198 | ;; | ||
199 | clrrrb | ||
200 | ;; | ||
201 | .word_copy_user: | ||
202 | cmp.gtu p9,p0=16,len1 | ||
203 | (p9) br.cond.spnt 4f // if (16 > len1) skip 8-byte copy | ||
204 | ;; | ||
205 | shr.u cnt=len1,3 // number of 64-bit words | ||
206 | ;; | ||
207 | adds cnt=-1,cnt | ||
208 | ;; | ||
209 | .pred.rel "mutex", p14, p15 | ||
210 | (p14) sub src1=src1,t2 | ||
211 | (p15) sub src1=src1,t1 | ||
212 | // | ||
213 | // Now both src1 and dst1 point to an 8-byte aligned address. And | ||
214 | // we have more than 8 bytes to copy. | ||
215 | // | ||
216 | mov ar.lc=cnt | ||
217 | mov ar.ec=PIPE_DEPTH | ||
218 | mov pr.rot=1<<16 // p16=true all others are false | ||
219 | ;; | ||
220 | 3: | ||
221 | // | ||
222 | // The pipleline consists of 3 stages: | ||
223 | // 1 (p16): Load a word from src1 | ||
224 | // 2 (EPI_1): Shift right pair, saving to tmp | ||
225 | // 3 (EPI): Store tmp to dst1 | ||
226 | // | ||
227 | // To make it simple, use at least 2 (p16) loops to set up val1[n] | ||
228 | // because we need 2 back-to-back val1[] to get tmp. | ||
229 | // Note that this implies EPI_2 must be p18 or greater. | ||
230 | // | ||
231 | |||
232 | #define EPI_1 p[PIPE_DEPTH-2] | ||
233 | #define SWITCH(pred, shift) cmp.eq pred,p0=shift,rshift | ||
234 | #define CASE(pred, shift) \ | ||
235 | (pred) br.cond.spnt .copy_user_bit##shift | ||
236 | #define BODY(rshift) \ | ||
237 | .copy_user_bit##rshift: \ | ||
238 | 1: \ | ||
239 | EX(.failure_out,(EPI) st8 [dst1]=tmp,8); \ | ||
240 | (EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \ | ||
241 | EX(3f,(p16) ld8 val1[1]=[src1],8); \ | ||
242 | (p16) mov val1[0]=r0; \ | ||
243 | br.ctop.dptk 1b; \ | ||
244 | ;; \ | ||
245 | br.cond.sptk.many .diff_align_do_tail; \ | ||
246 | 2: \ | ||
247 | (EPI) st8 [dst1]=tmp,8; \ | ||
248 | (EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \ | ||
249 | 3: \ | ||
250 | (p16) mov val1[1]=r0; \ | ||
251 | (p16) mov val1[0]=r0; \ | ||
252 | br.ctop.dptk 2b; \ | ||
253 | ;; \ | ||
254 | br.cond.sptk.many .failure_in2 | ||
255 | |||
256 | // | ||
257 | // Since the instruction 'shrp' requires a fixed 128-bit value | ||
258 | // specifying the bits to shift, we need to provide 7 cases | ||
259 | // below. | ||
260 | // | ||
261 | SWITCH(p6, 8) | ||
262 | SWITCH(p7, 16) | ||
263 | SWITCH(p8, 24) | ||
264 | SWITCH(p9, 32) | ||
265 | SWITCH(p10, 40) | ||
266 | SWITCH(p11, 48) | ||
267 | SWITCH(p12, 56) | ||
268 | ;; | ||
269 | CASE(p6, 8) | ||
270 | CASE(p7, 16) | ||
271 | CASE(p8, 24) | ||
272 | CASE(p9, 32) | ||
273 | CASE(p10, 40) | ||
274 | CASE(p11, 48) | ||
275 | CASE(p12, 56) | ||
276 | ;; | ||
277 | BODY(8) | ||
278 | BODY(16) | ||
279 | BODY(24) | ||
280 | BODY(32) | ||
281 | BODY(40) | ||
282 | BODY(48) | ||
283 | BODY(56) | ||
284 | ;; | ||
285 | .diff_align_do_tail: | ||
286 | .pred.rel "mutex", p14, p15 | ||
287 | (p14) sub src1=src1,t1 | ||
288 | (p14) adds dst1=-8,dst1 | ||
289 | (p15) sub dst1=dst1,t1 | ||
290 | ;; | ||
291 | 4: | ||
292 | // Tail correction. | ||
293 | // | ||
294 | // The problem with this piplelined loop is that the last word is not | ||
295 | // loaded and thus parf of the last word written is not correct. | ||
296 | // To fix that, we simply copy the tail byte by byte. | ||
297 | |||
298 | sub len1=endsrc,src1,1 | ||
299 | clrrrb | ||
300 | ;; | ||
301 | mov ar.ec=PIPE_DEPTH | ||
302 | mov pr.rot=1<<16 // p16=true all others are false | ||
303 | mov ar.lc=len1 | ||
304 | ;; | ||
305 | 5: | ||
306 | EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1) | ||
307 | EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1) | ||
308 | br.ctop.dptk.few 5b | ||
309 | ;; | ||
310 | mov ar.lc=saved_lc | ||
311 | mov pr=saved_pr,0xffffffffffff0000 | ||
312 | mov ar.pfs=saved_pfs | ||
313 | br.ret.sptk.many rp | ||
314 | |||
315 | // | ||
316 | // Beginning of long mempcy (i.e. > 16 bytes) | ||
317 | // | ||
318 | .long_copy_user: | ||
319 | tbit.nz p6,p7=src1,0 // odd alignment | ||
320 | and tmp=7,tmp | ||
321 | ;; | ||
322 | cmp.eq p10,p8=r0,tmp | ||
323 | mov len1=len // copy because of rotation | ||
324 | (p8) br.cond.dpnt .diff_align_copy_user | ||
325 | ;; | ||
326 | // At this point we know we have more than 16 bytes to copy | ||
327 | // and also that both src and dest have the same alignment | ||
328 | // which may not be the one we want. So for now we must move | ||
329 | // forward slowly until we reach 16byte alignment: no need to | ||
330 | // worry about reaching the end of buffer. | ||
331 | // | ||
332 | EX(.failure_in1,(p6) ld1 val1[0]=[src1],1) // 1-byte aligned | ||
333 | (p6) adds len1=-1,len1;; | ||
334 | tbit.nz p7,p0=src1,1 | ||
335 | ;; | ||
336 | EX(.failure_in1,(p7) ld2 val1[1]=[src1],2) // 2-byte aligned | ||
337 | (p7) adds len1=-2,len1;; | ||
338 | tbit.nz p8,p0=src1,2 | ||
339 | ;; | ||
340 | // | ||
341 | // Stop bit not required after ld4 because if we fail on ld4 | ||
342 | // we have never executed the ld1, therefore st1 is not executed. | ||
343 | // | ||
344 | EX(.failure_in1,(p8) ld4 val2[0]=[src1],4) // 4-byte aligned | ||
345 | ;; | ||
346 | EX(.failure_out,(p6) st1 [dst1]=val1[0],1) | ||
347 | tbit.nz p9,p0=src1,3 | ||
348 | ;; | ||
349 | // | ||
350 | // Stop bit not required after ld8 because if we fail on ld8 | ||
351 | // we have never executed the ld2, therefore st2 is not executed. | ||
352 | // | ||
353 | EX(.failure_in1,(p9) ld8 val2[1]=[src1],8) // 8-byte aligned | ||
354 | EX(.failure_out,(p7) st2 [dst1]=val1[1],2) | ||
355 | (p8) adds len1=-4,len1 | ||
356 | ;; | ||
357 | EX(.failure_out, (p8) st4 [dst1]=val2[0],4) | ||
358 | (p9) adds len1=-8,len1;; | ||
359 | shr.u cnt=len1,4 // number of 128-bit (2x64bit) words | ||
360 | ;; | ||
361 | EX(.failure_out, (p9) st8 [dst1]=val2[1],8) | ||
362 | tbit.nz p6,p0=len1,3 | ||
363 | cmp.eq p7,p0=r0,cnt | ||
364 | adds tmp=-1,cnt // br.ctop is repeat/until | ||
365 | (p7) br.cond.dpnt .dotail // we have less than 16 bytes left | ||
366 | ;; | ||
367 | adds src2=8,src1 | ||
368 | adds dst2=8,dst1 | ||
369 | mov ar.lc=tmp | ||
370 | ;; | ||
371 | // | ||
372 | // 16bytes/iteration | ||
373 | // | ||
374 | 2: | ||
375 | EX(.failure_in3,(p16) ld8 val1[0]=[src1],16) | ||
376 | (p16) ld8 val2[0]=[src2],16 | ||
377 | |||
378 | EX(.failure_out, (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16) | ||
379 | (EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16 | ||
380 | br.ctop.dptk 2b | ||
381 | ;; // RAW on src1 when fall through from loop | ||
382 | // | ||
383 | // Tail correction based on len only | ||
384 | // | ||
385 | // No matter where we come from (loop or test) the src1 pointer | ||
386 | // is 16 byte aligned AND we have less than 16 bytes to copy. | ||
387 | // | ||
388 | .dotail: | ||
389 | EX(.failure_in1,(p6) ld8 val1[0]=[src1],8) // at least 8 bytes | ||
390 | tbit.nz p7,p0=len1,2 | ||
391 | ;; | ||
392 | EX(.failure_in1,(p7) ld4 val1[1]=[src1],4) // at least 4 bytes | ||
393 | tbit.nz p8,p0=len1,1 | ||
394 | ;; | ||
395 | EX(.failure_in1,(p8) ld2 val2[0]=[src1],2) // at least 2 bytes | ||
396 | tbit.nz p9,p0=len1,0 | ||
397 | ;; | ||
398 | EX(.failure_out, (p6) st8 [dst1]=val1[0],8) | ||
399 | ;; | ||
400 | EX(.failure_in1,(p9) ld1 val2[1]=[src1]) // only 1 byte left | ||
401 | mov ar.lc=saved_lc | ||
402 | ;; | ||
403 | EX(.failure_out,(p7) st4 [dst1]=val1[1],4) | ||
404 | mov pr=saved_pr,0xffffffffffff0000 | ||
405 | ;; | ||
406 | EX(.failure_out, (p8) st2 [dst1]=val2[0],2) | ||
407 | mov ar.pfs=saved_pfs | ||
408 | ;; | ||
409 | EX(.failure_out, (p9) st1 [dst1]=val2[1]) | ||
410 | br.ret.sptk.many rp | ||
411 | |||
412 | |||
413 | // | ||
414 | // Here we handle the case where the byte by byte copy fails | ||
415 | // on the load. | ||
416 | // Several factors make the zeroing of the rest of the buffer kind of | ||
417 | // tricky: | ||
418 | // - the pipeline: loads/stores are not in sync (pipeline) | ||
419 | // | ||
420 | // In the same loop iteration, the dst1 pointer does not directly | ||
421 | // reflect where the faulty load was. | ||
422 | // | ||
423 | // - pipeline effect | ||
424 | // When you get a fault on load, you may have valid data from | ||
425 | // previous loads not yet store in transit. Such data must be | ||
426 | // store normally before moving onto zeroing the rest. | ||
427 | // | ||
428 | // - single/multi dispersal independence. | ||
429 | // | ||
430 | // solution: | ||
431 | // - we don't disrupt the pipeline, i.e. data in transit in | ||
432 | // the software pipeline will be eventually move to memory. | ||
433 | // We simply replace the load with a simple mov and keep the | ||
434 | // pipeline going. We can't really do this inline because | ||
435 | // p16 is always reset to 1 when lc > 0. | ||
436 | // | ||
437 | .failure_in_pipe1: | ||
438 | sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied | ||
439 | 1: | ||
440 | (p16) mov val1[0]=r0 | ||
441 | (EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1 | ||
442 | br.ctop.dptk 1b | ||
443 | ;; | ||
444 | mov pr=saved_pr,0xffffffffffff0000 | ||
445 | mov ar.lc=saved_lc | ||
446 | mov ar.pfs=saved_pfs | ||
447 | br.ret.sptk.many rp | ||
448 | |||
449 | // | ||
450 | // This is the case where the byte by byte copy fails on the load | ||
451 | // when we copy the head. We need to finish the pipeline and copy | ||
452 | // zeros for the rest of the destination. Since this happens | ||
453 | // at the top we still need to fill the body and tail. | ||
454 | .failure_in_pipe2: | ||
455 | sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied | ||
456 | 2: | ||
457 | (p16) mov val1[0]=r0 | ||
458 | (EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1 | ||
459 | br.ctop.dptk 2b | ||
460 | ;; | ||
461 | sub len=enddst,dst1,1 // precompute len | ||
462 | br.cond.dptk.many .failure_in1bis | ||
463 | ;; | ||
464 | |||
465 | // | ||
466 | // Here we handle the head & tail part when we check for alignment. | ||
467 | // The following code handles only the load failures. The | ||
468 | // main diffculty comes from the fact that loads/stores are | ||
469 | // scheduled. So when you fail on a load, the stores corresponding | ||
470 | // to previous successful loads must be executed. | ||
471 | // | ||
472 | // However some simplifications are possible given the way | ||
473 | // things work. | ||
474 | // | ||
475 | // 1) HEAD | ||
476 | // Theory of operation: | ||
477 | // | ||
478 | // Page A | Page B | ||
479 | // ---------|----- | ||
480 | // 1|8 x | ||
481 | // 1 2|8 x | ||
482 | // 4|8 x | ||
483 | // 1 4|8 x | ||
484 | // 2 4|8 x | ||
485 | // 1 2 4|8 x | ||
486 | // |1 | ||
487 | // |2 x | ||
488 | // |4 x | ||
489 | // | ||
490 | // page_size >= 4k (2^12). (x means 4, 2, 1) | ||
491 | // Here we suppose Page A exists and Page B does not. | ||
492 | // | ||
493 | // As we move towards eight byte alignment we may encounter faults. | ||
494 | // The numbers on each page show the size of the load (current alignment). | ||
495 | // | ||
496 | // Key point: | ||
497 | // - if you fail on 1, 2, 4 then you have never executed any smaller | ||
498 | // size loads, e.g. failing ld4 means no ld1 nor ld2 executed | ||
499 | // before. | ||
500 | // | ||
501 | // This allows us to simplify the cleanup code, because basically you | ||
502 | // only have to worry about "pending" stores in the case of a failing | ||
503 | // ld8(). Given the way the code is written today, this means only | ||
504 | // worry about st2, st4. There we can use the information encapsulated | ||
505 | // into the predicates. | ||
506 | // | ||
507 | // Other key point: | ||
508 | // - if you fail on the ld8 in the head, it means you went straight | ||
509 | // to it, i.e. 8byte alignment within an unexisting page. | ||
510 | // Again this comes from the fact that if you crossed just for the ld8 then | ||
511 | // you are 8byte aligned but also 16byte align, therefore you would | ||
512 | // either go for the 16byte copy loop OR the ld8 in the tail part. | ||
513 | // The combination ld1, ld2, ld4, ld8 where you fail on ld8 is impossible | ||
514 | // because it would mean you had 15bytes to copy in which case you | ||
515 | // would have defaulted to the byte by byte copy. | ||
516 | // | ||
517 | // | ||
518 | // 2) TAIL | ||
519 | // Here we now we have less than 16 bytes AND we are either 8 or 16 byte | ||
520 | // aligned. | ||
521 | // | ||
522 | // Key point: | ||
523 | // This means that we either: | ||
524 | // - are right on a page boundary | ||
525 | // OR | ||
526 | // - are at more than 16 bytes from a page boundary with | ||
527 | // at most 15 bytes to copy: no chance of crossing. | ||
528 | // | ||
529 | // This allows us to assume that if we fail on a load we haven't possibly | ||
530 | // executed any of the previous (tail) ones, so we don't need to do | ||
531 | // any stores. For instance, if we fail on ld2, this means we had | ||
532 | // 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4. | ||
533 | // | ||
534 | // This means that we are in a situation similar the a fault in the | ||
535 | // head part. That's nice! | ||
536 | // | ||
537 | .failure_in1: | ||
538 | sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied | ||
539 | sub len=endsrc,src1,1 | ||
540 | // | ||
541 | // we know that ret0 can never be zero at this point | ||
542 | // because we failed why trying to do a load, i.e. there is still | ||
543 | // some work to do. | ||
544 | // The failure_in1bis and length problem is taken care of at the | ||
545 | // calling side. | ||
546 | // | ||
547 | ;; | ||
548 | .failure_in1bis: // from (.failure_in3) | ||
549 | mov ar.lc=len // Continue with a stupid byte store. | ||
550 | ;; | ||
551 | 5: | ||
552 | st1 [dst1]=r0,1 | ||
553 | br.cloop.dptk 5b | ||
554 | ;; | ||
555 | mov pr=saved_pr,0xffffffffffff0000 | ||
556 | mov ar.lc=saved_lc | ||
557 | mov ar.pfs=saved_pfs | ||
558 | br.ret.sptk.many rp | ||
559 | |||
560 | // | ||
561 | // Here we simply restart the loop but instead | ||
562 | // of doing loads we fill the pipeline with zeroes | ||
563 | // We can't simply store r0 because we may have valid | ||
564 | // data in transit in the pipeline. | ||
565 | // ar.lc and ar.ec are setup correctly at this point | ||
566 | // | ||
567 | // we MUST use src1/endsrc here and not dst1/enddst because | ||
568 | // of the pipeline effect. | ||
569 | // | ||
570 | .failure_in3: | ||
571 | sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied | ||
572 | ;; | ||
573 | 2: | ||
574 | (p16) mov val1[0]=r0 | ||
575 | (p16) mov val2[0]=r0 | ||
576 | (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16 | ||
577 | (EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16 | ||
578 | br.ctop.dptk 2b | ||
579 | ;; | ||
580 | cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ? | ||
581 | sub len=enddst,dst1,1 // precompute len | ||
582 | (p6) br.cond.dptk .failure_in1bis | ||
583 | ;; | ||
584 | mov pr=saved_pr,0xffffffffffff0000 | ||
585 | mov ar.lc=saved_lc | ||
586 | mov ar.pfs=saved_pfs | ||
587 | br.ret.sptk.many rp | ||
588 | |||
589 | .failure_in2: | ||
590 | sub ret0=endsrc,src1 | ||
591 | cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ? | ||
592 | sub len=enddst,dst1,1 // precompute len | ||
593 | (p6) br.cond.dptk .failure_in1bis | ||
594 | ;; | ||
595 | mov pr=saved_pr,0xffffffffffff0000 | ||
596 | mov ar.lc=saved_lc | ||
597 | mov ar.pfs=saved_pfs | ||
598 | br.ret.sptk.many rp | ||
599 | |||
600 | // | ||
601 | // handling of failures on stores: that's the easy part | ||
602 | // | ||
603 | .failure_out: | ||
604 | sub ret0=enddst,dst1 | ||
605 | mov pr=saved_pr,0xffffffffffff0000 | ||
606 | mov ar.lc=saved_lc | ||
607 | |||
608 | mov ar.pfs=saved_pfs | ||
609 | br.ret.sptk.many rp | ||
610 | END(__copy_user) | ||
diff --git a/arch/ia64/lib/csum_partial_copy.c b/arch/ia64/lib/csum_partial_copy.c new file mode 100644 index 000000000000..36866e8a5d2b --- /dev/null +++ b/arch/ia64/lib/csum_partial_copy.c | |||
@@ -0,0 +1,151 @@ | |||
1 | /* | ||
2 | * Network Checksum & Copy routine | ||
3 | * | ||
4 | * Copyright (C) 1999, 2003-2004 Hewlett-Packard Co | ||
5 | * Stephane Eranian <eranian@hpl.hp.com> | ||
6 | * | ||
7 | * Most of the code has been imported from Linux/Alpha | ||
8 | */ | ||
9 | |||
10 | #include <linux/module.h> | ||
11 | #include <linux/types.h> | ||
12 | #include <linux/string.h> | ||
13 | |||
14 | #include <asm/uaccess.h> | ||
15 | |||
16 | /* | ||
17 | * XXX Fixme: those 2 inlines are meant for debugging and will go away | ||
18 | */ | ||
19 | static inline unsigned | ||
20 | short from64to16(unsigned long x) | ||
21 | { | ||
22 | /* add up 32-bit words for 33 bits */ | ||
23 | x = (x & 0xffffffff) + (x >> 32); | ||
24 | /* add up 16-bit and 17-bit words for 17+c bits */ | ||
25 | x = (x & 0xffff) + (x >> 16); | ||
26 | /* add up 16-bit and 2-bit for 16+c bit */ | ||
27 | x = (x & 0xffff) + (x >> 16); | ||
28 | /* add up carry.. */ | ||
29 | x = (x & 0xffff) + (x >> 16); | ||
30 | return x; | ||
31 | } | ||
32 | |||
33 | static inline | ||
34 | unsigned long do_csum_c(const unsigned char * buff, int len, unsigned int psum) | ||
35 | { | ||
36 | int odd, count; | ||
37 | unsigned long result = (unsigned long)psum; | ||
38 | |||
39 | if (len <= 0) | ||
40 | goto out; | ||
41 | odd = 1 & (unsigned long) buff; | ||
42 | if (odd) { | ||
43 | result = *buff << 8; | ||
44 | len--; | ||
45 | buff++; | ||
46 | } | ||
47 | count = len >> 1; /* nr of 16-bit words.. */ | ||
48 | if (count) { | ||
49 | if (2 & (unsigned long) buff) { | ||
50 | result += *(unsigned short *) buff; | ||
51 | count--; | ||
52 | len -= 2; | ||
53 | buff += 2; | ||
54 | } | ||
55 | count >>= 1; /* nr of 32-bit words.. */ | ||
56 | if (count) { | ||
57 | if (4 & (unsigned long) buff) { | ||
58 | result += *(unsigned int *) buff; | ||
59 | count--; | ||
60 | len -= 4; | ||
61 | buff += 4; | ||
62 | } | ||
63 | count >>= 1; /* nr of 64-bit words.. */ | ||
64 | if (count) { | ||
65 | unsigned long carry = 0; | ||
66 | do { | ||
67 | unsigned long w = *(unsigned long *) buff; | ||
68 | count--; | ||
69 | buff += 8; | ||
70 | result += carry; | ||
71 | result += w; | ||
72 | carry = (w > result); | ||
73 | } while (count); | ||
74 | result += carry; | ||
75 | result = (result & 0xffffffff) + (result >> 32); | ||
76 | } | ||
77 | if (len & 4) { | ||
78 | result += *(unsigned int *) buff; | ||
79 | buff += 4; | ||
80 | } | ||
81 | } | ||
82 | if (len & 2) { | ||
83 | result += *(unsigned short *) buff; | ||
84 | buff += 2; | ||
85 | } | ||
86 | } | ||
87 | if (len & 1) | ||
88 | result += *buff; | ||
89 | |||
90 | result = from64to16(result); | ||
91 | |||
92 | if (odd) | ||
93 | result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); | ||
94 | |||
95 | out: | ||
96 | return result; | ||
97 | } | ||
98 | |||
99 | /* | ||
100 | * XXX Fixme | ||
101 | * | ||
102 | * This is very ugly but temporary. THIS NEEDS SERIOUS ENHANCEMENTS. | ||
103 | * But it's very tricky to get right even in C. | ||
104 | */ | ||
105 | extern unsigned long do_csum(const unsigned char *, long); | ||
106 | |||
107 | static unsigned int | ||
108 | do_csum_partial_copy_from_user (const unsigned char __user *src, unsigned char *dst, | ||
109 | int len, unsigned int psum, int *errp) | ||
110 | { | ||
111 | unsigned long result; | ||
112 | |||
113 | /* XXX Fixme | ||
114 | * for now we separate the copy from checksum for obvious | ||
115 | * alignment difficulties. Look at the Alpha code and you'll be | ||
116 | * scared. | ||
117 | */ | ||
118 | |||
119 | if (__copy_from_user(dst, src, len) != 0 && errp) | ||
120 | *errp = -EFAULT; | ||
121 | |||
122 | result = do_csum(dst, len); | ||
123 | |||
124 | /* add in old sum, and carry.. */ | ||
125 | result += psum; | ||
126 | /* 32+c bits -> 32 bits */ | ||
127 | result = (result & 0xffffffff) + (result >> 32); | ||
128 | return result; | ||
129 | } | ||
130 | |||
131 | unsigned int | ||
132 | csum_partial_copy_from_user (const unsigned char __user *src, unsigned char *dst, | ||
133 | int len, unsigned int sum, int *errp) | ||
134 | { | ||
135 | if (!access_ok(VERIFY_READ, src, len)) { | ||
136 | *errp = -EFAULT; | ||
137 | memset(dst, 0, len); | ||
138 | return sum; | ||
139 | } | ||
140 | |||
141 | return do_csum_partial_copy_from_user(src, dst, len, sum, errp); | ||
142 | } | ||
143 | |||
144 | unsigned int | ||
145 | csum_partial_copy_nocheck(const unsigned char __user *src, unsigned char *dst, | ||
146 | int len, unsigned int sum) | ||
147 | { | ||
148 | return do_csum_partial_copy_from_user(src, dst, len, sum, NULL); | ||
149 | } | ||
150 | |||
151 | EXPORT_SYMBOL(csum_partial_copy_nocheck); | ||
diff --git a/arch/ia64/lib/dec_and_lock.c b/arch/ia64/lib/dec_and_lock.c new file mode 100644 index 000000000000..c7ce92f968f1 --- /dev/null +++ b/arch/ia64/lib/dec_and_lock.c | |||
@@ -0,0 +1,42 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2003 Jerome Marchand, Bull S.A. | ||
3 | * Cleaned up by David Mosberger-Tang <davidm@hpl.hp.com> | ||
4 | * | ||
5 | * This file is released under the GPLv2, or at your option any later version. | ||
6 | * | ||
7 | * ia64 version of "atomic_dec_and_lock()" using the atomic "cmpxchg" instruction. This | ||
8 | * code is an adaptation of the x86 version of "atomic_dec_and_lock()". | ||
9 | */ | ||
10 | |||
11 | #include <linux/compiler.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/spinlock.h> | ||
14 | #include <asm/atomic.h> | ||
15 | |||
16 | /* | ||
17 | * Decrement REFCOUNT and if the count reaches zero, acquire the spinlock. Both of these | ||
18 | * operations have to be done atomically, so that the count doesn't drop to zero without | ||
19 | * acquiring the spinlock first. | ||
20 | */ | ||
21 | int | ||
22 | _atomic_dec_and_lock (atomic_t *refcount, spinlock_t *lock) | ||
23 | { | ||
24 | int old, new; | ||
25 | |||
26 | do { | ||
27 | old = atomic_read(refcount); | ||
28 | new = old - 1; | ||
29 | |||
30 | if (unlikely (old == 1)) { | ||
31 | /* oops, we may be decrementing to zero, do it the slow way... */ | ||
32 | spin_lock(lock); | ||
33 | if (atomic_dec_and_test(refcount)) | ||
34 | return 1; | ||
35 | spin_unlock(lock); | ||
36 | return 0; | ||
37 | } | ||
38 | } while (cmpxchg(&refcount->counter, old, new) != old); | ||
39 | return 0; | ||
40 | } | ||
41 | |||
42 | EXPORT_SYMBOL(_atomic_dec_and_lock); | ||
diff --git a/arch/ia64/lib/do_csum.S b/arch/ia64/lib/do_csum.S new file mode 100644 index 000000000000..6bec2fc9f5b2 --- /dev/null +++ b/arch/ia64/lib/do_csum.S | |||
@@ -0,0 +1,323 @@ | |||
1 | /* | ||
2 | * | ||
3 | * Optmized version of the standard do_csum() function | ||
4 | * | ||
5 | * Return: a 64bit quantity containing the 16bit Internet checksum | ||
6 | * | ||
7 | * Inputs: | ||
8 | * in0: address of buffer to checksum (char *) | ||
9 | * in1: length of the buffer (int) | ||
10 | * | ||
11 | * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co | ||
12 | * Stephane Eranian <eranian@hpl.hp.com> | ||
13 | * | ||
14 | * 02/04/22 Ken Chen <kenneth.w.chen@intel.com> | ||
15 | * Data locality study on the checksum buffer. | ||
16 | * More optimization cleanup - remove excessive stop bits. | ||
17 | * 02/04/08 David Mosberger <davidm@hpl.hp.com> | ||
18 | * More cleanup and tuning. | ||
19 | * 01/04/18 Jun Nakajima <jun.nakajima@intel.com> | ||
20 | * Clean up and optimize and the software pipeline, loading two | ||
21 | * back-to-back 8-byte words per loop. Clean up the initialization | ||
22 | * for the loop. Support the cases where load latency = 1 or 2. | ||
23 | * Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default). | ||
24 | */ | ||
25 | |||
26 | #include <asm/asmmacro.h> | ||
27 | |||
28 | // | ||
29 | // Theory of operations: | ||
30 | // The goal is to go as quickly as possible to the point where | ||
31 | // we can checksum 16 bytes/loop. Before reaching that point we must | ||
32 | // take care of incorrect alignment of first byte. | ||
33 | // | ||
34 | // The code hereafter also takes care of the "tail" part of the buffer | ||
35 | // before entering the core loop, if any. The checksum is a sum so it | ||
36 | // allows us to commute operations. So we do the "head" and "tail" | ||
37 | // first to finish at full speed in the body. Once we get the head and | ||
38 | // tail values, we feed them into the pipeline, very handy initialization. | ||
39 | // | ||
40 | // Of course we deal with the special case where the whole buffer fits | ||
41 | // into one 8 byte word. In this case we have only one entry in the pipeline. | ||
42 | // | ||
43 | // We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for | ||
44 | // possible load latency and also to accommodate for head and tail. | ||
45 | // | ||
46 | // The end of the function deals with folding the checksum from 64bits | ||
47 | // down to 16bits taking care of the carry. | ||
48 | // | ||
49 | // This version avoids synchronization in the core loop by also using a | ||
50 | // pipeline for the accumulation of the checksum in resultx[] (x=1,2). | ||
51 | // | ||
52 | // wordx[] (x=1,2) | ||
53 | // |---| | ||
54 | // | | 0 : new value loaded in pipeline | ||
55 | // |---| | ||
56 | // | | - : in transit data | ||
57 | // |---| | ||
58 | // | | LOAD_LATENCY : current value to add to checksum | ||
59 | // |---| | ||
60 | // | | LOAD_LATENCY+1 : previous value added to checksum | ||
61 | // |---| (previous iteration) | ||
62 | // | ||
63 | // resultx[] (x=1,2) | ||
64 | // |---| | ||
65 | // | | 0 : initial value | ||
66 | // |---| | ||
67 | // | | LOAD_LATENCY-1 : new checksum | ||
68 | // |---| | ||
69 | // | | LOAD_LATENCY : previous value of checksum | ||
70 | // |---| | ||
71 | // | | LOAD_LATENCY+1 : final checksum when out of the loop | ||
72 | // |---| | ||
73 | // | ||
74 | // | ||
75 | // See RFC1071 "Computing the Internet Checksum" for various techniques for | ||
76 | // calculating the Internet checksum. | ||
77 | // | ||
78 | // NOT YET DONE: | ||
79 | // - Maybe another algorithm which would take care of the folding at the | ||
80 | // end in a different manner | ||
81 | // - Work with people more knowledgeable than me on the network stack | ||
82 | // to figure out if we could not split the function depending on the | ||
83 | // type of packet or alignment we get. Like the ip_fast_csum() routine | ||
84 | // where we know we have at least 20bytes worth of data to checksum. | ||
85 | // - Do a better job of handling small packets. | ||
86 | // - Note on prefetching: it was found that under various load, i.e. ftp read/write, | ||
87 | // nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate is at 99.8% | ||
88 | // on the data that buffer points to (partly because the checksum is often preceded by | ||
89 | // a copy_from_user()). This finding indiate that lfetch will not be beneficial since | ||
90 | // the data is already in the cache. | ||
91 | // | ||
92 | |||
93 | #define saved_pfs r11 | ||
94 | #define hmask r16 | ||
95 | #define tmask r17 | ||
96 | #define first1 r18 | ||
97 | #define firstval r19 | ||
98 | #define firstoff r20 | ||
99 | #define last r21 | ||
100 | #define lastval r22 | ||
101 | #define lastoff r23 | ||
102 | #define saved_lc r24 | ||
103 | #define saved_pr r25 | ||
104 | #define tmp1 r26 | ||
105 | #define tmp2 r27 | ||
106 | #define tmp3 r28 | ||
107 | #define carry1 r29 | ||
108 | #define carry2 r30 | ||
109 | #define first2 r31 | ||
110 | |||
111 | #define buf in0 | ||
112 | #define len in1 | ||
113 | |||
114 | #define LOAD_LATENCY 2 // XXX fix me | ||
115 | |||
116 | #if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2) | ||
117 | # error "Only 1 or 2 is supported/tested for LOAD_LATENCY." | ||
118 | #endif | ||
119 | |||
120 | #define PIPE_DEPTH (LOAD_LATENCY+2) | ||
121 | #define ELD p[LOAD_LATENCY] // end of load | ||
122 | #define ELD_1 p[LOAD_LATENCY+1] // and next stage | ||
123 | |||
124 | // unsigned long do_csum(unsigned char *buf,long len) | ||
125 | |||
126 | GLOBAL_ENTRY(do_csum) | ||
127 | .prologue | ||
128 | .save ar.pfs, saved_pfs | ||
129 | alloc saved_pfs=ar.pfs,2,16,0,16 | ||
130 | .rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2] | ||
131 | .rotp p[PIPE_DEPTH], pC1[2], pC2[2] | ||
132 | mov ret0=r0 // in case we have zero length | ||
133 | cmp.lt p0,p6=r0,len // check for zero length or negative (32bit len) | ||
134 | ;; | ||
135 | add tmp1=buf,len // last byte's address | ||
136 | .save pr, saved_pr | ||
137 | mov saved_pr=pr // preserve predicates (rotation) | ||
138 | (p6) br.ret.spnt.many rp // return if zero or negative length | ||
139 | |||
140 | mov hmask=-1 // initialize head mask | ||
141 | tbit.nz p15,p0=buf,0 // is buf an odd address? | ||
142 | and first1=-8,buf // 8-byte align down address of first1 element | ||
143 | |||
144 | and firstoff=7,buf // how many bytes off for first1 element | ||
145 | mov tmask=-1 // initialize tail mask | ||
146 | |||
147 | ;; | ||
148 | adds tmp2=-1,tmp1 // last-1 | ||
149 | and lastoff=7,tmp1 // how many bytes off for last element | ||
150 | ;; | ||
151 | sub tmp1=8,lastoff // complement to lastoff | ||
152 | and last=-8,tmp2 // address of word containing last byte | ||
153 | ;; | ||
154 | sub tmp3=last,first1 // tmp3=distance from first1 to last | ||
155 | .save ar.lc, saved_lc | ||
156 | mov saved_lc=ar.lc // save lc | ||
157 | cmp.eq p8,p9=last,first1 // everything fits in one word ? | ||
158 | |||
159 | ld8 firstval=[first1],8 // load, ahead of time, "first1" word | ||
160 | and tmp1=7, tmp1 // make sure that if tmp1==8 -> tmp1=0 | ||
161 | shl tmp2=firstoff,3 // number of bits | ||
162 | ;; | ||
163 | (p9) ld8 lastval=[last] // load, ahead of time, "last" word, if needed | ||
164 | shl tmp1=tmp1,3 // number of bits | ||
165 | (p9) adds tmp3=-8,tmp3 // effectively loaded | ||
166 | ;; | ||
167 | (p8) mov lastval=r0 // we don't need lastval if first1==last | ||
168 | shl hmask=hmask,tmp2 // build head mask, mask off [0,first1off[ | ||
169 | shr.u tmask=tmask,tmp1 // build tail mask, mask off ]8,lastoff] | ||
170 | ;; | ||
171 | .body | ||
172 | #define count tmp3 | ||
173 | |||
174 | (p8) and hmask=hmask,tmask // apply tail mask to head mask if 1 word only | ||
175 | (p9) and word2[0]=lastval,tmask // mask last it as appropriate | ||
176 | shr.u count=count,3 // how many 8-byte? | ||
177 | ;; | ||
178 | // If count is odd, finish this 8-byte word so that we can | ||
179 | // load two back-to-back 8-byte words per loop thereafter. | ||
180 | and word1[0]=firstval,hmask // and mask it as appropriate | ||
181 | tbit.nz p10,p11=count,0 // if (count is odd) | ||
182 | ;; | ||
183 | (p8) mov result1[0]=word1[0] | ||
184 | (p9) add result1[0]=word1[0],word2[0] | ||
185 | ;; | ||
186 | cmp.ltu p6,p0=result1[0],word1[0] // check the carry | ||
187 | cmp.eq.or.andcm p8,p0=0,count // exit if zero 8-byte | ||
188 | ;; | ||
189 | (p6) adds result1[0]=1,result1[0] | ||
190 | (p8) br.cond.dptk .do_csum_exit // if (within an 8-byte word) | ||
191 | (p11) br.cond.dptk .do_csum16 // if (count is even) | ||
192 | |||
193 | // Here count is odd. | ||
194 | ld8 word1[1]=[first1],8 // load an 8-byte word | ||
195 | cmp.eq p9,p10=1,count // if (count == 1) | ||
196 | adds count=-1,count // loaded an 8-byte word | ||
197 | ;; | ||
198 | add result1[0]=result1[0],word1[1] | ||
199 | ;; | ||
200 | cmp.ltu p6,p0=result1[0],word1[1] | ||
201 | ;; | ||
202 | (p6) adds result1[0]=1,result1[0] | ||
203 | (p9) br.cond.sptk .do_csum_exit // if (count == 1) exit | ||
204 | // Fall through to caluculate the checksum, feeding result1[0] as | ||
205 | // the initial value in result1[0]. | ||
206 | // | ||
207 | // Calculate the checksum loading two 8-byte words per loop. | ||
208 | // | ||
209 | .do_csum16: | ||
210 | add first2=8,first1 | ||
211 | shr.u count=count,1 // we do 16 bytes per loop | ||
212 | ;; | ||
213 | adds count=-1,count | ||
214 | mov carry1=r0 | ||
215 | mov carry2=r0 | ||
216 | brp.loop.imp 1f,2f | ||
217 | ;; | ||
218 | mov ar.ec=PIPE_DEPTH | ||
219 | mov ar.lc=count // set lc | ||
220 | mov pr.rot=1<<16 | ||
221 | // result1[0] must be initialized in advance. | ||
222 | mov result2[0]=r0 | ||
223 | ;; | ||
224 | .align 32 | ||
225 | 1: | ||
226 | (ELD_1) cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1] | ||
227 | (pC1[1])adds carry1=1,carry1 | ||
228 | (ELD_1) cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1] | ||
229 | (pC2[1])adds carry2=1,carry2 | ||
230 | (ELD) add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY] | ||
231 | (ELD) add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY] | ||
232 | 2: | ||
233 | (p[0]) ld8 word1[0]=[first1],16 | ||
234 | (p[0]) ld8 word2[0]=[first2],16 | ||
235 | br.ctop.sptk 1b | ||
236 | ;; | ||
237 | // Since len is a 32-bit value, carry cannot be larger than a 64-bit value. | ||
238 | (pC1[1])adds carry1=1,carry1 // since we miss the last one | ||
239 | (pC2[1])adds carry2=1,carry2 | ||
240 | ;; | ||
241 | add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1 | ||
242 | add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2 | ||
243 | ;; | ||
244 | cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1 | ||
245 | cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2 | ||
246 | ;; | ||
247 | (p6) adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1] | ||
248 | (p7) adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1] | ||
249 | ;; | ||
250 | add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1] | ||
251 | ;; | ||
252 | cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1] | ||
253 | ;; | ||
254 | (p6) adds result1[0]=1,result1[0] | ||
255 | ;; | ||
256 | .do_csum_exit: | ||
257 | // | ||
258 | // now fold 64 into 16 bits taking care of carry | ||
259 | // that's not very good because it has lots of sequentiality | ||
260 | // | ||
261 | mov tmp3=0xffff | ||
262 | zxt4 tmp1=result1[0] | ||
263 | shr.u tmp2=result1[0],32 | ||
264 | ;; | ||
265 | add result1[0]=tmp1,tmp2 | ||
266 | ;; | ||
267 | and tmp1=result1[0],tmp3 | ||
268 | shr.u tmp2=result1[0],16 | ||
269 | ;; | ||
270 | add result1[0]=tmp1,tmp2 | ||
271 | ;; | ||
272 | and tmp1=result1[0],tmp3 | ||
273 | shr.u tmp2=result1[0],16 | ||
274 | ;; | ||
275 | add result1[0]=tmp1,tmp2 | ||
276 | ;; | ||
277 | and tmp1=result1[0],tmp3 | ||
278 | shr.u tmp2=result1[0],16 | ||
279 | ;; | ||
280 | add ret0=tmp1,tmp2 | ||
281 | mov pr=saved_pr,0xffffffffffff0000 | ||
282 | ;; | ||
283 | // if buf was odd then swap bytes | ||
284 | mov ar.pfs=saved_pfs // restore ar.ec | ||
285 | (p15) mux1 ret0=ret0,@rev // reverse word | ||
286 | ;; | ||
287 | mov ar.lc=saved_lc | ||
288 | (p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes | ||
289 | br.ret.sptk.many rp | ||
290 | |||
291 | // I (Jun Nakajima) wrote an equivalent code (see below), but it was | ||
292 | // not much better than the original. So keep the original there so that | ||
293 | // someone else can challenge. | ||
294 | // | ||
295 | // shr.u word1[0]=result1[0],32 | ||
296 | // zxt4 result1[0]=result1[0] | ||
297 | // ;; | ||
298 | // add result1[0]=result1[0],word1[0] | ||
299 | // ;; | ||
300 | // zxt2 result2[0]=result1[0] | ||
301 | // extr.u word1[0]=result1[0],16,16 | ||
302 | // shr.u carry1=result1[0],32 | ||
303 | // ;; | ||
304 | // add result2[0]=result2[0],word1[0] | ||
305 | // ;; | ||
306 | // add result2[0]=result2[0],carry1 | ||
307 | // ;; | ||
308 | // extr.u ret0=result2[0],16,16 | ||
309 | // ;; | ||
310 | // add ret0=ret0,result2[0] | ||
311 | // ;; | ||
312 | // zxt2 ret0=ret0 | ||
313 | // mov ar.pfs=saved_pfs // restore ar.ec | ||
314 | // mov pr=saved_pr,0xffffffffffff0000 | ||
315 | // ;; | ||
316 | // // if buf was odd then swap bytes | ||
317 | // mov ar.lc=saved_lc | ||
318 | //(p15) mux1 ret0=ret0,@rev // reverse word | ||
319 | // ;; | ||
320 | //(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes | ||
321 | // br.ret.sptk.many rp | ||
322 | |||
323 | END(do_csum) | ||
diff --git a/arch/ia64/lib/flush.S b/arch/ia64/lib/flush.S new file mode 100644 index 000000000000..29c802b19669 --- /dev/null +++ b/arch/ia64/lib/flush.S | |||
@@ -0,0 +1,39 @@ | |||
1 | /* | ||
2 | * Cache flushing routines. | ||
3 | * | ||
4 | * Copyright (C) 1999-2001 Hewlett-Packard Co | ||
5 | * Copyright (C) 1999-2001 David Mosberger-Tang <davidm@hpl.hp.com> | ||
6 | */ | ||
7 | #include <asm/asmmacro.h> | ||
8 | #include <asm/page.h> | ||
9 | |||
10 | /* | ||
11 | * flush_icache_range(start,end) | ||
12 | * Must flush range from start to end-1 but nothing else (need to | ||
13 | * be careful not to touch addresses that may be unmapped). | ||
14 | */ | ||
15 | GLOBAL_ENTRY(flush_icache_range) | ||
16 | .prologue | ||
17 | alloc r2=ar.pfs,2,0,0,0 | ||
18 | sub r8=in1,in0,1 | ||
19 | ;; | ||
20 | shr.u r8=r8,5 // we flush 32 bytes per iteration | ||
21 | .save ar.lc, r3 | ||
22 | mov r3=ar.lc // save ar.lc | ||
23 | ;; | ||
24 | |||
25 | .body | ||
26 | |||
27 | mov ar.lc=r8 | ||
28 | ;; | ||
29 | .Loop: fc in0 // issuable on M0 only | ||
30 | add in0=32,in0 | ||
31 | br.cloop.sptk.few .Loop | ||
32 | ;; | ||
33 | sync.i | ||
34 | ;; | ||
35 | srlz.i | ||
36 | ;; | ||
37 | mov ar.lc=r3 // restore ar.lc | ||
38 | br.ret.sptk.many rp | ||
39 | END(flush_icache_range) | ||
diff --git a/arch/ia64/lib/idiv32.S b/arch/ia64/lib/idiv32.S new file mode 100644 index 000000000000..2ac28bf0a662 --- /dev/null +++ b/arch/ia64/lib/idiv32.S | |||
@@ -0,0 +1,83 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2000 Hewlett-Packard Co | ||
3 | * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com> | ||
4 | * | ||
5 | * 32-bit integer division. | ||
6 | * | ||
7 | * This code is based on the application note entitled "Divide, Square Root | ||
8 | * and Remainder Algorithms for the IA-64 Architecture". This document | ||
9 | * is available as Intel document number 248725-002 or via the web at | ||
10 | * http://developer.intel.com/software/opensource/numerics/ | ||
11 | * | ||
12 | * For more details on the theory behind these algorithms, see "IA-64 | ||
13 | * and Elementary Functions" by Peter Markstein; HP Professional Books | ||
14 | * (http://www.hp.com/go/retailbooks/) | ||
15 | */ | ||
16 | |||
17 | #include <asm/asmmacro.h> | ||
18 | |||
19 | #ifdef MODULO | ||
20 | # define OP mod | ||
21 | #else | ||
22 | # define OP div | ||
23 | #endif | ||
24 | |||
25 | #ifdef UNSIGNED | ||
26 | # define SGN u | ||
27 | # define EXTEND zxt4 | ||
28 | # define INT_TO_FP(a,b) fcvt.xuf.s1 a=b | ||
29 | # define FP_TO_INT(a,b) fcvt.fxu.trunc.s1 a=b | ||
30 | #else | ||
31 | # define SGN | ||
32 | # define EXTEND sxt4 | ||
33 | # define INT_TO_FP(a,b) fcvt.xf a=b | ||
34 | # define FP_TO_INT(a,b) fcvt.fx.trunc.s1 a=b | ||
35 | #endif | ||
36 | |||
37 | #define PASTE1(a,b) a##b | ||
38 | #define PASTE(a,b) PASTE1(a,b) | ||
39 | #define NAME PASTE(PASTE(__,SGN),PASTE(OP,si3)) | ||
40 | |||
41 | GLOBAL_ENTRY(NAME) | ||
42 | .regstk 2,0,0,0 | ||
43 | // Transfer inputs to FP registers. | ||
44 | mov r2 = 0xffdd // r2 = -34 + 65535 (fp reg format bias) | ||
45 | EXTEND in0 = in0 // in0 = a | ||
46 | EXTEND in1 = in1 // in1 = b | ||
47 | ;; | ||
48 | setf.sig f8 = in0 | ||
49 | setf.sig f9 = in1 | ||
50 | #ifdef MODULO | ||
51 | sub in1 = r0, in1 // in1 = -b | ||
52 | #endif | ||
53 | ;; | ||
54 | // Convert the inputs to FP, to avoid FP software-assist faults. | ||
55 | INT_TO_FP(f8, f8) | ||
56 | INT_TO_FP(f9, f9) | ||
57 | ;; | ||
58 | setf.exp f7 = r2 // f7 = 2^-34 | ||
59 | frcpa.s1 f6, p6 = f8, f9 // y0 = frcpa(b) | ||
60 | ;; | ||
61 | (p6) fmpy.s1 f8 = f8, f6 // q0 = a*y0 | ||
62 | (p6) fnma.s1 f6 = f9, f6, f1 // e0 = -b*y0 + 1 | ||
63 | ;; | ||
64 | #ifdef MODULO | ||
65 | setf.sig f9 = in1 // f9 = -b | ||
66 | #endif | ||
67 | (p6) fma.s1 f8 = f6, f8, f8 // q1 = e0*q0 + q0 | ||
68 | (p6) fma.s1 f6 = f6, f6, f7 // e1 = e0*e0 + 2^-34 | ||
69 | ;; | ||
70 | #ifdef MODULO | ||
71 | setf.sig f7 = in0 | ||
72 | #endif | ||
73 | (p6) fma.s1 f6 = f6, f8, f8 // q2 = e1*q1 + q1 | ||
74 | ;; | ||
75 | FP_TO_INT(f6, f6) // q = trunc(q2) | ||
76 | ;; | ||
77 | #ifdef MODULO | ||
78 | xma.l f6 = f6, f9, f7 // r = q*(-b) + a | ||
79 | ;; | ||
80 | #endif | ||
81 | getf.sig r8 = f6 // transfer result to result register | ||
82 | br.ret.sptk.many rp | ||
83 | END(NAME) | ||
diff --git a/arch/ia64/lib/idiv64.S b/arch/ia64/lib/idiv64.S new file mode 100644 index 000000000000..f69bd2b0987a --- /dev/null +++ b/arch/ia64/lib/idiv64.S | |||
@@ -0,0 +1,80 @@ | |||
1 | /* | ||
2 | * Copyright (C) 1999-2000 Hewlett-Packard Co | ||
3 | * Copyright (C) 1999-2000 David Mosberger-Tang <davidm@hpl.hp.com> | ||
4 | * | ||
5 | * 64-bit integer division. | ||
6 | * | ||
7 | * This code is based on the application note entitled "Divide, Square Root | ||
8 | * and Remainder Algorithms for the IA-64 Architecture". This document | ||
9 | * is available as Intel document number 248725-002 or via the web at | ||
10 | * http://developer.intel.com/software/opensource/numerics/ | ||
11 | * | ||
12 | * For more details on the theory behind these algorithms, see "IA-64 | ||
13 | * and Elementary Functions" by Peter Markstein; HP Professional Books | ||
14 | * (http://www.hp.com/go/retailbooks/) | ||
15 | */ | ||
16 | |||
17 | #include <asm/asmmacro.h> | ||
18 | |||
19 | #ifdef MODULO | ||
20 | # define OP mod | ||
21 | #else | ||
22 | # define OP div | ||
23 | #endif | ||
24 | |||
25 | #ifdef UNSIGNED | ||
26 | # define SGN u | ||
27 | # define INT_TO_FP(a,b) fcvt.xuf.s1 a=b | ||
28 | # define FP_TO_INT(a,b) fcvt.fxu.trunc.s1 a=b | ||
29 | #else | ||
30 | # define SGN | ||
31 | # define INT_TO_FP(a,b) fcvt.xf a=b | ||
32 | # define FP_TO_INT(a,b) fcvt.fx.trunc.s1 a=b | ||
33 | #endif | ||
34 | |||
35 | #define PASTE1(a,b) a##b | ||
36 | #define PASTE(a,b) PASTE1(a,b) | ||
37 | #define NAME PASTE(PASTE(__,SGN),PASTE(OP,di3)) | ||
38 | |||
39 | GLOBAL_ENTRY(NAME) | ||
40 | .regstk 2,0,0,0 | ||
41 | // Transfer inputs to FP registers. | ||
42 | setf.sig f8 = in0 | ||
43 | setf.sig f9 = in1 | ||
44 | ;; | ||
45 | // Convert the inputs to FP, to avoid FP software-assist faults. | ||
46 | INT_TO_FP(f8, f8) | ||
47 | INT_TO_FP(f9, f9) | ||
48 | ;; | ||
49 | frcpa.s1 f11, p6 = f8, f9 // y0 = frcpa(b) | ||
50 | ;; | ||
51 | (p6) fmpy.s1 f7 = f8, f11 // q0 = a*y0 | ||
52 | (p6) fnma.s1 f6 = f9, f11, f1 // e0 = -b*y0 + 1 | ||
53 | ;; | ||
54 | (p6) fma.s1 f10 = f7, f6, f7 // q1 = q0*e0 + q0 | ||
55 | (p6) fmpy.s1 f7 = f6, f6 // e1 = e0*e0 | ||
56 | ;; | ||
57 | #ifdef MODULO | ||
58 | sub in1 = r0, in1 // in1 = -b | ||
59 | #endif | ||
60 | (p6) fma.s1 f10 = f10, f7, f10 // q2 = q1*e1 + q1 | ||
61 | (p6) fma.s1 f6 = f11, f6, f11 // y1 = y0*e0 + y0 | ||
62 | ;; | ||
63 | (p6) fma.s1 f6 = f6, f7, f6 // y2 = y1*e1 + y1 | ||
64 | (p6) fnma.s1 f7 = f9, f10, f8 // r = -b*q2 + a | ||
65 | ;; | ||
66 | #ifdef MODULO | ||
67 | setf.sig f8 = in0 // f8 = a | ||
68 | setf.sig f9 = in1 // f9 = -b | ||
69 | #endif | ||
70 | (p6) fma.s1 f11 = f7, f6, f10 // q3 = r*y2 + q2 | ||
71 | ;; | ||
72 | FP_TO_INT(f11, f11) // q = trunc(q3) | ||
73 | ;; | ||
74 | #ifdef MODULO | ||
75 | xma.l f11 = f11, f9, f8 // r = q*(-b) + a | ||
76 | ;; | ||
77 | #endif | ||
78 | getf.sig r8 = f11 // transfer result to result register | ||
79 | br.ret.sptk.many rp | ||
80 | END(NAME) | ||
diff --git a/arch/ia64/lib/io.c b/arch/ia64/lib/io.c new file mode 100644 index 000000000000..8949e44091ac --- /dev/null +++ b/arch/ia64/lib/io.c | |||
@@ -0,0 +1,165 @@ | |||
1 | #include <linux/config.h> | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/types.h> | ||
4 | |||
5 | #include <asm/io.h> | ||
6 | |||
7 | /* | ||
8 | * Copy data from IO memory space to "real" memory space. | ||
9 | * This needs to be optimized. | ||
10 | */ | ||
11 | void memcpy_fromio(void *to, const volatile void __iomem *from, long count) | ||
12 | { | ||
13 | char *dst = to; | ||
14 | |||
15 | while (count) { | ||
16 | count--; | ||
17 | *dst++ = readb(from++); | ||
18 | } | ||
19 | } | ||
20 | EXPORT_SYMBOL(memcpy_fromio); | ||
21 | |||
22 | /* | ||
23 | * Copy data from "real" memory space to IO memory space. | ||
24 | * This needs to be optimized. | ||
25 | */ | ||
26 | void memcpy_toio(volatile void __iomem *to, const void *from, long count) | ||
27 | { | ||
28 | const char *src = from; | ||
29 | |||
30 | while (count) { | ||
31 | count--; | ||
32 | writeb(*src++, to++); | ||
33 | } | ||
34 | } | ||
35 | EXPORT_SYMBOL(memcpy_toio); | ||
36 | |||
37 | /* | ||
38 | * "memset" on IO memory space. | ||
39 | * This needs to be optimized. | ||
40 | */ | ||
41 | void memset_io(volatile void __iomem *dst, int c, long count) | ||
42 | { | ||
43 | unsigned char ch = (char)(c & 0xff); | ||
44 | |||
45 | while (count) { | ||
46 | count--; | ||
47 | writeb(ch, dst); | ||
48 | dst++; | ||
49 | } | ||
50 | } | ||
51 | EXPORT_SYMBOL(memset_io); | ||
52 | |||
53 | #ifdef CONFIG_IA64_GENERIC | ||
54 | |||
55 | #undef __ia64_inb | ||
56 | #undef __ia64_inw | ||
57 | #undef __ia64_inl | ||
58 | #undef __ia64_outb | ||
59 | #undef __ia64_outw | ||
60 | #undef __ia64_outl | ||
61 | #undef __ia64_readb | ||
62 | #undef __ia64_readw | ||
63 | #undef __ia64_readl | ||
64 | #undef __ia64_readq | ||
65 | #undef __ia64_readb_relaxed | ||
66 | #undef __ia64_readw_relaxed | ||
67 | #undef __ia64_readl_relaxed | ||
68 | #undef __ia64_readq_relaxed | ||
69 | #undef __ia64_writeb | ||
70 | #undef __ia64_writew | ||
71 | #undef __ia64_writel | ||
72 | #undef __ia64_writeq | ||
73 | #undef __ia64_mmiowb | ||
74 | |||
75 | unsigned int | ||
76 | __ia64_inb (unsigned long port) | ||
77 | { | ||
78 | return ___ia64_inb(port); | ||
79 | } | ||
80 | |||
81 | unsigned int | ||
82 | __ia64_inw (unsigned long port) | ||
83 | { | ||
84 | return ___ia64_inw(port); | ||
85 | } | ||
86 | |||
87 | unsigned int | ||
88 | __ia64_inl (unsigned long port) | ||
89 | { | ||
90 | return ___ia64_inl(port); | ||
91 | } | ||
92 | |||
93 | void | ||
94 | __ia64_outb (unsigned char val, unsigned long port) | ||
95 | { | ||
96 | ___ia64_outb(val, port); | ||
97 | } | ||
98 | |||
99 | void | ||
100 | __ia64_outw (unsigned short val, unsigned long port) | ||
101 | { | ||
102 | ___ia64_outw(val, port); | ||
103 | } | ||
104 | |||
105 | void | ||
106 | __ia64_outl (unsigned int val, unsigned long port) | ||
107 | { | ||
108 | ___ia64_outl(val, port); | ||
109 | } | ||
110 | |||
111 | unsigned char | ||
112 | __ia64_readb (void __iomem *addr) | ||
113 | { | ||
114 | return ___ia64_readb (addr); | ||
115 | } | ||
116 | |||
117 | unsigned short | ||
118 | __ia64_readw (void __iomem *addr) | ||
119 | { | ||
120 | return ___ia64_readw (addr); | ||
121 | } | ||
122 | |||
123 | unsigned int | ||
124 | __ia64_readl (void __iomem *addr) | ||
125 | { | ||
126 | return ___ia64_readl (addr); | ||
127 | } | ||
128 | |||
129 | unsigned long | ||
130 | __ia64_readq (void __iomem *addr) | ||
131 | { | ||
132 | return ___ia64_readq (addr); | ||
133 | } | ||
134 | |||
135 | unsigned char | ||
136 | __ia64_readb_relaxed (void __iomem *addr) | ||
137 | { | ||
138 | return ___ia64_readb (addr); | ||
139 | } | ||
140 | |||
141 | unsigned short | ||
142 | __ia64_readw_relaxed (void __iomem *addr) | ||
143 | { | ||
144 | return ___ia64_readw (addr); | ||
145 | } | ||
146 | |||
147 | unsigned int | ||
148 | __ia64_readl_relaxed (void __iomem *addr) | ||
149 | { | ||
150 | return ___ia64_readl (addr); | ||
151 | } | ||
152 | |||
153 | unsigned long | ||
154 | __ia64_readq_relaxed (void __iomem *addr) | ||
155 | { | ||
156 | return ___ia64_readq (addr); | ||
157 | } | ||
158 | |||
159 | void | ||
160 | __ia64_mmiowb(void) | ||
161 | { | ||
162 | ___ia64_mmiowb(); | ||
163 | } | ||
164 | |||
165 | #endif /* CONFIG_IA64_GENERIC */ | ||
diff --git a/arch/ia64/lib/ip_fast_csum.S b/arch/ia64/lib/ip_fast_csum.S new file mode 100644 index 000000000000..19674ca2acfc --- /dev/null +++ b/arch/ia64/lib/ip_fast_csum.S | |||
@@ -0,0 +1,90 @@ | |||
1 | /* | ||
2 | * Optmized version of the ip_fast_csum() function | ||
3 | * Used for calculating IP header checksum | ||
4 | * | ||
5 | * Return: 16bit checksum, complemented | ||
6 | * | ||
7 | * Inputs: | ||
8 | * in0: address of buffer to checksum (char *) | ||
9 | * in1: length of the buffer (int) | ||
10 | * | ||
11 | * Copyright (C) 2002 Intel Corp. | ||
12 | * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com> | ||
13 | */ | ||
14 | |||
15 | #include <asm/asmmacro.h> | ||
16 | |||
17 | /* | ||
18 | * Since we know that most likely this function is called with buf aligned | ||
19 | * on 4-byte boundary and 20 bytes in length, we can execution rather quickly | ||
20 | * versus calling generic version of do_csum, which has lots of overhead in | ||
21 | * handling various alignments and sizes. However, due to lack of constrains | ||
22 | * put on the function input argument, cases with alignment not on 4-byte or | ||
23 | * size not equal to 20 bytes will be handled by the generic do_csum function. | ||
24 | */ | ||
25 | |||
26 | #define in0 r32 | ||
27 | #define in1 r33 | ||
28 | #define ret0 r8 | ||
29 | |||
30 | GLOBAL_ENTRY(ip_fast_csum) | ||
31 | .prologue | ||
32 | .body | ||
33 | cmp.ne p6,p7=5,in1 // size other than 20 byte? | ||
34 | and r14=3,in0 // is it aligned on 4-byte? | ||
35 | add r15=4,in0 // second source pointer | ||
36 | ;; | ||
37 | cmp.ne.or.andcm p6,p7=r14,r0 | ||
38 | ;; | ||
39 | (p7) ld4 r20=[in0],8 | ||
40 | (p7) ld4 r21=[r15],8 | ||
41 | (p6) br.spnt .generic | ||
42 | ;; | ||
43 | ld4 r22=[in0],8 | ||
44 | ld4 r23=[r15],8 | ||
45 | ;; | ||
46 | ld4 r24=[in0] | ||
47 | add r20=r20,r21 | ||
48 | add r22=r22,r23 | ||
49 | ;; | ||
50 | add r20=r20,r22 | ||
51 | ;; | ||
52 | add r20=r20,r24 | ||
53 | ;; | ||
54 | shr.u ret0=r20,16 // now need to add the carry | ||
55 | zxt2 r20=r20 | ||
56 | ;; | ||
57 | add r20=ret0,r20 | ||
58 | ;; | ||
59 | shr.u ret0=r20,16 // add carry again | ||
60 | zxt2 r20=r20 | ||
61 | ;; | ||
62 | add r20=ret0,r20 | ||
63 | ;; | ||
64 | shr.u ret0=r20,16 | ||
65 | zxt2 r20=r20 | ||
66 | ;; | ||
67 | add r20=ret0,r20 | ||
68 | ;; | ||
69 | andcm ret0=-1,r20 | ||
70 | .restore sp // reset frame state | ||
71 | br.ret.sptk.many b0 | ||
72 | ;; | ||
73 | |||
74 | .generic: | ||
75 | .prologue | ||
76 | .save ar.pfs, r35 | ||
77 | alloc r35=ar.pfs,2,2,2,0 | ||
78 | .save rp, r34 | ||
79 | mov r34=b0 | ||
80 | .body | ||
81 | dep.z out1=in1,2,30 | ||
82 | mov out0=in0 | ||
83 | ;; | ||
84 | br.call.sptk.many b0=do_csum | ||
85 | ;; | ||
86 | andcm ret0=-1,ret0 | ||
87 | mov ar.pfs=r35 | ||
88 | mov b0=r34 | ||
89 | br.ret.sptk.many b0 | ||
90 | END(ip_fast_csum) | ||
diff --git a/arch/ia64/lib/memcpy.S b/arch/ia64/lib/memcpy.S new file mode 100644 index 000000000000..448908d80b69 --- /dev/null +++ b/arch/ia64/lib/memcpy.S | |||
@@ -0,0 +1,301 @@ | |||
1 | /* | ||
2 | * | ||
3 | * Optimized version of the standard memcpy() function | ||
4 | * | ||
5 | * Inputs: | ||
6 | * in0: destination address | ||
7 | * in1: source address | ||
8 | * in2: number of bytes to copy | ||
9 | * Output: | ||
10 | * no return value | ||
11 | * | ||
12 | * Copyright (C) 2000-2001 Hewlett-Packard Co | ||
13 | * Stephane Eranian <eranian@hpl.hp.com> | ||
14 | * David Mosberger-Tang <davidm@hpl.hp.com> | ||
15 | */ | ||
16 | #include <asm/asmmacro.h> | ||
17 | |||
18 | GLOBAL_ENTRY(memcpy) | ||
19 | |||
20 | # define MEM_LAT 21 /* latency to memory */ | ||
21 | |||
22 | # define dst r2 | ||
23 | # define src r3 | ||
24 | # define retval r8 | ||
25 | # define saved_pfs r9 | ||
26 | # define saved_lc r10 | ||
27 | # define saved_pr r11 | ||
28 | # define cnt r16 | ||
29 | # define src2 r17 | ||
30 | # define t0 r18 | ||
31 | # define t1 r19 | ||
32 | # define t2 r20 | ||
33 | # define t3 r21 | ||
34 | # define t4 r22 | ||
35 | # define src_end r23 | ||
36 | |||
37 | # define N (MEM_LAT + 4) | ||
38 | # define Nrot ((N + 7) & ~7) | ||
39 | |||
40 | /* | ||
41 | * First, check if everything (src, dst, len) is a multiple of eight. If | ||
42 | * so, we handle everything with no taken branches (other than the loop | ||
43 | * itself) and a small icache footprint. Otherwise, we jump off to | ||
44 | * the more general copy routine handling arbitrary | ||
45 | * sizes/alignment etc. | ||
46 | */ | ||
47 | .prologue | ||
48 | .save ar.pfs, saved_pfs | ||
49 | alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot | ||
50 | .save ar.lc, saved_lc | ||
51 | mov saved_lc=ar.lc | ||
52 | or t0=in0,in1 | ||
53 | ;; | ||
54 | |||
55 | or t0=t0,in2 | ||
56 | .save pr, saved_pr | ||
57 | mov saved_pr=pr | ||
58 | |||
59 | .body | ||
60 | |||
61 | cmp.eq p6,p0=in2,r0 // zero length? | ||
62 | mov retval=in0 // return dst | ||
63 | (p6) br.ret.spnt.many rp // zero length, return immediately | ||
64 | ;; | ||
65 | |||
66 | mov dst=in0 // copy because of rotation | ||
67 | shr.u cnt=in2,3 // number of 8-byte words to copy | ||
68 | mov pr.rot=1<<16 | ||
69 | ;; | ||
70 | |||
71 | adds cnt=-1,cnt // br.ctop is repeat/until | ||
72 | cmp.gtu p7,p0=16,in2 // copying less than 16 bytes? | ||
73 | mov ar.ec=N | ||
74 | ;; | ||
75 | |||
76 | and t0=0x7,t0 | ||
77 | mov ar.lc=cnt | ||
78 | ;; | ||
79 | cmp.ne p6,p0=t0,r0 | ||
80 | |||
81 | mov src=in1 // copy because of rotation | ||
82 | (p7) br.cond.spnt.few .memcpy_short | ||
83 | (p6) br.cond.spnt.few .memcpy_long | ||
84 | ;; | ||
85 | nop.m 0 | ||
86 | ;; | ||
87 | nop.m 0 | ||
88 | nop.i 0 | ||
89 | ;; | ||
90 | nop.m 0 | ||
91 | ;; | ||
92 | .rotr val[N] | ||
93 | .rotp p[N] | ||
94 | .align 32 | ||
95 | 1: { .mib | ||
96 | (p[0]) ld8 val[0]=[src],8 | ||
97 | nop.i 0 | ||
98 | brp.loop.imp 1b, 2f | ||
99 | } | ||
100 | 2: { .mfb | ||
101 | (p[N-1])st8 [dst]=val[N-1],8 | ||
102 | nop.f 0 | ||
103 | br.ctop.dptk.few 1b | ||
104 | } | ||
105 | ;; | ||
106 | mov ar.lc=saved_lc | ||
107 | mov pr=saved_pr,-1 | ||
108 | mov ar.pfs=saved_pfs | ||
109 | br.ret.sptk.many rp | ||
110 | |||
111 | /* | ||
112 | * Small (<16 bytes) unaligned copying is done via a simple byte-at-the-time | ||
113 | * copy loop. This performs relatively poorly on Itanium, but it doesn't | ||
114 | * get used very often (gcc inlines small copies) and due to atomicity | ||
115 | * issues, we want to avoid read-modify-write of entire words. | ||
116 | */ | ||
117 | .align 32 | ||
118 | .memcpy_short: | ||
119 | adds cnt=-1,in2 // br.ctop is repeat/until | ||
120 | mov ar.ec=MEM_LAT | ||
121 | brp.loop.imp 1f, 2f | ||
122 | ;; | ||
123 | mov ar.lc=cnt | ||
124 | ;; | ||
125 | nop.m 0 | ||
126 | ;; | ||
127 | nop.m 0 | ||
128 | nop.i 0 | ||
129 | ;; | ||
130 | nop.m 0 | ||
131 | ;; | ||
132 | nop.m 0 | ||
133 | ;; | ||
134 | /* | ||
135 | * It is faster to put a stop bit in the loop here because it makes | ||
136 | * the pipeline shorter (and latency is what matters on short copies). | ||
137 | */ | ||
138 | .align 32 | ||
139 | 1: { .mib | ||
140 | (p[0]) ld1 val[0]=[src],1 | ||
141 | nop.i 0 | ||
142 | brp.loop.imp 1b, 2f | ||
143 | } ;; | ||
144 | 2: { .mfb | ||
145 | (p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1 | ||
146 | nop.f 0 | ||
147 | br.ctop.dptk.few 1b | ||
148 | } ;; | ||
149 | mov ar.lc=saved_lc | ||
150 | mov pr=saved_pr,-1 | ||
151 | mov ar.pfs=saved_pfs | ||
152 | br.ret.sptk.many rp | ||
153 | |||
154 | /* | ||
155 | * Large (>= 16 bytes) copying is done in a fancy way. Latency isn't | ||
156 | * an overriding concern here, but throughput is. We first do | ||
157 | * sub-word copying until the destination is aligned, then we check | ||
158 | * if the source is also aligned. If so, we do a simple load/store-loop | ||
159 | * until there are less than 8 bytes left over and then we do the tail, | ||
160 | * by storing the last few bytes using sub-word copying. If the source | ||
161 | * is not aligned, we branch off to the non-congruent loop. | ||
162 | * | ||
163 | * stage: op: | ||
164 | * 0 ld | ||
165 | * : | ||
166 | * MEM_LAT+3 shrp | ||
167 | * MEM_LAT+4 st | ||
168 | * | ||
169 | * On Itanium, the pipeline itself runs without stalls. However, br.ctop | ||
170 | * seems to introduce an unavoidable bubble in the pipeline so the overall | ||
171 | * latency is 2 cycles/iteration. This gives us a _copy_ throughput | ||
172 | * of 4 byte/cycle. Still not bad. | ||
173 | */ | ||
174 | # undef N | ||
175 | # undef Nrot | ||
176 | # define N (MEM_LAT + 5) /* number of stages */ | ||
177 | # define Nrot ((N+1 + 2 + 7) & ~7) /* number of rotating regs */ | ||
178 | |||
179 | #define LOG_LOOP_SIZE 6 | ||
180 | |||
181 | .memcpy_long: | ||
182 | alloc t3=ar.pfs,3,Nrot,0,Nrot // resize register frame | ||
183 | and t0=-8,src // t0 = src & ~7 | ||
184 | and t2=7,src // t2 = src & 7 | ||
185 | ;; | ||
186 | ld8 t0=[t0] // t0 = 1st source word | ||
187 | adds src2=7,src // src2 = (src + 7) | ||
188 | sub t4=r0,dst // t4 = -dst | ||
189 | ;; | ||
190 | and src2=-8,src2 // src2 = (src + 7) & ~7 | ||
191 | shl t2=t2,3 // t2 = 8*(src & 7) | ||
192 | shl t4=t4,3 // t4 = 8*(dst & 7) | ||
193 | ;; | ||
194 | ld8 t1=[src2] // t1 = 1st source word if src is 8-byte aligned, 2nd otherwise | ||
195 | sub t3=64,t2 // t3 = 64-8*(src & 7) | ||
196 | shr.u t0=t0,t2 | ||
197 | ;; | ||
198 | add src_end=src,in2 | ||
199 | shl t1=t1,t3 | ||
200 | mov pr=t4,0x38 // (p5,p4,p3)=(dst & 7) | ||
201 | ;; | ||
202 | or t0=t0,t1 | ||
203 | mov cnt=r0 | ||
204 | adds src_end=-1,src_end | ||
205 | ;; | ||
206 | (p3) st1 [dst]=t0,1 | ||
207 | (p3) shr.u t0=t0,8 | ||
208 | (p3) adds cnt=1,cnt | ||
209 | ;; | ||
210 | (p4) st2 [dst]=t0,2 | ||
211 | (p4) shr.u t0=t0,16 | ||
212 | (p4) adds cnt=2,cnt | ||
213 | ;; | ||
214 | (p5) st4 [dst]=t0,4 | ||
215 | (p5) adds cnt=4,cnt | ||
216 | and src_end=-8,src_end // src_end = last word of source buffer | ||
217 | ;; | ||
218 | |||
219 | // At this point, dst is aligned to 8 bytes and there at least 16-7=9 bytes left to copy: | ||
220 | |||
221 | 1:{ add src=cnt,src // make src point to remainder of source buffer | ||
222 | sub cnt=in2,cnt // cnt = number of bytes left to copy | ||
223 | mov t4=ip | ||
224 | } ;; | ||
225 | and src2=-8,src // align source pointer | ||
226 | adds t4=.memcpy_loops-1b,t4 | ||
227 | mov ar.ec=N | ||
228 | |||
229 | and t0=7,src // t0 = src & 7 | ||
230 | shr.u t2=cnt,3 // t2 = number of 8-byte words left to copy | ||
231 | shl cnt=cnt,3 // move bits 0-2 to 3-5 | ||
232 | ;; | ||
233 | |||
234 | .rotr val[N+1], w[2] | ||
235 | .rotp p[N] | ||
236 | |||
237 | cmp.ne p6,p0=t0,r0 // is src aligned, too? | ||
238 | shl t0=t0,LOG_LOOP_SIZE // t0 = 8*(src & 7) | ||
239 | adds t2=-1,t2 // br.ctop is repeat/until | ||
240 | ;; | ||
241 | add t4=t0,t4 | ||
242 | mov pr=cnt,0x38 // set (p5,p4,p3) to # of bytes last-word bytes to copy | ||
243 | mov ar.lc=t2 | ||
244 | ;; | ||
245 | nop.m 0 | ||
246 | ;; | ||
247 | nop.m 0 | ||
248 | nop.i 0 | ||
249 | ;; | ||
250 | nop.m 0 | ||
251 | ;; | ||
252 | (p6) ld8 val[1]=[src2],8 // prime the pump... | ||
253 | mov b6=t4 | ||
254 | br.sptk.few b6 | ||
255 | ;; | ||
256 | |||
257 | .memcpy_tail: | ||
258 | // At this point, (p5,p4,p3) are set to the number of bytes left to copy (which is | ||
259 | // less than 8) and t0 contains the last few bytes of the src buffer: | ||
260 | (p5) st4 [dst]=t0,4 | ||
261 | (p5) shr.u t0=t0,32 | ||
262 | mov ar.lc=saved_lc | ||
263 | ;; | ||
264 | (p4) st2 [dst]=t0,2 | ||
265 | (p4) shr.u t0=t0,16 | ||
266 | mov ar.pfs=saved_pfs | ||
267 | ;; | ||
268 | (p3) st1 [dst]=t0 | ||
269 | mov pr=saved_pr,-1 | ||
270 | br.ret.sptk.many rp | ||
271 | |||
272 | /////////////////////////////////////////////////////// | ||
273 | .align 64 | ||
274 | |||
275 | #define COPY(shift,index) \ | ||
276 | 1: { .mib \ | ||
277 | (p[0]) ld8 val[0]=[src2],8; \ | ||
278 | (p[MEM_LAT+3]) shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift; \ | ||
279 | brp.loop.imp 1b, 2f \ | ||
280 | }; \ | ||
281 | 2: { .mfb \ | ||
282 | (p[MEM_LAT+4]) st8 [dst]=w[1],8; \ | ||
283 | nop.f 0; \ | ||
284 | br.ctop.dptk.few 1b; \ | ||
285 | }; \ | ||
286 | ;; \ | ||
287 | ld8 val[N-1]=[src_end]; /* load last word (may be same as val[N]) */ \ | ||
288 | ;; \ | ||
289 | shrp t0=val[N-1],val[N-index],shift; \ | ||
290 | br .memcpy_tail | ||
291 | .memcpy_loops: | ||
292 | COPY(0, 1) /* no point special casing this---it doesn't go any faster without shrp */ | ||
293 | COPY(8, 0) | ||
294 | COPY(16, 0) | ||
295 | COPY(24, 0) | ||
296 | COPY(32, 0) | ||
297 | COPY(40, 0) | ||
298 | COPY(48, 0) | ||
299 | COPY(56, 0) | ||
300 | |||
301 | END(memcpy) | ||
diff --git a/arch/ia64/lib/memcpy_mck.S b/arch/ia64/lib/memcpy_mck.S new file mode 100644 index 000000000000..6f26ef7cc236 --- /dev/null +++ b/arch/ia64/lib/memcpy_mck.S | |||
@@ -0,0 +1,661 @@ | |||
1 | /* | ||
2 | * Itanium 2-optimized version of memcpy and copy_user function | ||
3 | * | ||
4 | * Inputs: | ||
5 | * in0: destination address | ||
6 | * in1: source address | ||
7 | * in2: number of bytes to copy | ||
8 | * Output: | ||
9 | * 0 if success, or number of byte NOT copied if error occurred. | ||
10 | * | ||
11 | * Copyright (C) 2002 Intel Corp. | ||
12 | * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com> | ||
13 | */ | ||
14 | #include <linux/config.h> | ||
15 | #include <asm/asmmacro.h> | ||
16 | #include <asm/page.h> | ||
17 | |||
18 | #define EK(y...) EX(y) | ||
19 | |||
20 | /* McKinley specific optimization */ | ||
21 | |||
22 | #define retval r8 | ||
23 | #define saved_pfs r31 | ||
24 | #define saved_lc r10 | ||
25 | #define saved_pr r11 | ||
26 | #define saved_in0 r14 | ||
27 | #define saved_in1 r15 | ||
28 | #define saved_in2 r16 | ||
29 | |||
30 | #define src0 r2 | ||
31 | #define src1 r3 | ||
32 | #define dst0 r17 | ||
33 | #define dst1 r18 | ||
34 | #define cnt r9 | ||
35 | |||
36 | /* r19-r30 are temp for each code section */ | ||
37 | #define PREFETCH_DIST 8 | ||
38 | #define src_pre_mem r19 | ||
39 | #define dst_pre_mem r20 | ||
40 | #define src_pre_l2 r21 | ||
41 | #define dst_pre_l2 r22 | ||
42 | #define t1 r23 | ||
43 | #define t2 r24 | ||
44 | #define t3 r25 | ||
45 | #define t4 r26 | ||
46 | #define t5 t1 // alias! | ||
47 | #define t6 t2 // alias! | ||
48 | #define t7 t3 // alias! | ||
49 | #define n8 r27 | ||
50 | #define t9 t5 // alias! | ||
51 | #define t10 t4 // alias! | ||
52 | #define t11 t7 // alias! | ||
53 | #define t12 t6 // alias! | ||
54 | #define t14 t10 // alias! | ||
55 | #define t13 r28 | ||
56 | #define t15 r29 | ||
57 | #define tmp r30 | ||
58 | |||
59 | /* defines for long_copy block */ | ||
60 | #define A 0 | ||
61 | #define B (PREFETCH_DIST) | ||
62 | #define C (B + PREFETCH_DIST) | ||
63 | #define D (C + 1) | ||
64 | #define N (D + 1) | ||
65 | #define Nrot ((N + 7) & ~7) | ||
66 | |||
67 | /* alias */ | ||
68 | #define in0 r32 | ||
69 | #define in1 r33 | ||
70 | #define in2 r34 | ||
71 | |||
72 | GLOBAL_ENTRY(memcpy) | ||
73 | and r28=0x7,in0 | ||
74 | and r29=0x7,in1 | ||
75 | mov f6=f0 | ||
76 | br.cond.sptk .common_code | ||
77 | ;; | ||
78 | GLOBAL_ENTRY(__copy_user) | ||
79 | .prologue | ||
80 | // check dest alignment | ||
81 | and r28=0x7,in0 | ||
82 | and r29=0x7,in1 | ||
83 | mov f6=f1 | ||
84 | mov saved_in0=in0 // save dest pointer | ||
85 | mov saved_in1=in1 // save src pointer | ||
86 | mov saved_in2=in2 // save len | ||
87 | ;; | ||
88 | .common_code: | ||
89 | cmp.gt p15,p0=8,in2 // check for small size | ||
90 | cmp.ne p13,p0=0,r28 // check dest alignment | ||
91 | cmp.ne p14,p0=0,r29 // check src alignment | ||
92 | add src0=0,in1 | ||
93 | sub r30=8,r28 // for .align_dest | ||
94 | mov retval=r0 // initialize return value | ||
95 | ;; | ||
96 | add dst0=0,in0 | ||
97 | add dst1=1,in0 // dest odd index | ||
98 | cmp.le p6,p0 = 1,r30 // for .align_dest | ||
99 | (p15) br.cond.dpnt .memcpy_short | ||
100 | (p13) br.cond.dpnt .align_dest | ||
101 | (p14) br.cond.dpnt .unaligned_src | ||
102 | ;; | ||
103 | |||
104 | // both dest and src are aligned on 8-byte boundary | ||
105 | .aligned_src: | ||
106 | .save ar.pfs, saved_pfs | ||
107 | alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot | ||
108 | .save pr, saved_pr | ||
109 | mov saved_pr=pr | ||
110 | |||
111 | shr.u cnt=in2,7 // this much cache line | ||
112 | ;; | ||
113 | cmp.lt p6,p0=2*PREFETCH_DIST,cnt | ||
114 | cmp.lt p7,p8=1,cnt | ||
115 | .save ar.lc, saved_lc | ||
116 | mov saved_lc=ar.lc | ||
117 | .body | ||
118 | add cnt=-1,cnt | ||
119 | add src_pre_mem=0,in1 // prefetch src pointer | ||
120 | add dst_pre_mem=0,in0 // prefetch dest pointer | ||
121 | ;; | ||
122 | (p7) mov ar.lc=cnt // prefetch count | ||
123 | (p8) mov ar.lc=r0 | ||
124 | (p6) br.cond.dpnt .long_copy | ||
125 | ;; | ||
126 | |||
127 | .prefetch: | ||
128 | lfetch.fault [src_pre_mem], 128 | ||
129 | lfetch.fault.excl [dst_pre_mem], 128 | ||
130 | br.cloop.dptk.few .prefetch | ||
131 | ;; | ||
132 | |||
133 | .medium_copy: | ||
134 | and tmp=31,in2 // copy length after iteration | ||
135 | shr.u r29=in2,5 // number of 32-byte iteration | ||
136 | add dst1=8,dst0 // 2nd dest pointer | ||
137 | ;; | ||
138 | add cnt=-1,r29 // ctop iteration adjustment | ||
139 | cmp.eq p10,p0=r29,r0 // do we really need to loop? | ||
140 | add src1=8,src0 // 2nd src pointer | ||
141 | cmp.le p6,p0=8,tmp | ||
142 | ;; | ||
143 | cmp.le p7,p0=16,tmp | ||
144 | mov ar.lc=cnt // loop setup | ||
145 | cmp.eq p16,p17 = r0,r0 | ||
146 | mov ar.ec=2 | ||
147 | (p10) br.dpnt.few .aligned_src_tail | ||
148 | ;; | ||
149 | TEXT_ALIGN(32) | ||
150 | 1: | ||
151 | EX(.ex_handler, (p16) ld8 r34=[src0],16) | ||
152 | EK(.ex_handler, (p16) ld8 r38=[src1],16) | ||
153 | EX(.ex_handler, (p17) st8 [dst0]=r33,16) | ||
154 | EK(.ex_handler, (p17) st8 [dst1]=r37,16) | ||
155 | ;; | ||
156 | EX(.ex_handler, (p16) ld8 r32=[src0],16) | ||
157 | EK(.ex_handler, (p16) ld8 r36=[src1],16) | ||
158 | EX(.ex_handler, (p16) st8 [dst0]=r34,16) | ||
159 | EK(.ex_handler, (p16) st8 [dst1]=r38,16) | ||
160 | br.ctop.dptk.few 1b | ||
161 | ;; | ||
162 | |||
163 | .aligned_src_tail: | ||
164 | EX(.ex_handler, (p6) ld8 t1=[src0]) | ||
165 | mov ar.lc=saved_lc | ||
166 | mov ar.pfs=saved_pfs | ||
167 | EX(.ex_hndlr_s, (p7) ld8 t2=[src1],8) | ||
168 | cmp.le p8,p0=24,tmp | ||
169 | and r21=-8,tmp | ||
170 | ;; | ||
171 | EX(.ex_hndlr_s, (p8) ld8 t3=[src1]) | ||
172 | EX(.ex_handler, (p6) st8 [dst0]=t1) // store byte 1 | ||
173 | and in2=7,tmp // remaining length | ||
174 | EX(.ex_hndlr_d, (p7) st8 [dst1]=t2,8) // store byte 2 | ||
175 | add src0=src0,r21 // setting up src pointer | ||
176 | add dst0=dst0,r21 // setting up dest pointer | ||
177 | ;; | ||
178 | EX(.ex_handler, (p8) st8 [dst1]=t3) // store byte 3 | ||
179 | mov pr=saved_pr,-1 | ||
180 | br.dptk.many .memcpy_short | ||
181 | ;; | ||
182 | |||
183 | /* code taken from copy_page_mck */ | ||
184 | .long_copy: | ||
185 | .rotr v[2*PREFETCH_DIST] | ||
186 | .rotp p[N] | ||
187 | |||
188 | mov src_pre_mem = src0 | ||
189 | mov pr.rot = 0x10000 | ||
190 | mov ar.ec = 1 // special unrolled loop | ||
191 | |||
192 | mov dst_pre_mem = dst0 | ||
193 | |||
194 | add src_pre_l2 = 8*8, src0 | ||
195 | add dst_pre_l2 = 8*8, dst0 | ||
196 | ;; | ||
197 | add src0 = 8, src_pre_mem // first t1 src | ||
198 | mov ar.lc = 2*PREFETCH_DIST - 1 | ||
199 | shr.u cnt=in2,7 // number of lines | ||
200 | add src1 = 3*8, src_pre_mem // first t3 src | ||
201 | add dst0 = 8, dst_pre_mem // first t1 dst | ||
202 | add dst1 = 3*8, dst_pre_mem // first t3 dst | ||
203 | ;; | ||
204 | and tmp=127,in2 // remaining bytes after this block | ||
205 | add cnt = -(2*PREFETCH_DIST) - 1, cnt | ||
206 | // same as .line_copy loop, but with all predicated-off instructions removed: | ||
207 | .prefetch_loop: | ||
208 | EX(.ex_hndlr_lcpy_1, (p[A]) ld8 v[A] = [src_pre_mem], 128) // M0 | ||
209 | EK(.ex_hndlr_lcpy_1, (p[B]) st8 [dst_pre_mem] = v[B], 128) // M2 | ||
210 | br.ctop.sptk .prefetch_loop | ||
211 | ;; | ||
212 | cmp.eq p16, p0 = r0, r0 // reset p16 to 1 | ||
213 | mov ar.lc = cnt | ||
214 | mov ar.ec = N // # of stages in pipeline | ||
215 | ;; | ||
216 | .line_copy: | ||
217 | EX(.ex_handler, (p[D]) ld8 t2 = [src0], 3*8) // M0 | ||
218 | EK(.ex_handler, (p[D]) ld8 t4 = [src1], 3*8) // M1 | ||
219 | EX(.ex_handler_lcpy, (p[B]) st8 [dst_pre_mem] = v[B], 128) // M2 prefetch dst from memory | ||
220 | EK(.ex_handler_lcpy, (p[D]) st8 [dst_pre_l2] = n8, 128) // M3 prefetch dst from L2 | ||
221 | ;; | ||
222 | EX(.ex_handler_lcpy, (p[A]) ld8 v[A] = [src_pre_mem], 128) // M0 prefetch src from memory | ||
223 | EK(.ex_handler_lcpy, (p[C]) ld8 n8 = [src_pre_l2], 128) // M1 prefetch src from L2 | ||
224 | EX(.ex_handler, (p[D]) st8 [dst0] = t1, 8) // M2 | ||
225 | EK(.ex_handler, (p[D]) st8 [dst1] = t3, 8) // M3 | ||
226 | ;; | ||
227 | EX(.ex_handler, (p[D]) ld8 t5 = [src0], 8) | ||
228 | EK(.ex_handler, (p[D]) ld8 t7 = [src1], 3*8) | ||
229 | EX(.ex_handler, (p[D]) st8 [dst0] = t2, 3*8) | ||
230 | EK(.ex_handler, (p[D]) st8 [dst1] = t4, 3*8) | ||
231 | ;; | ||
232 | EX(.ex_handler, (p[D]) ld8 t6 = [src0], 3*8) | ||
233 | EK(.ex_handler, (p[D]) ld8 t10 = [src1], 8) | ||
234 | EX(.ex_handler, (p[D]) st8 [dst0] = t5, 8) | ||
235 | EK(.ex_handler, (p[D]) st8 [dst1] = t7, 3*8) | ||
236 | ;; | ||
237 | EX(.ex_handler, (p[D]) ld8 t9 = [src0], 3*8) | ||
238 | EK(.ex_handler, (p[D]) ld8 t11 = [src1], 3*8) | ||
239 | EX(.ex_handler, (p[D]) st8 [dst0] = t6, 3*8) | ||
240 | EK(.ex_handler, (p[D]) st8 [dst1] = t10, 8) | ||
241 | ;; | ||
242 | EX(.ex_handler, (p[D]) ld8 t12 = [src0], 8) | ||
243 | EK(.ex_handler, (p[D]) ld8 t14 = [src1], 8) | ||
244 | EX(.ex_handler, (p[D]) st8 [dst0] = t9, 3*8) | ||
245 | EK(.ex_handler, (p[D]) st8 [dst1] = t11, 3*8) | ||
246 | ;; | ||
247 | EX(.ex_handler, (p[D]) ld8 t13 = [src0], 4*8) | ||
248 | EK(.ex_handler, (p[D]) ld8 t15 = [src1], 4*8) | ||
249 | EX(.ex_handler, (p[D]) st8 [dst0] = t12, 8) | ||
250 | EK(.ex_handler, (p[D]) st8 [dst1] = t14, 8) | ||
251 | ;; | ||
252 | EX(.ex_handler, (p[C]) ld8 t1 = [src0], 8) | ||
253 | EK(.ex_handler, (p[C]) ld8 t3 = [src1], 8) | ||
254 | EX(.ex_handler, (p[D]) st8 [dst0] = t13, 4*8) | ||
255 | EK(.ex_handler, (p[D]) st8 [dst1] = t15, 4*8) | ||
256 | br.ctop.sptk .line_copy | ||
257 | ;; | ||
258 | |||
259 | add dst0=-8,dst0 | ||
260 | add src0=-8,src0 | ||
261 | mov in2=tmp | ||
262 | .restore sp | ||
263 | br.sptk.many .medium_copy | ||
264 | ;; | ||
265 | |||
266 | #define BLOCK_SIZE 128*32 | ||
267 | #define blocksize r23 | ||
268 | #define curlen r24 | ||
269 | |||
270 | // dest is on 8-byte boundary, src is not. We need to do | ||
271 | // ld8-ld8, shrp, then st8. Max 8 byte copy per cycle. | ||
272 | .unaligned_src: | ||
273 | .prologue | ||
274 | .save ar.pfs, saved_pfs | ||
275 | alloc saved_pfs=ar.pfs,3,5,0,8 | ||
276 | .save ar.lc, saved_lc | ||
277 | mov saved_lc=ar.lc | ||
278 | .save pr, saved_pr | ||
279 | mov saved_pr=pr | ||
280 | .body | ||
281 | .4k_block: | ||
282 | mov saved_in0=dst0 // need to save all input arguments | ||
283 | mov saved_in2=in2 | ||
284 | mov blocksize=BLOCK_SIZE | ||
285 | ;; | ||
286 | cmp.lt p6,p7=blocksize,in2 | ||
287 | mov saved_in1=src0 | ||
288 | ;; | ||
289 | (p6) mov in2=blocksize | ||
290 | ;; | ||
291 | shr.u r21=in2,7 // this much cache line | ||
292 | shr.u r22=in2,4 // number of 16-byte iteration | ||
293 | and curlen=15,in2 // copy length after iteration | ||
294 | and r30=7,src0 // source alignment | ||
295 | ;; | ||
296 | cmp.lt p7,p8=1,r21 | ||
297 | add cnt=-1,r21 | ||
298 | ;; | ||
299 | |||
300 | add src_pre_mem=0,src0 // prefetch src pointer | ||
301 | add dst_pre_mem=0,dst0 // prefetch dest pointer | ||
302 | and src0=-8,src0 // 1st src pointer | ||
303 | (p7) mov ar.lc = r21 | ||
304 | (p8) mov ar.lc = r0 | ||
305 | ;; | ||
306 | TEXT_ALIGN(32) | ||
307 | 1: lfetch.fault [src_pre_mem], 128 | ||
308 | lfetch.fault.excl [dst_pre_mem], 128 | ||
309 | br.cloop.dptk.few 1b | ||
310 | ;; | ||
311 | |||
312 | shladd dst1=r22,3,dst0 // 2nd dest pointer | ||
313 | shladd src1=r22,3,src0 // 2nd src pointer | ||
314 | cmp.eq p8,p9=r22,r0 // do we really need to loop? | ||
315 | cmp.le p6,p7=8,curlen; // have at least 8 byte remaining? | ||
316 | add cnt=-1,r22 // ctop iteration adjustment | ||
317 | ;; | ||
318 | EX(.ex_handler, (p9) ld8 r33=[src0],8) // loop primer | ||
319 | EK(.ex_handler, (p9) ld8 r37=[src1],8) | ||
320 | (p8) br.dpnt.few .noloop | ||
321 | ;; | ||
322 | |||
323 | // The jump address is calculated based on src alignment. The COPYU | ||
324 | // macro below need to confine its size to power of two, so an entry | ||
325 | // can be caulated using shl instead of an expensive multiply. The | ||
326 | // size is then hard coded by the following #define to match the | ||
327 | // actual size. This make it somewhat tedious when COPYU macro gets | ||
328 | // changed and this need to be adjusted to match. | ||
329 | #define LOOP_SIZE 6 | ||
330 | 1: | ||
331 | mov r29=ip // jmp_table thread | ||
332 | mov ar.lc=cnt | ||
333 | ;; | ||
334 | add r29=.jump_table - 1b - (.jmp1-.jump_table), r29 | ||
335 | shl r28=r30, LOOP_SIZE // jmp_table thread | ||
336 | mov ar.ec=2 // loop setup | ||
337 | ;; | ||
338 | add r29=r29,r28 // jmp_table thread | ||
339 | cmp.eq p16,p17=r0,r0 | ||
340 | ;; | ||
341 | mov b6=r29 // jmp_table thread | ||
342 | ;; | ||
343 | br.cond.sptk.few b6 | ||
344 | |||
345 | // for 8-15 byte case | ||
346 | // We will skip the loop, but need to replicate the side effect | ||
347 | // that the loop produces. | ||
348 | .noloop: | ||
349 | EX(.ex_handler, (p6) ld8 r37=[src1],8) | ||
350 | add src0=8,src0 | ||
351 | (p6) shl r25=r30,3 | ||
352 | ;; | ||
353 | EX(.ex_handler, (p6) ld8 r27=[src1]) | ||
354 | (p6) shr.u r28=r37,r25 | ||
355 | (p6) sub r26=64,r25 | ||
356 | ;; | ||
357 | (p6) shl r27=r27,r26 | ||
358 | ;; | ||
359 | (p6) or r21=r28,r27 | ||
360 | |||
361 | .unaligned_src_tail: | ||
362 | /* check if we have more than blocksize to copy, if so go back */ | ||
363 | cmp.gt p8,p0=saved_in2,blocksize | ||
364 | ;; | ||
365 | (p8) add dst0=saved_in0,blocksize | ||
366 | (p8) add src0=saved_in1,blocksize | ||
367 | (p8) sub in2=saved_in2,blocksize | ||
368 | (p8) br.dpnt .4k_block | ||
369 | ;; | ||
370 | |||
371 | /* we have up to 15 byte to copy in the tail. | ||
372 | * part of work is already done in the jump table code | ||
373 | * we are at the following state. | ||
374 | * src side: | ||
375 | * | ||
376 | * xxxxxx xx <----- r21 has xxxxxxxx already | ||
377 | * -------- -------- -------- | ||
378 | * 0 8 16 | ||
379 | * ^ | ||
380 | * | | ||
381 | * src1 | ||
382 | * | ||
383 | * dst | ||
384 | * -------- -------- -------- | ||
385 | * ^ | ||
386 | * | | ||
387 | * dst1 | ||
388 | */ | ||
389 | EX(.ex_handler, (p6) st8 [dst1]=r21,8) // more than 8 byte to copy | ||
390 | (p6) add curlen=-8,curlen // update length | ||
391 | mov ar.pfs=saved_pfs | ||
392 | ;; | ||
393 | mov ar.lc=saved_lc | ||
394 | mov pr=saved_pr,-1 | ||
395 | mov in2=curlen // remaining length | ||
396 | mov dst0=dst1 // dest pointer | ||
397 | add src0=src1,r30 // forward by src alignment | ||
398 | ;; | ||
399 | |||
400 | // 7 byte or smaller. | ||
401 | .memcpy_short: | ||
402 | cmp.le p8,p9 = 1,in2 | ||
403 | cmp.le p10,p11 = 2,in2 | ||
404 | cmp.le p12,p13 = 3,in2 | ||
405 | cmp.le p14,p15 = 4,in2 | ||
406 | add src1=1,src0 // second src pointer | ||
407 | add dst1=1,dst0 // second dest pointer | ||
408 | ;; | ||
409 | |||
410 | EX(.ex_handler_short, (p8) ld1 t1=[src0],2) | ||
411 | EK(.ex_handler_short, (p10) ld1 t2=[src1],2) | ||
412 | (p9) br.ret.dpnt rp // 0 byte copy | ||
413 | ;; | ||
414 | |||
415 | EX(.ex_handler_short, (p8) st1 [dst0]=t1,2) | ||
416 | EK(.ex_handler_short, (p10) st1 [dst1]=t2,2) | ||
417 | (p11) br.ret.dpnt rp // 1 byte copy | ||
418 | |||
419 | EX(.ex_handler_short, (p12) ld1 t3=[src0],2) | ||
420 | EK(.ex_handler_short, (p14) ld1 t4=[src1],2) | ||
421 | (p13) br.ret.dpnt rp // 2 byte copy | ||
422 | ;; | ||
423 | |||
424 | cmp.le p6,p7 = 5,in2 | ||
425 | cmp.le p8,p9 = 6,in2 | ||
426 | cmp.le p10,p11 = 7,in2 | ||
427 | |||
428 | EX(.ex_handler_short, (p12) st1 [dst0]=t3,2) | ||
429 | EK(.ex_handler_short, (p14) st1 [dst1]=t4,2) | ||
430 | (p15) br.ret.dpnt rp // 3 byte copy | ||
431 | ;; | ||
432 | |||
433 | EX(.ex_handler_short, (p6) ld1 t5=[src0],2) | ||
434 | EK(.ex_handler_short, (p8) ld1 t6=[src1],2) | ||
435 | (p7) br.ret.dpnt rp // 4 byte copy | ||
436 | ;; | ||
437 | |||
438 | EX(.ex_handler_short, (p6) st1 [dst0]=t5,2) | ||
439 | EK(.ex_handler_short, (p8) st1 [dst1]=t6,2) | ||
440 | (p9) br.ret.dptk rp // 5 byte copy | ||
441 | |||
442 | EX(.ex_handler_short, (p10) ld1 t7=[src0],2) | ||
443 | (p11) br.ret.dptk rp // 6 byte copy | ||
444 | ;; | ||
445 | |||
446 | EX(.ex_handler_short, (p10) st1 [dst0]=t7,2) | ||
447 | br.ret.dptk rp // done all cases | ||
448 | |||
449 | |||
450 | /* Align dest to nearest 8-byte boundary. We know we have at | ||
451 | * least 7 bytes to copy, enough to crawl to 8-byte boundary. | ||
452 | * Actual number of byte to crawl depend on the dest alignment. | ||
453 | * 7 byte or less is taken care at .memcpy_short | ||
454 | |||
455 | * src0 - source even index | ||
456 | * src1 - source odd index | ||
457 | * dst0 - dest even index | ||
458 | * dst1 - dest odd index | ||
459 | * r30 - distance to 8-byte boundary | ||
460 | */ | ||
461 | |||
462 | .align_dest: | ||
463 | add src1=1,in1 // source odd index | ||
464 | cmp.le p7,p0 = 2,r30 // for .align_dest | ||
465 | cmp.le p8,p0 = 3,r30 // for .align_dest | ||
466 | EX(.ex_handler_short, (p6) ld1 t1=[src0],2) | ||
467 | cmp.le p9,p0 = 4,r30 // for .align_dest | ||
468 | cmp.le p10,p0 = 5,r30 | ||
469 | ;; | ||
470 | EX(.ex_handler_short, (p7) ld1 t2=[src1],2) | ||
471 | EK(.ex_handler_short, (p8) ld1 t3=[src0],2) | ||
472 | cmp.le p11,p0 = 6,r30 | ||
473 | EX(.ex_handler_short, (p6) st1 [dst0] = t1,2) | ||
474 | cmp.le p12,p0 = 7,r30 | ||
475 | ;; | ||
476 | EX(.ex_handler_short, (p9) ld1 t4=[src1],2) | ||
477 | EK(.ex_handler_short, (p10) ld1 t5=[src0],2) | ||
478 | EX(.ex_handler_short, (p7) st1 [dst1] = t2,2) | ||
479 | EK(.ex_handler_short, (p8) st1 [dst0] = t3,2) | ||
480 | ;; | ||
481 | EX(.ex_handler_short, (p11) ld1 t6=[src1],2) | ||
482 | EK(.ex_handler_short, (p12) ld1 t7=[src0],2) | ||
483 | cmp.eq p6,p7=r28,r29 | ||
484 | EX(.ex_handler_short, (p9) st1 [dst1] = t4,2) | ||
485 | EK(.ex_handler_short, (p10) st1 [dst0] = t5,2) | ||
486 | sub in2=in2,r30 | ||
487 | ;; | ||
488 | EX(.ex_handler_short, (p11) st1 [dst1] = t6,2) | ||
489 | EK(.ex_handler_short, (p12) st1 [dst0] = t7) | ||
490 | add dst0=in0,r30 // setup arguments | ||
491 | add src0=in1,r30 | ||
492 | (p6) br.cond.dptk .aligned_src | ||
493 | (p7) br.cond.dpnt .unaligned_src | ||
494 | ;; | ||
495 | |||
496 | /* main loop body in jump table format */ | ||
497 | #define COPYU(shift) \ | ||
498 | 1: \ | ||
499 | EX(.ex_handler, (p16) ld8 r32=[src0],8); /* 1 */ \ | ||
500 | EK(.ex_handler, (p16) ld8 r36=[src1],8); \ | ||
501 | (p17) shrp r35=r33,r34,shift;; /* 1 */ \ | ||
502 | EX(.ex_handler, (p6) ld8 r22=[src1]); /* common, prime for tail section */ \ | ||
503 | nop.m 0; \ | ||
504 | (p16) shrp r38=r36,r37,shift; \ | ||
505 | EX(.ex_handler, (p17) st8 [dst0]=r35,8); /* 1 */ \ | ||
506 | EK(.ex_handler, (p17) st8 [dst1]=r39,8); \ | ||
507 | br.ctop.dptk.few 1b;; \ | ||
508 | (p7) add src1=-8,src1; /* back out for <8 byte case */ \ | ||
509 | shrp r21=r22,r38,shift; /* speculative work */ \ | ||
510 | br.sptk.few .unaligned_src_tail /* branch out of jump table */ \ | ||
511 | ;; | ||
512 | TEXT_ALIGN(32) | ||
513 | .jump_table: | ||
514 | COPYU(8) // unaligned cases | ||
515 | .jmp1: | ||
516 | COPYU(16) | ||
517 | COPYU(24) | ||
518 | COPYU(32) | ||
519 | COPYU(40) | ||
520 | COPYU(48) | ||
521 | COPYU(56) | ||
522 | |||
523 | #undef A | ||
524 | #undef B | ||
525 | #undef C | ||
526 | #undef D | ||
527 | END(memcpy) | ||
528 | |||
529 | /* | ||
530 | * Due to lack of local tag support in gcc 2.x assembler, it is not clear which | ||
531 | * instruction failed in the bundle. The exception algorithm is that we | ||
532 | * first figure out the faulting address, then detect if there is any | ||
533 | * progress made on the copy, if so, redo the copy from last known copied | ||
534 | * location up to the faulting address (exclusive). In the copy_from_user | ||
535 | * case, remaining byte in kernel buffer will be zeroed. | ||
536 | * | ||
537 | * Take copy_from_user as an example, in the code there are multiple loads | ||
538 | * in a bundle and those multiple loads could span over two pages, the | ||
539 | * faulting address is calculated as page_round_down(max(src0, src1)). | ||
540 | * This is based on knowledge that if we can access one byte in a page, we | ||
541 | * can access any byte in that page. | ||
542 | * | ||
543 | * predicate used in the exception handler: | ||
544 | * p6-p7: direction | ||
545 | * p10-p11: src faulting addr calculation | ||
546 | * p12-p13: dst faulting addr calculation | ||
547 | */ | ||
548 | |||
549 | #define A r19 | ||
550 | #define B r20 | ||
551 | #define C r21 | ||
552 | #define D r22 | ||
553 | #define F r28 | ||
554 | |||
555 | #define memset_arg0 r32 | ||
556 | #define memset_arg2 r33 | ||
557 | |||
558 | #define saved_retval loc0 | ||
559 | #define saved_rtlink loc1 | ||
560 | #define saved_pfs_stack loc2 | ||
561 | |||
562 | .ex_hndlr_s: | ||
563 | add src0=8,src0 | ||
564 | br.sptk .ex_handler | ||
565 | ;; | ||
566 | .ex_hndlr_d: | ||
567 | add dst0=8,dst0 | ||
568 | br.sptk .ex_handler | ||
569 | ;; | ||
570 | .ex_hndlr_lcpy_1: | ||
571 | mov src1=src_pre_mem | ||
572 | mov dst1=dst_pre_mem | ||
573 | cmp.gtu p10,p11=src_pre_mem,saved_in1 | ||
574 | cmp.gtu p12,p13=dst_pre_mem,saved_in0 | ||
575 | ;; | ||
576 | (p10) add src0=8,saved_in1 | ||
577 | (p11) mov src0=saved_in1 | ||
578 | (p12) add dst0=8,saved_in0 | ||
579 | (p13) mov dst0=saved_in0 | ||
580 | br.sptk .ex_handler | ||
581 | .ex_handler_lcpy: | ||
582 | // in line_copy block, the preload addresses should always ahead | ||
583 | // of the other two src/dst pointers. Furthermore, src1/dst1 should | ||
584 | // always ahead of src0/dst0. | ||
585 | mov src1=src_pre_mem | ||
586 | mov dst1=dst_pre_mem | ||
587 | .ex_handler: | ||
588 | mov pr=saved_pr,-1 // first restore pr, lc, and pfs | ||
589 | mov ar.lc=saved_lc | ||
590 | mov ar.pfs=saved_pfs | ||
591 | ;; | ||
592 | .ex_handler_short: // fault occurred in these sections didn't change pr, lc, pfs | ||
593 | cmp.ltu p6,p7=saved_in0, saved_in1 // get the copy direction | ||
594 | cmp.ltu p10,p11=src0,src1 | ||
595 | cmp.ltu p12,p13=dst0,dst1 | ||
596 | fcmp.eq p8,p0=f6,f0 // is it memcpy? | ||
597 | mov tmp = dst0 | ||
598 | ;; | ||
599 | (p11) mov src1 = src0 // pick the larger of the two | ||
600 | (p13) mov dst0 = dst1 // make dst0 the smaller one | ||
601 | (p13) mov dst1 = tmp // and dst1 the larger one | ||
602 | ;; | ||
603 | (p6) dep F = r0,dst1,0,PAGE_SHIFT // usr dst round down to page boundary | ||
604 | (p7) dep F = r0,src1,0,PAGE_SHIFT // usr src round down to page boundary | ||
605 | ;; | ||
606 | (p6) cmp.le p14,p0=dst0,saved_in0 // no progress has been made on store | ||
607 | (p7) cmp.le p14,p0=src0,saved_in1 // no progress has been made on load | ||
608 | mov retval=saved_in2 | ||
609 | (p8) ld1 tmp=[src1] // force an oops for memcpy call | ||
610 | (p8) st1 [dst1]=r0 // force an oops for memcpy call | ||
611 | (p14) br.ret.sptk.many rp | ||
612 | |||
613 | /* | ||
614 | * The remaining byte to copy is calculated as: | ||
615 | * | ||
616 | * A = (faulting_addr - orig_src) -> len to faulting ld address | ||
617 | * or | ||
618 | * (faulting_addr - orig_dst) -> len to faulting st address | ||
619 | * B = (cur_dst - orig_dst) -> len copied so far | ||
620 | * C = A - B -> len need to be copied | ||
621 | * D = orig_len - A -> len need to be zeroed | ||
622 | */ | ||
623 | (p6) sub A = F, saved_in0 | ||
624 | (p7) sub A = F, saved_in1 | ||
625 | clrrrb | ||
626 | ;; | ||
627 | alloc saved_pfs_stack=ar.pfs,3,3,3,0 | ||
628 | sub B = dst0, saved_in0 // how many byte copied so far | ||
629 | ;; | ||
630 | sub C = A, B | ||
631 | sub D = saved_in2, A | ||
632 | ;; | ||
633 | cmp.gt p8,p0=C,r0 // more than 1 byte? | ||
634 | add memset_arg0=saved_in0, A | ||
635 | (p6) mov memset_arg2=0 // copy_to_user should not call memset | ||
636 | (p7) mov memset_arg2=D // copy_from_user need to have kbuf zeroed | ||
637 | mov r8=0 | ||
638 | mov saved_retval = D | ||
639 | mov saved_rtlink = b0 | ||
640 | |||
641 | add out0=saved_in0, B | ||
642 | add out1=saved_in1, B | ||
643 | mov out2=C | ||
644 | (p8) br.call.sptk.few b0=__copy_user // recursive call | ||
645 | ;; | ||
646 | |||
647 | add saved_retval=saved_retval,r8 // above might return non-zero value | ||
648 | cmp.gt p8,p0=memset_arg2,r0 // more than 1 byte? | ||
649 | mov out0=memset_arg0 // *s | ||
650 | mov out1=r0 // c | ||
651 | mov out2=memset_arg2 // n | ||
652 | (p8) br.call.sptk.few b0=memset | ||
653 | ;; | ||
654 | |||
655 | mov retval=saved_retval | ||
656 | mov ar.pfs=saved_pfs_stack | ||
657 | mov b0=saved_rtlink | ||
658 | br.ret.sptk.many rp | ||
659 | |||
660 | /* end of McKinley specific optimization */ | ||
661 | END(__copy_user) | ||
diff --git a/arch/ia64/lib/memset.S b/arch/ia64/lib/memset.S new file mode 100644 index 000000000000..bd8cf907fe22 --- /dev/null +++ b/arch/ia64/lib/memset.S | |||
@@ -0,0 +1,362 @@ | |||
1 | /* Optimized version of the standard memset() function. | ||
2 | |||
3 | Copyright (c) 2002 Hewlett-Packard Co/CERN | ||
4 | Sverre Jarp <Sverre.Jarp@cern.ch> | ||
5 | |||
6 | Return: dest | ||
7 | |||
8 | Inputs: | ||
9 | in0: dest | ||
10 | in1: value | ||
11 | in2: count | ||
12 | |||
13 | The algorithm is fairly straightforward: set byte by byte until we | ||
14 | we get to a 16B-aligned address, then loop on 128 B chunks using an | ||
15 | early store as prefetching, then loop on 32B chucks, then clear remaining | ||
16 | words, finally clear remaining bytes. | ||
17 | Since a stf.spill f0 can store 16B in one go, we use this instruction | ||
18 | to get peak speed when value = 0. */ | ||
19 | |||
20 | #include <asm/asmmacro.h> | ||
21 | #undef ret | ||
22 | |||
23 | #define dest in0 | ||
24 | #define value in1 | ||
25 | #define cnt in2 | ||
26 | |||
27 | #define tmp r31 | ||
28 | #define save_lc r30 | ||
29 | #define ptr0 r29 | ||
30 | #define ptr1 r28 | ||
31 | #define ptr2 r27 | ||
32 | #define ptr3 r26 | ||
33 | #define ptr9 r24 | ||
34 | #define loopcnt r23 | ||
35 | #define linecnt r22 | ||
36 | #define bytecnt r21 | ||
37 | |||
38 | #define fvalue f6 | ||
39 | |||
40 | // This routine uses only scratch predicate registers (p6 - p15) | ||
41 | #define p_scr p6 // default register for same-cycle branches | ||
42 | #define p_nz p7 | ||
43 | #define p_zr p8 | ||
44 | #define p_unalgn p9 | ||
45 | #define p_y p11 | ||
46 | #define p_n p12 | ||
47 | #define p_yy p13 | ||
48 | #define p_nn p14 | ||
49 | |||
50 | #define MIN1 15 | ||
51 | #define MIN1P1HALF 8 | ||
52 | #define LINE_SIZE 128 | ||
53 | #define LSIZE_SH 7 // shift amount | ||
54 | #define PREF_AHEAD 8 | ||
55 | |||
56 | GLOBAL_ENTRY(memset) | ||
57 | { .mmi | ||
58 | .prologue | ||
59 | alloc tmp = ar.pfs, 3, 0, 0, 0 | ||
60 | .body | ||
61 | lfetch.nt1 [dest] // | ||
62 | .save ar.lc, save_lc | ||
63 | mov.i save_lc = ar.lc | ||
64 | } { .mmi | ||
65 | mov ret0 = dest // return value | ||
66 | cmp.ne p_nz, p_zr = value, r0 // use stf.spill if value is zero | ||
67 | cmp.eq p_scr, p0 = cnt, r0 | ||
68 | ;; } | ||
69 | { .mmi | ||
70 | and ptr2 = -(MIN1+1), dest // aligned address | ||
71 | and tmp = MIN1, dest // prepare to check for correct alignment | ||
72 | tbit.nz p_y, p_n = dest, 0 // Do we have an odd address? (M_B_U) | ||
73 | } { .mib | ||
74 | mov ptr1 = dest | ||
75 | mux1 value = value, @brcst // create 8 identical bytes in word | ||
76 | (p_scr) br.ret.dpnt.many rp // return immediately if count = 0 | ||
77 | ;; } | ||
78 | { .mib | ||
79 | cmp.ne p_unalgn, p0 = tmp, r0 // | ||
80 | } { .mib | ||
81 | sub bytecnt = (MIN1+1), tmp // NB: # of bytes to move is 1 higher than loopcnt | ||
82 | cmp.gt p_scr, p0 = 16, cnt // is it a minimalistic task? | ||
83 | (p_scr) br.cond.dptk.many .move_bytes_unaligned // go move just a few (M_B_U) | ||
84 | ;; } | ||
85 | { .mmi | ||
86 | (p_unalgn) add ptr1 = (MIN1+1), ptr2 // after alignment | ||
87 | (p_unalgn) add ptr2 = MIN1P1HALF, ptr2 // after alignment | ||
88 | (p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3 // should we do a st8 ? | ||
89 | ;; } | ||
90 | { .mib | ||
91 | (p_y) add cnt = -8, cnt // | ||
92 | (p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2 // should we do a st4 ? | ||
93 | } { .mib | ||
94 | (p_y) st8 [ptr2] = value,-4 // | ||
95 | (p_n) add ptr2 = 4, ptr2 // | ||
96 | ;; } | ||
97 | { .mib | ||
98 | (p_yy) add cnt = -4, cnt // | ||
99 | (p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1 // should we do a st2 ? | ||
100 | } { .mib | ||
101 | (p_yy) st4 [ptr2] = value,-2 // | ||
102 | (p_nn) add ptr2 = 2, ptr2 // | ||
103 | ;; } | ||
104 | { .mmi | ||
105 | mov tmp = LINE_SIZE+1 // for compare | ||
106 | (p_y) add cnt = -2, cnt // | ||
107 | (p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0 // should we do a st1 ? | ||
108 | } { .mmi | ||
109 | setf.sig fvalue=value // transfer value to FLP side | ||
110 | (p_y) st2 [ptr2] = value,-1 // | ||
111 | (p_n) add ptr2 = 1, ptr2 // | ||
112 | ;; } | ||
113 | |||
114 | { .mmi | ||
115 | (p_yy) st1 [ptr2] = value // | ||
116 | cmp.gt p_scr, p0 = tmp, cnt // is it a minimalistic task? | ||
117 | } { .mbb | ||
118 | (p_yy) add cnt = -1, cnt // | ||
119 | (p_scr) br.cond.dpnt.many .fraction_of_line // go move just a few | ||
120 | ;; } | ||
121 | |||
122 | { .mib | ||
123 | nop.m 0 | ||
124 | shr.u linecnt = cnt, LSIZE_SH | ||
125 | (p_zr) br.cond.dptk.many .l1b // Jump to use stf.spill | ||
126 | ;; } | ||
127 | |||
128 | TEXT_ALIGN(32) // --------------------- // L1A: store ahead into cache lines; fill later | ||
129 | { .mmi | ||
130 | and tmp = -(LINE_SIZE), cnt // compute end of range | ||
131 | mov ptr9 = ptr1 // used for prefetching | ||
132 | and cnt = (LINE_SIZE-1), cnt // remainder | ||
133 | } { .mmi | ||
134 | mov loopcnt = PREF_AHEAD-1 // default prefetch loop | ||
135 | cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value | ||
136 | ;; } | ||
137 | { .mmi | ||
138 | (p_scr) add loopcnt = -1, linecnt // | ||
139 | add ptr2 = 8, ptr1 // start of stores (beyond prefetch stores) | ||
140 | add ptr1 = tmp, ptr1 // first address beyond total range | ||
141 | ;; } | ||
142 | { .mmi | ||
143 | add tmp = -1, linecnt // next loop count | ||
144 | mov.i ar.lc = loopcnt // | ||
145 | ;; } | ||
146 | .pref_l1a: | ||
147 | { .mib | ||
148 | stf8 [ptr9] = fvalue, 128 // Do stores one cache line apart | ||
149 | nop.i 0 | ||
150 | br.cloop.dptk.few .pref_l1a | ||
151 | ;; } | ||
152 | { .mmi | ||
153 | add ptr0 = 16, ptr2 // Two stores in parallel | ||
154 | mov.i ar.lc = tmp // | ||
155 | ;; } | ||
156 | .l1ax: | ||
157 | { .mmi | ||
158 | stf8 [ptr2] = fvalue, 8 | ||
159 | stf8 [ptr0] = fvalue, 8 | ||
160 | ;; } | ||
161 | { .mmi | ||
162 | stf8 [ptr2] = fvalue, 24 | ||
163 | stf8 [ptr0] = fvalue, 24 | ||
164 | ;; } | ||
165 | { .mmi | ||
166 | stf8 [ptr2] = fvalue, 8 | ||
167 | stf8 [ptr0] = fvalue, 8 | ||
168 | ;; } | ||
169 | { .mmi | ||
170 | stf8 [ptr2] = fvalue, 24 | ||
171 | stf8 [ptr0] = fvalue, 24 | ||
172 | ;; } | ||
173 | { .mmi | ||
174 | stf8 [ptr2] = fvalue, 8 | ||
175 | stf8 [ptr0] = fvalue, 8 | ||
176 | ;; } | ||
177 | { .mmi | ||
178 | stf8 [ptr2] = fvalue, 24 | ||
179 | stf8 [ptr0] = fvalue, 24 | ||
180 | ;; } | ||
181 | { .mmi | ||
182 | stf8 [ptr2] = fvalue, 8 | ||
183 | stf8 [ptr0] = fvalue, 32 | ||
184 | cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching? | ||
185 | ;; } | ||
186 | { .mmb | ||
187 | stf8 [ptr2] = fvalue, 24 | ||
188 | (p_scr) stf8 [ptr9] = fvalue, 128 | ||
189 | br.cloop.dptk.few .l1ax | ||
190 | ;; } | ||
191 | { .mbb | ||
192 | cmp.le p_scr, p0 = 8, cnt // just a few bytes left ? | ||
193 | (p_scr) br.cond.dpnt.many .fraction_of_line // Branch no. 2 | ||
194 | br.cond.dpnt.many .move_bytes_from_alignment // Branch no. 3 | ||
195 | ;; } | ||
196 | |||
197 | TEXT_ALIGN(32) | ||
198 | .l1b: // ------------------------------------ // L1B: store ahead into cache lines; fill later | ||
199 | { .mmi | ||
200 | and tmp = -(LINE_SIZE), cnt // compute end of range | ||
201 | mov ptr9 = ptr1 // used for prefetching | ||
202 | and cnt = (LINE_SIZE-1), cnt // remainder | ||
203 | } { .mmi | ||
204 | mov loopcnt = PREF_AHEAD-1 // default prefetch loop | ||
205 | cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value | ||
206 | ;; } | ||
207 | { .mmi | ||
208 | (p_scr) add loopcnt = -1, linecnt | ||
209 | add ptr2 = 16, ptr1 // start of stores (beyond prefetch stores) | ||
210 | add ptr1 = tmp, ptr1 // first address beyond total range | ||
211 | ;; } | ||
212 | { .mmi | ||
213 | add tmp = -1, linecnt // next loop count | ||
214 | mov.i ar.lc = loopcnt | ||
215 | ;; } | ||
216 | .pref_l1b: | ||
217 | { .mib | ||
218 | stf.spill [ptr9] = f0, 128 // Do stores one cache line apart | ||
219 | nop.i 0 | ||
220 | br.cloop.dptk.few .pref_l1b | ||
221 | ;; } | ||
222 | { .mmi | ||
223 | add ptr0 = 16, ptr2 // Two stores in parallel | ||
224 | mov.i ar.lc = tmp | ||
225 | ;; } | ||
226 | .l1bx: | ||
227 | { .mmi | ||
228 | stf.spill [ptr2] = f0, 32 | ||
229 | stf.spill [ptr0] = f0, 32 | ||
230 | ;; } | ||
231 | { .mmi | ||
232 | stf.spill [ptr2] = f0, 32 | ||
233 | stf.spill [ptr0] = f0, 32 | ||
234 | ;; } | ||
235 | { .mmi | ||
236 | stf.spill [ptr2] = f0, 32 | ||
237 | stf.spill [ptr0] = f0, 64 | ||
238 | cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching? | ||
239 | ;; } | ||
240 | { .mmb | ||
241 | stf.spill [ptr2] = f0, 32 | ||
242 | (p_scr) stf.spill [ptr9] = f0, 128 | ||
243 | br.cloop.dptk.few .l1bx | ||
244 | ;; } | ||
245 | { .mib | ||
246 | cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ? | ||
247 | (p_scr) br.cond.dpnt.many .move_bytes_from_alignment // | ||
248 | ;; } | ||
249 | |||
250 | .fraction_of_line: | ||
251 | { .mib | ||
252 | add ptr2 = 16, ptr1 | ||
253 | shr.u loopcnt = cnt, 5 // loopcnt = cnt / 32 | ||
254 | ;; } | ||
255 | { .mib | ||
256 | cmp.eq p_scr, p0 = loopcnt, r0 | ||
257 | add loopcnt = -1, loopcnt | ||
258 | (p_scr) br.cond.dpnt.many .store_words | ||
259 | ;; } | ||
260 | { .mib | ||
261 | and cnt = 0x1f, cnt // compute the remaining cnt | ||
262 | mov.i ar.lc = loopcnt | ||
263 | ;; } | ||
264 | TEXT_ALIGN(32) | ||
265 | .l2: // ------------------------------------ // L2A: store 32B in 2 cycles | ||
266 | { .mmb | ||
267 | stf8 [ptr1] = fvalue, 8 | ||
268 | stf8 [ptr2] = fvalue, 8 | ||
269 | ;; } { .mmb | ||
270 | stf8 [ptr1] = fvalue, 24 | ||
271 | stf8 [ptr2] = fvalue, 24 | ||
272 | br.cloop.dptk.many .l2 | ||
273 | ;; } | ||
274 | .store_words: | ||
275 | { .mib | ||
276 | cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ? | ||
277 | (p_scr) br.cond.dpnt.many .move_bytes_from_alignment // Branch | ||
278 | ;; } | ||
279 | |||
280 | { .mmi | ||
281 | stf8 [ptr1] = fvalue, 8 // store | ||
282 | cmp.le p_y, p_n = 16, cnt | ||
283 | add cnt = -8, cnt // subtract | ||
284 | ;; } | ||
285 | { .mmi | ||
286 | (p_y) stf8 [ptr1] = fvalue, 8 // store | ||
287 | (p_y) cmp.le.unc p_yy, p_nn = 16, cnt | ||
288 | (p_y) add cnt = -8, cnt // subtract | ||
289 | ;; } | ||
290 | { .mmi // store | ||
291 | (p_yy) stf8 [ptr1] = fvalue, 8 | ||
292 | (p_yy) add cnt = -8, cnt // subtract | ||
293 | ;; } | ||
294 | |||
295 | .move_bytes_from_alignment: | ||
296 | { .mib | ||
297 | cmp.eq p_scr, p0 = cnt, r0 | ||
298 | tbit.nz.unc p_y, p0 = cnt, 2 // should we terminate with a st4 ? | ||
299 | (p_scr) br.cond.dpnt.few .restore_and_exit | ||
300 | ;; } | ||
301 | { .mib | ||
302 | (p_y) st4 [ptr1] = value,4 | ||
303 | tbit.nz.unc p_yy, p0 = cnt, 1 // should we terminate with a st2 ? | ||
304 | ;; } | ||
305 | { .mib | ||
306 | (p_yy) st2 [ptr1] = value,2 | ||
307 | tbit.nz.unc p_y, p0 = cnt, 0 // should we terminate with a st1 ? | ||
308 | ;; } | ||
309 | |||
310 | { .mib | ||
311 | (p_y) st1 [ptr1] = value | ||
312 | ;; } | ||
313 | .restore_and_exit: | ||
314 | { .mib | ||
315 | nop.m 0 | ||
316 | mov.i ar.lc = save_lc | ||
317 | br.ret.sptk.many rp | ||
318 | ;; } | ||
319 | |||
320 | .move_bytes_unaligned: | ||
321 | { .mmi | ||
322 | .pred.rel "mutex",p_y, p_n | ||
323 | .pred.rel "mutex",p_yy, p_nn | ||
324 | (p_n) cmp.le p_yy, p_nn = 4, cnt | ||
325 | (p_y) cmp.le p_yy, p_nn = 5, cnt | ||
326 | (p_n) add ptr2 = 2, ptr1 | ||
327 | } { .mmi | ||
328 | (p_y) add ptr2 = 3, ptr1 | ||
329 | (p_y) st1 [ptr1] = value, 1 // fill 1 (odd-aligned) byte [15, 14 (or less) left] | ||
330 | (p_y) add cnt = -1, cnt | ||
331 | ;; } | ||
332 | { .mmi | ||
333 | (p_yy) cmp.le.unc p_y, p0 = 8, cnt | ||
334 | add ptr3 = ptr1, cnt // prepare last store | ||
335 | mov.i ar.lc = save_lc | ||
336 | } { .mmi | ||
337 | (p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes | ||
338 | (p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [11, 10 (o less) left] | ||
339 | (p_yy) add cnt = -4, cnt | ||
340 | ;; } | ||
341 | { .mmi | ||
342 | (p_y) cmp.le.unc p_yy, p0 = 8, cnt | ||
343 | add ptr3 = -1, ptr3 // last store | ||
344 | tbit.nz p_scr, p0 = cnt, 1 // will there be a st2 at the end ? | ||
345 | } { .mmi | ||
346 | (p_y) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes | ||
347 | (p_y) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [7, 6 (or less) left] | ||
348 | (p_y) add cnt = -4, cnt | ||
349 | ;; } | ||
350 | { .mmi | ||
351 | (p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes | ||
352 | (p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [3, 2 (or less) left] | ||
353 | tbit.nz p_y, p0 = cnt, 0 // will there be a st1 at the end ? | ||
354 | } { .mmi | ||
355 | (p_yy) add cnt = -4, cnt | ||
356 | ;; } | ||
357 | { .mmb | ||
358 | (p_scr) st2 [ptr1] = value // fill 2 (aligned) bytes | ||
359 | (p_y) st1 [ptr3] = value // fill last byte (using ptr3) | ||
360 | br.ret.sptk.many rp | ||
361 | } | ||
362 | END(memset) | ||
diff --git a/arch/ia64/lib/strlen.S b/arch/ia64/lib/strlen.S new file mode 100644 index 000000000000..e0cdac0a85b8 --- /dev/null +++ b/arch/ia64/lib/strlen.S | |||
@@ -0,0 +1,192 @@ | |||
1 | /* | ||
2 | * | ||
3 | * Optimized version of the standard strlen() function | ||
4 | * | ||
5 | * | ||
6 | * Inputs: | ||
7 | * in0 address of string | ||
8 | * | ||
9 | * Outputs: | ||
10 | * ret0 the number of characters in the string (0 if empty string) | ||
11 | * does not count the \0 | ||
12 | * | ||
13 | * Copyright (C) 1999, 2001 Hewlett-Packard Co | ||
14 | * Stephane Eranian <eranian@hpl.hp.com> | ||
15 | * | ||
16 | * 09/24/99 S.Eranian add speculation recovery code | ||
17 | */ | ||
18 | |||
19 | #include <asm/asmmacro.h> | ||
20 | |||
21 | // | ||
22 | // | ||
23 | // This is an enhanced version of the basic strlen. it includes a combination | ||
24 | // of compute zero index (czx), parallel comparisons, speculative loads and | ||
25 | // loop unroll using rotating registers. | ||
26 | // | ||
27 | // General Ideas about the algorithm: | ||
28 | // The goal is to look at the string in chunks of 8 bytes. | ||
29 | // so we need to do a few extra checks at the beginning because the | ||
30 | // string may not be 8-byte aligned. In this case we load the 8byte | ||
31 | // quantity which includes the start of the string and mask the unused | ||
32 | // bytes with 0xff to avoid confusing czx. | ||
33 | // We use speculative loads and software pipelining to hide memory | ||
34 | // latency and do read ahead safely. This way we defer any exception. | ||
35 | // | ||
36 | // Because we don't want the kernel to be relying on particular | ||
37 | // settings of the DCR register, we provide recovery code in case | ||
38 | // speculation fails. The recovery code is going to "redo" the work using | ||
39 | // only normal loads. If we still get a fault then we generate a | ||
40 | // kernel panic. Otherwise we return the strlen as usual. | ||
41 | // | ||
42 | // The fact that speculation may fail can be caused, for instance, by | ||
43 | // the DCR.dm bit being set. In this case TLB misses are deferred, i.e., | ||
44 | // a NaT bit will be set if the translation is not present. The normal | ||
45 | // load, on the other hand, will cause the translation to be inserted | ||
46 | // if the mapping exists. | ||
47 | // | ||
48 | // It should be noted that we execute recovery code only when we need | ||
49 | // to use the data that has been speculatively loaded: we don't execute | ||
50 | // recovery code on pure read ahead data. | ||
51 | // | ||
52 | // Remarks: | ||
53 | // - the cmp r0,r0 is used as a fast way to initialize a predicate | ||
54 | // register to 1. This is required to make sure that we get the parallel | ||
55 | // compare correct. | ||
56 | // | ||
57 | // - we don't use the epilogue counter to exit the loop but we need to set | ||
58 | // it to zero beforehand. | ||
59 | // | ||
60 | // - after the loop we must test for Nat values because neither the | ||
61 | // czx nor cmp instruction raise a NaT consumption fault. We must be | ||
62 | // careful not to look too far for a Nat for which we don't care. | ||
63 | // For instance we don't need to look at a NaT in val2 if the zero byte | ||
64 | // was in val1. | ||
65 | // | ||
66 | // - Clearly performance tuning is required. | ||
67 | // | ||
68 | // | ||
69 | // | ||
70 | #define saved_pfs r11 | ||
71 | #define tmp r10 | ||
72 | #define base r16 | ||
73 | #define orig r17 | ||
74 | #define saved_pr r18 | ||
75 | #define src r19 | ||
76 | #define mask r20 | ||
77 | #define val r21 | ||
78 | #define val1 r22 | ||
79 | #define val2 r23 | ||
80 | |||
81 | GLOBAL_ENTRY(strlen) | ||
82 | .prologue | ||
83 | .save ar.pfs, saved_pfs | ||
84 | alloc saved_pfs=ar.pfs,11,0,0,8 // rotating must be multiple of 8 | ||
85 | |||
86 | .rotr v[2], w[2] // declares our 4 aliases | ||
87 | |||
88 | extr.u tmp=in0,0,3 // tmp=least significant 3 bits | ||
89 | mov orig=in0 // keep trackof initial byte address | ||
90 | dep src=0,in0,0,3 // src=8byte-aligned in0 address | ||
91 | .save pr, saved_pr | ||
92 | mov saved_pr=pr // preserve predicates (rotation) | ||
93 | ;; | ||
94 | |||
95 | .body | ||
96 | |||
97 | ld8 v[1]=[src],8 // must not speculate: can fail here | ||
98 | shl tmp=tmp,3 // multiply by 8bits/byte | ||
99 | mov mask=-1 // our mask | ||
100 | ;; | ||
101 | ld8.s w[1]=[src],8 // speculatively load next | ||
102 | cmp.eq p6,p0=r0,r0 // sets p6 to true for cmp.and | ||
103 | sub tmp=64,tmp // how many bits to shift our mask on the right | ||
104 | ;; | ||
105 | shr.u mask=mask,tmp // zero enough bits to hold v[1] valuable part | ||
106 | mov ar.ec=r0 // clear epilogue counter (saved in ar.pfs) | ||
107 | ;; | ||
108 | add base=-16,src // keep track of aligned base | ||
109 | or v[1]=v[1],mask // now we have a safe initial byte pattern | ||
110 | ;; | ||
111 | 1: | ||
112 | ld8.s v[0]=[src],8 // speculatively load next | ||
113 | czx1.r val1=v[1] // search 0 byte from right | ||
114 | czx1.r val2=w[1] // search 0 byte from right following 8bytes | ||
115 | ;; | ||
116 | ld8.s w[0]=[src],8 // speculatively load next to next | ||
117 | cmp.eq.and p6,p0=8,val1 // p6 = p6 and val1==8 | ||
118 | cmp.eq.and p6,p0=8,val2 // p6 = p6 and mask==8 | ||
119 | (p6) br.wtop.dptk 1b // loop until p6 == 0 | ||
120 | ;; | ||
121 | // | ||
122 | // We must return try the recovery code iff | ||
123 | // val1_is_nat || (val1==8 && val2_is_nat) | ||
124 | // | ||
125 | // XXX Fixme | ||
126 | // - there must be a better way of doing the test | ||
127 | // | ||
128 | cmp.eq p8,p9=8,val1 // p6 = val1 had zero (disambiguate) | ||
129 | tnat.nz p6,p7=val1 // test NaT on val1 | ||
130 | (p6) br.cond.spnt .recover // jump to recovery if val1 is NaT | ||
131 | ;; | ||
132 | // | ||
133 | // if we come here p7 is true, i.e., initialized for // cmp | ||
134 | // | ||
135 | cmp.eq.and p7,p0=8,val1// val1==8? | ||
136 | tnat.nz.and p7,p0=val2 // test NaT if val2 | ||
137 | (p7) br.cond.spnt .recover // jump to recovery if val2 is NaT | ||
138 | ;; | ||
139 | (p8) mov val1=val2 // the other test got us out of the loop | ||
140 | (p8) adds src=-16,src // correct position when 3 ahead | ||
141 | (p9) adds src=-24,src // correct position when 4 ahead | ||
142 | ;; | ||
143 | sub ret0=src,orig // distance from base | ||
144 | sub tmp=8,val1 // which byte in word | ||
145 | mov pr=saved_pr,0xffffffffffff0000 | ||
146 | ;; | ||
147 | sub ret0=ret0,tmp // adjust | ||
148 | mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what | ||
149 | br.ret.sptk.many rp // end of normal execution | ||
150 | |||
151 | // | ||
152 | // Outlined recovery code when speculation failed | ||
153 | // | ||
154 | // This time we don't use speculation and rely on the normal exception | ||
155 | // mechanism. that's why the loop is not as good as the previous one | ||
156 | // because read ahead is not possible | ||
157 | // | ||
158 | // IMPORTANT: | ||
159 | // Please note that in the case of strlen() as opposed to strlen_user() | ||
160 | // we don't use the exception mechanism, as this function is not | ||
161 | // supposed to fail. If that happens it means we have a bug and the | ||
162 | // code will cause of kernel fault. | ||
163 | // | ||
164 | // XXX Fixme | ||
165 | // - today we restart from the beginning of the string instead | ||
166 | // of trying to continue where we left off. | ||
167 | // | ||
168 | .recover: | ||
169 | ld8 val=[base],8 // will fail if unrecoverable fault | ||
170 | ;; | ||
171 | or val=val,mask // remask first bytes | ||
172 | cmp.eq p0,p6=r0,r0 // nullify first ld8 in loop | ||
173 | ;; | ||
174 | // | ||
175 | // ar.ec is still zero here | ||
176 | // | ||
177 | 2: | ||
178 | (p6) ld8 val=[base],8 // will fail if unrecoverable fault | ||
179 | ;; | ||
180 | czx1.r val1=val // search 0 byte from right | ||
181 | ;; | ||
182 | cmp.eq p6,p0=8,val1 // val1==8 ? | ||
183 | (p6) br.wtop.dptk 2b // loop until p6 == 0 | ||
184 | ;; // (avoid WAW on p63) | ||
185 | sub ret0=base,orig // distance from base | ||
186 | sub tmp=8,val1 | ||
187 | mov pr=saved_pr,0xffffffffffff0000 | ||
188 | ;; | ||
189 | sub ret0=ret0,tmp // length=now - back -1 | ||
190 | mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what | ||
191 | br.ret.sptk.many rp // end of successful recovery code | ||
192 | END(strlen) | ||
diff --git a/arch/ia64/lib/strlen_user.S b/arch/ia64/lib/strlen_user.S new file mode 100644 index 000000000000..c71eded4285e --- /dev/null +++ b/arch/ia64/lib/strlen_user.S | |||
@@ -0,0 +1,198 @@ | |||
1 | /* | ||
2 | * Optimized version of the strlen_user() function | ||
3 | * | ||
4 | * Inputs: | ||
5 | * in0 address of buffer | ||
6 | * | ||
7 | * Outputs: | ||
8 | * ret0 0 in case of fault, strlen(buffer)+1 otherwise | ||
9 | * | ||
10 | * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co | ||
11 | * David Mosberger-Tang <davidm@hpl.hp.com> | ||
12 | * Stephane Eranian <eranian@hpl.hp.com> | ||
13 | * | ||
14 | * 01/19/99 S.Eranian heavily enhanced version (see details below) | ||
15 | * 09/24/99 S.Eranian added speculation recovery code | ||
16 | */ | ||
17 | |||
18 | #include <asm/asmmacro.h> | ||
19 | |||
20 | // | ||
21 | // int strlen_user(char *) | ||
22 | // ------------------------ | ||
23 | // Returns: | ||
24 | // - length of string + 1 | ||
25 | // - 0 in case an exception is raised | ||
26 | // | ||
27 | // This is an enhanced version of the basic strlen_user. it includes a | ||
28 | // combination of compute zero index (czx), parallel comparisons, speculative | ||
29 | // loads and loop unroll using rotating registers. | ||
30 | // | ||
31 | // General Ideas about the algorithm: | ||
32 | // The goal is to look at the string in chunks of 8 bytes. | ||
33 | // so we need to do a few extra checks at the beginning because the | ||
34 | // string may not be 8-byte aligned. In this case we load the 8byte | ||
35 | // quantity which includes the start of the string and mask the unused | ||
36 | // bytes with 0xff to avoid confusing czx. | ||
37 | // We use speculative loads and software pipelining to hide memory | ||
38 | // latency and do read ahead safely. This way we defer any exception. | ||
39 | // | ||
40 | // Because we don't want the kernel to be relying on particular | ||
41 | // settings of the DCR register, we provide recovery code in case | ||
42 | // speculation fails. The recovery code is going to "redo" the work using | ||
43 | // only normal loads. If we still get a fault then we return an | ||
44 | // error (ret0=0). Otherwise we return the strlen+1 as usual. | ||
45 | // The fact that speculation may fail can be caused, for instance, by | ||
46 | // the DCR.dm bit being set. In this case TLB misses are deferred, i.e., | ||
47 | // a NaT bit will be set if the translation is not present. The normal | ||
48 | // load, on the other hand, will cause the translation to be inserted | ||
49 | // if the mapping exists. | ||
50 | // | ||
51 | // It should be noted that we execute recovery code only when we need | ||
52 | // to use the data that has been speculatively loaded: we don't execute | ||
53 | // recovery code on pure read ahead data. | ||
54 | // | ||
55 | // Remarks: | ||
56 | // - the cmp r0,r0 is used as a fast way to initialize a predicate | ||
57 | // register to 1. This is required to make sure that we get the parallel | ||
58 | // compare correct. | ||
59 | // | ||
60 | // - we don't use the epilogue counter to exit the loop but we need to set | ||
61 | // it to zero beforehand. | ||
62 | // | ||
63 | // - after the loop we must test for Nat values because neither the | ||
64 | // czx nor cmp instruction raise a NaT consumption fault. We must be | ||
65 | // careful not to look too far for a Nat for which we don't care. | ||
66 | // For instance we don't need to look at a NaT in val2 if the zero byte | ||
67 | // was in val1. | ||
68 | // | ||
69 | // - Clearly performance tuning is required. | ||
70 | // | ||
71 | |||
72 | #define saved_pfs r11 | ||
73 | #define tmp r10 | ||
74 | #define base r16 | ||
75 | #define orig r17 | ||
76 | #define saved_pr r18 | ||
77 | #define src r19 | ||
78 | #define mask r20 | ||
79 | #define val r21 | ||
80 | #define val1 r22 | ||
81 | #define val2 r23 | ||
82 | |||
83 | GLOBAL_ENTRY(__strlen_user) | ||
84 | .prologue | ||
85 | .save ar.pfs, saved_pfs | ||
86 | alloc saved_pfs=ar.pfs,11,0,0,8 | ||
87 | |||
88 | .rotr v[2], w[2] // declares our 4 aliases | ||
89 | |||
90 | extr.u tmp=in0,0,3 // tmp=least significant 3 bits | ||
91 | mov orig=in0 // keep trackof initial byte address | ||
92 | dep src=0,in0,0,3 // src=8byte-aligned in0 address | ||
93 | .save pr, saved_pr | ||
94 | mov saved_pr=pr // preserve predicates (rotation) | ||
95 | ;; | ||
96 | |||
97 | .body | ||
98 | |||
99 | ld8.s v[1]=[src],8 // load the initial 8bytes (must speculate) | ||
100 | shl tmp=tmp,3 // multiply by 8bits/byte | ||
101 | mov mask=-1 // our mask | ||
102 | ;; | ||
103 | ld8.s w[1]=[src],8 // load next 8 bytes in 2nd pipeline | ||
104 | cmp.eq p6,p0=r0,r0 // sets p6 (required because of // cmp.and) | ||
105 | sub tmp=64,tmp // how many bits to shift our mask on the right | ||
106 | ;; | ||
107 | shr.u mask=mask,tmp // zero enough bits to hold v[1] valuable part | ||
108 | mov ar.ec=r0 // clear epilogue counter (saved in ar.pfs) | ||
109 | ;; | ||
110 | add base=-16,src // keep track of aligned base | ||
111 | chk.s v[1], .recover // if already NaT, then directly skip to recover | ||
112 | or v[1]=v[1],mask // now we have a safe initial byte pattern | ||
113 | ;; | ||
114 | 1: | ||
115 | ld8.s v[0]=[src],8 // speculatively load next | ||
116 | czx1.r val1=v[1] // search 0 byte from right | ||
117 | czx1.r val2=w[1] // search 0 byte from right following 8bytes | ||
118 | ;; | ||
119 | ld8.s w[0]=[src],8 // speculatively load next to next | ||
120 | cmp.eq.and p6,p0=8,val1 // p6 = p6 and val1==8 | ||
121 | cmp.eq.and p6,p0=8,val2 // p6 = p6 and mask==8 | ||
122 | (p6) br.wtop.dptk.few 1b // loop until p6 == 0 | ||
123 | ;; | ||
124 | // | ||
125 | // We must return try the recovery code iff | ||
126 | // val1_is_nat || (val1==8 && val2_is_nat) | ||
127 | // | ||
128 | // XXX Fixme | ||
129 | // - there must be a better way of doing the test | ||
130 | // | ||
131 | cmp.eq p8,p9=8,val1 // p6 = val1 had zero (disambiguate) | ||
132 | tnat.nz p6,p7=val1 // test NaT on val1 | ||
133 | (p6) br.cond.spnt .recover // jump to recovery if val1 is NaT | ||
134 | ;; | ||
135 | // | ||
136 | // if we come here p7 is true, i.e., initialized for // cmp | ||
137 | // | ||
138 | cmp.eq.and p7,p0=8,val1// val1==8? | ||
139 | tnat.nz.and p7,p0=val2 // test NaT if val2 | ||
140 | (p7) br.cond.spnt .recover // jump to recovery if val2 is NaT | ||
141 | ;; | ||
142 | (p8) mov val1=val2 // val2 contains the value | ||
143 | (p8) adds src=-16,src // correct position when 3 ahead | ||
144 | (p9) adds src=-24,src // correct position when 4 ahead | ||
145 | ;; | ||
146 | sub ret0=src,orig // distance from origin | ||
147 | sub tmp=7,val1 // 7=8-1 because this strlen returns strlen+1 | ||
148 | mov pr=saved_pr,0xffffffffffff0000 | ||
149 | ;; | ||
150 | sub ret0=ret0,tmp // length=now - back -1 | ||
151 | mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what | ||
152 | br.ret.sptk.many rp // end of normal execution | ||
153 | |||
154 | // | ||
155 | // Outlined recovery code when speculation failed | ||
156 | // | ||
157 | // This time we don't use speculation and rely on the normal exception | ||
158 | // mechanism. that's why the loop is not as good as the previous one | ||
159 | // because read ahead is not possible | ||
160 | // | ||
161 | // XXX Fixme | ||
162 | // - today we restart from the beginning of the string instead | ||
163 | // of trying to continue where we left off. | ||
164 | // | ||
165 | .recover: | ||
166 | EX(.Lexit1, ld8 val=[base],8) // load the initial bytes | ||
167 | ;; | ||
168 | or val=val,mask // remask first bytes | ||
169 | cmp.eq p0,p6=r0,r0 // nullify first ld8 in loop | ||
170 | ;; | ||
171 | // | ||
172 | // ar.ec is still zero here | ||
173 | // | ||
174 | 2: | ||
175 | EX(.Lexit1, (p6) ld8 val=[base],8) | ||
176 | ;; | ||
177 | czx1.r val1=val // search 0 byte from right | ||
178 | ;; | ||
179 | cmp.eq p6,p0=8,val1 // val1==8 ? | ||
180 | (p6) br.wtop.dptk.few 2b // loop until p6 == 0 | ||
181 | ;; | ||
182 | sub ret0=base,orig // distance from base | ||
183 | sub tmp=7,val1 // 7=8-1 because this strlen returns strlen+1 | ||
184 | mov pr=saved_pr,0xffffffffffff0000 | ||
185 | ;; | ||
186 | sub ret0=ret0,tmp // length=now - back -1 | ||
187 | mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what | ||
188 | br.ret.sptk.many rp // end of successful recovery code | ||
189 | |||
190 | // | ||
191 | // We failed even on the normal load (called from exception handler) | ||
192 | // | ||
193 | .Lexit1: | ||
194 | mov ret0=0 | ||
195 | mov pr=saved_pr,0xffffffffffff0000 | ||
196 | mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what | ||
197 | br.ret.sptk.many rp | ||
198 | END(__strlen_user) | ||
diff --git a/arch/ia64/lib/strncpy_from_user.S b/arch/ia64/lib/strncpy_from_user.S new file mode 100644 index 000000000000..a504381f31eb --- /dev/null +++ b/arch/ia64/lib/strncpy_from_user.S | |||
@@ -0,0 +1,44 @@ | |||
1 | /* | ||
2 | * Just like strncpy() except that if a fault occurs during copying, | ||
3 | * -EFAULT is returned. | ||
4 | * | ||
5 | * Inputs: | ||
6 | * in0: address of destination buffer | ||
7 | * in1: address of string to be copied | ||
8 | * in2: length of buffer in bytes | ||
9 | * Outputs: | ||
10 | * r8: -EFAULT in case of fault or number of bytes copied if no fault | ||
11 | * | ||
12 | * Copyright (C) 1998-2001 Hewlett-Packard Co | ||
13 | * Copyright (C) 1998-2001 David Mosberger-Tang <davidm@hpl.hp.com> | ||
14 | * | ||
15 | * 00/03/06 D. Mosberger Fixed to return proper return value (bug found by | ||
16 | * by Andreas Schwab <schwab@suse.de>). | ||
17 | */ | ||
18 | |||
19 | #include <asm/asmmacro.h> | ||
20 | |||
21 | GLOBAL_ENTRY(__strncpy_from_user) | ||
22 | alloc r2=ar.pfs,3,0,0,0 | ||
23 | mov r8=0 | ||
24 | mov r9=in1 | ||
25 | ;; | ||
26 | add r10=in1,in2 | ||
27 | cmp.eq p6,p0=r0,in2 | ||
28 | (p6) br.ret.spnt.many rp | ||
29 | |||
30 | // XXX braindead copy loop---this needs to be optimized | ||
31 | .Loop1: | ||
32 | EX(.Lexit, ld1 r8=[in1],1) | ||
33 | ;; | ||
34 | EX(.Lexit, st1 [in0]=r8,1) | ||
35 | cmp.ne p6,p7=r8,r0 | ||
36 | ;; | ||
37 | (p6) cmp.ne.unc p8,p0=in1,r10 | ||
38 | (p8) br.cond.dpnt.few .Loop1 | ||
39 | ;; | ||
40 | (p6) mov r8=in2 // buffer filled up---return buffer length | ||
41 | (p7) sub r8=in1,r9,1 // return string length (excluding NUL character) | ||
42 | [.Lexit:] | ||
43 | br.ret.sptk.many rp | ||
44 | END(__strncpy_from_user) | ||
diff --git a/arch/ia64/lib/strnlen_user.S b/arch/ia64/lib/strnlen_user.S new file mode 100644 index 000000000000..d09066b1e49d --- /dev/null +++ b/arch/ia64/lib/strnlen_user.S | |||
@@ -0,0 +1,45 @@ | |||
1 | /* | ||
2 | * Returns 0 if exception before NUL or reaching the supplied limit (N), | ||
3 | * a value greater than N if the string is longer than the limit, else | ||
4 | * strlen. | ||
5 | * | ||
6 | * Inputs: | ||
7 | * in0: address of buffer | ||
8 | * in1: string length limit N | ||
9 | * Outputs: | ||
10 | * r8: 0 in case of fault, strlen(buffer)+1 otherwise | ||
11 | * | ||
12 | * Copyright (C) 1999, 2001 David Mosberger-Tang <davidm@hpl.hp.com> | ||
13 | */ | ||
14 | |||
15 | #include <asm/asmmacro.h> | ||
16 | |||
17 | GLOBAL_ENTRY(__strnlen_user) | ||
18 | .prologue | ||
19 | alloc r2=ar.pfs,2,0,0,0 | ||
20 | .save ar.lc, r16 | ||
21 | mov r16=ar.lc // preserve ar.lc | ||
22 | |||
23 | .body | ||
24 | |||
25 | add r3=-1,in1 | ||
26 | ;; | ||
27 | mov ar.lc=r3 | ||
28 | mov r9=0 | ||
29 | ;; | ||
30 | // XXX braindead strlen loop---this needs to be optimized | ||
31 | .Loop1: | ||
32 | EXCLR(.Lexit, ld1 r8=[in0],1) | ||
33 | add r9=1,r9 | ||
34 | ;; | ||
35 | cmp.eq p6,p0=r8,r0 | ||
36 | (p6) br.cond.dpnt .Lexit | ||
37 | br.cloop.dptk.few .Loop1 | ||
38 | |||
39 | add r9=1,in1 // NUL not found---return N+1 | ||
40 | ;; | ||
41 | .Lexit: | ||
42 | mov r8=r9 | ||
43 | mov ar.lc=r16 // restore ar.lc | ||
44 | br.ret.sptk.many rp | ||
45 | END(__strnlen_user) | ||
diff --git a/arch/ia64/lib/swiotlb.c b/arch/ia64/lib/swiotlb.c new file mode 100644 index 000000000000..ab7b3ad99a7f --- /dev/null +++ b/arch/ia64/lib/swiotlb.c | |||
@@ -0,0 +1,658 @@ | |||
1 | /* | ||
2 | * Dynamic DMA mapping support. | ||
3 | * | ||
4 | * This implementation is for IA-64 platforms that do not support | ||
5 | * I/O TLBs (aka DMA address translation hardware). | ||
6 | * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com> | ||
7 | * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com> | ||
8 | * Copyright (C) 2000, 2003 Hewlett-Packard Co | ||
9 | * David Mosberger-Tang <davidm@hpl.hp.com> | ||
10 | * | ||
11 | * 03/05/07 davidm Switch from PCI-DMA to generic device DMA API. | ||
12 | * 00/12/13 davidm Rename to swiotlb.c and add mark_clean() to avoid | ||
13 | * unnecessary i-cache flushing. | ||
14 | * 04/07/.. ak Better overflow handling. Assorted fixes. | ||
15 | */ | ||
16 | |||
17 | #include <linux/cache.h> | ||
18 | #include <linux/mm.h> | ||
19 | #include <linux/module.h> | ||
20 | #include <linux/pci.h> | ||
21 | #include <linux/spinlock.h> | ||
22 | #include <linux/string.h> | ||
23 | #include <linux/types.h> | ||
24 | #include <linux/ctype.h> | ||
25 | |||
26 | #include <asm/io.h> | ||
27 | #include <asm/pci.h> | ||
28 | #include <asm/dma.h> | ||
29 | |||
30 | #include <linux/init.h> | ||
31 | #include <linux/bootmem.h> | ||
32 | |||
33 | #define OFFSET(val,align) ((unsigned long) \ | ||
34 | ( (val) & ( (align) - 1))) | ||
35 | |||
36 | #define SG_ENT_VIRT_ADDRESS(sg) (page_address((sg)->page) + (sg)->offset) | ||
37 | #define SG_ENT_PHYS_ADDRESS(SG) virt_to_phys(SG_ENT_VIRT_ADDRESS(SG)) | ||
38 | |||
39 | /* | ||
40 | * Maximum allowable number of contiguous slabs to map, | ||
41 | * must be a power of 2. What is the appropriate value ? | ||
42 | * The complexity of {map,unmap}_single is linearly dependent on this value. | ||
43 | */ | ||
44 | #define IO_TLB_SEGSIZE 128 | ||
45 | |||
46 | /* | ||
47 | * log of the size of each IO TLB slab. The number of slabs is command line | ||
48 | * controllable. | ||
49 | */ | ||
50 | #define IO_TLB_SHIFT 11 | ||
51 | |||
52 | int swiotlb_force; | ||
53 | |||
54 | /* | ||
55 | * Used to do a quick range check in swiotlb_unmap_single and | ||
56 | * swiotlb_sync_single_*, to see if the memory was in fact allocated by this | ||
57 | * API. | ||
58 | */ | ||
59 | static char *io_tlb_start, *io_tlb_end; | ||
60 | |||
61 | /* | ||
62 | * The number of IO TLB blocks (in groups of 64) betweeen io_tlb_start and | ||
63 | * io_tlb_end. This is command line adjustable via setup_io_tlb_npages. | ||
64 | */ | ||
65 | static unsigned long io_tlb_nslabs; | ||
66 | |||
67 | /* | ||
68 | * When the IOMMU overflows we return a fallback buffer. This sets the size. | ||
69 | */ | ||
70 | static unsigned long io_tlb_overflow = 32*1024; | ||
71 | |||
72 | void *io_tlb_overflow_buffer; | ||
73 | |||
74 | /* | ||
75 | * This is a free list describing the number of free entries available from | ||
76 | * each index | ||
77 | */ | ||
78 | static unsigned int *io_tlb_list; | ||
79 | static unsigned int io_tlb_index; | ||
80 | |||
81 | /* | ||
82 | * We need to save away the original address corresponding to a mapped entry | ||
83 | * for the sync operations. | ||
84 | */ | ||
85 | static unsigned char **io_tlb_orig_addr; | ||
86 | |||
87 | /* | ||
88 | * Protect the above data structures in the map and unmap calls | ||
89 | */ | ||
90 | static DEFINE_SPINLOCK(io_tlb_lock); | ||
91 | |||
92 | static int __init | ||
93 | setup_io_tlb_npages(char *str) | ||
94 | { | ||
95 | if (isdigit(*str)) { | ||
96 | io_tlb_nslabs = simple_strtoul(str, &str, 0) << | ||
97 | (PAGE_SHIFT - IO_TLB_SHIFT); | ||
98 | /* avoid tail segment of size < IO_TLB_SEGSIZE */ | ||
99 | io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); | ||
100 | } | ||
101 | if (*str == ',') | ||
102 | ++str; | ||
103 | if (!strcmp(str, "force")) | ||
104 | swiotlb_force = 1; | ||
105 | return 1; | ||
106 | } | ||
107 | __setup("swiotlb=", setup_io_tlb_npages); | ||
108 | /* make io_tlb_overflow tunable too? */ | ||
109 | |||
110 | /* | ||
111 | * Statically reserve bounce buffer space and initialize bounce buffer data | ||
112 | * structures for the software IO TLB used to implement the PCI DMA API. | ||
113 | */ | ||
114 | void | ||
115 | swiotlb_init_with_default_size (size_t default_size) | ||
116 | { | ||
117 | unsigned long i; | ||
118 | |||
119 | if (!io_tlb_nslabs) { | ||
120 | io_tlb_nslabs = (default_size >> PAGE_SHIFT); | ||
121 | io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); | ||
122 | } | ||
123 | |||
124 | /* | ||
125 | * Get IO TLB memory from the low pages | ||
126 | */ | ||
127 | io_tlb_start = alloc_bootmem_low_pages(io_tlb_nslabs * | ||
128 | (1 << IO_TLB_SHIFT)); | ||
129 | if (!io_tlb_start) | ||
130 | panic("Cannot allocate SWIOTLB buffer"); | ||
131 | io_tlb_end = io_tlb_start + io_tlb_nslabs * (1 << IO_TLB_SHIFT); | ||
132 | |||
133 | /* | ||
134 | * Allocate and initialize the free list array. This array is used | ||
135 | * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE | ||
136 | * between io_tlb_start and io_tlb_end. | ||
137 | */ | ||
138 | io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int)); | ||
139 | for (i = 0; i < io_tlb_nslabs; i++) | ||
140 | io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); | ||
141 | io_tlb_index = 0; | ||
142 | io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(char *)); | ||
143 | |||
144 | /* | ||
145 | * Get the overflow emergency buffer | ||
146 | */ | ||
147 | io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow); | ||
148 | printk(KERN_INFO "Placing software IO TLB between 0x%lx - 0x%lx\n", | ||
149 | virt_to_phys(io_tlb_start), virt_to_phys(io_tlb_end)); | ||
150 | } | ||
151 | |||
152 | void | ||
153 | swiotlb_init (void) | ||
154 | { | ||
155 | swiotlb_init_with_default_size(64 * (1<<20)); /* default to 64MB */ | ||
156 | } | ||
157 | |||
158 | static inline int | ||
159 | address_needs_mapping(struct device *hwdev, dma_addr_t addr) | ||
160 | { | ||
161 | dma_addr_t mask = 0xffffffff; | ||
162 | /* If the device has a mask, use it, otherwise default to 32 bits */ | ||
163 | if (hwdev && hwdev->dma_mask) | ||
164 | mask = *hwdev->dma_mask; | ||
165 | return (addr & ~mask) != 0; | ||
166 | } | ||
167 | |||
168 | /* | ||
169 | * Allocates bounce buffer and returns its kernel virtual address. | ||
170 | */ | ||
171 | static void * | ||
172 | map_single(struct device *hwdev, char *buffer, size_t size, int dir) | ||
173 | { | ||
174 | unsigned long flags; | ||
175 | char *dma_addr; | ||
176 | unsigned int nslots, stride, index, wrap; | ||
177 | int i; | ||
178 | |||
179 | /* | ||
180 | * For mappings greater than a page, we limit the stride (and | ||
181 | * hence alignment) to a page size. | ||
182 | */ | ||
183 | nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; | ||
184 | if (size > PAGE_SIZE) | ||
185 | stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); | ||
186 | else | ||
187 | stride = 1; | ||
188 | |||
189 | if (!nslots) | ||
190 | BUG(); | ||
191 | |||
192 | /* | ||
193 | * Find suitable number of IO TLB entries size that will fit this | ||
194 | * request and allocate a buffer from that IO TLB pool. | ||
195 | */ | ||
196 | spin_lock_irqsave(&io_tlb_lock, flags); | ||
197 | { | ||
198 | wrap = index = ALIGN(io_tlb_index, stride); | ||
199 | |||
200 | if (index >= io_tlb_nslabs) | ||
201 | wrap = index = 0; | ||
202 | |||
203 | do { | ||
204 | /* | ||
205 | * If we find a slot that indicates we have 'nslots' | ||
206 | * number of contiguous buffers, we allocate the | ||
207 | * buffers from that slot and mark the entries as '0' | ||
208 | * indicating unavailable. | ||
209 | */ | ||
210 | if (io_tlb_list[index] >= nslots) { | ||
211 | int count = 0; | ||
212 | |||
213 | for (i = index; i < (int) (index + nslots); i++) | ||
214 | io_tlb_list[i] = 0; | ||
215 | for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--) | ||
216 | io_tlb_list[i] = ++count; | ||
217 | dma_addr = io_tlb_start + (index << IO_TLB_SHIFT); | ||
218 | |||
219 | /* | ||
220 | * Update the indices to avoid searching in | ||
221 | * the next round. | ||
222 | */ | ||
223 | io_tlb_index = ((index + nslots) < io_tlb_nslabs | ||
224 | ? (index + nslots) : 0); | ||
225 | |||
226 | goto found; | ||
227 | } | ||
228 | index += stride; | ||
229 | if (index >= io_tlb_nslabs) | ||
230 | index = 0; | ||
231 | } while (index != wrap); | ||
232 | |||
233 | spin_unlock_irqrestore(&io_tlb_lock, flags); | ||
234 | return NULL; | ||
235 | } | ||
236 | found: | ||
237 | spin_unlock_irqrestore(&io_tlb_lock, flags); | ||
238 | |||
239 | /* | ||
240 | * Save away the mapping from the original address to the DMA address. | ||
241 | * This is needed when we sync the memory. Then we sync the buffer if | ||
242 | * needed. | ||
243 | */ | ||
244 | io_tlb_orig_addr[index] = buffer; | ||
245 | if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) | ||
246 | memcpy(dma_addr, buffer, size); | ||
247 | |||
248 | return dma_addr; | ||
249 | } | ||
250 | |||
251 | /* | ||
252 | * dma_addr is the kernel virtual address of the bounce buffer to unmap. | ||
253 | */ | ||
254 | static void | ||
255 | unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir) | ||
256 | { | ||
257 | unsigned long flags; | ||
258 | int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; | ||
259 | int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT; | ||
260 | char *buffer = io_tlb_orig_addr[index]; | ||
261 | |||
262 | /* | ||
263 | * First, sync the memory before unmapping the entry | ||
264 | */ | ||
265 | if (buffer && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))) | ||
266 | /* | ||
267 | * bounce... copy the data back into the original buffer * and | ||
268 | * delete the bounce buffer. | ||
269 | */ | ||
270 | memcpy(buffer, dma_addr, size); | ||
271 | |||
272 | /* | ||
273 | * Return the buffer to the free list by setting the corresponding | ||
274 | * entries to indicate the number of contigous entries available. | ||
275 | * While returning the entries to the free list, we merge the entries | ||
276 | * with slots below and above the pool being returned. | ||
277 | */ | ||
278 | spin_lock_irqsave(&io_tlb_lock, flags); | ||
279 | { | ||
280 | count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ? | ||
281 | io_tlb_list[index + nslots] : 0); | ||
282 | /* | ||
283 | * Step 1: return the slots to the free list, merging the | ||
284 | * slots with superceeding slots | ||
285 | */ | ||
286 | for (i = index + nslots - 1; i >= index; i--) | ||
287 | io_tlb_list[i] = ++count; | ||
288 | /* | ||
289 | * Step 2: merge the returned slots with the preceding slots, | ||
290 | * if available (non zero) | ||
291 | */ | ||
292 | for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--) | ||
293 | io_tlb_list[i] = ++count; | ||
294 | } | ||
295 | spin_unlock_irqrestore(&io_tlb_lock, flags); | ||
296 | } | ||
297 | |||
298 | static void | ||
299 | sync_single(struct device *hwdev, char *dma_addr, size_t size, int dir) | ||
300 | { | ||
301 | int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT; | ||
302 | char *buffer = io_tlb_orig_addr[index]; | ||
303 | |||
304 | /* | ||
305 | * bounce... copy the data back into/from the original buffer | ||
306 | * XXX How do you handle DMA_BIDIRECTIONAL here ? | ||
307 | */ | ||
308 | if (dir == DMA_FROM_DEVICE) | ||
309 | memcpy(buffer, dma_addr, size); | ||
310 | else if (dir == DMA_TO_DEVICE) | ||
311 | memcpy(dma_addr, buffer, size); | ||
312 | else | ||
313 | BUG(); | ||
314 | } | ||
315 | |||
316 | void * | ||
317 | swiotlb_alloc_coherent(struct device *hwdev, size_t size, | ||
318 | dma_addr_t *dma_handle, int flags) | ||
319 | { | ||
320 | unsigned long dev_addr; | ||
321 | void *ret; | ||
322 | int order = get_order(size); | ||
323 | |||
324 | /* | ||
325 | * XXX fix me: the DMA API should pass us an explicit DMA mask | ||
326 | * instead, or use ZONE_DMA32 (ia64 overloads ZONE_DMA to be a ~32 | ||
327 | * bit range instead of a 16MB one). | ||
328 | */ | ||
329 | flags |= GFP_DMA; | ||
330 | |||
331 | ret = (void *)__get_free_pages(flags, order); | ||
332 | if (ret && address_needs_mapping(hwdev, virt_to_phys(ret))) { | ||
333 | /* | ||
334 | * The allocated memory isn't reachable by the device. | ||
335 | * Fall back on swiotlb_map_single(). | ||
336 | */ | ||
337 | free_pages((unsigned long) ret, order); | ||
338 | ret = NULL; | ||
339 | } | ||
340 | if (!ret) { | ||
341 | /* | ||
342 | * We are either out of memory or the device can't DMA | ||
343 | * to GFP_DMA memory; fall back on | ||
344 | * swiotlb_map_single(), which will grab memory from | ||
345 | * the lowest available address range. | ||
346 | */ | ||
347 | dma_addr_t handle; | ||
348 | handle = swiotlb_map_single(NULL, NULL, size, DMA_FROM_DEVICE); | ||
349 | if (dma_mapping_error(handle)) | ||
350 | return NULL; | ||
351 | |||
352 | ret = phys_to_virt(handle); | ||
353 | } | ||
354 | |||
355 | memset(ret, 0, size); | ||
356 | dev_addr = virt_to_phys(ret); | ||
357 | |||
358 | /* Confirm address can be DMA'd by device */ | ||
359 | if (address_needs_mapping(hwdev, dev_addr)) { | ||
360 | printk("hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016lx\n", | ||
361 | (unsigned long long)*hwdev->dma_mask, dev_addr); | ||
362 | panic("swiotlb_alloc_coherent: allocated memory is out of " | ||
363 | "range for device"); | ||
364 | } | ||
365 | *dma_handle = dev_addr; | ||
366 | return ret; | ||
367 | } | ||
368 | |||
369 | void | ||
370 | swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, | ||
371 | dma_addr_t dma_handle) | ||
372 | { | ||
373 | if (!(vaddr >= (void *)io_tlb_start | ||
374 | && vaddr < (void *)io_tlb_end)) | ||
375 | free_pages((unsigned long) vaddr, get_order(size)); | ||
376 | else | ||
377 | /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ | ||
378 | swiotlb_unmap_single (hwdev, dma_handle, size, DMA_TO_DEVICE); | ||
379 | } | ||
380 | |||
381 | static void | ||
382 | swiotlb_full(struct device *dev, size_t size, int dir, int do_panic) | ||
383 | { | ||
384 | /* | ||
385 | * Ran out of IOMMU space for this operation. This is very bad. | ||
386 | * Unfortunately the drivers cannot handle this operation properly. | ||
387 | * unless they check for pci_dma_mapping_error (most don't) | ||
388 | * When the mapping is small enough return a static buffer to limit | ||
389 | * the damage, or panic when the transfer is too big. | ||
390 | */ | ||
391 | printk(KERN_ERR "PCI-DMA: Out of SW-IOMMU space for %lu bytes at " | ||
392 | "device %s\n", size, dev ? dev->bus_id : "?"); | ||
393 | |||
394 | if (size > io_tlb_overflow && do_panic) { | ||
395 | if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) | ||
396 | panic("PCI-DMA: Memory would be corrupted\n"); | ||
397 | if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) | ||
398 | panic("PCI-DMA: Random memory would be DMAed\n"); | ||
399 | } | ||
400 | } | ||
401 | |||
402 | /* | ||
403 | * Map a single buffer of the indicated size for DMA in streaming mode. The | ||
404 | * PCI address to use is returned. | ||
405 | * | ||
406 | * Once the device is given the dma address, the device owns this memory until | ||
407 | * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed. | ||
408 | */ | ||
409 | dma_addr_t | ||
410 | swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir) | ||
411 | { | ||
412 | unsigned long dev_addr = virt_to_phys(ptr); | ||
413 | void *map; | ||
414 | |||
415 | if (dir == DMA_NONE) | ||
416 | BUG(); | ||
417 | /* | ||
418 | * If the pointer passed in happens to be in the device's DMA window, | ||
419 | * we can safely return the device addr and not worry about bounce | ||
420 | * buffering it. | ||
421 | */ | ||
422 | if (!address_needs_mapping(hwdev, dev_addr) && !swiotlb_force) | ||
423 | return dev_addr; | ||
424 | |||
425 | /* | ||
426 | * Oh well, have to allocate and map a bounce buffer. | ||
427 | */ | ||
428 | map = map_single(hwdev, ptr, size, dir); | ||
429 | if (!map) { | ||
430 | swiotlb_full(hwdev, size, dir, 1); | ||
431 | map = io_tlb_overflow_buffer; | ||
432 | } | ||
433 | |||
434 | dev_addr = virt_to_phys(map); | ||
435 | |||
436 | /* | ||
437 | * Ensure that the address returned is DMA'ble | ||
438 | */ | ||
439 | if (address_needs_mapping(hwdev, dev_addr)) | ||
440 | panic("map_single: bounce buffer is not DMA'ble"); | ||
441 | |||
442 | return dev_addr; | ||
443 | } | ||
444 | |||
445 | /* | ||
446 | * Since DMA is i-cache coherent, any (complete) pages that were written via | ||
447 | * DMA can be marked as "clean" so that lazy_mmu_prot_update() doesn't have to | ||
448 | * flush them when they get mapped into an executable vm-area. | ||
449 | */ | ||
450 | static void | ||
451 | mark_clean(void *addr, size_t size) | ||
452 | { | ||
453 | unsigned long pg_addr, end; | ||
454 | |||
455 | pg_addr = PAGE_ALIGN((unsigned long) addr); | ||
456 | end = (unsigned long) addr + size; | ||
457 | while (pg_addr + PAGE_SIZE <= end) { | ||
458 | struct page *page = virt_to_page(pg_addr); | ||
459 | set_bit(PG_arch_1, &page->flags); | ||
460 | pg_addr += PAGE_SIZE; | ||
461 | } | ||
462 | } | ||
463 | |||
464 | /* | ||
465 | * Unmap a single streaming mode DMA translation. The dma_addr and size must | ||
466 | * match what was provided for in a previous swiotlb_map_single call. All | ||
467 | * other usages are undefined. | ||
468 | * | ||
469 | * After this call, reads by the cpu to the buffer are guaranteed to see | ||
470 | * whatever the device wrote there. | ||
471 | */ | ||
472 | void | ||
473 | swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size, | ||
474 | int dir) | ||
475 | { | ||
476 | char *dma_addr = phys_to_virt(dev_addr); | ||
477 | |||
478 | if (dir == DMA_NONE) | ||
479 | BUG(); | ||
480 | if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end) | ||
481 | unmap_single(hwdev, dma_addr, size, dir); | ||
482 | else if (dir == DMA_FROM_DEVICE) | ||
483 | mark_clean(dma_addr, size); | ||
484 | } | ||
485 | |||
486 | /* | ||
487 | * Make physical memory consistent for a single streaming mode DMA translation | ||
488 | * after a transfer. | ||
489 | * | ||
490 | * If you perform a swiotlb_map_single() but wish to interrogate the buffer | ||
491 | * using the cpu, yet do not wish to teardown the PCI dma mapping, you must | ||
492 | * call this function before doing so. At the next point you give the PCI dma | ||
493 | * address back to the card, you must first perform a | ||
494 | * swiotlb_dma_sync_for_device, and then the device again owns the buffer | ||
495 | */ | ||
496 | void | ||
497 | swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr, | ||
498 | size_t size, int dir) | ||
499 | { | ||
500 | char *dma_addr = phys_to_virt(dev_addr); | ||
501 | |||
502 | if (dir == DMA_NONE) | ||
503 | BUG(); | ||
504 | if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end) | ||
505 | sync_single(hwdev, dma_addr, size, dir); | ||
506 | else if (dir == DMA_FROM_DEVICE) | ||
507 | mark_clean(dma_addr, size); | ||
508 | } | ||
509 | |||
510 | void | ||
511 | swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, | ||
512 | size_t size, int dir) | ||
513 | { | ||
514 | char *dma_addr = phys_to_virt(dev_addr); | ||
515 | |||
516 | if (dir == DMA_NONE) | ||
517 | BUG(); | ||
518 | if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end) | ||
519 | sync_single(hwdev, dma_addr, size, dir); | ||
520 | else if (dir == DMA_FROM_DEVICE) | ||
521 | mark_clean(dma_addr, size); | ||
522 | } | ||
523 | |||
524 | /* | ||
525 | * Map a set of buffers described by scatterlist in streaming mode for DMA. | ||
526 | * This is the scatter-gather version of the above swiotlb_map_single | ||
527 | * interface. Here the scatter gather list elements are each tagged with the | ||
528 | * appropriate dma address and length. They are obtained via | ||
529 | * sg_dma_{address,length}(SG). | ||
530 | * | ||
531 | * NOTE: An implementation may be able to use a smaller number of | ||
532 | * DMA address/length pairs than there are SG table elements. | ||
533 | * (for example via virtual mapping capabilities) | ||
534 | * The routine returns the number of addr/length pairs actually | ||
535 | * used, at most nents. | ||
536 | * | ||
537 | * Device ownership issues as mentioned above for swiotlb_map_single are the | ||
538 | * same here. | ||
539 | */ | ||
540 | int | ||
541 | swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nelems, | ||
542 | int dir) | ||
543 | { | ||
544 | void *addr; | ||
545 | unsigned long dev_addr; | ||
546 | int i; | ||
547 | |||
548 | if (dir == DMA_NONE) | ||
549 | BUG(); | ||
550 | |||
551 | for (i = 0; i < nelems; i++, sg++) { | ||
552 | addr = SG_ENT_VIRT_ADDRESS(sg); | ||
553 | dev_addr = virt_to_phys(addr); | ||
554 | if (swiotlb_force || address_needs_mapping(hwdev, dev_addr)) { | ||
555 | sg->dma_address = (dma_addr_t) virt_to_phys(map_single(hwdev, addr, sg->length, dir)); | ||
556 | if (!sg->dma_address) { | ||
557 | /* Don't panic here, we expect map_sg users | ||
558 | to do proper error handling. */ | ||
559 | swiotlb_full(hwdev, sg->length, dir, 0); | ||
560 | swiotlb_unmap_sg(hwdev, sg - i, i, dir); | ||
561 | sg[0].dma_length = 0; | ||
562 | return 0; | ||
563 | } | ||
564 | } else | ||
565 | sg->dma_address = dev_addr; | ||
566 | sg->dma_length = sg->length; | ||
567 | } | ||
568 | return nelems; | ||
569 | } | ||
570 | |||
571 | /* | ||
572 | * Unmap a set of streaming mode DMA translations. Again, cpu read rules | ||
573 | * concerning calls here are the same as for swiotlb_unmap_single() above. | ||
574 | */ | ||
575 | void | ||
576 | swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nelems, | ||
577 | int dir) | ||
578 | { | ||
579 | int i; | ||
580 | |||
581 | if (dir == DMA_NONE) | ||
582 | BUG(); | ||
583 | |||
584 | for (i = 0; i < nelems; i++, sg++) | ||
585 | if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg)) | ||
586 | unmap_single(hwdev, (void *) phys_to_virt(sg->dma_address), sg->dma_length, dir); | ||
587 | else if (dir == DMA_FROM_DEVICE) | ||
588 | mark_clean(SG_ENT_VIRT_ADDRESS(sg), sg->dma_length); | ||
589 | } | ||
590 | |||
591 | /* | ||
592 | * Make physical memory consistent for a set of streaming mode DMA translations | ||
593 | * after a transfer. | ||
594 | * | ||
595 | * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules | ||
596 | * and usage. | ||
597 | */ | ||
598 | void | ||
599 | swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, | ||
600 | int nelems, int dir) | ||
601 | { | ||
602 | int i; | ||
603 | |||
604 | if (dir == DMA_NONE) | ||
605 | BUG(); | ||
606 | |||
607 | for (i = 0; i < nelems; i++, sg++) | ||
608 | if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg)) | ||
609 | sync_single(hwdev, (void *) sg->dma_address, | ||
610 | sg->dma_length, dir); | ||
611 | } | ||
612 | |||
613 | void | ||
614 | swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, | ||
615 | int nelems, int dir) | ||
616 | { | ||
617 | int i; | ||
618 | |||
619 | if (dir == DMA_NONE) | ||
620 | BUG(); | ||
621 | |||
622 | for (i = 0; i < nelems; i++, sg++) | ||
623 | if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg)) | ||
624 | sync_single(hwdev, (void *) sg->dma_address, | ||
625 | sg->dma_length, dir); | ||
626 | } | ||
627 | |||
628 | int | ||
629 | swiotlb_dma_mapping_error(dma_addr_t dma_addr) | ||
630 | { | ||
631 | return (dma_addr == virt_to_phys(io_tlb_overflow_buffer)); | ||
632 | } | ||
633 | |||
634 | /* | ||
635 | * Return whether the given PCI device DMA address mask can be supported | ||
636 | * properly. For example, if your device can only drive the low 24-bits | ||
637 | * during PCI bus mastering, then you would pass 0x00ffffff as the mask to | ||
638 | * this function. | ||
639 | */ | ||
640 | int | ||
641 | swiotlb_dma_supported (struct device *hwdev, u64 mask) | ||
642 | { | ||
643 | return (virt_to_phys (io_tlb_end) - 1) <= mask; | ||
644 | } | ||
645 | |||
646 | EXPORT_SYMBOL(swiotlb_init); | ||
647 | EXPORT_SYMBOL(swiotlb_map_single); | ||
648 | EXPORT_SYMBOL(swiotlb_unmap_single); | ||
649 | EXPORT_SYMBOL(swiotlb_map_sg); | ||
650 | EXPORT_SYMBOL(swiotlb_unmap_sg); | ||
651 | EXPORT_SYMBOL(swiotlb_sync_single_for_cpu); | ||
652 | EXPORT_SYMBOL(swiotlb_sync_single_for_device); | ||
653 | EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu); | ||
654 | EXPORT_SYMBOL(swiotlb_sync_sg_for_device); | ||
655 | EXPORT_SYMBOL(swiotlb_dma_mapping_error); | ||
656 | EXPORT_SYMBOL(swiotlb_alloc_coherent); | ||
657 | EXPORT_SYMBOL(swiotlb_free_coherent); | ||
658 | EXPORT_SYMBOL(swiotlb_dma_supported); | ||
diff --git a/arch/ia64/lib/xor.S b/arch/ia64/lib/xor.S new file mode 100644 index 000000000000..54e3f7eab8e9 --- /dev/null +++ b/arch/ia64/lib/xor.S | |||
@@ -0,0 +1,184 @@ | |||
1 | /* | ||
2 | * arch/ia64/lib/xor.S | ||
3 | * | ||
4 | * Optimized RAID-5 checksumming functions for IA-64. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2, or (at your option) | ||
9 | * any later version. | ||
10 | * | ||
11 | * You should have received a copy of the GNU General Public License | ||
12 | * (for example /usr/src/linux/COPYING); if not, write to the Free | ||
13 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
14 | */ | ||
15 | |||
16 | #include <asm/asmmacro.h> | ||
17 | |||
18 | GLOBAL_ENTRY(xor_ia64_2) | ||
19 | .prologue | ||
20 | .fframe 0 | ||
21 | .save ar.pfs, r31 | ||
22 | alloc r31 = ar.pfs, 3, 0, 13, 16 | ||
23 | .save ar.lc, r30 | ||
24 | mov r30 = ar.lc | ||
25 | .save pr, r29 | ||
26 | mov r29 = pr | ||
27 | ;; | ||
28 | .body | ||
29 | mov r8 = in1 | ||
30 | mov ar.ec = 6 + 2 | ||
31 | shr in0 = in0, 3 | ||
32 | ;; | ||
33 | adds in0 = -1, in0 | ||
34 | mov r16 = in1 | ||
35 | mov r17 = in2 | ||
36 | ;; | ||
37 | mov ar.lc = in0 | ||
38 | mov pr.rot = 1 << 16 | ||
39 | ;; | ||
40 | .rotr s1[6+1], s2[6+1], d[2] | ||
41 | .rotp p[6+2] | ||
42 | 0: | ||
43 | (p[0]) ld8.nta s1[0] = [r16], 8 | ||
44 | (p[0]) ld8.nta s2[0] = [r17], 8 | ||
45 | (p[6]) xor d[0] = s1[6], s2[6] | ||
46 | (p[6+1])st8.nta [r8] = d[1], 8 | ||
47 | nop.f 0 | ||
48 | br.ctop.dptk.few 0b | ||
49 | ;; | ||
50 | mov ar.lc = r30 | ||
51 | mov pr = r29, -1 | ||
52 | br.ret.sptk.few rp | ||
53 | END(xor_ia64_2) | ||
54 | |||
55 | GLOBAL_ENTRY(xor_ia64_3) | ||
56 | .prologue | ||
57 | .fframe 0 | ||
58 | .save ar.pfs, r31 | ||
59 | alloc r31 = ar.pfs, 4, 0, 20, 24 | ||
60 | .save ar.lc, r30 | ||
61 | mov r30 = ar.lc | ||
62 | .save pr, r29 | ||
63 | mov r29 = pr | ||
64 | ;; | ||
65 | .body | ||
66 | mov r8 = in1 | ||
67 | mov ar.ec = 6 + 2 | ||
68 | shr in0 = in0, 3 | ||
69 | ;; | ||
70 | adds in0 = -1, in0 | ||
71 | mov r16 = in1 | ||
72 | mov r17 = in2 | ||
73 | ;; | ||
74 | mov r18 = in3 | ||
75 | mov ar.lc = in0 | ||
76 | mov pr.rot = 1 << 16 | ||
77 | ;; | ||
78 | .rotr s1[6+1], s2[6+1], s3[6+1], d[2] | ||
79 | .rotp p[6+2] | ||
80 | 0: | ||
81 | (p[0]) ld8.nta s1[0] = [r16], 8 | ||
82 | (p[0]) ld8.nta s2[0] = [r17], 8 | ||
83 | (p[6]) xor d[0] = s1[6], s2[6] | ||
84 | ;; | ||
85 | (p[0]) ld8.nta s3[0] = [r18], 8 | ||
86 | (p[6+1])st8.nta [r8] = d[1], 8 | ||
87 | (p[6]) xor d[0] = d[0], s3[6] | ||
88 | br.ctop.dptk.few 0b | ||
89 | ;; | ||
90 | mov ar.lc = r30 | ||
91 | mov pr = r29, -1 | ||
92 | br.ret.sptk.few rp | ||
93 | END(xor_ia64_3) | ||
94 | |||
95 | GLOBAL_ENTRY(xor_ia64_4) | ||
96 | .prologue | ||
97 | .fframe 0 | ||
98 | .save ar.pfs, r31 | ||
99 | alloc r31 = ar.pfs, 5, 0, 27, 32 | ||
100 | .save ar.lc, r30 | ||
101 | mov r30 = ar.lc | ||
102 | .save pr, r29 | ||
103 | mov r29 = pr | ||
104 | ;; | ||
105 | .body | ||
106 | mov r8 = in1 | ||
107 | mov ar.ec = 6 + 2 | ||
108 | shr in0 = in0, 3 | ||
109 | ;; | ||
110 | adds in0 = -1, in0 | ||
111 | mov r16 = in1 | ||
112 | mov r17 = in2 | ||
113 | ;; | ||
114 | mov r18 = in3 | ||
115 | mov ar.lc = in0 | ||
116 | mov pr.rot = 1 << 16 | ||
117 | mov r19 = in4 | ||
118 | ;; | ||
119 | .rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], d[2] | ||
120 | .rotp p[6+2] | ||
121 | 0: | ||
122 | (p[0]) ld8.nta s1[0] = [r16], 8 | ||
123 | (p[0]) ld8.nta s2[0] = [r17], 8 | ||
124 | (p[6]) xor d[0] = s1[6], s2[6] | ||
125 | (p[0]) ld8.nta s3[0] = [r18], 8 | ||
126 | (p[0]) ld8.nta s4[0] = [r19], 8 | ||
127 | (p[6]) xor r20 = s3[6], s4[6] | ||
128 | ;; | ||
129 | (p[6+1])st8.nta [r8] = d[1], 8 | ||
130 | (p[6]) xor d[0] = d[0], r20 | ||
131 | br.ctop.dptk.few 0b | ||
132 | ;; | ||
133 | mov ar.lc = r30 | ||
134 | mov pr = r29, -1 | ||
135 | br.ret.sptk.few rp | ||
136 | END(xor_ia64_4) | ||
137 | |||
138 | GLOBAL_ENTRY(xor_ia64_5) | ||
139 | .prologue | ||
140 | .fframe 0 | ||
141 | .save ar.pfs, r31 | ||
142 | alloc r31 = ar.pfs, 6, 0, 34, 40 | ||
143 | .save ar.lc, r30 | ||
144 | mov r30 = ar.lc | ||
145 | .save pr, r29 | ||
146 | mov r29 = pr | ||
147 | ;; | ||
148 | .body | ||
149 | mov r8 = in1 | ||
150 | mov ar.ec = 6 + 2 | ||
151 | shr in0 = in0, 3 | ||
152 | ;; | ||
153 | adds in0 = -1, in0 | ||
154 | mov r16 = in1 | ||
155 | mov r17 = in2 | ||
156 | ;; | ||
157 | mov r18 = in3 | ||
158 | mov ar.lc = in0 | ||
159 | mov pr.rot = 1 << 16 | ||
160 | mov r19 = in4 | ||
161 | mov r20 = in5 | ||
162 | ;; | ||
163 | .rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], s5[6+1], d[2] | ||
164 | .rotp p[6+2] | ||
165 | 0: | ||
166 | (p[0]) ld8.nta s1[0] = [r16], 8 | ||
167 | (p[0]) ld8.nta s2[0] = [r17], 8 | ||
168 | (p[6]) xor d[0] = s1[6], s2[6] | ||
169 | (p[0]) ld8.nta s3[0] = [r18], 8 | ||
170 | (p[0]) ld8.nta s4[0] = [r19], 8 | ||
171 | (p[6]) xor r21 = s3[6], s4[6] | ||
172 | ;; | ||
173 | (p[0]) ld8.nta s5[0] = [r20], 8 | ||
174 | (p[6+1])st8.nta [r8] = d[1], 8 | ||
175 | (p[6]) xor d[0] = d[0], r21 | ||
176 | ;; | ||
177 | (p[6]) xor d[0] = d[0], s5[6] | ||
178 | nop.f 0 | ||
179 | br.ctop.dptk.few 0b | ||
180 | ;; | ||
181 | mov ar.lc = r30 | ||
182 | mov pr = r29, -1 | ||
183 | br.ret.sptk.few rp | ||
184 | END(xor_ia64_5) | ||