aboutsummaryrefslogtreecommitdiffstats
path: root/arch/ia64/lib
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:20:36 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:20:36 -0400
commit1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/ia64/lib
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
Diffstat (limited to 'arch/ia64/lib')
-rw-r--r--arch/ia64/lib/Makefile52
-rw-r--r--arch/ia64/lib/bitop.c88
-rw-r--r--arch/ia64/lib/carta_random.S54
-rw-r--r--arch/ia64/lib/checksum.c102
-rw-r--r--arch/ia64/lib/clear_page.S77
-rw-r--r--arch/ia64/lib/clear_user.S209
-rw-r--r--arch/ia64/lib/copy_page.S98
-rw-r--r--arch/ia64/lib/copy_page_mck.S185
-rw-r--r--arch/ia64/lib/copy_user.S610
-rw-r--r--arch/ia64/lib/csum_partial_copy.c151
-rw-r--r--arch/ia64/lib/dec_and_lock.c42
-rw-r--r--arch/ia64/lib/do_csum.S323
-rw-r--r--arch/ia64/lib/flush.S39
-rw-r--r--arch/ia64/lib/idiv32.S83
-rw-r--r--arch/ia64/lib/idiv64.S80
-rw-r--r--arch/ia64/lib/io.c165
-rw-r--r--arch/ia64/lib/ip_fast_csum.S90
-rw-r--r--arch/ia64/lib/memcpy.S301
-rw-r--r--arch/ia64/lib/memcpy_mck.S661
-rw-r--r--arch/ia64/lib/memset.S362
-rw-r--r--arch/ia64/lib/strlen.S192
-rw-r--r--arch/ia64/lib/strlen_user.S198
-rw-r--r--arch/ia64/lib/strncpy_from_user.S44
-rw-r--r--arch/ia64/lib/strnlen_user.S45
-rw-r--r--arch/ia64/lib/swiotlb.c658
-rw-r--r--arch/ia64/lib/xor.S184
26 files changed, 5093 insertions, 0 deletions
diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile
new file mode 100644
index 000000000000..1902c3c2ef92
--- /dev/null
+++ b/arch/ia64/lib/Makefile
@@ -0,0 +1,52 @@
1#
2# Makefile for ia64-specific library routines..
3#
4
5obj-y := io.o
6
7lib-y := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \
8 __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o \
9 bitop.o checksum.o clear_page.o csum_partial_copy.o copy_page.o \
10 clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o \
11 flush.o ip_fast_csum.o do_csum.o \
12 memset.o strlen.o swiotlb.o
13
14lib-$(CONFIG_ITANIUM) += copy_page.o copy_user.o memcpy.o
15lib-$(CONFIG_MCKINLEY) += copy_page_mck.o memcpy_mck.o
16lib-$(CONFIG_PERFMON) += carta_random.o
17lib-$(CONFIG_MD_RAID5) += xor.o
18lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
19
20AFLAGS___divdi3.o =
21AFLAGS___udivdi3.o = -DUNSIGNED
22AFLAGS___moddi3.o = -DMODULO
23AFLAGS___umoddi3.o = -DUNSIGNED -DMODULO
24
25AFLAGS___divsi3.o =
26AFLAGS___udivsi3.o = -DUNSIGNED
27AFLAGS___modsi3.o = -DMODULO
28AFLAGS___umodsi3.o = -DUNSIGNED -DMODULO
29
30$(obj)/__divdi3.o: $(src)/idiv64.S FORCE
31 $(call if_changed_dep,as_o_S)
32
33$(obj)/__udivdi3.o: $(src)/idiv64.S FORCE
34 $(call if_changed_dep,as_o_S)
35
36$(obj)/__moddi3.o: $(src)/idiv64.S FORCE
37 $(call if_changed_dep,as_o_S)
38
39$(obj)/__umoddi3.o: $(src)/idiv64.S FORCE
40 $(call if_changed_dep,as_o_S)
41
42$(obj)/__divsi3.o: $(src)/idiv32.S FORCE
43 $(call if_changed_dep,as_o_S)
44
45$(obj)/__udivsi3.o: $(src)/idiv32.S FORCE
46 $(call if_changed_dep,as_o_S)
47
48$(obj)/__modsi3.o: $(src)/idiv32.S FORCE
49 $(call if_changed_dep,as_o_S)
50
51$(obj)/__umodsi3.o: $(src)/idiv32.S FORCE
52 $(call if_changed_dep,as_o_S)
diff --git a/arch/ia64/lib/bitop.c b/arch/ia64/lib/bitop.c
new file mode 100644
index 000000000000..82e299c8464e
--- /dev/null
+++ b/arch/ia64/lib/bitop.c
@@ -0,0 +1,88 @@
1#include <linux/compiler.h>
2#include <linux/types.h>
3#include <asm/intrinsics.h>
4#include <linux/module.h>
5#include <linux/bitops.h>
6
7/*
8 * Find next zero bit in a bitmap reasonably efficiently..
9 */
10
11int __find_next_zero_bit (const void *addr, unsigned long size, unsigned long offset)
12{
13 unsigned long *p = ((unsigned long *) addr) + (offset >> 6);
14 unsigned long result = offset & ~63UL;
15 unsigned long tmp;
16
17 if (offset >= size)
18 return size;
19 size -= result;
20 offset &= 63UL;
21 if (offset) {
22 tmp = *(p++);
23 tmp |= ~0UL >> (64-offset);
24 if (size < 64)
25 goto found_first;
26 if (~tmp)
27 goto found_middle;
28 size -= 64;
29 result += 64;
30 }
31 while (size & ~63UL) {
32 if (~(tmp = *(p++)))
33 goto found_middle;
34 result += 64;
35 size -= 64;
36 }
37 if (!size)
38 return result;
39 tmp = *p;
40found_first:
41 tmp |= ~0UL << size;
42 if (tmp == ~0UL) /* any bits zero? */
43 return result + size; /* nope */
44found_middle:
45 return result + ffz(tmp);
46}
47EXPORT_SYMBOL(__find_next_zero_bit);
48
49/*
50 * Find next bit in a bitmap reasonably efficiently..
51 */
52int __find_next_bit(const void *addr, unsigned long size, unsigned long offset)
53{
54 unsigned long *p = ((unsigned long *) addr) + (offset >> 6);
55 unsigned long result = offset & ~63UL;
56 unsigned long tmp;
57
58 if (offset >= size)
59 return size;
60 size -= result;
61 offset &= 63UL;
62 if (offset) {
63 tmp = *(p++);
64 tmp &= ~0UL << offset;
65 if (size < 64)
66 goto found_first;
67 if (tmp)
68 goto found_middle;
69 size -= 64;
70 result += 64;
71 }
72 while (size & ~63UL) {
73 if ((tmp = *(p++)))
74 goto found_middle;
75 result += 64;
76 size -= 64;
77 }
78 if (!size)
79 return result;
80 tmp = *p;
81 found_first:
82 tmp &= ~0UL >> (64-size);
83 if (tmp == 0UL) /* Are any bits set? */
84 return result + size; /* Nope. */
85 found_middle:
86 return result + __ffs(tmp);
87}
88EXPORT_SYMBOL(__find_next_bit);
diff --git a/arch/ia64/lib/carta_random.S b/arch/ia64/lib/carta_random.S
new file mode 100644
index 000000000000..d0674c360364
--- /dev/null
+++ b/arch/ia64/lib/carta_random.S
@@ -0,0 +1,54 @@
1/*
2 * Fast, simple, yet decent quality random number generator based on
3 * a paper by David G. Carta ("Two Fast Implementations of the
4 * `Minimal Standard' Random Number Generator," Communications of the
5 * ACM, January, 1990).
6 *
7 * Copyright (C) 2002 Hewlett-Packard Co
8 * David Mosberger-Tang <davidm@hpl.hp.com>
9 */
10
11#include <asm/asmmacro.h>
12
13#define a r2
14#define m r3
15#define lo r8
16#define hi r9
17#define t0 r16
18#define t1 r17
19#define seed r32
20
21GLOBAL_ENTRY(carta_random32)
22 movl a = (16807 << 16) | 16807
23 ;;
24 pmpyshr2.u t0 = a, seed, 0
25 pmpyshr2.u t1 = a, seed, 16
26 ;;
27 unpack2.l t0 = t1, t0
28 dep m = -1, r0, 0, 31
29 ;;
30 zxt4 lo = t0
31 shr.u hi = t0, 32
32 ;;
33 dep t0 = 0, hi, 15, 49 // t0 = (hi & 0x7fff)
34 ;;
35 shl t0 = t0, 16 // t0 = (hi & 0x7fff) << 16
36 shr t1 = hi, 15 // t1 = (hi >> 15)
37 ;;
38 add lo = lo, t0
39 ;;
40 cmp.gtu p6, p0 = lo, m
41 ;;
42(p6) and lo = lo, m
43 ;;
44(p6) add lo = 1, lo
45 ;;
46 add lo = lo, t1
47 ;;
48 cmp.gtu p6, p0 = lo, m
49 ;;
50(p6) and lo = lo, m
51 ;;
52(p6) add lo = 1, lo
53 br.ret.sptk.many rp
54END(carta_random32)
diff --git a/arch/ia64/lib/checksum.c b/arch/ia64/lib/checksum.c
new file mode 100644
index 000000000000..beb11721d9f5
--- /dev/null
+++ b/arch/ia64/lib/checksum.c
@@ -0,0 +1,102 @@
1/*
2 * Network checksum routines
3 *
4 * Copyright (C) 1999, 2003 Hewlett-Packard Co
5 * Stephane Eranian <eranian@hpl.hp.com>
6 *
7 * Most of the code coming from arch/alpha/lib/checksum.c
8 *
9 * This file contains network checksum routines that are better done
10 * in an architecture-specific manner due to speed..
11 */
12
13#include <linux/module.h>
14#include <linux/string.h>
15
16#include <asm/byteorder.h>
17
18static inline unsigned short
19from64to16 (unsigned long x)
20{
21 /* add up 32-bit words for 33 bits */
22 x = (x & 0xffffffff) + (x >> 32);
23 /* add up 16-bit and 17-bit words for 17+c bits */
24 x = (x & 0xffff) + (x >> 16);
25 /* add up 16-bit and 2-bit for 16+c bit */
26 x = (x & 0xffff) + (x >> 16);
27 /* add up carry.. */
28 x = (x & 0xffff) + (x >> 16);
29 return x;
30}
31
32/*
33 * computes the checksum of the TCP/UDP pseudo-header
34 * returns a 16-bit checksum, already complemented.
35 */
36unsigned short int
37csum_tcpudp_magic (unsigned long saddr, unsigned long daddr, unsigned short len,
38 unsigned short proto, unsigned int sum)
39{
40 return ~from64to16(saddr + daddr + sum + ((unsigned long) ntohs(len) << 16) +
41 ((unsigned long) proto << 8));
42}
43
44EXPORT_SYMBOL(csum_tcpudp_magic);
45
46unsigned int
47csum_tcpudp_nofold (unsigned long saddr, unsigned long daddr, unsigned short len,
48 unsigned short proto, unsigned int sum)
49{
50 unsigned long result;
51
52 result = (saddr + daddr + sum +
53 ((unsigned long) ntohs(len) << 16) +
54 ((unsigned long) proto << 8));
55
56 /* Fold down to 32-bits so we don't lose in the typedef-less network stack. */
57 /* 64 to 33 */
58 result = (result & 0xffffffff) + (result >> 32);
59 /* 33 to 32 */
60 result = (result & 0xffffffff) + (result >> 32);
61 return result;
62}
63
64extern unsigned long do_csum (const unsigned char *, long);
65
66/*
67 * computes the checksum of a memory block at buff, length len,
68 * and adds in "sum" (32-bit)
69 *
70 * returns a 32-bit number suitable for feeding into itself
71 * or csum_tcpudp_magic
72 *
73 * this function must be called with even lengths, except
74 * for the last fragment, which may be odd
75 *
76 * it's best to have buff aligned on a 32-bit boundary
77 */
78unsigned int
79csum_partial (const unsigned char * buff, int len, unsigned int sum)
80{
81 unsigned long result = do_csum(buff, len);
82
83 /* add in old sum, and carry.. */
84 result += sum;
85 /* 32+c bits -> 32 bits */
86 result = (result & 0xffffffff) + (result >> 32);
87 return result;
88}
89
90EXPORT_SYMBOL(csum_partial);
91
92/*
93 * this routine is used for miscellaneous IP-like checksums, mainly
94 * in icmp.c
95 */
96unsigned short
97ip_compute_csum (unsigned char * buff, int len)
98{
99 return ~do_csum(buff,len);
100}
101
102EXPORT_SYMBOL(ip_compute_csum);
diff --git a/arch/ia64/lib/clear_page.S b/arch/ia64/lib/clear_page.S
new file mode 100644
index 000000000000..d4987061dda7
--- /dev/null
+++ b/arch/ia64/lib/clear_page.S
@@ -0,0 +1,77 @@
1/*
2 * Copyright (C) 1999-2002 Hewlett-Packard Co
3 * Stephane Eranian <eranian@hpl.hp.com>
4 * David Mosberger-Tang <davidm@hpl.hp.com>
5 * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
6 *
7 * 1/06/01 davidm Tuned for Itanium.
8 * 2/12/02 kchen Tuned for both Itanium and McKinley
9 * 3/08/02 davidm Some more tweaking
10 */
11#include <linux/config.h>
12
13#include <asm/asmmacro.h>
14#include <asm/page.h>
15
16#ifdef CONFIG_ITANIUM
17# define L3_LINE_SIZE 64 // Itanium L3 line size
18# define PREFETCH_LINES 9 // magic number
19#else
20# define L3_LINE_SIZE 128 // McKinley L3 line size
21# define PREFETCH_LINES 12 // magic number
22#endif
23
24#define saved_lc r2
25#define dst_fetch r3
26#define dst1 r8
27#define dst2 r9
28#define dst3 r10
29#define dst4 r11
30
31#define dst_last r31
32
33GLOBAL_ENTRY(clear_page)
34 .prologue
35 .regstk 1,0,0,0
36 mov r16 = PAGE_SIZE/L3_LINE_SIZE-1 // main loop count, -1=repeat/until
37 .save ar.lc, saved_lc
38 mov saved_lc = ar.lc
39
40 .body
41 mov ar.lc = (PREFETCH_LINES - 1)
42 mov dst_fetch = in0
43 adds dst1 = 16, in0
44 adds dst2 = 32, in0
45 ;;
46.fetch: stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
47 adds dst3 = 48, in0 // executing this multiple times is harmless
48 br.cloop.sptk.few .fetch
49 ;;
50 addl dst_last = (PAGE_SIZE - PREFETCH_LINES*L3_LINE_SIZE), dst_fetch
51 mov ar.lc = r16 // one L3 line per iteration
52 adds dst4 = 64, in0
53 ;;
54#ifdef CONFIG_ITANIUM
55 // Optimized for Itanium
561: stf.spill.nta [dst1] = f0, 64
57 stf.spill.nta [dst2] = f0, 64
58 cmp.lt p8,p0=dst_fetch, dst_last
59 ;;
60#else
61 // Optimized for McKinley
621: stf.spill.nta [dst1] = f0, 64
63 stf.spill.nta [dst2] = f0, 64
64 stf.spill.nta [dst3] = f0, 64
65 stf.spill.nta [dst4] = f0, 128
66 cmp.lt p8,p0=dst_fetch, dst_last
67 ;;
68 stf.spill.nta [dst1] = f0, 64
69 stf.spill.nta [dst2] = f0, 64
70#endif
71 stf.spill.nta [dst3] = f0, 64
72(p8) stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
73 br.cloop.sptk.few 1b
74 ;;
75 mov ar.lc = saved_lc // restore lc
76 br.ret.sptk.many rp
77END(clear_page)
diff --git a/arch/ia64/lib/clear_user.S b/arch/ia64/lib/clear_user.S
new file mode 100644
index 000000000000..eecd8577b209
--- /dev/null
+++ b/arch/ia64/lib/clear_user.S
@@ -0,0 +1,209 @@
1/*
2 * This routine clears to zero a linear memory buffer in user space.
3 *
4 * Inputs:
5 * in0: address of buffer
6 * in1: length of buffer in bytes
7 * Outputs:
8 * r8: number of bytes that didn't get cleared due to a fault
9 *
10 * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
11 * Stephane Eranian <eranian@hpl.hp.com>
12 */
13
14#include <asm/asmmacro.h>
15
16//
17// arguments
18//
19#define buf r32
20#define len r33
21
22//
23// local registers
24//
25#define cnt r16
26#define buf2 r17
27#define saved_lc r18
28#define saved_pfs r19
29#define tmp r20
30#define len2 r21
31#define len3 r22
32
33//
34// Theory of operations:
35// - we check whether or not the buffer is small, i.e., less than 17
36// in which case we do the byte by byte loop.
37//
38// - Otherwise we go progressively from 1 byte store to 8byte store in
39// the head part, the body is a 16byte store loop and we finish we the
40// tail for the last 15 bytes.
41// The good point about this breakdown is that the long buffer handling
42// contains only 2 branches.
43//
44// The reason for not using shifting & masking for both the head and the
45// tail is to stay semantically correct. This routine is not supposed
46// to write bytes outside of the buffer. While most of the time this would
47// be ok, we can't tolerate a mistake. A classical example is the case
48// of multithreaded code were to the extra bytes touched is actually owned
49// by another thread which runs concurrently to ours. Another, less likely,
50// example is with device drivers where reading an I/O mapped location may
51// have side effects (same thing for writing).
52//
53
54GLOBAL_ENTRY(__do_clear_user)
55 .prologue
56 .save ar.pfs, saved_pfs
57 alloc saved_pfs=ar.pfs,2,0,0,0
58 cmp.eq p6,p0=r0,len // check for zero length
59 .save ar.lc, saved_lc
60 mov saved_lc=ar.lc // preserve ar.lc (slow)
61 .body
62 ;; // avoid WAW on CFM
63 adds tmp=-1,len // br.ctop is repeat/until
64 mov ret0=len // return value is length at this point
65(p6) br.ret.spnt.many rp
66 ;;
67 cmp.lt p6,p0=16,len // if len > 16 then long memset
68 mov ar.lc=tmp // initialize lc for small count
69(p6) br.cond.dptk .long_do_clear
70 ;; // WAR on ar.lc
71 //
72 // worst case 16 iterations, avg 8 iterations
73 //
74 // We could have played with the predicates to use the extra
75 // M slot for 2 stores/iteration but the cost the initialization
76 // the various counters compared to how long the loop is supposed
77 // to last on average does not make this solution viable.
78 //
791:
80 EX( .Lexit1, st1 [buf]=r0,1 )
81 adds len=-1,len // countdown length using len
82 br.cloop.dptk 1b
83 ;; // avoid RAW on ar.lc
84 //
85 // .Lexit4: comes from byte by byte loop
86 // len contains bytes left
87.Lexit1:
88 mov ret0=len // faster than using ar.lc
89 mov ar.lc=saved_lc
90 br.ret.sptk.many rp // end of short clear_user
91
92
93 //
94 // At this point we know we have more than 16 bytes to copy
95 // so we focus on alignment (no branches required)
96 //
97 // The use of len/len2 for countdown of the number of bytes left
98 // instead of ret0 is due to the fact that the exception code
99 // changes the values of r8.
100 //
101.long_do_clear:
102 tbit.nz p6,p0=buf,0 // odd alignment (for long_do_clear)
103 ;;
104 EX( .Lexit3, (p6) st1 [buf]=r0,1 ) // 1-byte aligned
105(p6) adds len=-1,len;; // sync because buf is modified
106 tbit.nz p6,p0=buf,1
107 ;;
108 EX( .Lexit3, (p6) st2 [buf]=r0,2 ) // 2-byte aligned
109(p6) adds len=-2,len;;
110 tbit.nz p6,p0=buf,2
111 ;;
112 EX( .Lexit3, (p6) st4 [buf]=r0,4 ) // 4-byte aligned
113(p6) adds len=-4,len;;
114 tbit.nz p6,p0=buf,3
115 ;;
116 EX( .Lexit3, (p6) st8 [buf]=r0,8 ) // 8-byte aligned
117(p6) adds len=-8,len;;
118 shr.u cnt=len,4 // number of 128-bit (2x64bit) words
119 ;;
120 cmp.eq p6,p0=r0,cnt
121 adds tmp=-1,cnt
122(p6) br.cond.dpnt .dotail // we have less than 16 bytes left
123 ;;
124 adds buf2=8,buf // setup second base pointer
125 mov ar.lc=tmp
126 ;;
127
128 //
129 // 16bytes/iteration core loop
130 //
131 // The second store can never generate a fault because
132 // we come into the loop only when we are 16-byte aligned.
133 // This means that if we cross a page then it will always be
134 // in the first store and never in the second.
135 //
136 //
137 // We need to keep track of the remaining length. A possible (optimistic)
138 // way would be to use ar.lc and derive how many byte were left by
139 // doing : left= 16*ar.lc + 16. this would avoid the addition at
140 // every iteration.
141 // However we need to keep the synchronization point. A template
142 // M;;MB does not exist and thus we can keep the addition at no
143 // extra cycle cost (use a nop slot anyway). It also simplifies the
144 // (unlikely) error recovery code
145 //
146
1472: EX(.Lexit3, st8 [buf]=r0,16 )
148 ;; // needed to get len correct when error
149 st8 [buf2]=r0,16
150 adds len=-16,len
151 br.cloop.dptk 2b
152 ;;
153 mov ar.lc=saved_lc
154 //
155 // tail correction based on len only
156 //
157 // We alternate the use of len3,len2 to allow parallelism and correct
158 // error handling. We also reuse p6/p7 to return correct value.
159 // The addition of len2/len3 does not cost anything more compared to
160 // the regular memset as we had empty slots.
161 //
162.dotail:
163 mov len2=len // for parallelization of error handling
164 mov len3=len
165 tbit.nz p6,p0=len,3
166 ;;
167 EX( .Lexit2, (p6) st8 [buf]=r0,8 ) // at least 8 bytes
168(p6) adds len3=-8,len2
169 tbit.nz p7,p6=len,2
170 ;;
171 EX( .Lexit2, (p7) st4 [buf]=r0,4 ) // at least 4 bytes
172(p7) adds len2=-4,len3
173 tbit.nz p6,p7=len,1
174 ;;
175 EX( .Lexit2, (p6) st2 [buf]=r0,2 ) // at least 2 bytes
176(p6) adds len3=-2,len2
177 tbit.nz p7,p6=len,0
178 ;;
179 EX( .Lexit2, (p7) st1 [buf]=r0 ) // only 1 byte left
180 mov ret0=r0 // success
181 br.ret.sptk.many rp // end of most likely path
182
183 //
184 // Outlined error handling code
185 //
186
187 //
188 // .Lexit3: comes from core loop, need restore pr/lc
189 // len contains bytes left
190 //
191 //
192 // .Lexit2:
193 // if p6 -> coming from st8 or st2 : len2 contains what's left
194 // if p7 -> coming from st4 or st1 : len3 contains what's left
195 // We must restore lc/pr even though might not have been used.
196.Lexit2:
197 .pred.rel "mutex", p6, p7
198(p6) mov len=len2
199(p7) mov len=len3
200 ;;
201 //
202 // .Lexit4: comes from head, need not restore pr/lc
203 // len contains bytes left
204 //
205.Lexit3:
206 mov ret0=len
207 mov ar.lc=saved_lc
208 br.ret.sptk.many rp
209END(__do_clear_user)
diff --git a/arch/ia64/lib/copy_page.S b/arch/ia64/lib/copy_page.S
new file mode 100644
index 000000000000..127d1d050d78
--- /dev/null
+++ b/arch/ia64/lib/copy_page.S
@@ -0,0 +1,98 @@
1/*
2 *
3 * Optimized version of the standard copy_page() function
4 *
5 * Inputs:
6 * in0: address of target page
7 * in1: address of source page
8 * Output:
9 * no return value
10 *
11 * Copyright (C) 1999, 2001 Hewlett-Packard Co
12 * Stephane Eranian <eranian@hpl.hp.com>
13 * David Mosberger <davidm@hpl.hp.com>
14 *
15 * 4/06/01 davidm Tuned to make it perform well both for cached and uncached copies.
16 */
17#include <asm/asmmacro.h>
18#include <asm/page.h>
19
20#define PIPE_DEPTH 3
21#define EPI p[PIPE_DEPTH-1]
22
23#define lcount r16
24#define saved_pr r17
25#define saved_lc r18
26#define saved_pfs r19
27#define src1 r20
28#define src2 r21
29#define tgt1 r22
30#define tgt2 r23
31#define srcf r24
32#define tgtf r25
33#define tgt_last r26
34
35#define Nrot ((8*PIPE_DEPTH+7)&~7)
36
37GLOBAL_ENTRY(copy_page)
38 .prologue
39 .save ar.pfs, saved_pfs
40 alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
41
42 .rotr t1[PIPE_DEPTH], t2[PIPE_DEPTH], t3[PIPE_DEPTH], t4[PIPE_DEPTH], \
43 t5[PIPE_DEPTH], t6[PIPE_DEPTH], t7[PIPE_DEPTH], t8[PIPE_DEPTH]
44 .rotp p[PIPE_DEPTH]
45
46 .save ar.lc, saved_lc
47 mov saved_lc=ar.lc
48 mov ar.ec=PIPE_DEPTH
49
50 mov lcount=PAGE_SIZE/64-1
51 .save pr, saved_pr
52 mov saved_pr=pr
53 mov pr.rot=1<<16
54
55 .body
56
57 mov src1=in1
58 adds src2=8,in1
59 mov tgt_last = PAGE_SIZE
60 ;;
61 adds tgt2=8,in0
62 add srcf=512,in1
63 mov ar.lc=lcount
64 mov tgt1=in0
65 add tgtf=512,in0
66 add tgt_last = tgt_last, in0
67 ;;
681:
69(p[0]) ld8 t1[0]=[src1],16
70(EPI) st8 [tgt1]=t1[PIPE_DEPTH-1],16
71(p[0]) ld8 t2[0]=[src2],16
72(EPI) st8 [tgt2]=t2[PIPE_DEPTH-1],16
73 cmp.ltu p6,p0 = tgtf, tgt_last
74 ;;
75(p[0]) ld8 t3[0]=[src1],16
76(EPI) st8 [tgt1]=t3[PIPE_DEPTH-1],16
77(p[0]) ld8 t4[0]=[src2],16
78(EPI) st8 [tgt2]=t4[PIPE_DEPTH-1],16
79 ;;
80(p[0]) ld8 t5[0]=[src1],16
81(EPI) st8 [tgt1]=t5[PIPE_DEPTH-1],16
82(p[0]) ld8 t6[0]=[src2],16
83(EPI) st8 [tgt2]=t6[PIPE_DEPTH-1],16
84 ;;
85(p[0]) ld8 t7[0]=[src1],16
86(EPI) st8 [tgt1]=t7[PIPE_DEPTH-1],16
87(p[0]) ld8 t8[0]=[src2],16
88(EPI) st8 [tgt2]=t8[PIPE_DEPTH-1],16
89
90(p6) lfetch [srcf], 64
91(p6) lfetch [tgtf], 64
92 br.ctop.sptk.few 1b
93 ;;
94 mov pr=saved_pr,0xffffffffffff0000 // restore predicates
95 mov ar.pfs=saved_pfs
96 mov ar.lc=saved_lc
97 br.ret.sptk.many rp
98END(copy_page)
diff --git a/arch/ia64/lib/copy_page_mck.S b/arch/ia64/lib/copy_page_mck.S
new file mode 100644
index 000000000000..3c45d60a81b4
--- /dev/null
+++ b/arch/ia64/lib/copy_page_mck.S
@@ -0,0 +1,185 @@
1/*
2 * McKinley-optimized version of copy_page().
3 *
4 * Copyright (C) 2002 Hewlett-Packard Co
5 * David Mosberger <davidm@hpl.hp.com>
6 *
7 * Inputs:
8 * in0: address of target page
9 * in1: address of source page
10 * Output:
11 * no return value
12 *
13 * General idea:
14 * - use regular loads and stores to prefetch data to avoid consuming M-slot just for
15 * lfetches => good for in-cache performance
16 * - avoid l2 bank-conflicts by not storing into the same 16-byte bank within a single
17 * cycle
18 *
19 * Principle of operation:
20 * First, note that L1 has a line-size of 64 bytes and L2 a line-size of 128 bytes.
21 * To avoid secondary misses in L2, we prefetch both source and destination with a line-size
22 * of 128 bytes. When both of these lines are in the L2 and the first half of the
23 * source line is in L1, we start copying the remaining words. The second half of the
24 * source line is prefetched in an earlier iteration, so that by the time we start
25 * accessing it, it's also present in the L1.
26 *
27 * We use a software-pipelined loop to control the overall operation. The pipeline
28 * has 2*PREFETCH_DIST+K stages. The first PREFETCH_DIST stages are used for prefetching
29 * source cache-lines. The second PREFETCH_DIST stages are used for prefetching destination
30 * cache-lines, the last K stages are used to copy the cache-line words not copied by
31 * the prefetches. The four relevant points in the pipelined are called A, B, C, D:
32 * p[A] is TRUE if a source-line should be prefetched, p[B] is TRUE if a destination-line
33 * should be prefetched, p[C] is TRUE if the second half of an L2 line should be brought
34 * into L1D and p[D] is TRUE if a cacheline needs to be copied.
35 *
36 * This all sounds very complicated, but thanks to the modulo-scheduled loop support,
37 * the resulting code is very regular and quite easy to follow (once you get the idea).
38 *
39 * As a secondary optimization, the first 2*PREFETCH_DIST iterations are implemented
40 * as the separate .prefetch_loop. Logically, this loop performs exactly like the
41 * main-loop (.line_copy), but has all known-to-be-predicated-off instructions removed,
42 * so that each loop iteration is faster (again, good for cached case).
43 *
44 * When reading the code, it helps to keep the following picture in mind:
45 *
46 * word 0 word 1
47 * +------+------+---
48 * | v[x] | t1 | ^
49 * | t2 | t3 | |
50 * | t4 | t5 | |
51 * | t6 | t7 | | 128 bytes
52 * | n[y] | t9 | | (L2 cache line)
53 * | t10 | t11 | |
54 * | t12 | t13 | |
55 * | t14 | t15 | v
56 * +------+------+---
57 *
58 * Here, v[x] is copied by the (memory) prefetch. n[y] is loaded at p[C]
59 * to fetch the second-half of the L2 cache line into L1, and the tX words are copied in
60 * an order that avoids bank conflicts.
61 */
62#include <asm/asmmacro.h>
63#include <asm/page.h>
64
65#define PREFETCH_DIST 8 // McKinley sustains 16 outstanding L2 misses (8 ld, 8 st)
66
67#define src0 r2
68#define src1 r3
69#define dst0 r9
70#define dst1 r10
71#define src_pre_mem r11
72#define dst_pre_mem r14
73#define src_pre_l2 r15
74#define dst_pre_l2 r16
75#define t1 r17
76#define t2 r18
77#define t3 r19
78#define t4 r20
79#define t5 t1 // alias!
80#define t6 t2 // alias!
81#define t7 t3 // alias!
82#define t9 t5 // alias!
83#define t10 t4 // alias!
84#define t11 t7 // alias!
85#define t12 t6 // alias!
86#define t14 t10 // alias!
87#define t13 r21
88#define t15 r22
89
90#define saved_lc r23
91#define saved_pr r24
92
93#define A 0
94#define B (PREFETCH_DIST)
95#define C (B + PREFETCH_DIST)
96#define D (C + 3)
97#define N (D + 1)
98#define Nrot ((N + 7) & ~7)
99
100GLOBAL_ENTRY(copy_page)
101 .prologue
102 alloc r8 = ar.pfs, 2, Nrot-2, 0, Nrot
103
104 .rotr v[2*PREFETCH_DIST], n[D-C+1]
105 .rotp p[N]
106
107 .save ar.lc, saved_lc
108 mov saved_lc = ar.lc
109 .save pr, saved_pr
110 mov saved_pr = pr
111 .body
112
113 mov src_pre_mem = in1
114 mov pr.rot = 0x10000
115 mov ar.ec = 1 // special unrolled loop
116
117 mov dst_pre_mem = in0
118 mov ar.lc = 2*PREFETCH_DIST - 1
119
120 add src_pre_l2 = 8*8, in1
121 add dst_pre_l2 = 8*8, in0
122 add src0 = 8, in1 // first t1 src
123 add src1 = 3*8, in1 // first t3 src
124 add dst0 = 8, in0 // first t1 dst
125 add dst1 = 3*8, in0 // first t3 dst
126 mov t1 = (PAGE_SIZE/128) - (2*PREFETCH_DIST) - 1
127 nop.m 0
128 nop.i 0
129 ;;
130 // same as .line_copy loop, but with all predicated-off instructions removed:
131.prefetch_loop:
132(p[A]) ld8 v[A] = [src_pre_mem], 128 // M0
133(p[B]) st8 [dst_pre_mem] = v[B], 128 // M2
134 br.ctop.sptk .prefetch_loop
135 ;;
136 cmp.eq p16, p0 = r0, r0 // reset p16 to 1 (br.ctop cleared it to zero)
137 mov ar.lc = t1 // with 64KB pages, t1 is too big to fit in 8 bits!
138 mov ar.ec = N // # of stages in pipeline
139 ;;
140.line_copy:
141(p[D]) ld8 t2 = [src0], 3*8 // M0
142(p[D]) ld8 t4 = [src1], 3*8 // M1
143(p[B]) st8 [dst_pre_mem] = v[B], 128 // M2 prefetch dst from memory
144(p[D]) st8 [dst_pre_l2] = n[D-C], 128 // M3 prefetch dst from L2
145 ;;
146(p[A]) ld8 v[A] = [src_pre_mem], 128 // M0 prefetch src from memory
147(p[C]) ld8 n[0] = [src_pre_l2], 128 // M1 prefetch src from L2
148(p[D]) st8 [dst0] = t1, 8 // M2
149(p[D]) st8 [dst1] = t3, 8 // M3
150 ;;
151(p[D]) ld8 t5 = [src0], 8
152(p[D]) ld8 t7 = [src1], 3*8
153(p[D]) st8 [dst0] = t2, 3*8
154(p[D]) st8 [dst1] = t4, 3*8
155 ;;
156(p[D]) ld8 t6 = [src0], 3*8
157(p[D]) ld8 t10 = [src1], 8
158(p[D]) st8 [dst0] = t5, 8
159(p[D]) st8 [dst1] = t7, 3*8
160 ;;
161(p[D]) ld8 t9 = [src0], 3*8
162(p[D]) ld8 t11 = [src1], 3*8
163(p[D]) st8 [dst0] = t6, 3*8
164(p[D]) st8 [dst1] = t10, 8
165 ;;
166(p[D]) ld8 t12 = [src0], 8
167(p[D]) ld8 t14 = [src1], 8
168(p[D]) st8 [dst0] = t9, 3*8
169(p[D]) st8 [dst1] = t11, 3*8
170 ;;
171(p[D]) ld8 t13 = [src0], 4*8
172(p[D]) ld8 t15 = [src1], 4*8
173(p[D]) st8 [dst0] = t12, 8
174(p[D]) st8 [dst1] = t14, 8
175 ;;
176(p[D-1])ld8 t1 = [src0], 8
177(p[D-1])ld8 t3 = [src1], 8
178(p[D]) st8 [dst0] = t13, 4*8
179(p[D]) st8 [dst1] = t15, 4*8
180 br.ctop.sptk .line_copy
181 ;;
182 mov ar.lc = saved_lc
183 mov pr = saved_pr, -1
184 br.ret.sptk.many rp
185END(copy_page)
diff --git a/arch/ia64/lib/copy_user.S b/arch/ia64/lib/copy_user.S
new file mode 100644
index 000000000000..c952bdc6a093
--- /dev/null
+++ b/arch/ia64/lib/copy_user.S
@@ -0,0 +1,610 @@
1/*
2 *
3 * Optimized version of the copy_user() routine.
4 * It is used to copy date across the kernel/user boundary.
5 *
6 * The source and destination are always on opposite side of
7 * the boundary. When reading from user space we must catch
8 * faults on loads. When writing to user space we must catch
9 * errors on stores. Note that because of the nature of the copy
10 * we don't need to worry about overlapping regions.
11 *
12 *
13 * Inputs:
14 * in0 address of source buffer
15 * in1 address of destination buffer
16 * in2 number of bytes to copy
17 *
18 * Outputs:
19 * ret0 0 in case of success. The number of bytes NOT copied in
20 * case of error.
21 *
22 * Copyright (C) 2000-2001 Hewlett-Packard Co
23 * Stephane Eranian <eranian@hpl.hp.com>
24 *
25 * Fixme:
26 * - handle the case where we have more than 16 bytes and the alignment
27 * are different.
28 * - more benchmarking
29 * - fix extraneous stop bit introduced by the EX() macro.
30 */
31
32#include <asm/asmmacro.h>
33
34//
35// Tuneable parameters
36//
37#define COPY_BREAK 16 // we do byte copy below (must be >=16)
38#define PIPE_DEPTH 21 // pipe depth
39
40#define EPI p[PIPE_DEPTH-1]
41
42//
43// arguments
44//
45#define dst in0
46#define src in1
47#define len in2
48
49//
50// local registers
51//
52#define t1 r2 // rshift in bytes
53#define t2 r3 // lshift in bytes
54#define rshift r14 // right shift in bits
55#define lshift r15 // left shift in bits
56#define word1 r16
57#define word2 r17
58#define cnt r18
59#define len2 r19
60#define saved_lc r20
61#define saved_pr r21
62#define tmp r22
63#define val r23
64#define src1 r24
65#define dst1 r25
66#define src2 r26
67#define dst2 r27
68#define len1 r28
69#define enddst r29
70#define endsrc r30
71#define saved_pfs r31
72
73GLOBAL_ENTRY(__copy_user)
74 .prologue
75 .save ar.pfs, saved_pfs
76 alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7)
77
78 .rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH]
79 .rotp p[PIPE_DEPTH]
80
81 adds len2=-1,len // br.ctop is repeat/until
82 mov ret0=r0
83
84 ;; // RAW of cfm when len=0
85 cmp.eq p8,p0=r0,len // check for zero length
86 .save ar.lc, saved_lc
87 mov saved_lc=ar.lc // preserve ar.lc (slow)
88(p8) br.ret.spnt.many rp // empty mempcy()
89 ;;
90 add enddst=dst,len // first byte after end of source
91 add endsrc=src,len // first byte after end of destination
92 .save pr, saved_pr
93 mov saved_pr=pr // preserve predicates
94
95 .body
96
97 mov dst1=dst // copy because of rotation
98 mov ar.ec=PIPE_DEPTH
99 mov pr.rot=1<<16 // p16=true all others are false
100
101 mov src1=src // copy because of rotation
102 mov ar.lc=len2 // initialize lc for small count
103 cmp.lt p10,p7=COPY_BREAK,len // if len > COPY_BREAK then long copy
104
105 xor tmp=src,dst // same alignment test prepare
106(p10) br.cond.dptk .long_copy_user
107 ;; // RAW pr.rot/p16 ?
108 //
109 // Now we do the byte by byte loop with software pipeline
110 //
111 // p7 is necessarily false by now
1121:
113 EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
114 EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
115 br.ctop.dptk.few 1b
116 ;;
117 mov ar.lc=saved_lc
118 mov pr=saved_pr,0xffffffffffff0000
119 mov ar.pfs=saved_pfs // restore ar.ec
120 br.ret.sptk.many rp // end of short memcpy
121
122 //
123 // Not 8-byte aligned
124 //
125.diff_align_copy_user:
126 // At this point we know we have more than 16 bytes to copy
127 // and also that src and dest do _not_ have the same alignment.
128 and src2=0x7,src1 // src offset
129 and dst2=0x7,dst1 // dst offset
130 ;;
131 // The basic idea is that we copy byte-by-byte at the head so
132 // that we can reach 8-byte alignment for both src1 and dst1.
133 // Then copy the body using software pipelined 8-byte copy,
134 // shifting the two back-to-back words right and left, then copy
135 // the tail by copying byte-by-byte.
136 //
137 // Fault handling. If the byte-by-byte at the head fails on the
138 // load, then restart and finish the pipleline by copying zeros
139 // to the dst1. Then copy zeros for the rest of dst1.
140 // If 8-byte software pipeline fails on the load, do the same as
141 // failure_in3 does. If the byte-by-byte at the tail fails, it is
142 // handled simply by failure_in_pipe1.
143 //
144 // The case p14 represents the source has more bytes in the
145 // the first word (by the shifted part), whereas the p15 needs to
146 // copy some bytes from the 2nd word of the source that has the
147 // tail of the 1st of the destination.
148 //
149
150 //
151 // Optimization. If dst1 is 8-byte aligned (quite common), we don't need
152 // to copy the head to dst1, to start 8-byte copy software pipeline.
153 // We know src1 is not 8-byte aligned in this case.
154 //
155 cmp.eq p14,p15=r0,dst2
156(p15) br.cond.spnt 1f
157 ;;
158 sub t1=8,src2
159 mov t2=src2
160 ;;
161 shl rshift=t2,3
162 sub len1=len,t1 // set len1
163 ;;
164 sub lshift=64,rshift
165 ;;
166 br.cond.spnt .word_copy_user
167 ;;
1681:
169 cmp.leu p14,p15=src2,dst2
170 sub t1=dst2,src2
171 ;;
172 .pred.rel "mutex", p14, p15
173(p14) sub word1=8,src2 // (8 - src offset)
174(p15) sub t1=r0,t1 // absolute value
175(p15) sub word1=8,dst2 // (8 - dst offset)
176 ;;
177 // For the case p14, we don't need to copy the shifted part to
178 // the 1st word of destination.
179 sub t2=8,t1
180(p14) sub word1=word1,t1
181 ;;
182 sub len1=len,word1 // resulting len
183(p15) shl rshift=t1,3 // in bits
184(p14) shl rshift=t2,3
185 ;;
186(p14) sub len1=len1,t1
187 adds cnt=-1,word1
188 ;;
189 sub lshift=64,rshift
190 mov ar.ec=PIPE_DEPTH
191 mov pr.rot=1<<16 // p16=true all others are false
192 mov ar.lc=cnt
193 ;;
1942:
195 EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1)
196 EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
197 br.ctop.dptk.few 2b
198 ;;
199 clrrrb
200 ;;
201.word_copy_user:
202 cmp.gtu p9,p0=16,len1
203(p9) br.cond.spnt 4f // if (16 > len1) skip 8-byte copy
204 ;;
205 shr.u cnt=len1,3 // number of 64-bit words
206 ;;
207 adds cnt=-1,cnt
208 ;;
209 .pred.rel "mutex", p14, p15
210(p14) sub src1=src1,t2
211(p15) sub src1=src1,t1
212 //
213 // Now both src1 and dst1 point to an 8-byte aligned address. And
214 // we have more than 8 bytes to copy.
215 //
216 mov ar.lc=cnt
217 mov ar.ec=PIPE_DEPTH
218 mov pr.rot=1<<16 // p16=true all others are false
219 ;;
2203:
221 //
222 // The pipleline consists of 3 stages:
223 // 1 (p16): Load a word from src1
224 // 2 (EPI_1): Shift right pair, saving to tmp
225 // 3 (EPI): Store tmp to dst1
226 //
227 // To make it simple, use at least 2 (p16) loops to set up val1[n]
228 // because we need 2 back-to-back val1[] to get tmp.
229 // Note that this implies EPI_2 must be p18 or greater.
230 //
231
232#define EPI_1 p[PIPE_DEPTH-2]
233#define SWITCH(pred, shift) cmp.eq pred,p0=shift,rshift
234#define CASE(pred, shift) \
235 (pred) br.cond.spnt .copy_user_bit##shift
236#define BODY(rshift) \
237.copy_user_bit##rshift: \
2381: \
239 EX(.failure_out,(EPI) st8 [dst1]=tmp,8); \
240(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \
241 EX(3f,(p16) ld8 val1[1]=[src1],8); \
242(p16) mov val1[0]=r0; \
243 br.ctop.dptk 1b; \
244 ;; \
245 br.cond.sptk.many .diff_align_do_tail; \
2462: \
247(EPI) st8 [dst1]=tmp,8; \
248(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \
2493: \
250(p16) mov val1[1]=r0; \
251(p16) mov val1[0]=r0; \
252 br.ctop.dptk 2b; \
253 ;; \
254 br.cond.sptk.many .failure_in2
255
256 //
257 // Since the instruction 'shrp' requires a fixed 128-bit value
258 // specifying the bits to shift, we need to provide 7 cases
259 // below.
260 //
261 SWITCH(p6, 8)
262 SWITCH(p7, 16)
263 SWITCH(p8, 24)
264 SWITCH(p9, 32)
265 SWITCH(p10, 40)
266 SWITCH(p11, 48)
267 SWITCH(p12, 56)
268 ;;
269 CASE(p6, 8)
270 CASE(p7, 16)
271 CASE(p8, 24)
272 CASE(p9, 32)
273 CASE(p10, 40)
274 CASE(p11, 48)
275 CASE(p12, 56)
276 ;;
277 BODY(8)
278 BODY(16)
279 BODY(24)
280 BODY(32)
281 BODY(40)
282 BODY(48)
283 BODY(56)
284 ;;
285.diff_align_do_tail:
286 .pred.rel "mutex", p14, p15
287(p14) sub src1=src1,t1
288(p14) adds dst1=-8,dst1
289(p15) sub dst1=dst1,t1
290 ;;
2914:
292 // Tail correction.
293 //
294 // The problem with this piplelined loop is that the last word is not
295 // loaded and thus parf of the last word written is not correct.
296 // To fix that, we simply copy the tail byte by byte.
297
298 sub len1=endsrc,src1,1
299 clrrrb
300 ;;
301 mov ar.ec=PIPE_DEPTH
302 mov pr.rot=1<<16 // p16=true all others are false
303 mov ar.lc=len1
304 ;;
3055:
306 EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
307 EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
308 br.ctop.dptk.few 5b
309 ;;
310 mov ar.lc=saved_lc
311 mov pr=saved_pr,0xffffffffffff0000
312 mov ar.pfs=saved_pfs
313 br.ret.sptk.many rp
314
315 //
316 // Beginning of long mempcy (i.e. > 16 bytes)
317 //
318.long_copy_user:
319 tbit.nz p6,p7=src1,0 // odd alignment
320 and tmp=7,tmp
321 ;;
322 cmp.eq p10,p8=r0,tmp
323 mov len1=len // copy because of rotation
324(p8) br.cond.dpnt .diff_align_copy_user
325 ;;
326 // At this point we know we have more than 16 bytes to copy
327 // and also that both src and dest have the same alignment
328 // which may not be the one we want. So for now we must move
329 // forward slowly until we reach 16byte alignment: no need to
330 // worry about reaching the end of buffer.
331 //
332 EX(.failure_in1,(p6) ld1 val1[0]=[src1],1) // 1-byte aligned
333(p6) adds len1=-1,len1;;
334 tbit.nz p7,p0=src1,1
335 ;;
336 EX(.failure_in1,(p7) ld2 val1[1]=[src1],2) // 2-byte aligned
337(p7) adds len1=-2,len1;;
338 tbit.nz p8,p0=src1,2
339 ;;
340 //
341 // Stop bit not required after ld4 because if we fail on ld4
342 // we have never executed the ld1, therefore st1 is not executed.
343 //
344 EX(.failure_in1,(p8) ld4 val2[0]=[src1],4) // 4-byte aligned
345 ;;
346 EX(.failure_out,(p6) st1 [dst1]=val1[0],1)
347 tbit.nz p9,p0=src1,3
348 ;;
349 //
350 // Stop bit not required after ld8 because if we fail on ld8
351 // we have never executed the ld2, therefore st2 is not executed.
352 //
353 EX(.failure_in1,(p9) ld8 val2[1]=[src1],8) // 8-byte aligned
354 EX(.failure_out,(p7) st2 [dst1]=val1[1],2)
355(p8) adds len1=-4,len1
356 ;;
357 EX(.failure_out, (p8) st4 [dst1]=val2[0],4)
358(p9) adds len1=-8,len1;;
359 shr.u cnt=len1,4 // number of 128-bit (2x64bit) words
360 ;;
361 EX(.failure_out, (p9) st8 [dst1]=val2[1],8)
362 tbit.nz p6,p0=len1,3
363 cmp.eq p7,p0=r0,cnt
364 adds tmp=-1,cnt // br.ctop is repeat/until
365(p7) br.cond.dpnt .dotail // we have less than 16 bytes left
366 ;;
367 adds src2=8,src1
368 adds dst2=8,dst1
369 mov ar.lc=tmp
370 ;;
371 //
372 // 16bytes/iteration
373 //
3742:
375 EX(.failure_in3,(p16) ld8 val1[0]=[src1],16)
376(p16) ld8 val2[0]=[src2],16
377
378 EX(.failure_out, (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16)
379(EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16
380 br.ctop.dptk 2b
381 ;; // RAW on src1 when fall through from loop
382 //
383 // Tail correction based on len only
384 //
385 // No matter where we come from (loop or test) the src1 pointer
386 // is 16 byte aligned AND we have less than 16 bytes to copy.
387 //
388.dotail:
389 EX(.failure_in1,(p6) ld8 val1[0]=[src1],8) // at least 8 bytes
390 tbit.nz p7,p0=len1,2
391 ;;
392 EX(.failure_in1,(p7) ld4 val1[1]=[src1],4) // at least 4 bytes
393 tbit.nz p8,p0=len1,1
394 ;;
395 EX(.failure_in1,(p8) ld2 val2[0]=[src1],2) // at least 2 bytes
396 tbit.nz p9,p0=len1,0
397 ;;
398 EX(.failure_out, (p6) st8 [dst1]=val1[0],8)
399 ;;
400 EX(.failure_in1,(p9) ld1 val2[1]=[src1]) // only 1 byte left
401 mov ar.lc=saved_lc
402 ;;
403 EX(.failure_out,(p7) st4 [dst1]=val1[1],4)
404 mov pr=saved_pr,0xffffffffffff0000
405 ;;
406 EX(.failure_out, (p8) st2 [dst1]=val2[0],2)
407 mov ar.pfs=saved_pfs
408 ;;
409 EX(.failure_out, (p9) st1 [dst1]=val2[1])
410 br.ret.sptk.many rp
411
412
413 //
414 // Here we handle the case where the byte by byte copy fails
415 // on the load.
416 // Several factors make the zeroing of the rest of the buffer kind of
417 // tricky:
418 // - the pipeline: loads/stores are not in sync (pipeline)
419 //
420 // In the same loop iteration, the dst1 pointer does not directly
421 // reflect where the faulty load was.
422 //
423 // - pipeline effect
424 // When you get a fault on load, you may have valid data from
425 // previous loads not yet store in transit. Such data must be
426 // store normally before moving onto zeroing the rest.
427 //
428 // - single/multi dispersal independence.
429 //
430 // solution:
431 // - we don't disrupt the pipeline, i.e. data in transit in
432 // the software pipeline will be eventually move to memory.
433 // We simply replace the load with a simple mov and keep the
434 // pipeline going. We can't really do this inline because
435 // p16 is always reset to 1 when lc > 0.
436 //
437.failure_in_pipe1:
438 sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
4391:
440(p16) mov val1[0]=r0
441(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1
442 br.ctop.dptk 1b
443 ;;
444 mov pr=saved_pr,0xffffffffffff0000
445 mov ar.lc=saved_lc
446 mov ar.pfs=saved_pfs
447 br.ret.sptk.many rp
448
449 //
450 // This is the case where the byte by byte copy fails on the load
451 // when we copy the head. We need to finish the pipeline and copy
452 // zeros for the rest of the destination. Since this happens
453 // at the top we still need to fill the body and tail.
454.failure_in_pipe2:
455 sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
4562:
457(p16) mov val1[0]=r0
458(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1
459 br.ctop.dptk 2b
460 ;;
461 sub len=enddst,dst1,1 // precompute len
462 br.cond.dptk.many .failure_in1bis
463 ;;
464
465 //
466 // Here we handle the head & tail part when we check for alignment.
467 // The following code handles only the load failures. The
468 // main diffculty comes from the fact that loads/stores are
469 // scheduled. So when you fail on a load, the stores corresponding
470 // to previous successful loads must be executed.
471 //
472 // However some simplifications are possible given the way
473 // things work.
474 //
475 // 1) HEAD
476 // Theory of operation:
477 //
478 // Page A | Page B
479 // ---------|-----
480 // 1|8 x
481 // 1 2|8 x
482 // 4|8 x
483 // 1 4|8 x
484 // 2 4|8 x
485 // 1 2 4|8 x
486 // |1
487 // |2 x
488 // |4 x
489 //
490 // page_size >= 4k (2^12). (x means 4, 2, 1)
491 // Here we suppose Page A exists and Page B does not.
492 //
493 // As we move towards eight byte alignment we may encounter faults.
494 // The numbers on each page show the size of the load (current alignment).
495 //
496 // Key point:
497 // - if you fail on 1, 2, 4 then you have never executed any smaller
498 // size loads, e.g. failing ld4 means no ld1 nor ld2 executed
499 // before.
500 //
501 // This allows us to simplify the cleanup code, because basically you
502 // only have to worry about "pending" stores in the case of a failing
503 // ld8(). Given the way the code is written today, this means only
504 // worry about st2, st4. There we can use the information encapsulated
505 // into the predicates.
506 //
507 // Other key point:
508 // - if you fail on the ld8 in the head, it means you went straight
509 // to it, i.e. 8byte alignment within an unexisting page.
510 // Again this comes from the fact that if you crossed just for the ld8 then
511 // you are 8byte aligned but also 16byte align, therefore you would
512 // either go for the 16byte copy loop OR the ld8 in the tail part.
513 // The combination ld1, ld2, ld4, ld8 where you fail on ld8 is impossible
514 // because it would mean you had 15bytes to copy in which case you
515 // would have defaulted to the byte by byte copy.
516 //
517 //
518 // 2) TAIL
519 // Here we now we have less than 16 bytes AND we are either 8 or 16 byte
520 // aligned.
521 //
522 // Key point:
523 // This means that we either:
524 // - are right on a page boundary
525 // OR
526 // - are at more than 16 bytes from a page boundary with
527 // at most 15 bytes to copy: no chance of crossing.
528 //
529 // This allows us to assume that if we fail on a load we haven't possibly
530 // executed any of the previous (tail) ones, so we don't need to do
531 // any stores. For instance, if we fail on ld2, this means we had
532 // 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4.
533 //
534 // This means that we are in a situation similar the a fault in the
535 // head part. That's nice!
536 //
537.failure_in1:
538 sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
539 sub len=endsrc,src1,1
540 //
541 // we know that ret0 can never be zero at this point
542 // because we failed why trying to do a load, i.e. there is still
543 // some work to do.
544 // The failure_in1bis and length problem is taken care of at the
545 // calling side.
546 //
547 ;;
548.failure_in1bis: // from (.failure_in3)
549 mov ar.lc=len // Continue with a stupid byte store.
550 ;;
5515:
552 st1 [dst1]=r0,1
553 br.cloop.dptk 5b
554 ;;
555 mov pr=saved_pr,0xffffffffffff0000
556 mov ar.lc=saved_lc
557 mov ar.pfs=saved_pfs
558 br.ret.sptk.many rp
559
560 //
561 // Here we simply restart the loop but instead
562 // of doing loads we fill the pipeline with zeroes
563 // We can't simply store r0 because we may have valid
564 // data in transit in the pipeline.
565 // ar.lc and ar.ec are setup correctly at this point
566 //
567 // we MUST use src1/endsrc here and not dst1/enddst because
568 // of the pipeline effect.
569 //
570.failure_in3:
571 sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
572 ;;
5732:
574(p16) mov val1[0]=r0
575(p16) mov val2[0]=r0
576(EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16
577(EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16
578 br.ctop.dptk 2b
579 ;;
580 cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ?
581 sub len=enddst,dst1,1 // precompute len
582(p6) br.cond.dptk .failure_in1bis
583 ;;
584 mov pr=saved_pr,0xffffffffffff0000
585 mov ar.lc=saved_lc
586 mov ar.pfs=saved_pfs
587 br.ret.sptk.many rp
588
589.failure_in2:
590 sub ret0=endsrc,src1
591 cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ?
592 sub len=enddst,dst1,1 // precompute len
593(p6) br.cond.dptk .failure_in1bis
594 ;;
595 mov pr=saved_pr,0xffffffffffff0000
596 mov ar.lc=saved_lc
597 mov ar.pfs=saved_pfs
598 br.ret.sptk.many rp
599
600 //
601 // handling of failures on stores: that's the easy part
602 //
603.failure_out:
604 sub ret0=enddst,dst1
605 mov pr=saved_pr,0xffffffffffff0000
606 mov ar.lc=saved_lc
607
608 mov ar.pfs=saved_pfs
609 br.ret.sptk.many rp
610END(__copy_user)
diff --git a/arch/ia64/lib/csum_partial_copy.c b/arch/ia64/lib/csum_partial_copy.c
new file mode 100644
index 000000000000..36866e8a5d2b
--- /dev/null
+++ b/arch/ia64/lib/csum_partial_copy.c
@@ -0,0 +1,151 @@
1/*
2 * Network Checksum & Copy routine
3 *
4 * Copyright (C) 1999, 2003-2004 Hewlett-Packard Co
5 * Stephane Eranian <eranian@hpl.hp.com>
6 *
7 * Most of the code has been imported from Linux/Alpha
8 */
9
10#include <linux/module.h>
11#include <linux/types.h>
12#include <linux/string.h>
13
14#include <asm/uaccess.h>
15
16/*
17 * XXX Fixme: those 2 inlines are meant for debugging and will go away
18 */
19static inline unsigned
20short from64to16(unsigned long x)
21{
22 /* add up 32-bit words for 33 bits */
23 x = (x & 0xffffffff) + (x >> 32);
24 /* add up 16-bit and 17-bit words for 17+c bits */
25 x = (x & 0xffff) + (x >> 16);
26 /* add up 16-bit and 2-bit for 16+c bit */
27 x = (x & 0xffff) + (x >> 16);
28 /* add up carry.. */
29 x = (x & 0xffff) + (x >> 16);
30 return x;
31}
32
33static inline
34unsigned long do_csum_c(const unsigned char * buff, int len, unsigned int psum)
35{
36 int odd, count;
37 unsigned long result = (unsigned long)psum;
38
39 if (len <= 0)
40 goto out;
41 odd = 1 & (unsigned long) buff;
42 if (odd) {
43 result = *buff << 8;
44 len--;
45 buff++;
46 }
47 count = len >> 1; /* nr of 16-bit words.. */
48 if (count) {
49 if (2 & (unsigned long) buff) {
50 result += *(unsigned short *) buff;
51 count--;
52 len -= 2;
53 buff += 2;
54 }
55 count >>= 1; /* nr of 32-bit words.. */
56 if (count) {
57 if (4 & (unsigned long) buff) {
58 result += *(unsigned int *) buff;
59 count--;
60 len -= 4;
61 buff += 4;
62 }
63 count >>= 1; /* nr of 64-bit words.. */
64 if (count) {
65 unsigned long carry = 0;
66 do {
67 unsigned long w = *(unsigned long *) buff;
68 count--;
69 buff += 8;
70 result += carry;
71 result += w;
72 carry = (w > result);
73 } while (count);
74 result += carry;
75 result = (result & 0xffffffff) + (result >> 32);
76 }
77 if (len & 4) {
78 result += *(unsigned int *) buff;
79 buff += 4;
80 }
81 }
82 if (len & 2) {
83 result += *(unsigned short *) buff;
84 buff += 2;
85 }
86 }
87 if (len & 1)
88 result += *buff;
89
90 result = from64to16(result);
91
92 if (odd)
93 result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
94
95out:
96 return result;
97}
98
99/*
100 * XXX Fixme
101 *
102 * This is very ugly but temporary. THIS NEEDS SERIOUS ENHANCEMENTS.
103 * But it's very tricky to get right even in C.
104 */
105extern unsigned long do_csum(const unsigned char *, long);
106
107static unsigned int
108do_csum_partial_copy_from_user (const unsigned char __user *src, unsigned char *dst,
109 int len, unsigned int psum, int *errp)
110{
111 unsigned long result;
112
113 /* XXX Fixme
114 * for now we separate the copy from checksum for obvious
115 * alignment difficulties. Look at the Alpha code and you'll be
116 * scared.
117 */
118
119 if (__copy_from_user(dst, src, len) != 0 && errp)
120 *errp = -EFAULT;
121
122 result = do_csum(dst, len);
123
124 /* add in old sum, and carry.. */
125 result += psum;
126 /* 32+c bits -> 32 bits */
127 result = (result & 0xffffffff) + (result >> 32);
128 return result;
129}
130
131unsigned int
132csum_partial_copy_from_user (const unsigned char __user *src, unsigned char *dst,
133 int len, unsigned int sum, int *errp)
134{
135 if (!access_ok(VERIFY_READ, src, len)) {
136 *errp = -EFAULT;
137 memset(dst, 0, len);
138 return sum;
139 }
140
141 return do_csum_partial_copy_from_user(src, dst, len, sum, errp);
142}
143
144unsigned int
145csum_partial_copy_nocheck(const unsigned char __user *src, unsigned char *dst,
146 int len, unsigned int sum)
147{
148 return do_csum_partial_copy_from_user(src, dst, len, sum, NULL);
149}
150
151EXPORT_SYMBOL(csum_partial_copy_nocheck);
diff --git a/arch/ia64/lib/dec_and_lock.c b/arch/ia64/lib/dec_and_lock.c
new file mode 100644
index 000000000000..c7ce92f968f1
--- /dev/null
+++ b/arch/ia64/lib/dec_and_lock.c
@@ -0,0 +1,42 @@
1/*
2 * Copyright (C) 2003 Jerome Marchand, Bull S.A.
3 * Cleaned up by David Mosberger-Tang <davidm@hpl.hp.com>
4 *
5 * This file is released under the GPLv2, or at your option any later version.
6 *
7 * ia64 version of "atomic_dec_and_lock()" using the atomic "cmpxchg" instruction. This
8 * code is an adaptation of the x86 version of "atomic_dec_and_lock()".
9 */
10
11#include <linux/compiler.h>
12#include <linux/module.h>
13#include <linux/spinlock.h>
14#include <asm/atomic.h>
15
16/*
17 * Decrement REFCOUNT and if the count reaches zero, acquire the spinlock. Both of these
18 * operations have to be done atomically, so that the count doesn't drop to zero without
19 * acquiring the spinlock first.
20 */
21int
22_atomic_dec_and_lock (atomic_t *refcount, spinlock_t *lock)
23{
24 int old, new;
25
26 do {
27 old = atomic_read(refcount);
28 new = old - 1;
29
30 if (unlikely (old == 1)) {
31 /* oops, we may be decrementing to zero, do it the slow way... */
32 spin_lock(lock);
33 if (atomic_dec_and_test(refcount))
34 return 1;
35 spin_unlock(lock);
36 return 0;
37 }
38 } while (cmpxchg(&refcount->counter, old, new) != old);
39 return 0;
40}
41
42EXPORT_SYMBOL(_atomic_dec_and_lock);
diff --git a/arch/ia64/lib/do_csum.S b/arch/ia64/lib/do_csum.S
new file mode 100644
index 000000000000..6bec2fc9f5b2
--- /dev/null
+++ b/arch/ia64/lib/do_csum.S
@@ -0,0 +1,323 @@
1/*
2 *
3 * Optmized version of the standard do_csum() function
4 *
5 * Return: a 64bit quantity containing the 16bit Internet checksum
6 *
7 * Inputs:
8 * in0: address of buffer to checksum (char *)
9 * in1: length of the buffer (int)
10 *
11 * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co
12 * Stephane Eranian <eranian@hpl.hp.com>
13 *
14 * 02/04/22 Ken Chen <kenneth.w.chen@intel.com>
15 * Data locality study on the checksum buffer.
16 * More optimization cleanup - remove excessive stop bits.
17 * 02/04/08 David Mosberger <davidm@hpl.hp.com>
18 * More cleanup and tuning.
19 * 01/04/18 Jun Nakajima <jun.nakajima@intel.com>
20 * Clean up and optimize and the software pipeline, loading two
21 * back-to-back 8-byte words per loop. Clean up the initialization
22 * for the loop. Support the cases where load latency = 1 or 2.
23 * Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default).
24 */
25
26#include <asm/asmmacro.h>
27
28//
29// Theory of operations:
30// The goal is to go as quickly as possible to the point where
31// we can checksum 16 bytes/loop. Before reaching that point we must
32// take care of incorrect alignment of first byte.
33//
34// The code hereafter also takes care of the "tail" part of the buffer
35// before entering the core loop, if any. The checksum is a sum so it
36// allows us to commute operations. So we do the "head" and "tail"
37// first to finish at full speed in the body. Once we get the head and
38// tail values, we feed them into the pipeline, very handy initialization.
39//
40// Of course we deal with the special case where the whole buffer fits
41// into one 8 byte word. In this case we have only one entry in the pipeline.
42//
43// We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for
44// possible load latency and also to accommodate for head and tail.
45//
46// The end of the function deals with folding the checksum from 64bits
47// down to 16bits taking care of the carry.
48//
49// This version avoids synchronization in the core loop by also using a
50// pipeline for the accumulation of the checksum in resultx[] (x=1,2).
51//
52// wordx[] (x=1,2)
53// |---|
54// | | 0 : new value loaded in pipeline
55// |---|
56// | | - : in transit data
57// |---|
58// | | LOAD_LATENCY : current value to add to checksum
59// |---|
60// | | LOAD_LATENCY+1 : previous value added to checksum
61// |---| (previous iteration)
62//
63// resultx[] (x=1,2)
64// |---|
65// | | 0 : initial value
66// |---|
67// | | LOAD_LATENCY-1 : new checksum
68// |---|
69// | | LOAD_LATENCY : previous value of checksum
70// |---|
71// | | LOAD_LATENCY+1 : final checksum when out of the loop
72// |---|
73//
74//
75// See RFC1071 "Computing the Internet Checksum" for various techniques for
76// calculating the Internet checksum.
77//
78// NOT YET DONE:
79// - Maybe another algorithm which would take care of the folding at the
80// end in a different manner
81// - Work with people more knowledgeable than me on the network stack
82// to figure out if we could not split the function depending on the
83// type of packet or alignment we get. Like the ip_fast_csum() routine
84// where we know we have at least 20bytes worth of data to checksum.
85// - Do a better job of handling small packets.
86// - Note on prefetching: it was found that under various load, i.e. ftp read/write,
87// nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate is at 99.8%
88// on the data that buffer points to (partly because the checksum is often preceded by
89// a copy_from_user()). This finding indiate that lfetch will not be beneficial since
90// the data is already in the cache.
91//
92
93#define saved_pfs r11
94#define hmask r16
95#define tmask r17
96#define first1 r18
97#define firstval r19
98#define firstoff r20
99#define last r21
100#define lastval r22
101#define lastoff r23
102#define saved_lc r24
103#define saved_pr r25
104#define tmp1 r26
105#define tmp2 r27
106#define tmp3 r28
107#define carry1 r29
108#define carry2 r30
109#define first2 r31
110
111#define buf in0
112#define len in1
113
114#define LOAD_LATENCY 2 // XXX fix me
115
116#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2)
117# error "Only 1 or 2 is supported/tested for LOAD_LATENCY."
118#endif
119
120#define PIPE_DEPTH (LOAD_LATENCY+2)
121#define ELD p[LOAD_LATENCY] // end of load
122#define ELD_1 p[LOAD_LATENCY+1] // and next stage
123
124// unsigned long do_csum(unsigned char *buf,long len)
125
126GLOBAL_ENTRY(do_csum)
127 .prologue
128 .save ar.pfs, saved_pfs
129 alloc saved_pfs=ar.pfs,2,16,0,16
130 .rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2]
131 .rotp p[PIPE_DEPTH], pC1[2], pC2[2]
132 mov ret0=r0 // in case we have zero length
133 cmp.lt p0,p6=r0,len // check for zero length or negative (32bit len)
134 ;;
135 add tmp1=buf,len // last byte's address
136 .save pr, saved_pr
137 mov saved_pr=pr // preserve predicates (rotation)
138(p6) br.ret.spnt.many rp // return if zero or negative length
139
140 mov hmask=-1 // initialize head mask
141 tbit.nz p15,p0=buf,0 // is buf an odd address?
142 and first1=-8,buf // 8-byte align down address of first1 element
143
144 and firstoff=7,buf // how many bytes off for first1 element
145 mov tmask=-1 // initialize tail mask
146
147 ;;
148 adds tmp2=-1,tmp1 // last-1
149 and lastoff=7,tmp1 // how many bytes off for last element
150 ;;
151 sub tmp1=8,lastoff // complement to lastoff
152 and last=-8,tmp2 // address of word containing last byte
153 ;;
154 sub tmp3=last,first1 // tmp3=distance from first1 to last
155 .save ar.lc, saved_lc
156 mov saved_lc=ar.lc // save lc
157 cmp.eq p8,p9=last,first1 // everything fits in one word ?
158
159 ld8 firstval=[first1],8 // load, ahead of time, "first1" word
160 and tmp1=7, tmp1 // make sure that if tmp1==8 -> tmp1=0
161 shl tmp2=firstoff,3 // number of bits
162 ;;
163(p9) ld8 lastval=[last] // load, ahead of time, "last" word, if needed
164 shl tmp1=tmp1,3 // number of bits
165(p9) adds tmp3=-8,tmp3 // effectively loaded
166 ;;
167(p8) mov lastval=r0 // we don't need lastval if first1==last
168 shl hmask=hmask,tmp2 // build head mask, mask off [0,first1off[
169 shr.u tmask=tmask,tmp1 // build tail mask, mask off ]8,lastoff]
170 ;;
171 .body
172#define count tmp3
173
174(p8) and hmask=hmask,tmask // apply tail mask to head mask if 1 word only
175(p9) and word2[0]=lastval,tmask // mask last it as appropriate
176 shr.u count=count,3 // how many 8-byte?
177 ;;
178 // If count is odd, finish this 8-byte word so that we can
179 // load two back-to-back 8-byte words per loop thereafter.
180 and word1[0]=firstval,hmask // and mask it as appropriate
181 tbit.nz p10,p11=count,0 // if (count is odd)
182 ;;
183(p8) mov result1[0]=word1[0]
184(p9) add result1[0]=word1[0],word2[0]
185 ;;
186 cmp.ltu p6,p0=result1[0],word1[0] // check the carry
187 cmp.eq.or.andcm p8,p0=0,count // exit if zero 8-byte
188 ;;
189(p6) adds result1[0]=1,result1[0]
190(p8) br.cond.dptk .do_csum_exit // if (within an 8-byte word)
191(p11) br.cond.dptk .do_csum16 // if (count is even)
192
193 // Here count is odd.
194 ld8 word1[1]=[first1],8 // load an 8-byte word
195 cmp.eq p9,p10=1,count // if (count == 1)
196 adds count=-1,count // loaded an 8-byte word
197 ;;
198 add result1[0]=result1[0],word1[1]
199 ;;
200 cmp.ltu p6,p0=result1[0],word1[1]
201 ;;
202(p6) adds result1[0]=1,result1[0]
203(p9) br.cond.sptk .do_csum_exit // if (count == 1) exit
204 // Fall through to caluculate the checksum, feeding result1[0] as
205 // the initial value in result1[0].
206 //
207 // Calculate the checksum loading two 8-byte words per loop.
208 //
209.do_csum16:
210 add first2=8,first1
211 shr.u count=count,1 // we do 16 bytes per loop
212 ;;
213 adds count=-1,count
214 mov carry1=r0
215 mov carry2=r0
216 brp.loop.imp 1f,2f
217 ;;
218 mov ar.ec=PIPE_DEPTH
219 mov ar.lc=count // set lc
220 mov pr.rot=1<<16
221 // result1[0] must be initialized in advance.
222 mov result2[0]=r0
223 ;;
224 .align 32
2251:
226(ELD_1) cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1]
227(pC1[1])adds carry1=1,carry1
228(ELD_1) cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1]
229(pC2[1])adds carry2=1,carry2
230(ELD) add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
231(ELD) add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
2322:
233(p[0]) ld8 word1[0]=[first1],16
234(p[0]) ld8 word2[0]=[first2],16
235 br.ctop.sptk 1b
236 ;;
237 // Since len is a 32-bit value, carry cannot be larger than a 64-bit value.
238(pC1[1])adds carry1=1,carry1 // since we miss the last one
239(pC2[1])adds carry2=1,carry2
240 ;;
241 add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1
242 add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2
243 ;;
244 cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1
245 cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2
246 ;;
247(p6) adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1]
248(p7) adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1]
249 ;;
250 add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1]
251 ;;
252 cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1]
253 ;;
254(p6) adds result1[0]=1,result1[0]
255 ;;
256.do_csum_exit:
257 //
258 // now fold 64 into 16 bits taking care of carry
259 // that's not very good because it has lots of sequentiality
260 //
261 mov tmp3=0xffff
262 zxt4 tmp1=result1[0]
263 shr.u tmp2=result1[0],32
264 ;;
265 add result1[0]=tmp1,tmp2
266 ;;
267 and tmp1=result1[0],tmp3
268 shr.u tmp2=result1[0],16
269 ;;
270 add result1[0]=tmp1,tmp2
271 ;;
272 and tmp1=result1[0],tmp3
273 shr.u tmp2=result1[0],16
274 ;;
275 add result1[0]=tmp1,tmp2
276 ;;
277 and tmp1=result1[0],tmp3
278 shr.u tmp2=result1[0],16
279 ;;
280 add ret0=tmp1,tmp2
281 mov pr=saved_pr,0xffffffffffff0000
282 ;;
283 // if buf was odd then swap bytes
284 mov ar.pfs=saved_pfs // restore ar.ec
285(p15) mux1 ret0=ret0,@rev // reverse word
286 ;;
287 mov ar.lc=saved_lc
288(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes
289 br.ret.sptk.many rp
290
291// I (Jun Nakajima) wrote an equivalent code (see below), but it was
292// not much better than the original. So keep the original there so that
293// someone else can challenge.
294//
295// shr.u word1[0]=result1[0],32
296// zxt4 result1[0]=result1[0]
297// ;;
298// add result1[0]=result1[0],word1[0]
299// ;;
300// zxt2 result2[0]=result1[0]
301// extr.u word1[0]=result1[0],16,16
302// shr.u carry1=result1[0],32
303// ;;
304// add result2[0]=result2[0],word1[0]
305// ;;
306// add result2[0]=result2[0],carry1
307// ;;
308// extr.u ret0=result2[0],16,16
309// ;;
310// add ret0=ret0,result2[0]
311// ;;
312// zxt2 ret0=ret0
313// mov ar.pfs=saved_pfs // restore ar.ec
314// mov pr=saved_pr,0xffffffffffff0000
315// ;;
316// // if buf was odd then swap bytes
317// mov ar.lc=saved_lc
318//(p15) mux1 ret0=ret0,@rev // reverse word
319// ;;
320//(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes
321// br.ret.sptk.many rp
322
323END(do_csum)
diff --git a/arch/ia64/lib/flush.S b/arch/ia64/lib/flush.S
new file mode 100644
index 000000000000..29c802b19669
--- /dev/null
+++ b/arch/ia64/lib/flush.S
@@ -0,0 +1,39 @@
1/*
2 * Cache flushing routines.
3 *
4 * Copyright (C) 1999-2001 Hewlett-Packard Co
5 * Copyright (C) 1999-2001 David Mosberger-Tang <davidm@hpl.hp.com>
6 */
7#include <asm/asmmacro.h>
8#include <asm/page.h>
9
10 /*
11 * flush_icache_range(start,end)
12 * Must flush range from start to end-1 but nothing else (need to
13 * be careful not to touch addresses that may be unmapped).
14 */
15GLOBAL_ENTRY(flush_icache_range)
16 .prologue
17 alloc r2=ar.pfs,2,0,0,0
18 sub r8=in1,in0,1
19 ;;
20 shr.u r8=r8,5 // we flush 32 bytes per iteration
21 .save ar.lc, r3
22 mov r3=ar.lc // save ar.lc
23 ;;
24
25 .body
26
27 mov ar.lc=r8
28 ;;
29.Loop: fc in0 // issuable on M0 only
30 add in0=32,in0
31 br.cloop.sptk.few .Loop
32 ;;
33 sync.i
34 ;;
35 srlz.i
36 ;;
37 mov ar.lc=r3 // restore ar.lc
38 br.ret.sptk.many rp
39END(flush_icache_range)
diff --git a/arch/ia64/lib/idiv32.S b/arch/ia64/lib/idiv32.S
new file mode 100644
index 000000000000..2ac28bf0a662
--- /dev/null
+++ b/arch/ia64/lib/idiv32.S
@@ -0,0 +1,83 @@
1/*
2 * Copyright (C) 2000 Hewlett-Packard Co
3 * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
4 *
5 * 32-bit integer division.
6 *
7 * This code is based on the application note entitled "Divide, Square Root
8 * and Remainder Algorithms for the IA-64 Architecture". This document
9 * is available as Intel document number 248725-002 or via the web at
10 * http://developer.intel.com/software/opensource/numerics/
11 *
12 * For more details on the theory behind these algorithms, see "IA-64
13 * and Elementary Functions" by Peter Markstein; HP Professional Books
14 * (http://www.hp.com/go/retailbooks/)
15 */
16
17#include <asm/asmmacro.h>
18
19#ifdef MODULO
20# define OP mod
21#else
22# define OP div
23#endif
24
25#ifdef UNSIGNED
26# define SGN u
27# define EXTEND zxt4
28# define INT_TO_FP(a,b) fcvt.xuf.s1 a=b
29# define FP_TO_INT(a,b) fcvt.fxu.trunc.s1 a=b
30#else
31# define SGN
32# define EXTEND sxt4
33# define INT_TO_FP(a,b) fcvt.xf a=b
34# define FP_TO_INT(a,b) fcvt.fx.trunc.s1 a=b
35#endif
36
37#define PASTE1(a,b) a##b
38#define PASTE(a,b) PASTE1(a,b)
39#define NAME PASTE(PASTE(__,SGN),PASTE(OP,si3))
40
41GLOBAL_ENTRY(NAME)
42 .regstk 2,0,0,0
43 // Transfer inputs to FP registers.
44 mov r2 = 0xffdd // r2 = -34 + 65535 (fp reg format bias)
45 EXTEND in0 = in0 // in0 = a
46 EXTEND in1 = in1 // in1 = b
47 ;;
48 setf.sig f8 = in0
49 setf.sig f9 = in1
50#ifdef MODULO
51 sub in1 = r0, in1 // in1 = -b
52#endif
53 ;;
54 // Convert the inputs to FP, to avoid FP software-assist faults.
55 INT_TO_FP(f8, f8)
56 INT_TO_FP(f9, f9)
57 ;;
58 setf.exp f7 = r2 // f7 = 2^-34
59 frcpa.s1 f6, p6 = f8, f9 // y0 = frcpa(b)
60 ;;
61(p6) fmpy.s1 f8 = f8, f6 // q0 = a*y0
62(p6) fnma.s1 f6 = f9, f6, f1 // e0 = -b*y0 + 1
63 ;;
64#ifdef MODULO
65 setf.sig f9 = in1 // f9 = -b
66#endif
67(p6) fma.s1 f8 = f6, f8, f8 // q1 = e0*q0 + q0
68(p6) fma.s1 f6 = f6, f6, f7 // e1 = e0*e0 + 2^-34
69 ;;
70#ifdef MODULO
71 setf.sig f7 = in0
72#endif
73(p6) fma.s1 f6 = f6, f8, f8 // q2 = e1*q1 + q1
74 ;;
75 FP_TO_INT(f6, f6) // q = trunc(q2)
76 ;;
77#ifdef MODULO
78 xma.l f6 = f6, f9, f7 // r = q*(-b) + a
79 ;;
80#endif
81 getf.sig r8 = f6 // transfer result to result register
82 br.ret.sptk.many rp
83END(NAME)
diff --git a/arch/ia64/lib/idiv64.S b/arch/ia64/lib/idiv64.S
new file mode 100644
index 000000000000..f69bd2b0987a
--- /dev/null
+++ b/arch/ia64/lib/idiv64.S
@@ -0,0 +1,80 @@
1/*
2 * Copyright (C) 1999-2000 Hewlett-Packard Co
3 * Copyright (C) 1999-2000 David Mosberger-Tang <davidm@hpl.hp.com>
4 *
5 * 64-bit integer division.
6 *
7 * This code is based on the application note entitled "Divide, Square Root
8 * and Remainder Algorithms for the IA-64 Architecture". This document
9 * is available as Intel document number 248725-002 or via the web at
10 * http://developer.intel.com/software/opensource/numerics/
11 *
12 * For more details on the theory behind these algorithms, see "IA-64
13 * and Elementary Functions" by Peter Markstein; HP Professional Books
14 * (http://www.hp.com/go/retailbooks/)
15 */
16
17#include <asm/asmmacro.h>
18
19#ifdef MODULO
20# define OP mod
21#else
22# define OP div
23#endif
24
25#ifdef UNSIGNED
26# define SGN u
27# define INT_TO_FP(a,b) fcvt.xuf.s1 a=b
28# define FP_TO_INT(a,b) fcvt.fxu.trunc.s1 a=b
29#else
30# define SGN
31# define INT_TO_FP(a,b) fcvt.xf a=b
32# define FP_TO_INT(a,b) fcvt.fx.trunc.s1 a=b
33#endif
34
35#define PASTE1(a,b) a##b
36#define PASTE(a,b) PASTE1(a,b)
37#define NAME PASTE(PASTE(__,SGN),PASTE(OP,di3))
38
39GLOBAL_ENTRY(NAME)
40 .regstk 2,0,0,0
41 // Transfer inputs to FP registers.
42 setf.sig f8 = in0
43 setf.sig f9 = in1
44 ;;
45 // Convert the inputs to FP, to avoid FP software-assist faults.
46 INT_TO_FP(f8, f8)
47 INT_TO_FP(f9, f9)
48 ;;
49 frcpa.s1 f11, p6 = f8, f9 // y0 = frcpa(b)
50 ;;
51(p6) fmpy.s1 f7 = f8, f11 // q0 = a*y0
52(p6) fnma.s1 f6 = f9, f11, f1 // e0 = -b*y0 + 1
53 ;;
54(p6) fma.s1 f10 = f7, f6, f7 // q1 = q0*e0 + q0
55(p6) fmpy.s1 f7 = f6, f6 // e1 = e0*e0
56 ;;
57#ifdef MODULO
58 sub in1 = r0, in1 // in1 = -b
59#endif
60(p6) fma.s1 f10 = f10, f7, f10 // q2 = q1*e1 + q1
61(p6) fma.s1 f6 = f11, f6, f11 // y1 = y0*e0 + y0
62 ;;
63(p6) fma.s1 f6 = f6, f7, f6 // y2 = y1*e1 + y1
64(p6) fnma.s1 f7 = f9, f10, f8 // r = -b*q2 + a
65 ;;
66#ifdef MODULO
67 setf.sig f8 = in0 // f8 = a
68 setf.sig f9 = in1 // f9 = -b
69#endif
70(p6) fma.s1 f11 = f7, f6, f10 // q3 = r*y2 + q2
71 ;;
72 FP_TO_INT(f11, f11) // q = trunc(q3)
73 ;;
74#ifdef MODULO
75 xma.l f11 = f11, f9, f8 // r = q*(-b) + a
76 ;;
77#endif
78 getf.sig r8 = f11 // transfer result to result register
79 br.ret.sptk.many rp
80END(NAME)
diff --git a/arch/ia64/lib/io.c b/arch/ia64/lib/io.c
new file mode 100644
index 000000000000..8949e44091ac
--- /dev/null
+++ b/arch/ia64/lib/io.c
@@ -0,0 +1,165 @@
1#include <linux/config.h>
2#include <linux/module.h>
3#include <linux/types.h>
4
5#include <asm/io.h>
6
7/*
8 * Copy data from IO memory space to "real" memory space.
9 * This needs to be optimized.
10 */
11void memcpy_fromio(void *to, const volatile void __iomem *from, long count)
12{
13 char *dst = to;
14
15 while (count) {
16 count--;
17 *dst++ = readb(from++);
18 }
19}
20EXPORT_SYMBOL(memcpy_fromio);
21
22/*
23 * Copy data from "real" memory space to IO memory space.
24 * This needs to be optimized.
25 */
26void memcpy_toio(volatile void __iomem *to, const void *from, long count)
27{
28 const char *src = from;
29
30 while (count) {
31 count--;
32 writeb(*src++, to++);
33 }
34}
35EXPORT_SYMBOL(memcpy_toio);
36
37/*
38 * "memset" on IO memory space.
39 * This needs to be optimized.
40 */
41void memset_io(volatile void __iomem *dst, int c, long count)
42{
43 unsigned char ch = (char)(c & 0xff);
44
45 while (count) {
46 count--;
47 writeb(ch, dst);
48 dst++;
49 }
50}
51EXPORT_SYMBOL(memset_io);
52
53#ifdef CONFIG_IA64_GENERIC
54
55#undef __ia64_inb
56#undef __ia64_inw
57#undef __ia64_inl
58#undef __ia64_outb
59#undef __ia64_outw
60#undef __ia64_outl
61#undef __ia64_readb
62#undef __ia64_readw
63#undef __ia64_readl
64#undef __ia64_readq
65#undef __ia64_readb_relaxed
66#undef __ia64_readw_relaxed
67#undef __ia64_readl_relaxed
68#undef __ia64_readq_relaxed
69#undef __ia64_writeb
70#undef __ia64_writew
71#undef __ia64_writel
72#undef __ia64_writeq
73#undef __ia64_mmiowb
74
75unsigned int
76__ia64_inb (unsigned long port)
77{
78 return ___ia64_inb(port);
79}
80
81unsigned int
82__ia64_inw (unsigned long port)
83{
84 return ___ia64_inw(port);
85}
86
87unsigned int
88__ia64_inl (unsigned long port)
89{
90 return ___ia64_inl(port);
91}
92
93void
94__ia64_outb (unsigned char val, unsigned long port)
95{
96 ___ia64_outb(val, port);
97}
98
99void
100__ia64_outw (unsigned short val, unsigned long port)
101{
102 ___ia64_outw(val, port);
103}
104
105void
106__ia64_outl (unsigned int val, unsigned long port)
107{
108 ___ia64_outl(val, port);
109}
110
111unsigned char
112__ia64_readb (void __iomem *addr)
113{
114 return ___ia64_readb (addr);
115}
116
117unsigned short
118__ia64_readw (void __iomem *addr)
119{
120 return ___ia64_readw (addr);
121}
122
123unsigned int
124__ia64_readl (void __iomem *addr)
125{
126 return ___ia64_readl (addr);
127}
128
129unsigned long
130__ia64_readq (void __iomem *addr)
131{
132 return ___ia64_readq (addr);
133}
134
135unsigned char
136__ia64_readb_relaxed (void __iomem *addr)
137{
138 return ___ia64_readb (addr);
139}
140
141unsigned short
142__ia64_readw_relaxed (void __iomem *addr)
143{
144 return ___ia64_readw (addr);
145}
146
147unsigned int
148__ia64_readl_relaxed (void __iomem *addr)
149{
150 return ___ia64_readl (addr);
151}
152
153unsigned long
154__ia64_readq_relaxed (void __iomem *addr)
155{
156 return ___ia64_readq (addr);
157}
158
159void
160__ia64_mmiowb(void)
161{
162 ___ia64_mmiowb();
163}
164
165#endif /* CONFIG_IA64_GENERIC */
diff --git a/arch/ia64/lib/ip_fast_csum.S b/arch/ia64/lib/ip_fast_csum.S
new file mode 100644
index 000000000000..19674ca2acfc
--- /dev/null
+++ b/arch/ia64/lib/ip_fast_csum.S
@@ -0,0 +1,90 @@
1/*
2 * Optmized version of the ip_fast_csum() function
3 * Used for calculating IP header checksum
4 *
5 * Return: 16bit checksum, complemented
6 *
7 * Inputs:
8 * in0: address of buffer to checksum (char *)
9 * in1: length of the buffer (int)
10 *
11 * Copyright (C) 2002 Intel Corp.
12 * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
13 */
14
15#include <asm/asmmacro.h>
16
17/*
18 * Since we know that most likely this function is called with buf aligned
19 * on 4-byte boundary and 20 bytes in length, we can execution rather quickly
20 * versus calling generic version of do_csum, which has lots of overhead in
21 * handling various alignments and sizes. However, due to lack of constrains
22 * put on the function input argument, cases with alignment not on 4-byte or
23 * size not equal to 20 bytes will be handled by the generic do_csum function.
24 */
25
26#define in0 r32
27#define in1 r33
28#define ret0 r8
29
30GLOBAL_ENTRY(ip_fast_csum)
31 .prologue
32 .body
33 cmp.ne p6,p7=5,in1 // size other than 20 byte?
34 and r14=3,in0 // is it aligned on 4-byte?
35 add r15=4,in0 // second source pointer
36 ;;
37 cmp.ne.or.andcm p6,p7=r14,r0
38 ;;
39(p7) ld4 r20=[in0],8
40(p7) ld4 r21=[r15],8
41(p6) br.spnt .generic
42 ;;
43 ld4 r22=[in0],8
44 ld4 r23=[r15],8
45 ;;
46 ld4 r24=[in0]
47 add r20=r20,r21
48 add r22=r22,r23
49 ;;
50 add r20=r20,r22
51 ;;
52 add r20=r20,r24
53 ;;
54 shr.u ret0=r20,16 // now need to add the carry
55 zxt2 r20=r20
56 ;;
57 add r20=ret0,r20
58 ;;
59 shr.u ret0=r20,16 // add carry again
60 zxt2 r20=r20
61 ;;
62 add r20=ret0,r20
63 ;;
64 shr.u ret0=r20,16
65 zxt2 r20=r20
66 ;;
67 add r20=ret0,r20
68 ;;
69 andcm ret0=-1,r20
70 .restore sp // reset frame state
71 br.ret.sptk.many b0
72 ;;
73
74.generic:
75 .prologue
76 .save ar.pfs, r35
77 alloc r35=ar.pfs,2,2,2,0
78 .save rp, r34
79 mov r34=b0
80 .body
81 dep.z out1=in1,2,30
82 mov out0=in0
83 ;;
84 br.call.sptk.many b0=do_csum
85 ;;
86 andcm ret0=-1,ret0
87 mov ar.pfs=r35
88 mov b0=r34
89 br.ret.sptk.many b0
90END(ip_fast_csum)
diff --git a/arch/ia64/lib/memcpy.S b/arch/ia64/lib/memcpy.S
new file mode 100644
index 000000000000..448908d80b69
--- /dev/null
+++ b/arch/ia64/lib/memcpy.S
@@ -0,0 +1,301 @@
1/*
2 *
3 * Optimized version of the standard memcpy() function
4 *
5 * Inputs:
6 * in0: destination address
7 * in1: source address
8 * in2: number of bytes to copy
9 * Output:
10 * no return value
11 *
12 * Copyright (C) 2000-2001 Hewlett-Packard Co
13 * Stephane Eranian <eranian@hpl.hp.com>
14 * David Mosberger-Tang <davidm@hpl.hp.com>
15 */
16#include <asm/asmmacro.h>
17
18GLOBAL_ENTRY(memcpy)
19
20# define MEM_LAT 21 /* latency to memory */
21
22# define dst r2
23# define src r3
24# define retval r8
25# define saved_pfs r9
26# define saved_lc r10
27# define saved_pr r11
28# define cnt r16
29# define src2 r17
30# define t0 r18
31# define t1 r19
32# define t2 r20
33# define t3 r21
34# define t4 r22
35# define src_end r23
36
37# define N (MEM_LAT + 4)
38# define Nrot ((N + 7) & ~7)
39
40 /*
41 * First, check if everything (src, dst, len) is a multiple of eight. If
42 * so, we handle everything with no taken branches (other than the loop
43 * itself) and a small icache footprint. Otherwise, we jump off to
44 * the more general copy routine handling arbitrary
45 * sizes/alignment etc.
46 */
47 .prologue
48 .save ar.pfs, saved_pfs
49 alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot
50 .save ar.lc, saved_lc
51 mov saved_lc=ar.lc
52 or t0=in0,in1
53 ;;
54
55 or t0=t0,in2
56 .save pr, saved_pr
57 mov saved_pr=pr
58
59 .body
60
61 cmp.eq p6,p0=in2,r0 // zero length?
62 mov retval=in0 // return dst
63(p6) br.ret.spnt.many rp // zero length, return immediately
64 ;;
65
66 mov dst=in0 // copy because of rotation
67 shr.u cnt=in2,3 // number of 8-byte words to copy
68 mov pr.rot=1<<16
69 ;;
70
71 adds cnt=-1,cnt // br.ctop is repeat/until
72 cmp.gtu p7,p0=16,in2 // copying less than 16 bytes?
73 mov ar.ec=N
74 ;;
75
76 and t0=0x7,t0
77 mov ar.lc=cnt
78 ;;
79 cmp.ne p6,p0=t0,r0
80
81 mov src=in1 // copy because of rotation
82(p7) br.cond.spnt.few .memcpy_short
83(p6) br.cond.spnt.few .memcpy_long
84 ;;
85 nop.m 0
86 ;;
87 nop.m 0
88 nop.i 0
89 ;;
90 nop.m 0
91 ;;
92 .rotr val[N]
93 .rotp p[N]
94 .align 32
951: { .mib
96(p[0]) ld8 val[0]=[src],8
97 nop.i 0
98 brp.loop.imp 1b, 2f
99}
1002: { .mfb
101(p[N-1])st8 [dst]=val[N-1],8
102 nop.f 0
103 br.ctop.dptk.few 1b
104}
105 ;;
106 mov ar.lc=saved_lc
107 mov pr=saved_pr,-1
108 mov ar.pfs=saved_pfs
109 br.ret.sptk.many rp
110
111 /*
112 * Small (<16 bytes) unaligned copying is done via a simple byte-at-the-time
113 * copy loop. This performs relatively poorly on Itanium, but it doesn't
114 * get used very often (gcc inlines small copies) and due to atomicity
115 * issues, we want to avoid read-modify-write of entire words.
116 */
117 .align 32
118.memcpy_short:
119 adds cnt=-1,in2 // br.ctop is repeat/until
120 mov ar.ec=MEM_LAT
121 brp.loop.imp 1f, 2f
122 ;;
123 mov ar.lc=cnt
124 ;;
125 nop.m 0
126 ;;
127 nop.m 0
128 nop.i 0
129 ;;
130 nop.m 0
131 ;;
132 nop.m 0
133 ;;
134 /*
135 * It is faster to put a stop bit in the loop here because it makes
136 * the pipeline shorter (and latency is what matters on short copies).
137 */
138 .align 32
1391: { .mib
140(p[0]) ld1 val[0]=[src],1
141 nop.i 0
142 brp.loop.imp 1b, 2f
143} ;;
1442: { .mfb
145(p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1
146 nop.f 0
147 br.ctop.dptk.few 1b
148} ;;
149 mov ar.lc=saved_lc
150 mov pr=saved_pr,-1
151 mov ar.pfs=saved_pfs
152 br.ret.sptk.many rp
153
154 /*
155 * Large (>= 16 bytes) copying is done in a fancy way. Latency isn't
156 * an overriding concern here, but throughput is. We first do
157 * sub-word copying until the destination is aligned, then we check
158 * if the source is also aligned. If so, we do a simple load/store-loop
159 * until there are less than 8 bytes left over and then we do the tail,
160 * by storing the last few bytes using sub-word copying. If the source
161 * is not aligned, we branch off to the non-congruent loop.
162 *
163 * stage: op:
164 * 0 ld
165 * :
166 * MEM_LAT+3 shrp
167 * MEM_LAT+4 st
168 *
169 * On Itanium, the pipeline itself runs without stalls. However, br.ctop
170 * seems to introduce an unavoidable bubble in the pipeline so the overall
171 * latency is 2 cycles/iteration. This gives us a _copy_ throughput
172 * of 4 byte/cycle. Still not bad.
173 */
174# undef N
175# undef Nrot
176# define N (MEM_LAT + 5) /* number of stages */
177# define Nrot ((N+1 + 2 + 7) & ~7) /* number of rotating regs */
178
179#define LOG_LOOP_SIZE 6
180
181.memcpy_long:
182 alloc t3=ar.pfs,3,Nrot,0,Nrot // resize register frame
183 and t0=-8,src // t0 = src & ~7
184 and t2=7,src // t2 = src & 7
185 ;;
186 ld8 t0=[t0] // t0 = 1st source word
187 adds src2=7,src // src2 = (src + 7)
188 sub t4=r0,dst // t4 = -dst
189 ;;
190 and src2=-8,src2 // src2 = (src + 7) & ~7
191 shl t2=t2,3 // t2 = 8*(src & 7)
192 shl t4=t4,3 // t4 = 8*(dst & 7)
193 ;;
194 ld8 t1=[src2] // t1 = 1st source word if src is 8-byte aligned, 2nd otherwise
195 sub t3=64,t2 // t3 = 64-8*(src & 7)
196 shr.u t0=t0,t2
197 ;;
198 add src_end=src,in2
199 shl t1=t1,t3
200 mov pr=t4,0x38 // (p5,p4,p3)=(dst & 7)
201 ;;
202 or t0=t0,t1
203 mov cnt=r0
204 adds src_end=-1,src_end
205 ;;
206(p3) st1 [dst]=t0,1
207(p3) shr.u t0=t0,8
208(p3) adds cnt=1,cnt
209 ;;
210(p4) st2 [dst]=t0,2
211(p4) shr.u t0=t0,16
212(p4) adds cnt=2,cnt
213 ;;
214(p5) st4 [dst]=t0,4
215(p5) adds cnt=4,cnt
216 and src_end=-8,src_end // src_end = last word of source buffer
217 ;;
218
219 // At this point, dst is aligned to 8 bytes and there at least 16-7=9 bytes left to copy:
220
2211:{ add src=cnt,src // make src point to remainder of source buffer
222 sub cnt=in2,cnt // cnt = number of bytes left to copy
223 mov t4=ip
224 } ;;
225 and src2=-8,src // align source pointer
226 adds t4=.memcpy_loops-1b,t4
227 mov ar.ec=N
228
229 and t0=7,src // t0 = src & 7
230 shr.u t2=cnt,3 // t2 = number of 8-byte words left to copy
231 shl cnt=cnt,3 // move bits 0-2 to 3-5
232 ;;
233
234 .rotr val[N+1], w[2]
235 .rotp p[N]
236
237 cmp.ne p6,p0=t0,r0 // is src aligned, too?
238 shl t0=t0,LOG_LOOP_SIZE // t0 = 8*(src & 7)
239 adds t2=-1,t2 // br.ctop is repeat/until
240 ;;
241 add t4=t0,t4
242 mov pr=cnt,0x38 // set (p5,p4,p3) to # of bytes last-word bytes to copy
243 mov ar.lc=t2
244 ;;
245 nop.m 0
246 ;;
247 nop.m 0
248 nop.i 0
249 ;;
250 nop.m 0
251 ;;
252(p6) ld8 val[1]=[src2],8 // prime the pump...
253 mov b6=t4
254 br.sptk.few b6
255 ;;
256
257.memcpy_tail:
258 // At this point, (p5,p4,p3) are set to the number of bytes left to copy (which is
259 // less than 8) and t0 contains the last few bytes of the src buffer:
260(p5) st4 [dst]=t0,4
261(p5) shr.u t0=t0,32
262 mov ar.lc=saved_lc
263 ;;
264(p4) st2 [dst]=t0,2
265(p4) shr.u t0=t0,16
266 mov ar.pfs=saved_pfs
267 ;;
268(p3) st1 [dst]=t0
269 mov pr=saved_pr,-1
270 br.ret.sptk.many rp
271
272///////////////////////////////////////////////////////
273 .align 64
274
275#define COPY(shift,index) \
276 1: { .mib \
277 (p[0]) ld8 val[0]=[src2],8; \
278 (p[MEM_LAT+3]) shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift; \
279 brp.loop.imp 1b, 2f \
280 }; \
281 2: { .mfb \
282 (p[MEM_LAT+4]) st8 [dst]=w[1],8; \
283 nop.f 0; \
284 br.ctop.dptk.few 1b; \
285 }; \
286 ;; \
287 ld8 val[N-1]=[src_end]; /* load last word (may be same as val[N]) */ \
288 ;; \
289 shrp t0=val[N-1],val[N-index],shift; \
290 br .memcpy_tail
291.memcpy_loops:
292 COPY(0, 1) /* no point special casing this---it doesn't go any faster without shrp */
293 COPY(8, 0)
294 COPY(16, 0)
295 COPY(24, 0)
296 COPY(32, 0)
297 COPY(40, 0)
298 COPY(48, 0)
299 COPY(56, 0)
300
301END(memcpy)
diff --git a/arch/ia64/lib/memcpy_mck.S b/arch/ia64/lib/memcpy_mck.S
new file mode 100644
index 000000000000..6f26ef7cc236
--- /dev/null
+++ b/arch/ia64/lib/memcpy_mck.S
@@ -0,0 +1,661 @@
1/*
2 * Itanium 2-optimized version of memcpy and copy_user function
3 *
4 * Inputs:
5 * in0: destination address
6 * in1: source address
7 * in2: number of bytes to copy
8 * Output:
9 * 0 if success, or number of byte NOT copied if error occurred.
10 *
11 * Copyright (C) 2002 Intel Corp.
12 * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
13 */
14#include <linux/config.h>
15#include <asm/asmmacro.h>
16#include <asm/page.h>
17
18#define EK(y...) EX(y)
19
20/* McKinley specific optimization */
21
22#define retval r8
23#define saved_pfs r31
24#define saved_lc r10
25#define saved_pr r11
26#define saved_in0 r14
27#define saved_in1 r15
28#define saved_in2 r16
29
30#define src0 r2
31#define src1 r3
32#define dst0 r17
33#define dst1 r18
34#define cnt r9
35
36/* r19-r30 are temp for each code section */
37#define PREFETCH_DIST 8
38#define src_pre_mem r19
39#define dst_pre_mem r20
40#define src_pre_l2 r21
41#define dst_pre_l2 r22
42#define t1 r23
43#define t2 r24
44#define t3 r25
45#define t4 r26
46#define t5 t1 // alias!
47#define t6 t2 // alias!
48#define t7 t3 // alias!
49#define n8 r27
50#define t9 t5 // alias!
51#define t10 t4 // alias!
52#define t11 t7 // alias!
53#define t12 t6 // alias!
54#define t14 t10 // alias!
55#define t13 r28
56#define t15 r29
57#define tmp r30
58
59/* defines for long_copy block */
60#define A 0
61#define B (PREFETCH_DIST)
62#define C (B + PREFETCH_DIST)
63#define D (C + 1)
64#define N (D + 1)
65#define Nrot ((N + 7) & ~7)
66
67/* alias */
68#define in0 r32
69#define in1 r33
70#define in2 r34
71
72GLOBAL_ENTRY(memcpy)
73 and r28=0x7,in0
74 and r29=0x7,in1
75 mov f6=f0
76 br.cond.sptk .common_code
77 ;;
78GLOBAL_ENTRY(__copy_user)
79 .prologue
80// check dest alignment
81 and r28=0x7,in0
82 and r29=0x7,in1
83 mov f6=f1
84 mov saved_in0=in0 // save dest pointer
85 mov saved_in1=in1 // save src pointer
86 mov saved_in2=in2 // save len
87 ;;
88.common_code:
89 cmp.gt p15,p0=8,in2 // check for small size
90 cmp.ne p13,p0=0,r28 // check dest alignment
91 cmp.ne p14,p0=0,r29 // check src alignment
92 add src0=0,in1
93 sub r30=8,r28 // for .align_dest
94 mov retval=r0 // initialize return value
95 ;;
96 add dst0=0,in0
97 add dst1=1,in0 // dest odd index
98 cmp.le p6,p0 = 1,r30 // for .align_dest
99(p15) br.cond.dpnt .memcpy_short
100(p13) br.cond.dpnt .align_dest
101(p14) br.cond.dpnt .unaligned_src
102 ;;
103
104// both dest and src are aligned on 8-byte boundary
105.aligned_src:
106 .save ar.pfs, saved_pfs
107 alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
108 .save pr, saved_pr
109 mov saved_pr=pr
110
111 shr.u cnt=in2,7 // this much cache line
112 ;;
113 cmp.lt p6,p0=2*PREFETCH_DIST,cnt
114 cmp.lt p7,p8=1,cnt
115 .save ar.lc, saved_lc
116 mov saved_lc=ar.lc
117 .body
118 add cnt=-1,cnt
119 add src_pre_mem=0,in1 // prefetch src pointer
120 add dst_pre_mem=0,in0 // prefetch dest pointer
121 ;;
122(p7) mov ar.lc=cnt // prefetch count
123(p8) mov ar.lc=r0
124(p6) br.cond.dpnt .long_copy
125 ;;
126
127.prefetch:
128 lfetch.fault [src_pre_mem], 128
129 lfetch.fault.excl [dst_pre_mem], 128
130 br.cloop.dptk.few .prefetch
131 ;;
132
133.medium_copy:
134 and tmp=31,in2 // copy length after iteration
135 shr.u r29=in2,5 // number of 32-byte iteration
136 add dst1=8,dst0 // 2nd dest pointer
137 ;;
138 add cnt=-1,r29 // ctop iteration adjustment
139 cmp.eq p10,p0=r29,r0 // do we really need to loop?
140 add src1=8,src0 // 2nd src pointer
141 cmp.le p6,p0=8,tmp
142 ;;
143 cmp.le p7,p0=16,tmp
144 mov ar.lc=cnt // loop setup
145 cmp.eq p16,p17 = r0,r0
146 mov ar.ec=2
147(p10) br.dpnt.few .aligned_src_tail
148 ;;
149 TEXT_ALIGN(32)
1501:
151EX(.ex_handler, (p16) ld8 r34=[src0],16)
152EK(.ex_handler, (p16) ld8 r38=[src1],16)
153EX(.ex_handler, (p17) st8 [dst0]=r33,16)
154EK(.ex_handler, (p17) st8 [dst1]=r37,16)
155 ;;
156EX(.ex_handler, (p16) ld8 r32=[src0],16)
157EK(.ex_handler, (p16) ld8 r36=[src1],16)
158EX(.ex_handler, (p16) st8 [dst0]=r34,16)
159EK(.ex_handler, (p16) st8 [dst1]=r38,16)
160 br.ctop.dptk.few 1b
161 ;;
162
163.aligned_src_tail:
164EX(.ex_handler, (p6) ld8 t1=[src0])
165 mov ar.lc=saved_lc
166 mov ar.pfs=saved_pfs
167EX(.ex_hndlr_s, (p7) ld8 t2=[src1],8)
168 cmp.le p8,p0=24,tmp
169 and r21=-8,tmp
170 ;;
171EX(.ex_hndlr_s, (p8) ld8 t3=[src1])
172EX(.ex_handler, (p6) st8 [dst0]=t1) // store byte 1
173 and in2=7,tmp // remaining length
174EX(.ex_hndlr_d, (p7) st8 [dst1]=t2,8) // store byte 2
175 add src0=src0,r21 // setting up src pointer
176 add dst0=dst0,r21 // setting up dest pointer
177 ;;
178EX(.ex_handler, (p8) st8 [dst1]=t3) // store byte 3
179 mov pr=saved_pr,-1
180 br.dptk.many .memcpy_short
181 ;;
182
183/* code taken from copy_page_mck */
184.long_copy:
185 .rotr v[2*PREFETCH_DIST]
186 .rotp p[N]
187
188 mov src_pre_mem = src0
189 mov pr.rot = 0x10000
190 mov ar.ec = 1 // special unrolled loop
191
192 mov dst_pre_mem = dst0
193
194 add src_pre_l2 = 8*8, src0
195 add dst_pre_l2 = 8*8, dst0
196 ;;
197 add src0 = 8, src_pre_mem // first t1 src
198 mov ar.lc = 2*PREFETCH_DIST - 1
199 shr.u cnt=in2,7 // number of lines
200 add src1 = 3*8, src_pre_mem // first t3 src
201 add dst0 = 8, dst_pre_mem // first t1 dst
202 add dst1 = 3*8, dst_pre_mem // first t3 dst
203 ;;
204 and tmp=127,in2 // remaining bytes after this block
205 add cnt = -(2*PREFETCH_DIST) - 1, cnt
206 // same as .line_copy loop, but with all predicated-off instructions removed:
207.prefetch_loop:
208EX(.ex_hndlr_lcpy_1, (p[A]) ld8 v[A] = [src_pre_mem], 128) // M0
209EK(.ex_hndlr_lcpy_1, (p[B]) st8 [dst_pre_mem] = v[B], 128) // M2
210 br.ctop.sptk .prefetch_loop
211 ;;
212 cmp.eq p16, p0 = r0, r0 // reset p16 to 1
213 mov ar.lc = cnt
214 mov ar.ec = N // # of stages in pipeline
215 ;;
216.line_copy:
217EX(.ex_handler, (p[D]) ld8 t2 = [src0], 3*8) // M0
218EK(.ex_handler, (p[D]) ld8 t4 = [src1], 3*8) // M1
219EX(.ex_handler_lcpy, (p[B]) st8 [dst_pre_mem] = v[B], 128) // M2 prefetch dst from memory
220EK(.ex_handler_lcpy, (p[D]) st8 [dst_pre_l2] = n8, 128) // M3 prefetch dst from L2
221 ;;
222EX(.ex_handler_lcpy, (p[A]) ld8 v[A] = [src_pre_mem], 128) // M0 prefetch src from memory
223EK(.ex_handler_lcpy, (p[C]) ld8 n8 = [src_pre_l2], 128) // M1 prefetch src from L2
224EX(.ex_handler, (p[D]) st8 [dst0] = t1, 8) // M2
225EK(.ex_handler, (p[D]) st8 [dst1] = t3, 8) // M3
226 ;;
227EX(.ex_handler, (p[D]) ld8 t5 = [src0], 8)
228EK(.ex_handler, (p[D]) ld8 t7 = [src1], 3*8)
229EX(.ex_handler, (p[D]) st8 [dst0] = t2, 3*8)
230EK(.ex_handler, (p[D]) st8 [dst1] = t4, 3*8)
231 ;;
232EX(.ex_handler, (p[D]) ld8 t6 = [src0], 3*8)
233EK(.ex_handler, (p[D]) ld8 t10 = [src1], 8)
234EX(.ex_handler, (p[D]) st8 [dst0] = t5, 8)
235EK(.ex_handler, (p[D]) st8 [dst1] = t7, 3*8)
236 ;;
237EX(.ex_handler, (p[D]) ld8 t9 = [src0], 3*8)
238EK(.ex_handler, (p[D]) ld8 t11 = [src1], 3*8)
239EX(.ex_handler, (p[D]) st8 [dst0] = t6, 3*8)
240EK(.ex_handler, (p[D]) st8 [dst1] = t10, 8)
241 ;;
242EX(.ex_handler, (p[D]) ld8 t12 = [src0], 8)
243EK(.ex_handler, (p[D]) ld8 t14 = [src1], 8)
244EX(.ex_handler, (p[D]) st8 [dst0] = t9, 3*8)
245EK(.ex_handler, (p[D]) st8 [dst1] = t11, 3*8)
246 ;;
247EX(.ex_handler, (p[D]) ld8 t13 = [src0], 4*8)
248EK(.ex_handler, (p[D]) ld8 t15 = [src1], 4*8)
249EX(.ex_handler, (p[D]) st8 [dst0] = t12, 8)
250EK(.ex_handler, (p[D]) st8 [dst1] = t14, 8)
251 ;;
252EX(.ex_handler, (p[C]) ld8 t1 = [src0], 8)
253EK(.ex_handler, (p[C]) ld8 t3 = [src1], 8)
254EX(.ex_handler, (p[D]) st8 [dst0] = t13, 4*8)
255EK(.ex_handler, (p[D]) st8 [dst1] = t15, 4*8)
256 br.ctop.sptk .line_copy
257 ;;
258
259 add dst0=-8,dst0
260 add src0=-8,src0
261 mov in2=tmp
262 .restore sp
263 br.sptk.many .medium_copy
264 ;;
265
266#define BLOCK_SIZE 128*32
267#define blocksize r23
268#define curlen r24
269
270// dest is on 8-byte boundary, src is not. We need to do
271// ld8-ld8, shrp, then st8. Max 8 byte copy per cycle.
272.unaligned_src:
273 .prologue
274 .save ar.pfs, saved_pfs
275 alloc saved_pfs=ar.pfs,3,5,0,8
276 .save ar.lc, saved_lc
277 mov saved_lc=ar.lc
278 .save pr, saved_pr
279 mov saved_pr=pr
280 .body
281.4k_block:
282 mov saved_in0=dst0 // need to save all input arguments
283 mov saved_in2=in2
284 mov blocksize=BLOCK_SIZE
285 ;;
286 cmp.lt p6,p7=blocksize,in2
287 mov saved_in1=src0
288 ;;
289(p6) mov in2=blocksize
290 ;;
291 shr.u r21=in2,7 // this much cache line
292 shr.u r22=in2,4 // number of 16-byte iteration
293 and curlen=15,in2 // copy length after iteration
294 and r30=7,src0 // source alignment
295 ;;
296 cmp.lt p7,p8=1,r21
297 add cnt=-1,r21
298 ;;
299
300 add src_pre_mem=0,src0 // prefetch src pointer
301 add dst_pre_mem=0,dst0 // prefetch dest pointer
302 and src0=-8,src0 // 1st src pointer
303(p7) mov ar.lc = r21
304(p8) mov ar.lc = r0
305 ;;
306 TEXT_ALIGN(32)
3071: lfetch.fault [src_pre_mem], 128
308 lfetch.fault.excl [dst_pre_mem], 128
309 br.cloop.dptk.few 1b
310 ;;
311
312 shladd dst1=r22,3,dst0 // 2nd dest pointer
313 shladd src1=r22,3,src0 // 2nd src pointer
314 cmp.eq p8,p9=r22,r0 // do we really need to loop?
315 cmp.le p6,p7=8,curlen; // have at least 8 byte remaining?
316 add cnt=-1,r22 // ctop iteration adjustment
317 ;;
318EX(.ex_handler, (p9) ld8 r33=[src0],8) // loop primer
319EK(.ex_handler, (p9) ld8 r37=[src1],8)
320(p8) br.dpnt.few .noloop
321 ;;
322
323// The jump address is calculated based on src alignment. The COPYU
324// macro below need to confine its size to power of two, so an entry
325// can be caulated using shl instead of an expensive multiply. The
326// size is then hard coded by the following #define to match the
327// actual size. This make it somewhat tedious when COPYU macro gets
328// changed and this need to be adjusted to match.
329#define LOOP_SIZE 6
3301:
331 mov r29=ip // jmp_table thread
332 mov ar.lc=cnt
333 ;;
334 add r29=.jump_table - 1b - (.jmp1-.jump_table), r29
335 shl r28=r30, LOOP_SIZE // jmp_table thread
336 mov ar.ec=2 // loop setup
337 ;;
338 add r29=r29,r28 // jmp_table thread
339 cmp.eq p16,p17=r0,r0
340 ;;
341 mov b6=r29 // jmp_table thread
342 ;;
343 br.cond.sptk.few b6
344
345// for 8-15 byte case
346// We will skip the loop, but need to replicate the side effect
347// that the loop produces.
348.noloop:
349EX(.ex_handler, (p6) ld8 r37=[src1],8)
350 add src0=8,src0
351(p6) shl r25=r30,3
352 ;;
353EX(.ex_handler, (p6) ld8 r27=[src1])
354(p6) shr.u r28=r37,r25
355(p6) sub r26=64,r25
356 ;;
357(p6) shl r27=r27,r26
358 ;;
359(p6) or r21=r28,r27
360
361.unaligned_src_tail:
362/* check if we have more than blocksize to copy, if so go back */
363 cmp.gt p8,p0=saved_in2,blocksize
364 ;;
365(p8) add dst0=saved_in0,blocksize
366(p8) add src0=saved_in1,blocksize
367(p8) sub in2=saved_in2,blocksize
368(p8) br.dpnt .4k_block
369 ;;
370
371/* we have up to 15 byte to copy in the tail.
372 * part of work is already done in the jump table code
373 * we are at the following state.
374 * src side:
375 *
376 * xxxxxx xx <----- r21 has xxxxxxxx already
377 * -------- -------- --------
378 * 0 8 16
379 * ^
380 * |
381 * src1
382 *
383 * dst
384 * -------- -------- --------
385 * ^
386 * |
387 * dst1
388 */
389EX(.ex_handler, (p6) st8 [dst1]=r21,8) // more than 8 byte to copy
390(p6) add curlen=-8,curlen // update length
391 mov ar.pfs=saved_pfs
392 ;;
393 mov ar.lc=saved_lc
394 mov pr=saved_pr,-1
395 mov in2=curlen // remaining length
396 mov dst0=dst1 // dest pointer
397 add src0=src1,r30 // forward by src alignment
398 ;;
399
400// 7 byte or smaller.
401.memcpy_short:
402 cmp.le p8,p9 = 1,in2
403 cmp.le p10,p11 = 2,in2
404 cmp.le p12,p13 = 3,in2
405 cmp.le p14,p15 = 4,in2
406 add src1=1,src0 // second src pointer
407 add dst1=1,dst0 // second dest pointer
408 ;;
409
410EX(.ex_handler_short, (p8) ld1 t1=[src0],2)
411EK(.ex_handler_short, (p10) ld1 t2=[src1],2)
412(p9) br.ret.dpnt rp // 0 byte copy
413 ;;
414
415EX(.ex_handler_short, (p8) st1 [dst0]=t1,2)
416EK(.ex_handler_short, (p10) st1 [dst1]=t2,2)
417(p11) br.ret.dpnt rp // 1 byte copy
418
419EX(.ex_handler_short, (p12) ld1 t3=[src0],2)
420EK(.ex_handler_short, (p14) ld1 t4=[src1],2)
421(p13) br.ret.dpnt rp // 2 byte copy
422 ;;
423
424 cmp.le p6,p7 = 5,in2
425 cmp.le p8,p9 = 6,in2
426 cmp.le p10,p11 = 7,in2
427
428EX(.ex_handler_short, (p12) st1 [dst0]=t3,2)
429EK(.ex_handler_short, (p14) st1 [dst1]=t4,2)
430(p15) br.ret.dpnt rp // 3 byte copy
431 ;;
432
433EX(.ex_handler_short, (p6) ld1 t5=[src0],2)
434EK(.ex_handler_short, (p8) ld1 t6=[src1],2)
435(p7) br.ret.dpnt rp // 4 byte copy
436 ;;
437
438EX(.ex_handler_short, (p6) st1 [dst0]=t5,2)
439EK(.ex_handler_short, (p8) st1 [dst1]=t6,2)
440(p9) br.ret.dptk rp // 5 byte copy
441
442EX(.ex_handler_short, (p10) ld1 t7=[src0],2)
443(p11) br.ret.dptk rp // 6 byte copy
444 ;;
445
446EX(.ex_handler_short, (p10) st1 [dst0]=t7,2)
447 br.ret.dptk rp // done all cases
448
449
450/* Align dest to nearest 8-byte boundary. We know we have at
451 * least 7 bytes to copy, enough to crawl to 8-byte boundary.
452 * Actual number of byte to crawl depend on the dest alignment.
453 * 7 byte or less is taken care at .memcpy_short
454
455 * src0 - source even index
456 * src1 - source odd index
457 * dst0 - dest even index
458 * dst1 - dest odd index
459 * r30 - distance to 8-byte boundary
460 */
461
462.align_dest:
463 add src1=1,in1 // source odd index
464 cmp.le p7,p0 = 2,r30 // for .align_dest
465 cmp.le p8,p0 = 3,r30 // for .align_dest
466EX(.ex_handler_short, (p6) ld1 t1=[src0],2)
467 cmp.le p9,p0 = 4,r30 // for .align_dest
468 cmp.le p10,p0 = 5,r30
469 ;;
470EX(.ex_handler_short, (p7) ld1 t2=[src1],2)
471EK(.ex_handler_short, (p8) ld1 t3=[src0],2)
472 cmp.le p11,p0 = 6,r30
473EX(.ex_handler_short, (p6) st1 [dst0] = t1,2)
474 cmp.le p12,p0 = 7,r30
475 ;;
476EX(.ex_handler_short, (p9) ld1 t4=[src1],2)
477EK(.ex_handler_short, (p10) ld1 t5=[src0],2)
478EX(.ex_handler_short, (p7) st1 [dst1] = t2,2)
479EK(.ex_handler_short, (p8) st1 [dst0] = t3,2)
480 ;;
481EX(.ex_handler_short, (p11) ld1 t6=[src1],2)
482EK(.ex_handler_short, (p12) ld1 t7=[src0],2)
483 cmp.eq p6,p7=r28,r29
484EX(.ex_handler_short, (p9) st1 [dst1] = t4,2)
485EK(.ex_handler_short, (p10) st1 [dst0] = t5,2)
486 sub in2=in2,r30
487 ;;
488EX(.ex_handler_short, (p11) st1 [dst1] = t6,2)
489EK(.ex_handler_short, (p12) st1 [dst0] = t7)
490 add dst0=in0,r30 // setup arguments
491 add src0=in1,r30
492(p6) br.cond.dptk .aligned_src
493(p7) br.cond.dpnt .unaligned_src
494 ;;
495
496/* main loop body in jump table format */
497#define COPYU(shift) \
4981: \
499EX(.ex_handler, (p16) ld8 r32=[src0],8); /* 1 */ \
500EK(.ex_handler, (p16) ld8 r36=[src1],8); \
501 (p17) shrp r35=r33,r34,shift;; /* 1 */ \
502EX(.ex_handler, (p6) ld8 r22=[src1]); /* common, prime for tail section */ \
503 nop.m 0; \
504 (p16) shrp r38=r36,r37,shift; \
505EX(.ex_handler, (p17) st8 [dst0]=r35,8); /* 1 */ \
506EK(.ex_handler, (p17) st8 [dst1]=r39,8); \
507 br.ctop.dptk.few 1b;; \
508 (p7) add src1=-8,src1; /* back out for <8 byte case */ \
509 shrp r21=r22,r38,shift; /* speculative work */ \
510 br.sptk.few .unaligned_src_tail /* branch out of jump table */ \
511 ;;
512 TEXT_ALIGN(32)
513.jump_table:
514 COPYU(8) // unaligned cases
515.jmp1:
516 COPYU(16)
517 COPYU(24)
518 COPYU(32)
519 COPYU(40)
520 COPYU(48)
521 COPYU(56)
522
523#undef A
524#undef B
525#undef C
526#undef D
527END(memcpy)
528
529/*
530 * Due to lack of local tag support in gcc 2.x assembler, it is not clear which
531 * instruction failed in the bundle. The exception algorithm is that we
532 * first figure out the faulting address, then detect if there is any
533 * progress made on the copy, if so, redo the copy from last known copied
534 * location up to the faulting address (exclusive). In the copy_from_user
535 * case, remaining byte in kernel buffer will be zeroed.
536 *
537 * Take copy_from_user as an example, in the code there are multiple loads
538 * in a bundle and those multiple loads could span over two pages, the
539 * faulting address is calculated as page_round_down(max(src0, src1)).
540 * This is based on knowledge that if we can access one byte in a page, we
541 * can access any byte in that page.
542 *
543 * predicate used in the exception handler:
544 * p6-p7: direction
545 * p10-p11: src faulting addr calculation
546 * p12-p13: dst faulting addr calculation
547 */
548
549#define A r19
550#define B r20
551#define C r21
552#define D r22
553#define F r28
554
555#define memset_arg0 r32
556#define memset_arg2 r33
557
558#define saved_retval loc0
559#define saved_rtlink loc1
560#define saved_pfs_stack loc2
561
562.ex_hndlr_s:
563 add src0=8,src0
564 br.sptk .ex_handler
565 ;;
566.ex_hndlr_d:
567 add dst0=8,dst0
568 br.sptk .ex_handler
569 ;;
570.ex_hndlr_lcpy_1:
571 mov src1=src_pre_mem
572 mov dst1=dst_pre_mem
573 cmp.gtu p10,p11=src_pre_mem,saved_in1
574 cmp.gtu p12,p13=dst_pre_mem,saved_in0
575 ;;
576(p10) add src0=8,saved_in1
577(p11) mov src0=saved_in1
578(p12) add dst0=8,saved_in0
579(p13) mov dst0=saved_in0
580 br.sptk .ex_handler
581.ex_handler_lcpy:
582 // in line_copy block, the preload addresses should always ahead
583 // of the other two src/dst pointers. Furthermore, src1/dst1 should
584 // always ahead of src0/dst0.
585 mov src1=src_pre_mem
586 mov dst1=dst_pre_mem
587.ex_handler:
588 mov pr=saved_pr,-1 // first restore pr, lc, and pfs
589 mov ar.lc=saved_lc
590 mov ar.pfs=saved_pfs
591 ;;
592.ex_handler_short: // fault occurred in these sections didn't change pr, lc, pfs
593 cmp.ltu p6,p7=saved_in0, saved_in1 // get the copy direction
594 cmp.ltu p10,p11=src0,src1
595 cmp.ltu p12,p13=dst0,dst1
596 fcmp.eq p8,p0=f6,f0 // is it memcpy?
597 mov tmp = dst0
598 ;;
599(p11) mov src1 = src0 // pick the larger of the two
600(p13) mov dst0 = dst1 // make dst0 the smaller one
601(p13) mov dst1 = tmp // and dst1 the larger one
602 ;;
603(p6) dep F = r0,dst1,0,PAGE_SHIFT // usr dst round down to page boundary
604(p7) dep F = r0,src1,0,PAGE_SHIFT // usr src round down to page boundary
605 ;;
606(p6) cmp.le p14,p0=dst0,saved_in0 // no progress has been made on store
607(p7) cmp.le p14,p0=src0,saved_in1 // no progress has been made on load
608 mov retval=saved_in2
609(p8) ld1 tmp=[src1] // force an oops for memcpy call
610(p8) st1 [dst1]=r0 // force an oops for memcpy call
611(p14) br.ret.sptk.many rp
612
613/*
614 * The remaining byte to copy is calculated as:
615 *
616 * A = (faulting_addr - orig_src) -> len to faulting ld address
617 * or
618 * (faulting_addr - orig_dst) -> len to faulting st address
619 * B = (cur_dst - orig_dst) -> len copied so far
620 * C = A - B -> len need to be copied
621 * D = orig_len - A -> len need to be zeroed
622 */
623(p6) sub A = F, saved_in0
624(p7) sub A = F, saved_in1
625 clrrrb
626 ;;
627 alloc saved_pfs_stack=ar.pfs,3,3,3,0
628 sub B = dst0, saved_in0 // how many byte copied so far
629 ;;
630 sub C = A, B
631 sub D = saved_in2, A
632 ;;
633 cmp.gt p8,p0=C,r0 // more than 1 byte?
634 add memset_arg0=saved_in0, A
635(p6) mov memset_arg2=0 // copy_to_user should not call memset
636(p7) mov memset_arg2=D // copy_from_user need to have kbuf zeroed
637 mov r8=0
638 mov saved_retval = D
639 mov saved_rtlink = b0
640
641 add out0=saved_in0, B
642 add out1=saved_in1, B
643 mov out2=C
644(p8) br.call.sptk.few b0=__copy_user // recursive call
645 ;;
646
647 add saved_retval=saved_retval,r8 // above might return non-zero value
648 cmp.gt p8,p0=memset_arg2,r0 // more than 1 byte?
649 mov out0=memset_arg0 // *s
650 mov out1=r0 // c
651 mov out2=memset_arg2 // n
652(p8) br.call.sptk.few b0=memset
653 ;;
654
655 mov retval=saved_retval
656 mov ar.pfs=saved_pfs_stack
657 mov b0=saved_rtlink
658 br.ret.sptk.many rp
659
660/* end of McKinley specific optimization */
661END(__copy_user)
diff --git a/arch/ia64/lib/memset.S b/arch/ia64/lib/memset.S
new file mode 100644
index 000000000000..bd8cf907fe22
--- /dev/null
+++ b/arch/ia64/lib/memset.S
@@ -0,0 +1,362 @@
1/* Optimized version of the standard memset() function.
2
3 Copyright (c) 2002 Hewlett-Packard Co/CERN
4 Sverre Jarp <Sverre.Jarp@cern.ch>
5
6 Return: dest
7
8 Inputs:
9 in0: dest
10 in1: value
11 in2: count
12
13 The algorithm is fairly straightforward: set byte by byte until we
14 we get to a 16B-aligned address, then loop on 128 B chunks using an
15 early store as prefetching, then loop on 32B chucks, then clear remaining
16 words, finally clear remaining bytes.
17 Since a stf.spill f0 can store 16B in one go, we use this instruction
18 to get peak speed when value = 0. */
19
20#include <asm/asmmacro.h>
21#undef ret
22
23#define dest in0
24#define value in1
25#define cnt in2
26
27#define tmp r31
28#define save_lc r30
29#define ptr0 r29
30#define ptr1 r28
31#define ptr2 r27
32#define ptr3 r26
33#define ptr9 r24
34#define loopcnt r23
35#define linecnt r22
36#define bytecnt r21
37
38#define fvalue f6
39
40// This routine uses only scratch predicate registers (p6 - p15)
41#define p_scr p6 // default register for same-cycle branches
42#define p_nz p7
43#define p_zr p8
44#define p_unalgn p9
45#define p_y p11
46#define p_n p12
47#define p_yy p13
48#define p_nn p14
49
50#define MIN1 15
51#define MIN1P1HALF 8
52#define LINE_SIZE 128
53#define LSIZE_SH 7 // shift amount
54#define PREF_AHEAD 8
55
56GLOBAL_ENTRY(memset)
57{ .mmi
58 .prologue
59 alloc tmp = ar.pfs, 3, 0, 0, 0
60 .body
61 lfetch.nt1 [dest] //
62 .save ar.lc, save_lc
63 mov.i save_lc = ar.lc
64} { .mmi
65 mov ret0 = dest // return value
66 cmp.ne p_nz, p_zr = value, r0 // use stf.spill if value is zero
67 cmp.eq p_scr, p0 = cnt, r0
68;; }
69{ .mmi
70 and ptr2 = -(MIN1+1), dest // aligned address
71 and tmp = MIN1, dest // prepare to check for correct alignment
72 tbit.nz p_y, p_n = dest, 0 // Do we have an odd address? (M_B_U)
73} { .mib
74 mov ptr1 = dest
75 mux1 value = value, @brcst // create 8 identical bytes in word
76(p_scr) br.ret.dpnt.many rp // return immediately if count = 0
77;; }
78{ .mib
79 cmp.ne p_unalgn, p0 = tmp, r0 //
80} { .mib
81 sub bytecnt = (MIN1+1), tmp // NB: # of bytes to move is 1 higher than loopcnt
82 cmp.gt p_scr, p0 = 16, cnt // is it a minimalistic task?
83(p_scr) br.cond.dptk.many .move_bytes_unaligned // go move just a few (M_B_U)
84;; }
85{ .mmi
86(p_unalgn) add ptr1 = (MIN1+1), ptr2 // after alignment
87(p_unalgn) add ptr2 = MIN1P1HALF, ptr2 // after alignment
88(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3 // should we do a st8 ?
89;; }
90{ .mib
91(p_y) add cnt = -8, cnt //
92(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2 // should we do a st4 ?
93} { .mib
94(p_y) st8 [ptr2] = value,-4 //
95(p_n) add ptr2 = 4, ptr2 //
96;; }
97{ .mib
98(p_yy) add cnt = -4, cnt //
99(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1 // should we do a st2 ?
100} { .mib
101(p_yy) st4 [ptr2] = value,-2 //
102(p_nn) add ptr2 = 2, ptr2 //
103;; }
104{ .mmi
105 mov tmp = LINE_SIZE+1 // for compare
106(p_y) add cnt = -2, cnt //
107(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0 // should we do a st1 ?
108} { .mmi
109 setf.sig fvalue=value // transfer value to FLP side
110(p_y) st2 [ptr2] = value,-1 //
111(p_n) add ptr2 = 1, ptr2 //
112;; }
113
114{ .mmi
115(p_yy) st1 [ptr2] = value //
116 cmp.gt p_scr, p0 = tmp, cnt // is it a minimalistic task?
117} { .mbb
118(p_yy) add cnt = -1, cnt //
119(p_scr) br.cond.dpnt.many .fraction_of_line // go move just a few
120;; }
121
122{ .mib
123 nop.m 0
124 shr.u linecnt = cnt, LSIZE_SH
125(p_zr) br.cond.dptk.many .l1b // Jump to use stf.spill
126;; }
127
128 TEXT_ALIGN(32) // --------------------- // L1A: store ahead into cache lines; fill later
129{ .mmi
130 and tmp = -(LINE_SIZE), cnt // compute end of range
131 mov ptr9 = ptr1 // used for prefetching
132 and cnt = (LINE_SIZE-1), cnt // remainder
133} { .mmi
134 mov loopcnt = PREF_AHEAD-1 // default prefetch loop
135 cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
136;; }
137{ .mmi
138(p_scr) add loopcnt = -1, linecnt //
139 add ptr2 = 8, ptr1 // start of stores (beyond prefetch stores)
140 add ptr1 = tmp, ptr1 // first address beyond total range
141;; }
142{ .mmi
143 add tmp = -1, linecnt // next loop count
144 mov.i ar.lc = loopcnt //
145;; }
146.pref_l1a:
147{ .mib
148 stf8 [ptr9] = fvalue, 128 // Do stores one cache line apart
149 nop.i 0
150 br.cloop.dptk.few .pref_l1a
151;; }
152{ .mmi
153 add ptr0 = 16, ptr2 // Two stores in parallel
154 mov.i ar.lc = tmp //
155;; }
156.l1ax:
157 { .mmi
158 stf8 [ptr2] = fvalue, 8
159 stf8 [ptr0] = fvalue, 8
160 ;; }
161 { .mmi
162 stf8 [ptr2] = fvalue, 24
163 stf8 [ptr0] = fvalue, 24
164 ;; }
165 { .mmi
166 stf8 [ptr2] = fvalue, 8
167 stf8 [ptr0] = fvalue, 8
168 ;; }
169 { .mmi
170 stf8 [ptr2] = fvalue, 24
171 stf8 [ptr0] = fvalue, 24
172 ;; }
173 { .mmi
174 stf8 [ptr2] = fvalue, 8
175 stf8 [ptr0] = fvalue, 8
176 ;; }
177 { .mmi
178 stf8 [ptr2] = fvalue, 24
179 stf8 [ptr0] = fvalue, 24
180 ;; }
181 { .mmi
182 stf8 [ptr2] = fvalue, 8
183 stf8 [ptr0] = fvalue, 32
184 cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching?
185 ;; }
186{ .mmb
187 stf8 [ptr2] = fvalue, 24
188(p_scr) stf8 [ptr9] = fvalue, 128
189 br.cloop.dptk.few .l1ax
190;; }
191{ .mbb
192 cmp.le p_scr, p0 = 8, cnt // just a few bytes left ?
193(p_scr) br.cond.dpnt.many .fraction_of_line // Branch no. 2
194 br.cond.dpnt.many .move_bytes_from_alignment // Branch no. 3
195;; }
196
197 TEXT_ALIGN(32)
198.l1b: // ------------------------------------ // L1B: store ahead into cache lines; fill later
199{ .mmi
200 and tmp = -(LINE_SIZE), cnt // compute end of range
201 mov ptr9 = ptr1 // used for prefetching
202 and cnt = (LINE_SIZE-1), cnt // remainder
203} { .mmi
204 mov loopcnt = PREF_AHEAD-1 // default prefetch loop
205 cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
206;; }
207{ .mmi
208(p_scr) add loopcnt = -1, linecnt
209 add ptr2 = 16, ptr1 // start of stores (beyond prefetch stores)
210 add ptr1 = tmp, ptr1 // first address beyond total range
211;; }
212{ .mmi
213 add tmp = -1, linecnt // next loop count
214 mov.i ar.lc = loopcnt
215;; }
216.pref_l1b:
217{ .mib
218 stf.spill [ptr9] = f0, 128 // Do stores one cache line apart
219 nop.i 0
220 br.cloop.dptk.few .pref_l1b
221;; }
222{ .mmi
223 add ptr0 = 16, ptr2 // Two stores in parallel
224 mov.i ar.lc = tmp
225;; }
226.l1bx:
227 { .mmi
228 stf.spill [ptr2] = f0, 32
229 stf.spill [ptr0] = f0, 32
230 ;; }
231 { .mmi
232 stf.spill [ptr2] = f0, 32
233 stf.spill [ptr0] = f0, 32
234 ;; }
235 { .mmi
236 stf.spill [ptr2] = f0, 32
237 stf.spill [ptr0] = f0, 64
238 cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching?
239 ;; }
240{ .mmb
241 stf.spill [ptr2] = f0, 32
242(p_scr) stf.spill [ptr9] = f0, 128
243 br.cloop.dptk.few .l1bx
244;; }
245{ .mib
246 cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ?
247(p_scr) br.cond.dpnt.many .move_bytes_from_alignment //
248;; }
249
250.fraction_of_line:
251{ .mib
252 add ptr2 = 16, ptr1
253 shr.u loopcnt = cnt, 5 // loopcnt = cnt / 32
254;; }
255{ .mib
256 cmp.eq p_scr, p0 = loopcnt, r0
257 add loopcnt = -1, loopcnt
258(p_scr) br.cond.dpnt.many .store_words
259;; }
260{ .mib
261 and cnt = 0x1f, cnt // compute the remaining cnt
262 mov.i ar.lc = loopcnt
263;; }
264 TEXT_ALIGN(32)
265.l2: // ------------------------------------ // L2A: store 32B in 2 cycles
266{ .mmb
267 stf8 [ptr1] = fvalue, 8
268 stf8 [ptr2] = fvalue, 8
269;; } { .mmb
270 stf8 [ptr1] = fvalue, 24
271 stf8 [ptr2] = fvalue, 24
272 br.cloop.dptk.many .l2
273;; }
274.store_words:
275{ .mib
276 cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ?
277(p_scr) br.cond.dpnt.many .move_bytes_from_alignment // Branch
278;; }
279
280{ .mmi
281 stf8 [ptr1] = fvalue, 8 // store
282 cmp.le p_y, p_n = 16, cnt
283 add cnt = -8, cnt // subtract
284;; }
285{ .mmi
286(p_y) stf8 [ptr1] = fvalue, 8 // store
287(p_y) cmp.le.unc p_yy, p_nn = 16, cnt
288(p_y) add cnt = -8, cnt // subtract
289;; }
290{ .mmi // store
291(p_yy) stf8 [ptr1] = fvalue, 8
292(p_yy) add cnt = -8, cnt // subtract
293;; }
294
295.move_bytes_from_alignment:
296{ .mib
297 cmp.eq p_scr, p0 = cnt, r0
298 tbit.nz.unc p_y, p0 = cnt, 2 // should we terminate with a st4 ?
299(p_scr) br.cond.dpnt.few .restore_and_exit
300;; }
301{ .mib
302(p_y) st4 [ptr1] = value,4
303 tbit.nz.unc p_yy, p0 = cnt, 1 // should we terminate with a st2 ?
304;; }
305{ .mib
306(p_yy) st2 [ptr1] = value,2
307 tbit.nz.unc p_y, p0 = cnt, 0 // should we terminate with a st1 ?
308;; }
309
310{ .mib
311(p_y) st1 [ptr1] = value
312;; }
313.restore_and_exit:
314{ .mib
315 nop.m 0
316 mov.i ar.lc = save_lc
317 br.ret.sptk.many rp
318;; }
319
320.move_bytes_unaligned:
321{ .mmi
322 .pred.rel "mutex",p_y, p_n
323 .pred.rel "mutex",p_yy, p_nn
324(p_n) cmp.le p_yy, p_nn = 4, cnt
325(p_y) cmp.le p_yy, p_nn = 5, cnt
326(p_n) add ptr2 = 2, ptr1
327} { .mmi
328(p_y) add ptr2 = 3, ptr1
329(p_y) st1 [ptr1] = value, 1 // fill 1 (odd-aligned) byte [15, 14 (or less) left]
330(p_y) add cnt = -1, cnt
331;; }
332{ .mmi
333(p_yy) cmp.le.unc p_y, p0 = 8, cnt
334 add ptr3 = ptr1, cnt // prepare last store
335 mov.i ar.lc = save_lc
336} { .mmi
337(p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
338(p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [11, 10 (o less) left]
339(p_yy) add cnt = -4, cnt
340;; }
341{ .mmi
342(p_y) cmp.le.unc p_yy, p0 = 8, cnt
343 add ptr3 = -1, ptr3 // last store
344 tbit.nz p_scr, p0 = cnt, 1 // will there be a st2 at the end ?
345} { .mmi
346(p_y) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
347(p_y) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [7, 6 (or less) left]
348(p_y) add cnt = -4, cnt
349;; }
350{ .mmi
351(p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
352(p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [3, 2 (or less) left]
353 tbit.nz p_y, p0 = cnt, 0 // will there be a st1 at the end ?
354} { .mmi
355(p_yy) add cnt = -4, cnt
356;; }
357{ .mmb
358(p_scr) st2 [ptr1] = value // fill 2 (aligned) bytes
359(p_y) st1 [ptr3] = value // fill last byte (using ptr3)
360 br.ret.sptk.many rp
361}
362END(memset)
diff --git a/arch/ia64/lib/strlen.S b/arch/ia64/lib/strlen.S
new file mode 100644
index 000000000000..e0cdac0a85b8
--- /dev/null
+++ b/arch/ia64/lib/strlen.S
@@ -0,0 +1,192 @@
1/*
2 *
3 * Optimized version of the standard strlen() function
4 *
5 *
6 * Inputs:
7 * in0 address of string
8 *
9 * Outputs:
10 * ret0 the number of characters in the string (0 if empty string)
11 * does not count the \0
12 *
13 * Copyright (C) 1999, 2001 Hewlett-Packard Co
14 * Stephane Eranian <eranian@hpl.hp.com>
15 *
16 * 09/24/99 S.Eranian add speculation recovery code
17 */
18
19#include <asm/asmmacro.h>
20
21//
22//
23// This is an enhanced version of the basic strlen. it includes a combination
24// of compute zero index (czx), parallel comparisons, speculative loads and
25// loop unroll using rotating registers.
26//
27// General Ideas about the algorithm:
28// The goal is to look at the string in chunks of 8 bytes.
29// so we need to do a few extra checks at the beginning because the
30// string may not be 8-byte aligned. In this case we load the 8byte
31// quantity which includes the start of the string and mask the unused
32// bytes with 0xff to avoid confusing czx.
33// We use speculative loads and software pipelining to hide memory
34// latency and do read ahead safely. This way we defer any exception.
35//
36// Because we don't want the kernel to be relying on particular
37// settings of the DCR register, we provide recovery code in case
38// speculation fails. The recovery code is going to "redo" the work using
39// only normal loads. If we still get a fault then we generate a
40// kernel panic. Otherwise we return the strlen as usual.
41//
42// The fact that speculation may fail can be caused, for instance, by
43// the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
44// a NaT bit will be set if the translation is not present. The normal
45// load, on the other hand, will cause the translation to be inserted
46// if the mapping exists.
47//
48// It should be noted that we execute recovery code only when we need
49// to use the data that has been speculatively loaded: we don't execute
50// recovery code on pure read ahead data.
51//
52// Remarks:
53// - the cmp r0,r0 is used as a fast way to initialize a predicate
54// register to 1. This is required to make sure that we get the parallel
55// compare correct.
56//
57// - we don't use the epilogue counter to exit the loop but we need to set
58// it to zero beforehand.
59//
60// - after the loop we must test for Nat values because neither the
61// czx nor cmp instruction raise a NaT consumption fault. We must be
62// careful not to look too far for a Nat for which we don't care.
63// For instance we don't need to look at a NaT in val2 if the zero byte
64// was in val1.
65//
66// - Clearly performance tuning is required.
67//
68//
69//
70#define saved_pfs r11
71#define tmp r10
72#define base r16
73#define orig r17
74#define saved_pr r18
75#define src r19
76#define mask r20
77#define val r21
78#define val1 r22
79#define val2 r23
80
81GLOBAL_ENTRY(strlen)
82 .prologue
83 .save ar.pfs, saved_pfs
84 alloc saved_pfs=ar.pfs,11,0,0,8 // rotating must be multiple of 8
85
86 .rotr v[2], w[2] // declares our 4 aliases
87
88 extr.u tmp=in0,0,3 // tmp=least significant 3 bits
89 mov orig=in0 // keep trackof initial byte address
90 dep src=0,in0,0,3 // src=8byte-aligned in0 address
91 .save pr, saved_pr
92 mov saved_pr=pr // preserve predicates (rotation)
93 ;;
94
95 .body
96
97 ld8 v[1]=[src],8 // must not speculate: can fail here
98 shl tmp=tmp,3 // multiply by 8bits/byte
99 mov mask=-1 // our mask
100 ;;
101 ld8.s w[1]=[src],8 // speculatively load next
102 cmp.eq p6,p0=r0,r0 // sets p6 to true for cmp.and
103 sub tmp=64,tmp // how many bits to shift our mask on the right
104 ;;
105 shr.u mask=mask,tmp // zero enough bits to hold v[1] valuable part
106 mov ar.ec=r0 // clear epilogue counter (saved in ar.pfs)
107 ;;
108 add base=-16,src // keep track of aligned base
109 or v[1]=v[1],mask // now we have a safe initial byte pattern
110 ;;
1111:
112 ld8.s v[0]=[src],8 // speculatively load next
113 czx1.r val1=v[1] // search 0 byte from right
114 czx1.r val2=w[1] // search 0 byte from right following 8bytes
115 ;;
116 ld8.s w[0]=[src],8 // speculatively load next to next
117 cmp.eq.and p6,p0=8,val1 // p6 = p6 and val1==8
118 cmp.eq.and p6,p0=8,val2 // p6 = p6 and mask==8
119(p6) br.wtop.dptk 1b // loop until p6 == 0
120 ;;
121 //
122 // We must return try the recovery code iff
123 // val1_is_nat || (val1==8 && val2_is_nat)
124 //
125 // XXX Fixme
126 // - there must be a better way of doing the test
127 //
128 cmp.eq p8,p9=8,val1 // p6 = val1 had zero (disambiguate)
129 tnat.nz p6,p7=val1 // test NaT on val1
130(p6) br.cond.spnt .recover // jump to recovery if val1 is NaT
131 ;;
132 //
133 // if we come here p7 is true, i.e., initialized for // cmp
134 //
135 cmp.eq.and p7,p0=8,val1// val1==8?
136 tnat.nz.and p7,p0=val2 // test NaT if val2
137(p7) br.cond.spnt .recover // jump to recovery if val2 is NaT
138 ;;
139(p8) mov val1=val2 // the other test got us out of the loop
140(p8) adds src=-16,src // correct position when 3 ahead
141(p9) adds src=-24,src // correct position when 4 ahead
142 ;;
143 sub ret0=src,orig // distance from base
144 sub tmp=8,val1 // which byte in word
145 mov pr=saved_pr,0xffffffffffff0000
146 ;;
147 sub ret0=ret0,tmp // adjust
148 mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
149 br.ret.sptk.many rp // end of normal execution
150
151 //
152 // Outlined recovery code when speculation failed
153 //
154 // This time we don't use speculation and rely on the normal exception
155 // mechanism. that's why the loop is not as good as the previous one
156 // because read ahead is not possible
157 //
158 // IMPORTANT:
159 // Please note that in the case of strlen() as opposed to strlen_user()
160 // we don't use the exception mechanism, as this function is not
161 // supposed to fail. If that happens it means we have a bug and the
162 // code will cause of kernel fault.
163 //
164 // XXX Fixme
165 // - today we restart from the beginning of the string instead
166 // of trying to continue where we left off.
167 //
168.recover:
169 ld8 val=[base],8 // will fail if unrecoverable fault
170 ;;
171 or val=val,mask // remask first bytes
172 cmp.eq p0,p6=r0,r0 // nullify first ld8 in loop
173 ;;
174 //
175 // ar.ec is still zero here
176 //
1772:
178(p6) ld8 val=[base],8 // will fail if unrecoverable fault
179 ;;
180 czx1.r val1=val // search 0 byte from right
181 ;;
182 cmp.eq p6,p0=8,val1 // val1==8 ?
183(p6) br.wtop.dptk 2b // loop until p6 == 0
184 ;; // (avoid WAW on p63)
185 sub ret0=base,orig // distance from base
186 sub tmp=8,val1
187 mov pr=saved_pr,0xffffffffffff0000
188 ;;
189 sub ret0=ret0,tmp // length=now - back -1
190 mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
191 br.ret.sptk.many rp // end of successful recovery code
192END(strlen)
diff --git a/arch/ia64/lib/strlen_user.S b/arch/ia64/lib/strlen_user.S
new file mode 100644
index 000000000000..c71eded4285e
--- /dev/null
+++ b/arch/ia64/lib/strlen_user.S
@@ -0,0 +1,198 @@
1/*
2 * Optimized version of the strlen_user() function
3 *
4 * Inputs:
5 * in0 address of buffer
6 *
7 * Outputs:
8 * ret0 0 in case of fault, strlen(buffer)+1 otherwise
9 *
10 * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
11 * David Mosberger-Tang <davidm@hpl.hp.com>
12 * Stephane Eranian <eranian@hpl.hp.com>
13 *
14 * 01/19/99 S.Eranian heavily enhanced version (see details below)
15 * 09/24/99 S.Eranian added speculation recovery code
16 */
17
18#include <asm/asmmacro.h>
19
20//
21// int strlen_user(char *)
22// ------------------------
23// Returns:
24// - length of string + 1
25// - 0 in case an exception is raised
26//
27// This is an enhanced version of the basic strlen_user. it includes a
28// combination of compute zero index (czx), parallel comparisons, speculative
29// loads and loop unroll using rotating registers.
30//
31// General Ideas about the algorithm:
32// The goal is to look at the string in chunks of 8 bytes.
33// so we need to do a few extra checks at the beginning because the
34// string may not be 8-byte aligned. In this case we load the 8byte
35// quantity which includes the start of the string and mask the unused
36// bytes with 0xff to avoid confusing czx.
37// We use speculative loads and software pipelining to hide memory
38// latency and do read ahead safely. This way we defer any exception.
39//
40// Because we don't want the kernel to be relying on particular
41// settings of the DCR register, we provide recovery code in case
42// speculation fails. The recovery code is going to "redo" the work using
43// only normal loads. If we still get a fault then we return an
44// error (ret0=0). Otherwise we return the strlen+1 as usual.
45// The fact that speculation may fail can be caused, for instance, by
46// the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
47// a NaT bit will be set if the translation is not present. The normal
48// load, on the other hand, will cause the translation to be inserted
49// if the mapping exists.
50//
51// It should be noted that we execute recovery code only when we need
52// to use the data that has been speculatively loaded: we don't execute
53// recovery code on pure read ahead data.
54//
55// Remarks:
56// - the cmp r0,r0 is used as a fast way to initialize a predicate
57// register to 1. This is required to make sure that we get the parallel
58// compare correct.
59//
60// - we don't use the epilogue counter to exit the loop but we need to set
61// it to zero beforehand.
62//
63// - after the loop we must test for Nat values because neither the
64// czx nor cmp instruction raise a NaT consumption fault. We must be
65// careful not to look too far for a Nat for which we don't care.
66// For instance we don't need to look at a NaT in val2 if the zero byte
67// was in val1.
68//
69// - Clearly performance tuning is required.
70//
71
72#define saved_pfs r11
73#define tmp r10
74#define base r16
75#define orig r17
76#define saved_pr r18
77#define src r19
78#define mask r20
79#define val r21
80#define val1 r22
81#define val2 r23
82
83GLOBAL_ENTRY(__strlen_user)
84 .prologue
85 .save ar.pfs, saved_pfs
86 alloc saved_pfs=ar.pfs,11,0,0,8
87
88 .rotr v[2], w[2] // declares our 4 aliases
89
90 extr.u tmp=in0,0,3 // tmp=least significant 3 bits
91 mov orig=in0 // keep trackof initial byte address
92 dep src=0,in0,0,3 // src=8byte-aligned in0 address
93 .save pr, saved_pr
94 mov saved_pr=pr // preserve predicates (rotation)
95 ;;
96
97 .body
98
99 ld8.s v[1]=[src],8 // load the initial 8bytes (must speculate)
100 shl tmp=tmp,3 // multiply by 8bits/byte
101 mov mask=-1 // our mask
102 ;;
103 ld8.s w[1]=[src],8 // load next 8 bytes in 2nd pipeline
104 cmp.eq p6,p0=r0,r0 // sets p6 (required because of // cmp.and)
105 sub tmp=64,tmp // how many bits to shift our mask on the right
106 ;;
107 shr.u mask=mask,tmp // zero enough bits to hold v[1] valuable part
108 mov ar.ec=r0 // clear epilogue counter (saved in ar.pfs)
109 ;;
110 add base=-16,src // keep track of aligned base
111 chk.s v[1], .recover // if already NaT, then directly skip to recover
112 or v[1]=v[1],mask // now we have a safe initial byte pattern
113 ;;
1141:
115 ld8.s v[0]=[src],8 // speculatively load next
116 czx1.r val1=v[1] // search 0 byte from right
117 czx1.r val2=w[1] // search 0 byte from right following 8bytes
118 ;;
119 ld8.s w[0]=[src],8 // speculatively load next to next
120 cmp.eq.and p6,p0=8,val1 // p6 = p6 and val1==8
121 cmp.eq.and p6,p0=8,val2 // p6 = p6 and mask==8
122(p6) br.wtop.dptk.few 1b // loop until p6 == 0
123 ;;
124 //
125 // We must return try the recovery code iff
126 // val1_is_nat || (val1==8 && val2_is_nat)
127 //
128 // XXX Fixme
129 // - there must be a better way of doing the test
130 //
131 cmp.eq p8,p9=8,val1 // p6 = val1 had zero (disambiguate)
132 tnat.nz p6,p7=val1 // test NaT on val1
133(p6) br.cond.spnt .recover // jump to recovery if val1 is NaT
134 ;;
135 //
136 // if we come here p7 is true, i.e., initialized for // cmp
137 //
138 cmp.eq.and p7,p0=8,val1// val1==8?
139 tnat.nz.and p7,p0=val2 // test NaT if val2
140(p7) br.cond.spnt .recover // jump to recovery if val2 is NaT
141 ;;
142(p8) mov val1=val2 // val2 contains the value
143(p8) adds src=-16,src // correct position when 3 ahead
144(p9) adds src=-24,src // correct position when 4 ahead
145 ;;
146 sub ret0=src,orig // distance from origin
147 sub tmp=7,val1 // 7=8-1 because this strlen returns strlen+1
148 mov pr=saved_pr,0xffffffffffff0000
149 ;;
150 sub ret0=ret0,tmp // length=now - back -1
151 mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
152 br.ret.sptk.many rp // end of normal execution
153
154 //
155 // Outlined recovery code when speculation failed
156 //
157 // This time we don't use speculation and rely on the normal exception
158 // mechanism. that's why the loop is not as good as the previous one
159 // because read ahead is not possible
160 //
161 // XXX Fixme
162 // - today we restart from the beginning of the string instead
163 // of trying to continue where we left off.
164 //
165.recover:
166 EX(.Lexit1, ld8 val=[base],8) // load the initial bytes
167 ;;
168 or val=val,mask // remask first bytes
169 cmp.eq p0,p6=r0,r0 // nullify first ld8 in loop
170 ;;
171 //
172 // ar.ec is still zero here
173 //
1742:
175 EX(.Lexit1, (p6) ld8 val=[base],8)
176 ;;
177 czx1.r val1=val // search 0 byte from right
178 ;;
179 cmp.eq p6,p0=8,val1 // val1==8 ?
180(p6) br.wtop.dptk.few 2b // loop until p6 == 0
181 ;;
182 sub ret0=base,orig // distance from base
183 sub tmp=7,val1 // 7=8-1 because this strlen returns strlen+1
184 mov pr=saved_pr,0xffffffffffff0000
185 ;;
186 sub ret0=ret0,tmp // length=now - back -1
187 mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
188 br.ret.sptk.many rp // end of successful recovery code
189
190 //
191 // We failed even on the normal load (called from exception handler)
192 //
193.Lexit1:
194 mov ret0=0
195 mov pr=saved_pr,0xffffffffffff0000
196 mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
197 br.ret.sptk.many rp
198END(__strlen_user)
diff --git a/arch/ia64/lib/strncpy_from_user.S b/arch/ia64/lib/strncpy_from_user.S
new file mode 100644
index 000000000000..a504381f31eb
--- /dev/null
+++ b/arch/ia64/lib/strncpy_from_user.S
@@ -0,0 +1,44 @@
1/*
2 * Just like strncpy() except that if a fault occurs during copying,
3 * -EFAULT is returned.
4 *
5 * Inputs:
6 * in0: address of destination buffer
7 * in1: address of string to be copied
8 * in2: length of buffer in bytes
9 * Outputs:
10 * r8: -EFAULT in case of fault or number of bytes copied if no fault
11 *
12 * Copyright (C) 1998-2001 Hewlett-Packard Co
13 * Copyright (C) 1998-2001 David Mosberger-Tang <davidm@hpl.hp.com>
14 *
15 * 00/03/06 D. Mosberger Fixed to return proper return value (bug found by
16 * by Andreas Schwab <schwab@suse.de>).
17 */
18
19#include <asm/asmmacro.h>
20
21GLOBAL_ENTRY(__strncpy_from_user)
22 alloc r2=ar.pfs,3,0,0,0
23 mov r8=0
24 mov r9=in1
25 ;;
26 add r10=in1,in2
27 cmp.eq p6,p0=r0,in2
28(p6) br.ret.spnt.many rp
29
30 // XXX braindead copy loop---this needs to be optimized
31.Loop1:
32 EX(.Lexit, ld1 r8=[in1],1)
33 ;;
34 EX(.Lexit, st1 [in0]=r8,1)
35 cmp.ne p6,p7=r8,r0
36 ;;
37(p6) cmp.ne.unc p8,p0=in1,r10
38(p8) br.cond.dpnt.few .Loop1
39 ;;
40(p6) mov r8=in2 // buffer filled up---return buffer length
41(p7) sub r8=in1,r9,1 // return string length (excluding NUL character)
42[.Lexit:]
43 br.ret.sptk.many rp
44END(__strncpy_from_user)
diff --git a/arch/ia64/lib/strnlen_user.S b/arch/ia64/lib/strnlen_user.S
new file mode 100644
index 000000000000..d09066b1e49d
--- /dev/null
+++ b/arch/ia64/lib/strnlen_user.S
@@ -0,0 +1,45 @@
1/*
2 * Returns 0 if exception before NUL or reaching the supplied limit (N),
3 * a value greater than N if the string is longer than the limit, else
4 * strlen.
5 *
6 * Inputs:
7 * in0: address of buffer
8 * in1: string length limit N
9 * Outputs:
10 * r8: 0 in case of fault, strlen(buffer)+1 otherwise
11 *
12 * Copyright (C) 1999, 2001 David Mosberger-Tang <davidm@hpl.hp.com>
13 */
14
15#include <asm/asmmacro.h>
16
17GLOBAL_ENTRY(__strnlen_user)
18 .prologue
19 alloc r2=ar.pfs,2,0,0,0
20 .save ar.lc, r16
21 mov r16=ar.lc // preserve ar.lc
22
23 .body
24
25 add r3=-1,in1
26 ;;
27 mov ar.lc=r3
28 mov r9=0
29 ;;
30 // XXX braindead strlen loop---this needs to be optimized
31.Loop1:
32 EXCLR(.Lexit, ld1 r8=[in0],1)
33 add r9=1,r9
34 ;;
35 cmp.eq p6,p0=r8,r0
36(p6) br.cond.dpnt .Lexit
37 br.cloop.dptk.few .Loop1
38
39 add r9=1,in1 // NUL not found---return N+1
40 ;;
41.Lexit:
42 mov r8=r9
43 mov ar.lc=r16 // restore ar.lc
44 br.ret.sptk.many rp
45END(__strnlen_user)
diff --git a/arch/ia64/lib/swiotlb.c b/arch/ia64/lib/swiotlb.c
new file mode 100644
index 000000000000..ab7b3ad99a7f
--- /dev/null
+++ b/arch/ia64/lib/swiotlb.c
@@ -0,0 +1,658 @@
1/*
2 * Dynamic DMA mapping support.
3 *
4 * This implementation is for IA-64 platforms that do not support
5 * I/O TLBs (aka DMA address translation hardware).
6 * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com>
7 * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com>
8 * Copyright (C) 2000, 2003 Hewlett-Packard Co
9 * David Mosberger-Tang <davidm@hpl.hp.com>
10 *
11 * 03/05/07 davidm Switch from PCI-DMA to generic device DMA API.
12 * 00/12/13 davidm Rename to swiotlb.c and add mark_clean() to avoid
13 * unnecessary i-cache flushing.
14 * 04/07/.. ak Better overflow handling. Assorted fixes.
15 */
16
17#include <linux/cache.h>
18#include <linux/mm.h>
19#include <linux/module.h>
20#include <linux/pci.h>
21#include <linux/spinlock.h>
22#include <linux/string.h>
23#include <linux/types.h>
24#include <linux/ctype.h>
25
26#include <asm/io.h>
27#include <asm/pci.h>
28#include <asm/dma.h>
29
30#include <linux/init.h>
31#include <linux/bootmem.h>
32
33#define OFFSET(val,align) ((unsigned long) \
34 ( (val) & ( (align) - 1)))
35
36#define SG_ENT_VIRT_ADDRESS(sg) (page_address((sg)->page) + (sg)->offset)
37#define SG_ENT_PHYS_ADDRESS(SG) virt_to_phys(SG_ENT_VIRT_ADDRESS(SG))
38
39/*
40 * Maximum allowable number of contiguous slabs to map,
41 * must be a power of 2. What is the appropriate value ?
42 * The complexity of {map,unmap}_single is linearly dependent on this value.
43 */
44#define IO_TLB_SEGSIZE 128
45
46/*
47 * log of the size of each IO TLB slab. The number of slabs is command line
48 * controllable.
49 */
50#define IO_TLB_SHIFT 11
51
52int swiotlb_force;
53
54/*
55 * Used to do a quick range check in swiotlb_unmap_single and
56 * swiotlb_sync_single_*, to see if the memory was in fact allocated by this
57 * API.
58 */
59static char *io_tlb_start, *io_tlb_end;
60
61/*
62 * The number of IO TLB blocks (in groups of 64) betweeen io_tlb_start and
63 * io_tlb_end. This is command line adjustable via setup_io_tlb_npages.
64 */
65static unsigned long io_tlb_nslabs;
66
67/*
68 * When the IOMMU overflows we return a fallback buffer. This sets the size.
69 */
70static unsigned long io_tlb_overflow = 32*1024;
71
72void *io_tlb_overflow_buffer;
73
74/*
75 * This is a free list describing the number of free entries available from
76 * each index
77 */
78static unsigned int *io_tlb_list;
79static unsigned int io_tlb_index;
80
81/*
82 * We need to save away the original address corresponding to a mapped entry
83 * for the sync operations.
84 */
85static unsigned char **io_tlb_orig_addr;
86
87/*
88 * Protect the above data structures in the map and unmap calls
89 */
90static DEFINE_SPINLOCK(io_tlb_lock);
91
92static int __init
93setup_io_tlb_npages(char *str)
94{
95 if (isdigit(*str)) {
96 io_tlb_nslabs = simple_strtoul(str, &str, 0) <<
97 (PAGE_SHIFT - IO_TLB_SHIFT);
98 /* avoid tail segment of size < IO_TLB_SEGSIZE */
99 io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
100 }
101 if (*str == ',')
102 ++str;
103 if (!strcmp(str, "force"))
104 swiotlb_force = 1;
105 return 1;
106}
107__setup("swiotlb=", setup_io_tlb_npages);
108/* make io_tlb_overflow tunable too? */
109
110/*
111 * Statically reserve bounce buffer space and initialize bounce buffer data
112 * structures for the software IO TLB used to implement the PCI DMA API.
113 */
114void
115swiotlb_init_with_default_size (size_t default_size)
116{
117 unsigned long i;
118
119 if (!io_tlb_nslabs) {
120 io_tlb_nslabs = (default_size >> PAGE_SHIFT);
121 io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
122 }
123
124 /*
125 * Get IO TLB memory from the low pages
126 */
127 io_tlb_start = alloc_bootmem_low_pages(io_tlb_nslabs *
128 (1 << IO_TLB_SHIFT));
129 if (!io_tlb_start)
130 panic("Cannot allocate SWIOTLB buffer");
131 io_tlb_end = io_tlb_start + io_tlb_nslabs * (1 << IO_TLB_SHIFT);
132
133 /*
134 * Allocate and initialize the free list array. This array is used
135 * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
136 * between io_tlb_start and io_tlb_end.
137 */
138 io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int));
139 for (i = 0; i < io_tlb_nslabs; i++)
140 io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
141 io_tlb_index = 0;
142 io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(char *));
143
144 /*
145 * Get the overflow emergency buffer
146 */
147 io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
148 printk(KERN_INFO "Placing software IO TLB between 0x%lx - 0x%lx\n",
149 virt_to_phys(io_tlb_start), virt_to_phys(io_tlb_end));
150}
151
152void
153swiotlb_init (void)
154{
155 swiotlb_init_with_default_size(64 * (1<<20)); /* default to 64MB */
156}
157
158static inline int
159address_needs_mapping(struct device *hwdev, dma_addr_t addr)
160{
161 dma_addr_t mask = 0xffffffff;
162 /* If the device has a mask, use it, otherwise default to 32 bits */
163 if (hwdev && hwdev->dma_mask)
164 mask = *hwdev->dma_mask;
165 return (addr & ~mask) != 0;
166}
167
168/*
169 * Allocates bounce buffer and returns its kernel virtual address.
170 */
171static void *
172map_single(struct device *hwdev, char *buffer, size_t size, int dir)
173{
174 unsigned long flags;
175 char *dma_addr;
176 unsigned int nslots, stride, index, wrap;
177 int i;
178
179 /*
180 * For mappings greater than a page, we limit the stride (and
181 * hence alignment) to a page size.
182 */
183 nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
184 if (size > PAGE_SIZE)
185 stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
186 else
187 stride = 1;
188
189 if (!nslots)
190 BUG();
191
192 /*
193 * Find suitable number of IO TLB entries size that will fit this
194 * request and allocate a buffer from that IO TLB pool.
195 */
196 spin_lock_irqsave(&io_tlb_lock, flags);
197 {
198 wrap = index = ALIGN(io_tlb_index, stride);
199
200 if (index >= io_tlb_nslabs)
201 wrap = index = 0;
202
203 do {
204 /*
205 * If we find a slot that indicates we have 'nslots'
206 * number of contiguous buffers, we allocate the
207 * buffers from that slot and mark the entries as '0'
208 * indicating unavailable.
209 */
210 if (io_tlb_list[index] >= nslots) {
211 int count = 0;
212
213 for (i = index; i < (int) (index + nslots); i++)
214 io_tlb_list[i] = 0;
215 for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
216 io_tlb_list[i] = ++count;
217 dma_addr = io_tlb_start + (index << IO_TLB_SHIFT);
218
219 /*
220 * Update the indices to avoid searching in
221 * the next round.
222 */
223 io_tlb_index = ((index + nslots) < io_tlb_nslabs
224 ? (index + nslots) : 0);
225
226 goto found;
227 }
228 index += stride;
229 if (index >= io_tlb_nslabs)
230 index = 0;
231 } while (index != wrap);
232
233 spin_unlock_irqrestore(&io_tlb_lock, flags);
234 return NULL;
235 }
236 found:
237 spin_unlock_irqrestore(&io_tlb_lock, flags);
238
239 /*
240 * Save away the mapping from the original address to the DMA address.
241 * This is needed when we sync the memory. Then we sync the buffer if
242 * needed.
243 */
244 io_tlb_orig_addr[index] = buffer;
245 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
246 memcpy(dma_addr, buffer, size);
247
248 return dma_addr;
249}
250
251/*
252 * dma_addr is the kernel virtual address of the bounce buffer to unmap.
253 */
254static void
255unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
256{
257 unsigned long flags;
258 int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
259 int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
260 char *buffer = io_tlb_orig_addr[index];
261
262 /*
263 * First, sync the memory before unmapping the entry
264 */
265 if (buffer && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
266 /*
267 * bounce... copy the data back into the original buffer * and
268 * delete the bounce buffer.
269 */
270 memcpy(buffer, dma_addr, size);
271
272 /*
273 * Return the buffer to the free list by setting the corresponding
274 * entries to indicate the number of contigous entries available.
275 * While returning the entries to the free list, we merge the entries
276 * with slots below and above the pool being returned.
277 */
278 spin_lock_irqsave(&io_tlb_lock, flags);
279 {
280 count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
281 io_tlb_list[index + nslots] : 0);
282 /*
283 * Step 1: return the slots to the free list, merging the
284 * slots with superceeding slots
285 */
286 for (i = index + nslots - 1; i >= index; i--)
287 io_tlb_list[i] = ++count;
288 /*
289 * Step 2: merge the returned slots with the preceding slots,
290 * if available (non zero)
291 */
292 for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
293 io_tlb_list[i] = ++count;
294 }
295 spin_unlock_irqrestore(&io_tlb_lock, flags);
296}
297
298static void
299sync_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
300{
301 int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
302 char *buffer = io_tlb_orig_addr[index];
303
304 /*
305 * bounce... copy the data back into/from the original buffer
306 * XXX How do you handle DMA_BIDIRECTIONAL here ?
307 */
308 if (dir == DMA_FROM_DEVICE)
309 memcpy(buffer, dma_addr, size);
310 else if (dir == DMA_TO_DEVICE)
311 memcpy(dma_addr, buffer, size);
312 else
313 BUG();
314}
315
316void *
317swiotlb_alloc_coherent(struct device *hwdev, size_t size,
318 dma_addr_t *dma_handle, int flags)
319{
320 unsigned long dev_addr;
321 void *ret;
322 int order = get_order(size);
323
324 /*
325 * XXX fix me: the DMA API should pass us an explicit DMA mask
326 * instead, or use ZONE_DMA32 (ia64 overloads ZONE_DMA to be a ~32
327 * bit range instead of a 16MB one).
328 */
329 flags |= GFP_DMA;
330
331 ret = (void *)__get_free_pages(flags, order);
332 if (ret && address_needs_mapping(hwdev, virt_to_phys(ret))) {
333 /*
334 * The allocated memory isn't reachable by the device.
335 * Fall back on swiotlb_map_single().
336 */
337 free_pages((unsigned long) ret, order);
338 ret = NULL;
339 }
340 if (!ret) {
341 /*
342 * We are either out of memory or the device can't DMA
343 * to GFP_DMA memory; fall back on
344 * swiotlb_map_single(), which will grab memory from
345 * the lowest available address range.
346 */
347 dma_addr_t handle;
348 handle = swiotlb_map_single(NULL, NULL, size, DMA_FROM_DEVICE);
349 if (dma_mapping_error(handle))
350 return NULL;
351
352 ret = phys_to_virt(handle);
353 }
354
355 memset(ret, 0, size);
356 dev_addr = virt_to_phys(ret);
357
358 /* Confirm address can be DMA'd by device */
359 if (address_needs_mapping(hwdev, dev_addr)) {
360 printk("hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016lx\n",
361 (unsigned long long)*hwdev->dma_mask, dev_addr);
362 panic("swiotlb_alloc_coherent: allocated memory is out of "
363 "range for device");
364 }
365 *dma_handle = dev_addr;
366 return ret;
367}
368
369void
370swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
371 dma_addr_t dma_handle)
372{
373 if (!(vaddr >= (void *)io_tlb_start
374 && vaddr < (void *)io_tlb_end))
375 free_pages((unsigned long) vaddr, get_order(size));
376 else
377 /* DMA_TO_DEVICE to avoid memcpy in unmap_single */
378 swiotlb_unmap_single (hwdev, dma_handle, size, DMA_TO_DEVICE);
379}
380
381static void
382swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
383{
384 /*
385 * Ran out of IOMMU space for this operation. This is very bad.
386 * Unfortunately the drivers cannot handle this operation properly.
387 * unless they check for pci_dma_mapping_error (most don't)
388 * When the mapping is small enough return a static buffer to limit
389 * the damage, or panic when the transfer is too big.
390 */
391 printk(KERN_ERR "PCI-DMA: Out of SW-IOMMU space for %lu bytes at "
392 "device %s\n", size, dev ? dev->bus_id : "?");
393
394 if (size > io_tlb_overflow && do_panic) {
395 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
396 panic("PCI-DMA: Memory would be corrupted\n");
397 if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
398 panic("PCI-DMA: Random memory would be DMAed\n");
399 }
400}
401
402/*
403 * Map a single buffer of the indicated size for DMA in streaming mode. The
404 * PCI address to use is returned.
405 *
406 * Once the device is given the dma address, the device owns this memory until
407 * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed.
408 */
409dma_addr_t
410swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir)
411{
412 unsigned long dev_addr = virt_to_phys(ptr);
413 void *map;
414
415 if (dir == DMA_NONE)
416 BUG();
417 /*
418 * If the pointer passed in happens to be in the device's DMA window,
419 * we can safely return the device addr and not worry about bounce
420 * buffering it.
421 */
422 if (!address_needs_mapping(hwdev, dev_addr) && !swiotlb_force)
423 return dev_addr;
424
425 /*
426 * Oh well, have to allocate and map a bounce buffer.
427 */
428 map = map_single(hwdev, ptr, size, dir);
429 if (!map) {
430 swiotlb_full(hwdev, size, dir, 1);
431 map = io_tlb_overflow_buffer;
432 }
433
434 dev_addr = virt_to_phys(map);
435
436 /*
437 * Ensure that the address returned is DMA'ble
438 */
439 if (address_needs_mapping(hwdev, dev_addr))
440 panic("map_single: bounce buffer is not DMA'ble");
441
442 return dev_addr;
443}
444
445/*
446 * Since DMA is i-cache coherent, any (complete) pages that were written via
447 * DMA can be marked as "clean" so that lazy_mmu_prot_update() doesn't have to
448 * flush them when they get mapped into an executable vm-area.
449 */
450static void
451mark_clean(void *addr, size_t size)
452{
453 unsigned long pg_addr, end;
454
455 pg_addr = PAGE_ALIGN((unsigned long) addr);
456 end = (unsigned long) addr + size;
457 while (pg_addr + PAGE_SIZE <= end) {
458 struct page *page = virt_to_page(pg_addr);
459 set_bit(PG_arch_1, &page->flags);
460 pg_addr += PAGE_SIZE;
461 }
462}
463
464/*
465 * Unmap a single streaming mode DMA translation. The dma_addr and size must
466 * match what was provided for in a previous swiotlb_map_single call. All
467 * other usages are undefined.
468 *
469 * After this call, reads by the cpu to the buffer are guaranteed to see
470 * whatever the device wrote there.
471 */
472void
473swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size,
474 int dir)
475{
476 char *dma_addr = phys_to_virt(dev_addr);
477
478 if (dir == DMA_NONE)
479 BUG();
480 if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end)
481 unmap_single(hwdev, dma_addr, size, dir);
482 else if (dir == DMA_FROM_DEVICE)
483 mark_clean(dma_addr, size);
484}
485
486/*
487 * Make physical memory consistent for a single streaming mode DMA translation
488 * after a transfer.
489 *
490 * If you perform a swiotlb_map_single() but wish to interrogate the buffer
491 * using the cpu, yet do not wish to teardown the PCI dma mapping, you must
492 * call this function before doing so. At the next point you give the PCI dma
493 * address back to the card, you must first perform a
494 * swiotlb_dma_sync_for_device, and then the device again owns the buffer
495 */
496void
497swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
498 size_t size, int dir)
499{
500 char *dma_addr = phys_to_virt(dev_addr);
501
502 if (dir == DMA_NONE)
503 BUG();
504 if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end)
505 sync_single(hwdev, dma_addr, size, dir);
506 else if (dir == DMA_FROM_DEVICE)
507 mark_clean(dma_addr, size);
508}
509
510void
511swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
512 size_t size, int dir)
513{
514 char *dma_addr = phys_to_virt(dev_addr);
515
516 if (dir == DMA_NONE)
517 BUG();
518 if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end)
519 sync_single(hwdev, dma_addr, size, dir);
520 else if (dir == DMA_FROM_DEVICE)
521 mark_clean(dma_addr, size);
522}
523
524/*
525 * Map a set of buffers described by scatterlist in streaming mode for DMA.
526 * This is the scatter-gather version of the above swiotlb_map_single
527 * interface. Here the scatter gather list elements are each tagged with the
528 * appropriate dma address and length. They are obtained via
529 * sg_dma_{address,length}(SG).
530 *
531 * NOTE: An implementation may be able to use a smaller number of
532 * DMA address/length pairs than there are SG table elements.
533 * (for example via virtual mapping capabilities)
534 * The routine returns the number of addr/length pairs actually
535 * used, at most nents.
536 *
537 * Device ownership issues as mentioned above for swiotlb_map_single are the
538 * same here.
539 */
540int
541swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
542 int dir)
543{
544 void *addr;
545 unsigned long dev_addr;
546 int i;
547
548 if (dir == DMA_NONE)
549 BUG();
550
551 for (i = 0; i < nelems; i++, sg++) {
552 addr = SG_ENT_VIRT_ADDRESS(sg);
553 dev_addr = virt_to_phys(addr);
554 if (swiotlb_force || address_needs_mapping(hwdev, dev_addr)) {
555 sg->dma_address = (dma_addr_t) virt_to_phys(map_single(hwdev, addr, sg->length, dir));
556 if (!sg->dma_address) {
557 /* Don't panic here, we expect map_sg users
558 to do proper error handling. */
559 swiotlb_full(hwdev, sg->length, dir, 0);
560 swiotlb_unmap_sg(hwdev, sg - i, i, dir);
561 sg[0].dma_length = 0;
562 return 0;
563 }
564 } else
565 sg->dma_address = dev_addr;
566 sg->dma_length = sg->length;
567 }
568 return nelems;
569}
570
571/*
572 * Unmap a set of streaming mode DMA translations. Again, cpu read rules
573 * concerning calls here are the same as for swiotlb_unmap_single() above.
574 */
575void
576swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
577 int dir)
578{
579 int i;
580
581 if (dir == DMA_NONE)
582 BUG();
583
584 for (i = 0; i < nelems; i++, sg++)
585 if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
586 unmap_single(hwdev, (void *) phys_to_virt(sg->dma_address), sg->dma_length, dir);
587 else if (dir == DMA_FROM_DEVICE)
588 mark_clean(SG_ENT_VIRT_ADDRESS(sg), sg->dma_length);
589}
590
591/*
592 * Make physical memory consistent for a set of streaming mode DMA translations
593 * after a transfer.
594 *
595 * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
596 * and usage.
597 */
598void
599swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
600 int nelems, int dir)
601{
602 int i;
603
604 if (dir == DMA_NONE)
605 BUG();
606
607 for (i = 0; i < nelems; i++, sg++)
608 if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
609 sync_single(hwdev, (void *) sg->dma_address,
610 sg->dma_length, dir);
611}
612
613void
614swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
615 int nelems, int dir)
616{
617 int i;
618
619 if (dir == DMA_NONE)
620 BUG();
621
622 for (i = 0; i < nelems; i++, sg++)
623 if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
624 sync_single(hwdev, (void *) sg->dma_address,
625 sg->dma_length, dir);
626}
627
628int
629swiotlb_dma_mapping_error(dma_addr_t dma_addr)
630{
631 return (dma_addr == virt_to_phys(io_tlb_overflow_buffer));
632}
633
634/*
635 * Return whether the given PCI device DMA address mask can be supported
636 * properly. For example, if your device can only drive the low 24-bits
637 * during PCI bus mastering, then you would pass 0x00ffffff as the mask to
638 * this function.
639 */
640int
641swiotlb_dma_supported (struct device *hwdev, u64 mask)
642{
643 return (virt_to_phys (io_tlb_end) - 1) <= mask;
644}
645
646EXPORT_SYMBOL(swiotlb_init);
647EXPORT_SYMBOL(swiotlb_map_single);
648EXPORT_SYMBOL(swiotlb_unmap_single);
649EXPORT_SYMBOL(swiotlb_map_sg);
650EXPORT_SYMBOL(swiotlb_unmap_sg);
651EXPORT_SYMBOL(swiotlb_sync_single_for_cpu);
652EXPORT_SYMBOL(swiotlb_sync_single_for_device);
653EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu);
654EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
655EXPORT_SYMBOL(swiotlb_dma_mapping_error);
656EXPORT_SYMBOL(swiotlb_alloc_coherent);
657EXPORT_SYMBOL(swiotlb_free_coherent);
658EXPORT_SYMBOL(swiotlb_dma_supported);
diff --git a/arch/ia64/lib/xor.S b/arch/ia64/lib/xor.S
new file mode 100644
index 000000000000..54e3f7eab8e9
--- /dev/null
+++ b/arch/ia64/lib/xor.S
@@ -0,0 +1,184 @@
1/*
2 * arch/ia64/lib/xor.S
3 *
4 * Optimized RAID-5 checksumming functions for IA-64.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2, or (at your option)
9 * any later version.
10 *
11 * You should have received a copy of the GNU General Public License
12 * (for example /usr/src/linux/COPYING); if not, write to the Free
13 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
14 */
15
16#include <asm/asmmacro.h>
17
18GLOBAL_ENTRY(xor_ia64_2)
19 .prologue
20 .fframe 0
21 .save ar.pfs, r31
22 alloc r31 = ar.pfs, 3, 0, 13, 16
23 .save ar.lc, r30
24 mov r30 = ar.lc
25 .save pr, r29
26 mov r29 = pr
27 ;;
28 .body
29 mov r8 = in1
30 mov ar.ec = 6 + 2
31 shr in0 = in0, 3
32 ;;
33 adds in0 = -1, in0
34 mov r16 = in1
35 mov r17 = in2
36 ;;
37 mov ar.lc = in0
38 mov pr.rot = 1 << 16
39 ;;
40 .rotr s1[6+1], s2[6+1], d[2]
41 .rotp p[6+2]
420:
43(p[0]) ld8.nta s1[0] = [r16], 8
44(p[0]) ld8.nta s2[0] = [r17], 8
45(p[6]) xor d[0] = s1[6], s2[6]
46(p[6+1])st8.nta [r8] = d[1], 8
47 nop.f 0
48 br.ctop.dptk.few 0b
49 ;;
50 mov ar.lc = r30
51 mov pr = r29, -1
52 br.ret.sptk.few rp
53END(xor_ia64_2)
54
55GLOBAL_ENTRY(xor_ia64_3)
56 .prologue
57 .fframe 0
58 .save ar.pfs, r31
59 alloc r31 = ar.pfs, 4, 0, 20, 24
60 .save ar.lc, r30
61 mov r30 = ar.lc
62 .save pr, r29
63 mov r29 = pr
64 ;;
65 .body
66 mov r8 = in1
67 mov ar.ec = 6 + 2
68 shr in0 = in0, 3
69 ;;
70 adds in0 = -1, in0
71 mov r16 = in1
72 mov r17 = in2
73 ;;
74 mov r18 = in3
75 mov ar.lc = in0
76 mov pr.rot = 1 << 16
77 ;;
78 .rotr s1[6+1], s2[6+1], s3[6+1], d[2]
79 .rotp p[6+2]
800:
81(p[0]) ld8.nta s1[0] = [r16], 8
82(p[0]) ld8.nta s2[0] = [r17], 8
83(p[6]) xor d[0] = s1[6], s2[6]
84 ;;
85(p[0]) ld8.nta s3[0] = [r18], 8
86(p[6+1])st8.nta [r8] = d[1], 8
87(p[6]) xor d[0] = d[0], s3[6]
88 br.ctop.dptk.few 0b
89 ;;
90 mov ar.lc = r30
91 mov pr = r29, -1
92 br.ret.sptk.few rp
93END(xor_ia64_3)
94
95GLOBAL_ENTRY(xor_ia64_4)
96 .prologue
97 .fframe 0
98 .save ar.pfs, r31
99 alloc r31 = ar.pfs, 5, 0, 27, 32
100 .save ar.lc, r30
101 mov r30 = ar.lc
102 .save pr, r29
103 mov r29 = pr
104 ;;
105 .body
106 mov r8 = in1
107 mov ar.ec = 6 + 2
108 shr in0 = in0, 3
109 ;;
110 adds in0 = -1, in0
111 mov r16 = in1
112 mov r17 = in2
113 ;;
114 mov r18 = in3
115 mov ar.lc = in0
116 mov pr.rot = 1 << 16
117 mov r19 = in4
118 ;;
119 .rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], d[2]
120 .rotp p[6+2]
1210:
122(p[0]) ld8.nta s1[0] = [r16], 8
123(p[0]) ld8.nta s2[0] = [r17], 8
124(p[6]) xor d[0] = s1[6], s2[6]
125(p[0]) ld8.nta s3[0] = [r18], 8
126(p[0]) ld8.nta s4[0] = [r19], 8
127(p[6]) xor r20 = s3[6], s4[6]
128 ;;
129(p[6+1])st8.nta [r8] = d[1], 8
130(p[6]) xor d[0] = d[0], r20
131 br.ctop.dptk.few 0b
132 ;;
133 mov ar.lc = r30
134 mov pr = r29, -1
135 br.ret.sptk.few rp
136END(xor_ia64_4)
137
138GLOBAL_ENTRY(xor_ia64_5)
139 .prologue
140 .fframe 0
141 .save ar.pfs, r31
142 alloc r31 = ar.pfs, 6, 0, 34, 40
143 .save ar.lc, r30
144 mov r30 = ar.lc
145 .save pr, r29
146 mov r29 = pr
147 ;;
148 .body
149 mov r8 = in1
150 mov ar.ec = 6 + 2
151 shr in0 = in0, 3
152 ;;
153 adds in0 = -1, in0
154 mov r16 = in1
155 mov r17 = in2
156 ;;
157 mov r18 = in3
158 mov ar.lc = in0
159 mov pr.rot = 1 << 16
160 mov r19 = in4
161 mov r20 = in5
162 ;;
163 .rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], s5[6+1], d[2]
164 .rotp p[6+2]
1650:
166(p[0]) ld8.nta s1[0] = [r16], 8
167(p[0]) ld8.nta s2[0] = [r17], 8
168(p[6]) xor d[0] = s1[6], s2[6]
169(p[0]) ld8.nta s3[0] = [r18], 8
170(p[0]) ld8.nta s4[0] = [r19], 8
171(p[6]) xor r21 = s3[6], s4[6]
172 ;;
173(p[0]) ld8.nta s5[0] = [r20], 8
174(p[6+1])st8.nta [r8] = d[1], 8
175(p[6]) xor d[0] = d[0], r21
176 ;;
177(p[6]) xor d[0] = d[0], s5[6]
178 nop.f 0
179 br.ctop.dptk.few 0b
180 ;;
181 mov ar.lc = r30
182 mov pr = r29, -1
183 br.ret.sptk.few rp
184END(xor_ia64_5)