Linux-2.6.12-rc2v2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
author: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
commit: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree: 0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/ia64/lib
26 files changed, 5093 insertions, 0 deletions
diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile
new file mode 100644
index 000000000000..1902c3c2ef92
--- /dev/null
+++ b/arch/ia64/lib/Makefile
@@ -0,0 +1,52 @@
+#
+# Makefile for ia64-specific library routines..
+#
+obj-y := io.o
+lib-y := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o                  \
+        __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o                   \
+        bitop.o checksum.o clear_page.o csum_partial_copy.o copy_page.o \
+        clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o   \
+        flush.o ip_fast_csum.o do_csum.o                                \
+        memset.o strlen.o swiotlb.o
+lib-$(CONFIG_ITANIUM)   += copy_page.o copy_user.o memcpy.o
+lib-$(CONFIG_MCKINLEY)  += copy_page_mck.o memcpy_mck.o
+lib-$(CONFIG_PERFMON)   += carta_random.o
+lib-$(CONFIG_MD_RAID5)  += xor.o
+lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
+AFLAGS___divdi3.o       =
+AFLAGS___udivdi3.o      = -DUNSIGNED
+AFLAGS___moddi3.o       =            -DMODULO
+AFLAGS___umoddi3.o      = -DUNSIGNED -DMODULO
+AFLAGS___divsi3.o       =
+AFLAGS___udivsi3.o      = -DUNSIGNED
+AFLAGS___modsi3.o       =            -DMODULO
+AFLAGS___umodsi3.o      = -DUNSIGNED -DMODULO
+$(obj)/__divdi3.o: $(src)/idiv64.S FORCE
+        $(call if_changed_dep,as_o_S)
+$(obj)/__udivdi3.o: $(src)/idiv64.S FORCE
+        $(call if_changed_dep,as_o_S)
+$(obj)/__moddi3.o: $(src)/idiv64.S FORCE
+        $(call if_changed_dep,as_o_S)
+$(obj)/__umoddi3.o: $(src)/idiv64.S FORCE
+        $(call if_changed_dep,as_o_S)
+$(obj)/__divsi3.o: $(src)/idiv32.S FORCE
+        $(call if_changed_dep,as_o_S)
+$(obj)/__udivsi3.o: $(src)/idiv32.S FORCE
+        $(call if_changed_dep,as_o_S)
+$(obj)/__modsi3.o: $(src)/idiv32.S FORCE
+        $(call if_changed_dep,as_o_S)
+$(obj)/__umodsi3.o: $(src)/idiv32.S FORCE
+        $(call if_changed_dep,as_o_S)
diff --git a/arch/ia64/lib/bitop.c b/arch/ia64/lib/bitop.c
new file mode 100644
index 000000000000..82e299c8464e
--- /dev/null
+++ b/arch/ia64/lib/bitop.c
@@ -0,0 +1,88 @@
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <asm/intrinsics.h>
+#include <linux/module.h>
+#include <linux/bitops.h>
+/*
+ * Find next zero bit in a bitmap reasonably efficiently..
+ */
+int __find_next_zero_bit (const void *addr, unsigned long size, unsigned long offset)
+{
+        unsigned long *p = ((unsigned long *) addr) + (offset >> 6);
+        unsigned long result = offset & ~63UL;
+        unsigned long tmp;
+        if (offset >= size)
+                return size;
+        size -= result;
+        offset &= 63UL;
+        if (offset) {
+                tmp = *(p++);
+                tmp |= ~0UL >> (64-offset);
+                if (size < 64)
+                        goto found_first;
+                if (~tmp)
+                        goto found_middle;
+                size -= 64;
+                result += 64;
+        }
+        while (size & ~63UL) {
+                if (~(tmp = *(p++)))
+                        goto found_middle;
+                result += 64;
+                size -= 64;
+        }
+        if (!size)
+                return result;
+        tmp = *p;
+found_first:
+        tmp |= ~0UL << size;
+        if (tmp == ~0UL)                /* any bits zero? */
+                return result + size;   /* nope */
+found_middle:
+        return result + ffz(tmp);
+}
+EXPORT_SYMBOL(__find_next_zero_bit);
+/*
+ * Find next bit in a bitmap reasonably efficiently..
+ */
+int __find_next_bit(const void *addr, unsigned long size, unsigned long offset)
+{
+        unsigned long *p = ((unsigned long *) addr) + (offset >> 6);
+        unsigned long result = offset & ~63UL;
+        unsigned long tmp;
+        if (offset >= size)
+                return size;
+        size -= result;
+        offset &= 63UL;
+        if (offset) {
+                tmp = *(p++);
+                tmp &= ~0UL << offset;
+                if (size < 64)
+                        goto found_first;
+                if (tmp)
+                        goto found_middle;
+                size -= 64;
+                result += 64;
+        }
+        while (size & ~63UL) {
+                if ((tmp = *(p++)))
+                        goto found_middle;
+                result += 64;
+                size -= 64;
+        }
+        if (!size)
+                return result;
+        tmp = *p;
+  found_first:
+        tmp &= ~0UL >> (64-size);
+        if (tmp == 0UL)         /* Are any bits set? */
+                return result + size; /* Nope. */
+  found_middle:
+        return result + __ffs(tmp);
+}
+EXPORT_SYMBOL(__find_next_bit);
diff --git a/arch/ia64/lib/carta_random.S b/arch/ia64/lib/carta_random.S
new file mode 100644
index 000000000000..d0674c360364
--- /dev/null
+++ b/arch/ia64/lib/carta_random.S
@@ -0,0 +1,54 @@
+/*
+ * Fast, simple, yet decent quality random number generator based on
+ * a paper by David G. Carta ("Two Fast Implementations of the
+ * `Minimal Standard' Random Number Generator," Communications of the
+ * ACM, January, 1990).
+ *
+ * Copyright (C) 2002 Hewlett-Packard Co
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+#include <asm/asmmacro.h>
+#define a       r2
+#define m       r3
+#define lo      r8
+#define hi      r9
+#define t0      r16
+#define t1      r17
+#define seed    r32
+GLOBAL_ENTRY(carta_random32)
+        movl    a = (16807 << 16) | 16807
+        ;;
+        pmpyshr2.u t0 = a, seed, 0
+        pmpyshr2.u t1 = a, seed, 16
+        ;;
+        unpack2.l t0 = t1, t0
+        dep     m = -1, r0, 0, 31
+        ;;
+        zxt4    lo = t0
+        shr.u   hi = t0, 32
+        ;;
+        dep     t0 = 0, hi, 15, 49      // t0 = (hi & 0x7fff)
+        ;;
+        shl     t0 = t0, 16             // t0 = (hi & 0x7fff) << 16
+        shr     t1 = hi, 15             // t1 = (hi >> 15)
+        ;;
+        add     lo = lo, t0
+        ;;
+        cmp.gtu p6, p0 = lo, m
+        ;;
+(p6)    and     lo = lo, m
+        ;;
+(p6)    add     lo = 1, lo
+        ;;
+        add     lo = lo, t1
+        ;;
+        cmp.gtu p6, p0 = lo, m
+        ;;
+(p6)    and     lo = lo, m
+        ;;
+(p6)    add     lo = 1, lo
+        br.ret.sptk.many rp
+END(carta_random32)
diff --git a/arch/ia64/lib/checksum.c b/arch/ia64/lib/checksum.c
new file mode 100644
index 000000000000..beb11721d9f5
--- /dev/null
+++ b/arch/ia64/lib/checksum.c
@@ -0,0 +1,102 @@
+/*
+ * Network checksum routines
+ *
+ * Copyright (C) 1999, 2003 Hewlett-Packard Co
+ *      Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Most of the code coming from arch/alpha/lib/checksum.c
+ *
+ * This file contains network checksum routines that are better done
+ * in an architecture-specific manner due to speed..
+ */
+#include <linux/module.h>
+#include <linux/string.h>
+#include <asm/byteorder.h>
+static inline unsigned short
+from64to16 (unsigned long x)
+{
+        /* add up 32-bit words for 33 bits */
+        x = (x & 0xffffffff) + (x >> 32);
+        /* add up 16-bit and 17-bit words for 17+c bits */
+        x = (x & 0xffff) + (x >> 16);
+        /* add up 16-bit and 2-bit for 16+c bit */
+        x = (x & 0xffff) + (x >> 16);
+        /* add up carry.. */
+        x = (x & 0xffff) + (x >> 16);
+        return x;
+}
+/*
+ * computes the checksum of the TCP/UDP pseudo-header
+ * returns a 16-bit checksum, already complemented.
+ */
+unsigned short int
+csum_tcpudp_magic (unsigned long saddr, unsigned long daddr, unsigned short len,
+                   unsigned short proto, unsigned int sum)
+{
+        return ~from64to16(saddr + daddr + sum + ((unsigned long) ntohs(len) << 16) +
+                           ((unsigned long) proto << 8));
+}
+EXPORT_SYMBOL(csum_tcpudp_magic);
+unsigned int
+csum_tcpudp_nofold (unsigned long saddr, unsigned long daddr, unsigned short len,
+                    unsigned short proto, unsigned int sum)
+{
+        unsigned long result;
+        result = (saddr + daddr + sum +
+                  ((unsigned long) ntohs(len) << 16) +
+                  ((unsigned long) proto << 8));
+        /* Fold down to 32-bits so we don't lose in the typedef-less network stack.  */
+        /* 64 to 33 */
+        result = (result & 0xffffffff) + (result >> 32);
+        /* 33 to 32 */
+        result = (result & 0xffffffff) + (result >> 32);
+        return result;
+}
+extern unsigned long do_csum (const unsigned char *, long);
+/*
+ * computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic
+ *
+ * this function must be called with even lengths, except
+ * for the last fragment, which may be odd
+ *
+ * it's best to have buff aligned on a 32-bit boundary
+ */
+unsigned int
+csum_partial (const unsigned char * buff, int len, unsigned int sum)
+{
+        unsigned long result = do_csum(buff, len);
+        /* add in old sum, and carry.. */
+        result += sum;
+        /* 32+c bits -> 32 bits */
+        result = (result & 0xffffffff) + (result >> 32);
+        return result;
+}
+EXPORT_SYMBOL(csum_partial);
+/*
+ * this routine is used for miscellaneous IP-like checksums, mainly
+ * in icmp.c
+ */
+unsigned short
+ip_compute_csum (unsigned char * buff, int len)
+{
+        return ~do_csum(buff,len);
+}
+EXPORT_SYMBOL(ip_compute_csum);
diff --git a/arch/ia64/lib/clear_page.S b/arch/ia64/lib/clear_page.S
new file mode 100644
index 000000000000..d4987061dda7
--- /dev/null
+++ b/arch/ia64/lib/clear_page.S
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 1999-2002 Hewlett-Packard Co
+ *      Stephane Eranian <eranian@hpl.hp.com>
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
+ *
+ * 1/06/01 davidm       Tuned for Itanium.
+ * 2/12/02 kchen        Tuned for both Itanium and McKinley
+ * 3/08/02 davidm       Some more tweaking
+ */
+#include <linux/config.h>
+#include <asm/asmmacro.h>
+#include <asm/page.h>
+#ifdef CONFIG_ITANIUM
+# define L3_LINE_SIZE   64      // Itanium L3 line size
+# define PREFETCH_LINES 9       // magic number
+#else
+# define L3_LINE_SIZE   128     // McKinley L3 line size
+# define PREFETCH_LINES 12      // magic number
+#endif
+#define saved_lc        r2
+#define dst_fetch       r3
+#define dst1            r8
+#define dst2            r9
+#define dst3            r10
+#define dst4            r11
+#define dst_last        r31
+GLOBAL_ENTRY(clear_page)
+        .prologue
+        .regstk 1,0,0,0
+        mov r16 = PAGE_SIZE/L3_LINE_SIZE-1      // main loop count, -1=repeat/until
+        .save ar.lc, saved_lc
+        mov saved_lc = ar.lc
+        .body
+        mov ar.lc = (PREFETCH_LINES - 1)
+        mov dst_fetch = in0
+        adds dst1 = 16, in0
+        adds dst2 = 32, in0
+        ;;
+.fetch: stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
+        adds dst3 = 48, in0             // executing this multiple times is harmless
+        br.cloop.sptk.few .fetch
+        ;;
+        addl dst_last = (PAGE_SIZE - PREFETCH_LINES*L3_LINE_SIZE), dst_fetch
+        mov ar.lc = r16                 // one L3 line per iteration
+        adds dst4 = 64, in0
+        ;;
+#ifdef CONFIG_ITANIUM
+        // Optimized for Itanium
+1:      stf.spill.nta [dst1] = f0, 64
+        stf.spill.nta [dst2] = f0, 64
+        cmp.lt p8,p0=dst_fetch, dst_last
+        ;;
+#else
+        // Optimized for McKinley
+1:      stf.spill.nta [dst1] = f0, 64
+        stf.spill.nta [dst2] = f0, 64
+        stf.spill.nta [dst3] = f0, 64
+        stf.spill.nta [dst4] = f0, 128
+        cmp.lt p8,p0=dst_fetch, dst_last
+        ;;
+        stf.spill.nta [dst1] = f0, 64
+        stf.spill.nta [dst2] = f0, 64
+#endif
+        stf.spill.nta [dst3] = f0, 64
+(p8)    stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
+        br.cloop.sptk.few 1b
+        ;;
+        mov ar.lc = saved_lc            // restore lc
+        br.ret.sptk.many rp
+END(clear_page)
diff --git a/arch/ia64/lib/clear_user.S b/arch/ia64/lib/clear_user.S
new file mode 100644
index 000000000000..eecd8577b209
--- /dev/null
+++ b/arch/ia64/lib/clear_user.S
@@ -0,0 +1,209 @@
+/*
+ * This routine clears to zero a linear memory buffer in user space.
+ *
+ * Inputs:
+ *      in0:    address of buffer
+ *      in1:    length of buffer in bytes
+ * Outputs:
+ *      r8:     number of bytes that didn't get cleared due to a fault
+ *
+ * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
+ *      Stephane Eranian <eranian@hpl.hp.com>
+ */
+#include <asm/asmmacro.h>
+//
+// arguments
+//
+#define buf             r32
+#define len             r33
+//
+// local registers
+//
+#define cnt             r16
+#define buf2            r17
+#define saved_lc        r18
+#define saved_pfs       r19
+#define tmp             r20
+#define len2            r21
+#define len3            r22
+//
+// Theory of operations:
+//      - we check whether or not the buffer is small, i.e., less than 17
+//        in which case we do the byte by byte loop.
+//
+//      - Otherwise we go progressively from 1 byte store to 8byte store in
+//        the head part, the body is a 16byte store loop and we finish we the
+//        tail for the last 15 bytes.
+//        The good point about this breakdown is that the long buffer handling
+//        contains only 2 branches.
+//
+//      The reason for not using shifting & masking for both the head and the
+//      tail is to stay semantically correct. This routine is not supposed
+//      to write bytes outside of the buffer. While most of the time this would
+//      be ok, we can't tolerate a mistake. A classical example is the case
+//      of multithreaded code were to the extra bytes touched is actually owned
+//      by another thread which runs concurrently to ours. Another, less likely,
+//      example is with device drivers where reading an I/O mapped location may
+//      have side effects (same thing for writing).
+//
+GLOBAL_ENTRY(__do_clear_user)
+        .prologue
+        .save ar.pfs, saved_pfs
+        alloc   saved_pfs=ar.pfs,2,0,0,0
+        cmp.eq p6,p0=r0,len             // check for zero length
+        .save ar.lc, saved_lc
+        mov saved_lc=ar.lc              // preserve ar.lc (slow)
+        .body
+        ;;                              // avoid WAW on CFM
+        adds tmp=-1,len                 // br.ctop is repeat/until
+        mov ret0=len                    // return value is length at this point
+(p6)    br.ret.spnt.many rp
+        ;;
+        cmp.lt p6,p0=16,len             // if len > 16 then long memset
+        mov ar.lc=tmp                   // initialize lc for small count
+(p6)    br.cond.dptk .long_do_clear
+        ;;                              // WAR on ar.lc
+        //
+        // worst case 16 iterations, avg 8 iterations
+        //
+        // We could have played with the predicates to use the extra
+        // M slot for 2 stores/iteration but the cost the initialization
+        // the various counters compared to how long the loop is supposed
+        // to last on average does not make this solution viable.
+        //
+1:
+        EX( .Lexit1, st1 [buf]=r0,1 )
+        adds len=-1,len                 // countdown length using len
+        br.cloop.dptk 1b
+        ;;                              // avoid RAW on ar.lc
+        //
+        // .Lexit4: comes from byte by byte loop
+        //          len contains bytes left
+.Lexit1:
+        mov ret0=len                    // faster than using ar.lc
+        mov ar.lc=saved_lc
+        br.ret.sptk.many rp             // end of short clear_user
+        //
+        // At this point we know we have more than 16 bytes to copy
+        // so we focus on alignment (no branches required)
+        //
+        // The use of len/len2 for countdown of the number of bytes left
+        // instead of ret0 is due to the fact that the exception code
+        // changes the values of r8.
+        //
+.long_do_clear:
+        tbit.nz p6,p0=buf,0             // odd alignment (for long_do_clear)
+        ;;
+        EX( .Lexit3, (p6) st1 [buf]=r0,1 )      // 1-byte aligned
+(p6)    adds len=-1,len;;               // sync because buf is modified
+        tbit.nz p6,p0=buf,1
+        ;;
+        EX( .Lexit3, (p6) st2 [buf]=r0,2 )      // 2-byte aligned
+(p6)    adds len=-2,len;;
+        tbit.nz p6,p0=buf,2
+        ;;
+        EX( .Lexit3, (p6) st4 [buf]=r0,4 )      // 4-byte aligned
+(p6)    adds len=-4,len;;
+        tbit.nz p6,p0=buf,3
+        ;;
+        EX( .Lexit3, (p6) st8 [buf]=r0,8 )      // 8-byte aligned
+(p6)    adds len=-8,len;;
+        shr.u cnt=len,4         // number of 128-bit (2x64bit) words
+        ;;
+        cmp.eq p6,p0=r0,cnt
+        adds tmp=-1,cnt
+(p6)    br.cond.dpnt .dotail            // we have less than 16 bytes left
+        ;;
+        adds buf2=8,buf                 // setup second base pointer
+        mov ar.lc=tmp
+        ;;
+        //
+        // 16bytes/iteration core loop
+        //
+        // The second store can never generate a fault because
+        // we come into the loop only when we are 16-byte aligned.
+        // This means that if we cross a page then it will always be
+        // in the first store and never in the second.
+        //
+        //
+        // We need to keep track of the remaining length. A possible (optimistic)
+        // way would be to use ar.lc and derive how many byte were left by
+        // doing : left= 16*ar.lc + 16.  this would avoid the addition at
+        // every iteration.
+        // However we need to keep the synchronization point. A template
+        // M;;MB does not exist and thus we can keep the addition at no
+        // extra cycle cost (use a nop slot anyway). It also simplifies the
+        // (unlikely)  error recovery code
+        //
+2:      EX(.Lexit3, st8 [buf]=r0,16 )
+        ;;                              // needed to get len correct when error
+        st8 [buf2]=r0,16
+        adds len=-16,len
+        br.cloop.dptk 2b
+        ;;
+        mov ar.lc=saved_lc
+        //
+        // tail correction based on len only
+        //
+        // We alternate the use of len3,len2 to allow parallelism and correct
+        // error handling. We also reuse p6/p7 to return correct value.
+        // The addition of len2/len3 does not cost anything more compared to
+        // the regular memset as we had empty slots.
+        //
+.dotail:
+        mov len2=len                    // for parallelization of error handling
+        mov len3=len
+        tbit.nz p6,p0=len,3
+        ;;
+        EX( .Lexit2, (p6) st8 [buf]=r0,8 )      // at least 8 bytes
+(p6)    adds len3=-8,len2
+        tbit.nz p7,p6=len,2
+        ;;
+        EX( .Lexit2, (p7) st4 [buf]=r0,4 )      // at least 4 bytes
+(p7)    adds len2=-4,len3
+        tbit.nz p6,p7=len,1
+        ;;
+        EX( .Lexit2, (p6) st2 [buf]=r0,2 )      // at least 2 bytes
+(p6)    adds len3=-2,len2
+        tbit.nz p7,p6=len,0
+        ;;
+        EX( .Lexit2, (p7) st1 [buf]=r0 )        // only 1 byte left
+        mov ret0=r0                             // success
+        br.ret.sptk.many rp                     // end of most likely path
+        //
+        // Outlined error handling code
+        //
+        //
+        // .Lexit3: comes from core loop, need restore pr/lc
+        //          len contains bytes left
+        //
+        //
+        // .Lexit2:
+        //      if p6 -> coming from st8 or st2 : len2 contains what's left
+        //      if p7 -> coming from st4 or st1 : len3 contains what's left
+        // We must restore lc/pr even though might not have been used.
+.Lexit2:
+        .pred.rel "mutex", p6, p7
+(p6)    mov len=len2
+(p7)    mov len=len3
+        ;;
+        //
+        // .Lexit4: comes from head, need not restore pr/lc
+        //          len contains bytes left
+        //
+.Lexit3:
+        mov ret0=len
+        mov ar.lc=saved_lc
+        br.ret.sptk.many rp
+END(__do_clear_user)
diff --git a/arch/ia64/lib/copy_page.S b/arch/ia64/lib/copy_page.S
new file mode 100644
index 000000000000..127d1d050d78
--- /dev/null
+++ b/arch/ia64/lib/copy_page.S
@@ -0,0 +1,98 @@
+/*
+ *
+ * Optimized version of the standard copy_page() function
+ *
+ * Inputs:
+ *      in0:    address of target page
+ *      in1:    address of source page
+ * Output:
+ *      no return value
+ *
+ * Copyright (C) 1999, 2001 Hewlett-Packard Co
+ *      Stephane Eranian <eranian@hpl.hp.com>
+ *      David Mosberger <davidm@hpl.hp.com>
+ *
+ * 4/06/01 davidm       Tuned to make it perform well both for cached and uncached copies.
+ */
+#include <asm/asmmacro.h>
+#include <asm/page.h>
+#define PIPE_DEPTH      3
+#define EPI             p[PIPE_DEPTH-1]
+#define lcount          r16
+#define saved_pr        r17
+#define saved_lc        r18
+#define saved_pfs       r19
+#define src1            r20
+#define src2            r21
+#define tgt1            r22
+#define tgt2            r23
+#define srcf            r24
+#define tgtf            r25
+#define tgt_last        r26
+#define Nrot            ((8*PIPE_DEPTH+7)&~7)
+GLOBAL_ENTRY(copy_page)
+        .prologue
+        .save ar.pfs, saved_pfs
+        alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
+        .rotr t1[PIPE_DEPTH], t2[PIPE_DEPTH], t3[PIPE_DEPTH], t4[PIPE_DEPTH], \
+              t5[PIPE_DEPTH], t6[PIPE_DEPTH], t7[PIPE_DEPTH], t8[PIPE_DEPTH]
+        .rotp p[PIPE_DEPTH]
+        .save ar.lc, saved_lc
+        mov saved_lc=ar.lc
+        mov ar.ec=PIPE_DEPTH
+        mov lcount=PAGE_SIZE/64-1
+        .save pr, saved_pr
+        mov saved_pr=pr
+        mov pr.rot=1<<16
+        .body
+        mov src1=in1
+        adds src2=8,in1
+        mov tgt_last = PAGE_SIZE
+        ;;
+        adds tgt2=8,in0
+        add srcf=512,in1
+        mov ar.lc=lcount
+        mov tgt1=in0
+        add tgtf=512,in0
+        add tgt_last = tgt_last, in0
+        ;;
+1:
+(p[0])  ld8 t1[0]=[src1],16
+(EPI)   st8 [tgt1]=t1[PIPE_DEPTH-1],16
+(p[0])  ld8 t2[0]=[src2],16
+(EPI)   st8 [tgt2]=t2[PIPE_DEPTH-1],16
+        cmp.ltu p6,p0 = tgtf, tgt_last
+        ;;
+(p[0])  ld8 t3[0]=[src1],16
+(EPI)   st8 [tgt1]=t3[PIPE_DEPTH-1],16
+(p[0])  ld8 t4[0]=[src2],16
+(EPI)   st8 [tgt2]=t4[PIPE_DEPTH-1],16
+        ;;
+(p[0])  ld8 t5[0]=[src1],16
+(EPI)   st8 [tgt1]=t5[PIPE_DEPTH-1],16
+(p[0])  ld8 t6[0]=[src2],16
+(EPI)   st8 [tgt2]=t6[PIPE_DEPTH-1],16
+        ;;
+(p[0])  ld8 t7[0]=[src1],16
+(EPI)   st8 [tgt1]=t7[PIPE_DEPTH-1],16
+(p[0])  ld8 t8[0]=[src2],16
+(EPI)   st8 [tgt2]=t8[PIPE_DEPTH-1],16
+(p6)    lfetch [srcf], 64
+(p6)    lfetch [tgtf], 64
+        br.ctop.sptk.few 1b
+        ;;
+        mov pr=saved_pr,0xffffffffffff0000      // restore predicates
+        mov ar.pfs=saved_pfs
+        mov ar.lc=saved_lc
+        br.ret.sptk.many rp
+END(copy_page)
diff --git a/arch/ia64/lib/copy_page_mck.S b/arch/ia64/lib/copy_page_mck.S
new file mode 100644
index 000000000000..3c45d60a81b4
--- /dev/null
+++ b/arch/ia64/lib/copy_page_mck.S
@@ -0,0 +1,185 @@
+/*
+ * McKinley-optimized version of copy_page().
+ *
+ * Copyright (C) 2002 Hewlett-Packard Co
+ *      David Mosberger <davidm@hpl.hp.com>
+ *
+ * Inputs:
+ *      in0:    address of target page
+ *      in1:    address of source page
+ * Output:
+ *      no return value
+ *
+ * General idea:
+ *      - use regular loads and stores to prefetch data to avoid consuming M-slot just for
+ *        lfetches => good for in-cache performance
+ *      - avoid l2 bank-conflicts by not storing into the same 16-byte bank within a single
+ *        cycle
+ *
+ * Principle of operation:
+ *      First, note that L1 has a line-size of 64 bytes and L2 a line-size of 128 bytes.
+ *      To avoid secondary misses in L2, we prefetch both source and destination with a line-size
+ *      of 128 bytes.  When both of these lines are in the L2 and the first half of the
+ *      source line is in L1, we start copying the remaining words.  The second half of the
+ *      source line is prefetched in an earlier iteration, so that by the time we start
+ *      accessing it, it's also present in the L1.
+ *
+ *      We use a software-pipelined loop to control the overall operation.  The pipeline
+ *      has 2*PREFETCH_DIST+K stages.  The first PREFETCH_DIST stages are used for prefetching
+ *      source cache-lines.  The second PREFETCH_DIST stages are used for prefetching destination
+ *      cache-lines, the last K stages are used to copy the cache-line words not copied by
+ *      the prefetches.  The four relevant points in the pipelined are called A, B, C, D:
+ *      p[A] is TRUE if a source-line should be prefetched, p[B] is TRUE if a destination-line
+ *      should be prefetched, p[C] is TRUE if the second half of an L2 line should be brought
+ *      into L1D and p[D] is TRUE if a cacheline needs to be copied.
+ *
+ *      This all sounds very complicated, but thanks to the modulo-scheduled loop support,
+ *      the resulting code is very regular and quite easy to follow (once you get the idea).
+ *
+ *      As a secondary optimization, the first 2*PREFETCH_DIST iterations are implemented
+ *      as the separate .prefetch_loop.  Logically, this loop performs exactly like the
+ *      main-loop (.line_copy), but has all known-to-be-predicated-off instructions removed,
+ *      so that each loop iteration is faster (again, good for cached case).
+ *
+ *      When reading the code, it helps to keep the following picture in mind:
+ *
+ *             word 0 word 1
+ *            +------+------+---
+ *            | v[x] |  t1  | ^
+ *            | t2   |  t3  | |
+ *            | t4   |  t5  | |
+ *            | t6   |  t7  | | 128 bytes
+ *            | n[y] |  t9  | | (L2 cache line)
+ *            | t10  |  t11 | |
+ *            | t12  |  t13 | |
+ *            | t14  |  t15 | v
+ *            +------+------+---
+ *
+ *      Here, v[x] is copied by the (memory) prefetch.  n[y] is loaded at p[C]
+ *      to fetch the second-half of the L2 cache line into L1, and the tX words are copied in
+ *      an order that avoids bank conflicts.
+ */
+#include <asm/asmmacro.h>
+#include <asm/page.h>
+#define PREFETCH_DIST   8               // McKinley sustains 16 outstanding L2 misses (8 ld, 8 st)
+#define src0            r2
+#define src1            r3
+#define dst0            r9
+#define dst1            r10
+#define src_pre_mem     r11
+#define dst_pre_mem     r14
+#define src_pre_l2      r15
+#define dst_pre_l2      r16
+#define t1              r17
+#define t2              r18
+#define t3              r19
+#define t4              r20
+#define t5              t1      // alias!
+#define t6              t2      // alias!
+#define t7              t3      // alias!
+#define t9              t5      // alias!
+#define t10             t4      // alias!
+#define t11             t7      // alias!
+#define t12             t6      // alias!
+#define t14             t10     // alias!
+#define t13             r21
+#define t15             r22
+#define saved_lc        r23
+#define saved_pr        r24
+#define A       0
+#define B       (PREFETCH_DIST)
+#define C       (B + PREFETCH_DIST)
+#define D       (C + 3)
+#define N       (D + 1)
+#define Nrot    ((N + 7) & ~7)
+GLOBAL_ENTRY(copy_page)
+        .prologue
+        alloc r8 = ar.pfs, 2, Nrot-2, 0, Nrot
+        .rotr v[2*PREFETCH_DIST], n[D-C+1]
+        .rotp p[N]
+        .save ar.lc, saved_lc
+        mov saved_lc = ar.lc
+        .save pr, saved_pr
+        mov saved_pr = pr
+        .body
+        mov src_pre_mem = in1
+        mov pr.rot = 0x10000
+        mov ar.ec = 1                           // special unrolled loop
+        mov dst_pre_mem = in0
+        mov ar.lc = 2*PREFETCH_DIST - 1
+        add src_pre_l2 = 8*8, in1
+        add dst_pre_l2 = 8*8, in0
+        add src0 = 8, in1                       // first t1 src
+        add src1 = 3*8, in1                     // first t3 src
+        add dst0 = 8, in0                       // first t1 dst
+        add dst1 = 3*8, in0                     // first t3 dst
+        mov t1 = (PAGE_SIZE/128) - (2*PREFETCH_DIST) - 1
+        nop.m 0
+        nop.i 0
+        ;;
+        // same as .line_copy loop, but with all predicated-off instructions removed:
+.prefetch_loop:
+(p[A])  ld8 v[A] = [src_pre_mem], 128           // M0
+(p[B])  st8 [dst_pre_mem] = v[B], 128           // M2
+        br.ctop.sptk .prefetch_loop
+        ;;
+        cmp.eq p16, p0 = r0, r0                 // reset p16 to 1 (br.ctop cleared it to zero)
+        mov ar.lc = t1                          // with 64KB pages, t1 is too big to fit in 8 bits!
+        mov ar.ec = N                           // # of stages in pipeline
+        ;;
+.line_copy:
+(p[D])  ld8 t2 = [src0], 3*8                    // M0
+(p[D])  ld8 t4 = [src1], 3*8                    // M1
+(p[B])  st8 [dst_pre_mem] = v[B], 128           // M2 prefetch dst from memory
+(p[D])  st8 [dst_pre_l2] = n[D-C], 128          // M3 prefetch dst from L2
+        ;;
+(p[A])  ld8 v[A] = [src_pre_mem], 128           // M0 prefetch src from memory
+(p[C])  ld8 n[0] = [src_pre_l2], 128            // M1 prefetch src from L2
+(p[D])  st8 [dst0] =  t1, 8                     // M2
+(p[D])  st8 [dst1] =  t3, 8                     // M3
+        ;;
+(p[D])  ld8  t5 = [src0], 8
+(p[D])  ld8  t7 = [src1], 3*8
+(p[D])  st8 [dst0] =  t2, 3*8
+(p[D])  st8 [dst1] =  t4, 3*8
+        ;;
+(p[D])  ld8  t6 = [src0], 3*8
+(p[D])  ld8 t10 = [src1], 8
+(p[D])  st8 [dst0] =  t5, 8
+(p[D])  st8 [dst1] =  t7, 3*8
+        ;;
+(p[D])  ld8  t9 = [src0], 3*8
+(p[D])  ld8 t11 = [src1], 3*8
+(p[D])  st8 [dst0] =  t6, 3*8
+(p[D])  st8 [dst1] = t10, 8
+        ;;
+(p[D])  ld8 t12 = [src0], 8
+(p[D])  ld8 t14 = [src1], 8
+(p[D])  st8 [dst0] =  t9, 3*8
+(p[D])  st8 [dst1] = t11, 3*8
+        ;;
+(p[D])  ld8 t13 = [src0], 4*8
+(p[D])  ld8 t15 = [src1], 4*8
+(p[D])  st8 [dst0] = t12, 8
+(p[D])  st8 [dst1] = t14, 8
+        ;;
+(p[D-1])ld8  t1 = [src0], 8
+(p[D-1])ld8  t3 = [src1], 8
+(p[D])  st8 [dst0] = t13, 4*8
+(p[D])  st8 [dst1] = t15, 4*8
+        br.ctop.sptk .line_copy
+        ;;
+        mov ar.lc = saved_lc
+        mov pr = saved_pr, -1
+        br.ret.sptk.many rp
+END(copy_page)
diff --git a/arch/ia64/lib/copy_user.S b/arch/ia64/lib/copy_user.S
new file mode 100644
index 000000000000..c952bdc6a093
--- /dev/null
+++ b/arch/ia64/lib/copy_user.S
@@ -0,0 +1,610 @@
+/*
+ *
+ * Optimized version of the copy_user() routine.
+ * It is used to copy date across the kernel/user boundary.
+ *
+ * The source and destination are always on opposite side of
+ * the boundary. When reading from user space we must catch
+ * faults on loads. When writing to user space we must catch
+ * errors on stores. Note that because of the nature of the copy
+ * we don't need to worry about overlapping regions.
+ *
+ *
+ * Inputs:
+ *      in0     address of source buffer
+ *      in1     address of destination buffer
+ *      in2     number of bytes to copy
+ *
+ * Outputs:
+ *      ret0    0 in case of success. The number of bytes NOT copied in
+ *              case of error.
+ *
+ * Copyright (C) 2000-2001 Hewlett-Packard Co
+ *      Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Fixme:
+ *      - handle the case where we have more than 16 bytes and the alignment
+ *        are different.
+ *      - more benchmarking
+ *      - fix extraneous stop bit introduced by the EX() macro.
+ */
+#include <asm/asmmacro.h>
+//
+// Tuneable parameters
+//
+#define COPY_BREAK      16      // we do byte copy below (must be >=16)
+#define PIPE_DEPTH      21      // pipe depth
+#define EPI             p[PIPE_DEPTH-1]
+//
+// arguments
+//
+#define dst             in0
+#define src             in1
+#define len             in2
+//
+// local registers
+//
+#define t1              r2      // rshift in bytes
+#define t2              r3      // lshift in bytes
+#define rshift          r14     // right shift in bits
+#define lshift          r15     // left shift in bits
+#define word1           r16
+#define word2           r17
+#define cnt             r18
+#define len2            r19
+#define saved_lc        r20
+#define saved_pr        r21
+#define tmp             r22
+#define val             r23
+#define src1            r24
+#define dst1            r25
+#define src2            r26
+#define dst2            r27
+#define len1            r28
+#define enddst          r29
+#define endsrc          r30
+#define saved_pfs       r31
+GLOBAL_ENTRY(__copy_user)
+        .prologue
+        .save ar.pfs, saved_pfs
+        alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7)
+        .rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH]
+        .rotp p[PIPE_DEPTH]
+        adds len2=-1,len        // br.ctop is repeat/until
+        mov ret0=r0
+        ;;                      // RAW of cfm when len=0
+        cmp.eq p8,p0=r0,len     // check for zero length
+        .save ar.lc, saved_lc
+        mov saved_lc=ar.lc      // preserve ar.lc (slow)
+(p8)    br.ret.spnt.many rp     // empty mempcy()
+        ;;
+        add enddst=dst,len      // first byte after end of source
+        add endsrc=src,len      // first byte after end of destination
+        .save pr, saved_pr
+        mov saved_pr=pr         // preserve predicates
+        .body
+        mov dst1=dst            // copy because of rotation
+        mov ar.ec=PIPE_DEPTH
+        mov pr.rot=1<<16        // p16=true all others are false
+        mov src1=src            // copy because of rotation
+        mov ar.lc=len2          // initialize lc for small count
+        cmp.lt p10,p7=COPY_BREAK,len    // if len > COPY_BREAK then long copy
+        xor tmp=src,dst         // same alignment test prepare
+(p10)   br.cond.dptk .long_copy_user
+        ;;                      // RAW pr.rot/p16 ?
+        //
+        // Now we do the byte by byte loop with software pipeline
+        //
+        // p7 is necessarily false by now
+1:
+        EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
+        EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
+        br.ctop.dptk.few 1b
+        ;;
+        mov ar.lc=saved_lc
+        mov pr=saved_pr,0xffffffffffff0000
+        mov ar.pfs=saved_pfs            // restore ar.ec
+        br.ret.sptk.many rp             // end of short memcpy
+        //
+        // Not 8-byte aligned
+        //
+.diff_align_copy_user:
+        // At this point we know we have more than 16 bytes to copy
+        // and also that src and dest do _not_ have the same alignment.
+        and src2=0x7,src1                               // src offset
+        and dst2=0x7,dst1                               // dst offset
+        ;;
+        // The basic idea is that we copy byte-by-byte at the head so
+        // that we can reach 8-byte alignment for both src1 and dst1.
+        // Then copy the body using software pipelined 8-byte copy,
+        // shifting the two back-to-back words right and left, then copy
+        // the tail by copying byte-by-byte.
+        //
+        // Fault handling. If the byte-by-byte at the head fails on the
+        // load, then restart and finish the pipleline by copying zeros
+        // to the dst1. Then copy zeros for the rest of dst1.
+        // If 8-byte software pipeline fails on the load, do the same as
+        // failure_in3 does. If the byte-by-byte at the tail fails, it is
+        // handled simply by failure_in_pipe1.
+        //
+        // The case p14 represents the source has more bytes in the
+        // the first word (by the shifted part), whereas the p15 needs to
+        // copy some bytes from the 2nd word of the source that has the
+        // tail of the 1st of the destination.
+        //
+        //
+        // Optimization. If dst1 is 8-byte aligned (quite common), we don't need
+        // to copy the head to dst1, to start 8-byte copy software pipeline.
+        // We know src1 is not 8-byte aligned in this case.
+        //
+        cmp.eq p14,p15=r0,dst2
+(p15)   br.cond.spnt 1f
+        ;;
+        sub t1=8,src2
+        mov t2=src2
+        ;;
+        shl rshift=t2,3
+        sub len1=len,t1                                 // set len1
+        ;;
+        sub lshift=64,rshift
+        ;;
+        br.cond.spnt .word_copy_user
+        ;;
+1:
+        cmp.leu p14,p15=src2,dst2
+        sub t1=dst2,src2
+        ;;
+        .pred.rel "mutex", p14, p15
+(p14)   sub word1=8,src2                                // (8 - src offset)
+(p15)   sub t1=r0,t1                                    // absolute value
+(p15)   sub word1=8,dst2                                // (8 - dst offset)
+        ;;
+        // For the case p14, we don't need to copy the shifted part to
+        // the 1st word of destination.
+        sub t2=8,t1
+(p14)   sub word1=word1,t1
+        ;;
+        sub len1=len,word1                              // resulting len
+(p15)   shl rshift=t1,3                                 // in bits
+(p14)   shl rshift=t2,3
+        ;;
+(p14)   sub len1=len1,t1
+        adds cnt=-1,word1
+        ;;
+        sub lshift=64,rshift
+        mov ar.ec=PIPE_DEPTH
+        mov pr.rot=1<<16        // p16=true all others are false
+        mov ar.lc=cnt
+        ;;
+2:
+        EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1)
+        EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
+        br.ctop.dptk.few 2b
+        ;;
+        clrrrb
+        ;;
+.word_copy_user:
+        cmp.gtu p9,p0=16,len1
+(p9)    br.cond.spnt 4f                 // if (16 > len1) skip 8-byte copy
+        ;;
+        shr.u cnt=len1,3                // number of 64-bit words
+        ;;
+        adds cnt=-1,cnt
+        ;;
+        .pred.rel "mutex", p14, p15
+(p14)   sub src1=src1,t2
+(p15)   sub src1=src1,t1
+        //
+        // Now both src1 and dst1 point to an 8-byte aligned address. And
+        // we have more than 8 bytes to copy.
+        //
+        mov ar.lc=cnt
+        mov ar.ec=PIPE_DEPTH
+        mov pr.rot=1<<16        // p16=true all others are false
+        ;;
+3:
+        //
+        // The pipleline consists of 3 stages:
+        // 1 (p16):     Load a word from src1
+        // 2 (EPI_1):   Shift right pair, saving to tmp
+        // 3 (EPI):     Store tmp to dst1
+        //
+        // To make it simple, use at least 2 (p16) loops to set up val1[n]
+        // because we need 2 back-to-back val1[] to get tmp.
+        // Note that this implies EPI_2 must be p18 or greater.
+        //
+#define EPI_1           p[PIPE_DEPTH-2]
+#define SWITCH(pred, shift)     cmp.eq pred,p0=shift,rshift
+#define CASE(pred, shift)       \
+        (pred)  br.cond.spnt .copy_user_bit##shift
+#define BODY(rshift)                                            \
+.copy_user_bit##rshift:                                         \
+1:                                                              \
+        EX(.failure_out,(EPI) st8 [dst1]=tmp,8);                \
+(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift;  \
+        EX(3f,(p16) ld8 val1[1]=[src1],8);                      \
+(p16)   mov val1[0]=r0;                                         \
+        br.ctop.dptk 1b;                                        \
+        ;;                                                      \
+        br.cond.sptk.many .diff_align_do_tail;                  \
+2:                                                              \
+(EPI)   st8 [dst1]=tmp,8;                                       \
+(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift;  \
+3:                                                              \
+(p16)   mov val1[1]=r0;                                         \
+(p16)   mov val1[0]=r0;                                         \
+        br.ctop.dptk 2b;                                        \
+        ;;                                                      \
+        br.cond.sptk.many .failure_in2
+        //
+        // Since the instruction 'shrp' requires a fixed 128-bit value
+        // specifying the bits to shift, we need to provide 7 cases
+        // below.
+        //
+        SWITCH(p6, 8)
+        SWITCH(p7, 16)
+        SWITCH(p8, 24)
+        SWITCH(p9, 32)
+        SWITCH(p10, 40)
+        SWITCH(p11, 48)
+        SWITCH(p12, 56)
+        ;;
+        CASE(p6, 8)
+        CASE(p7, 16)
+        CASE(p8, 24)
+        CASE(p9, 32)
+        CASE(p10, 40)
+        CASE(p11, 48)
+        CASE(p12, 56)
+        ;;
+        BODY(8)
+        BODY(16)
+        BODY(24)
+        BODY(32)
+        BODY(40)
+        BODY(48)
+        BODY(56)
+        ;;
+.diff_align_do_tail:
+        .pred.rel "mutex", p14, p15
+(p14)   sub src1=src1,t1
+(p14)   adds dst1=-8,dst1
+(p15)   sub dst1=dst1,t1
+        ;;
+4:
+        // Tail correction.
+        //
+        // The problem with this piplelined loop is that the last word is not
+        // loaded and thus parf of the last word written is not correct.
+        // To fix that, we simply copy the tail byte by byte.
+        sub len1=endsrc,src1,1
+        clrrrb
+        ;;
+        mov ar.ec=PIPE_DEPTH
+        mov pr.rot=1<<16        // p16=true all others are false
+        mov ar.lc=len1
+        ;;
+5:
+        EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
+        EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
+        br.ctop.dptk.few 5b
+        ;;
+        mov ar.lc=saved_lc
+        mov pr=saved_pr,0xffffffffffff0000
+        mov ar.pfs=saved_pfs
+        br.ret.sptk.many rp
+        //
+        // Beginning of long mempcy (i.e. > 16 bytes)
+        //
+.long_copy_user:
+        tbit.nz p6,p7=src1,0    // odd alignment
+        and tmp=7,tmp
+        ;;
+        cmp.eq p10,p8=r0,tmp
+        mov len1=len            // copy because of rotation
+(p8)    br.cond.dpnt .diff_align_copy_user
+        ;;
+        // At this point we know we have more than 16 bytes to copy
+        // and also that both src and dest have the same alignment
+        // which may not be the one we want. So for now we must move
+        // forward slowly until we reach 16byte alignment: no need to
+        // worry about reaching the end of buffer.
+        //
+        EX(.failure_in1,(p6) ld1 val1[0]=[src1],1)      // 1-byte aligned
+(p6)    adds len1=-1,len1;;
+        tbit.nz p7,p0=src1,1
+        ;;
+        EX(.failure_in1,(p7) ld2 val1[1]=[src1],2)      // 2-byte aligned
+(p7)    adds len1=-2,len1;;
+        tbit.nz p8,p0=src1,2
+        ;;
+        //
+        // Stop bit not required after ld4 because if we fail on ld4
+        // we have never executed the ld1, therefore st1 is not executed.
+        //
+        EX(.failure_in1,(p8) ld4 val2[0]=[src1],4)      // 4-byte aligned
+        ;;
+        EX(.failure_out,(p6) st1 [dst1]=val1[0],1)
+        tbit.nz p9,p0=src1,3
+        ;;
+        //
+        // Stop bit not required after ld8 because if we fail on ld8
+        // we have never executed the ld2, therefore st2 is not executed.
+        //
+        EX(.failure_in1,(p9) ld8 val2[1]=[src1],8)      // 8-byte aligned
+        EX(.failure_out,(p7) st2 [dst1]=val1[1],2)
+(p8)    adds len1=-4,len1
+        ;;
+        EX(.failure_out, (p8) st4 [dst1]=val2[0],4)
+(p9)    adds len1=-8,len1;;
+        shr.u cnt=len1,4                // number of 128-bit (2x64bit) words
+        ;;
+        EX(.failure_out, (p9) st8 [dst1]=val2[1],8)
+        tbit.nz p6,p0=len1,3
+        cmp.eq p7,p0=r0,cnt
+        adds tmp=-1,cnt                 // br.ctop is repeat/until
+(p7)    br.cond.dpnt .dotail            // we have less than 16 bytes left
+        ;;
+        adds src2=8,src1
+        adds dst2=8,dst1
+        mov ar.lc=tmp
+        ;;
+        //
+        // 16bytes/iteration
+        //
+2:
+        EX(.failure_in3,(p16) ld8 val1[0]=[src1],16)
+(p16)   ld8 val2[0]=[src2],16
+        EX(.failure_out, (EPI)  st8 [dst1]=val1[PIPE_DEPTH-1],16)
+(EPI)   st8 [dst2]=val2[PIPE_DEPTH-1],16
+        br.ctop.dptk 2b
+        ;;                      // RAW on src1 when fall through from loop
+        //
+        // Tail correction based on len only
+        //
+        // No matter where we come from (loop or test) the src1 pointer
+        // is 16 byte aligned AND we have less than 16 bytes to copy.
+        //
+.dotail:
+        EX(.failure_in1,(p6) ld8 val1[0]=[src1],8)      // at least 8 bytes
+        tbit.nz p7,p0=len1,2
+        ;;
+        EX(.failure_in1,(p7) ld4 val1[1]=[src1],4)      // at least 4 bytes
+        tbit.nz p8,p0=len1,1
+        ;;
+        EX(.failure_in1,(p8) ld2 val2[0]=[src1],2)      // at least 2 bytes
+        tbit.nz p9,p0=len1,0
+        ;;
+        EX(.failure_out, (p6) st8 [dst1]=val1[0],8)
+        ;;
+        EX(.failure_in1,(p9) ld1 val2[1]=[src1])        // only 1 byte left
+        mov ar.lc=saved_lc
+        ;;
+        EX(.failure_out,(p7) st4 [dst1]=val1[1],4)
+        mov pr=saved_pr,0xffffffffffff0000
+        ;;
+        EX(.failure_out, (p8)   st2 [dst1]=val2[0],2)
+        mov ar.pfs=saved_pfs
+        ;;
+        EX(.failure_out, (p9)   st1 [dst1]=val2[1])
+        br.ret.sptk.many rp
+        //
+        // Here we handle the case where the byte by byte copy fails
+        // on the load.
+        // Several factors make the zeroing of the rest of the buffer kind of
+        // tricky:
+        //      - the pipeline: loads/stores are not in sync (pipeline)
+        //
+        //        In the same loop iteration, the dst1 pointer does not directly
+        //        reflect where the faulty load was.
+        //
+        //      - pipeline effect
+        //        When you get a fault on load, you may have valid data from
+        //        previous loads not yet store in transit. Such data must be
+        //        store normally before moving onto zeroing the rest.
+        //
+        //      - single/multi dispersal independence.
+        //
+        // solution:
+        //      - we don't disrupt the pipeline, i.e. data in transit in
+        //        the software pipeline will be eventually move to memory.
+        //        We simply replace the load with a simple mov and keep the
+        //        pipeline going. We can't really do this inline because
+        //        p16 is always reset to 1 when lc > 0.
+        //
+.failure_in_pipe1:
+        sub ret0=endsrc,src1    // number of bytes to zero, i.e. not copied
+1:
+(p16)   mov val1[0]=r0
+(EPI)   st1 [dst1]=val1[PIPE_DEPTH-1],1
+        br.ctop.dptk 1b
+        ;;
+        mov pr=saved_pr,0xffffffffffff0000
+        mov ar.lc=saved_lc
+        mov ar.pfs=saved_pfs
+        br.ret.sptk.many rp
+        //
+        // This is the case where the byte by byte copy fails on the load
+        // when we copy the head. We need to finish the pipeline and copy
+        // zeros for the rest of the destination. Since this happens
+        // at the top we still need to fill the body and tail.
+.failure_in_pipe2:
+        sub ret0=endsrc,src1    // number of bytes to zero, i.e. not copied
+2:
+(p16)   mov val1[0]=r0
+(EPI)   st1 [dst1]=val1[PIPE_DEPTH-1],1
+        br.ctop.dptk 2b
+        ;;
+        sub len=enddst,dst1,1           // precompute len
+        br.cond.dptk.many .failure_in1bis
+        ;;
+        //
+        // Here we handle the head & tail part when we check for alignment.
+        // The following code handles only the load failures. The
+        // main diffculty comes from the fact that loads/stores are
+        // scheduled. So when you fail on a load, the stores corresponding
+        // to previous successful loads must be executed.
+        //
+        // However some simplifications are possible given the way
+        // things work.
+        //
+        // 1) HEAD
+        // Theory of operation:
+        //
+        //  Page A   | Page B
+        //  ---------|-----
+        //          1|8 x
+        //        1 2|8 x
+        //          4|8 x
+        //        1 4|8 x
+        //        2 4|8 x
+        //      1 2 4|8 x
+        //           |1
+        //           |2 x
+        //           |4 x
+        //
+        // page_size >= 4k (2^12).  (x means 4, 2, 1)
+        // Here we suppose Page A exists and Page B does not.
+        //
+        // As we move towards eight byte alignment we may encounter faults.
+        // The numbers on each page show the size of the load (current alignment).
+        //
+        // Key point:
+        //      - if you fail on 1, 2, 4 then you have never executed any smaller
+        //        size loads, e.g. failing ld4 means no ld1 nor ld2 executed
+        //        before.
+        //
+        // This allows us to simplify the cleanup code, because basically you
+        // only have to worry about "pending" stores in the case of a failing
+        // ld8(). Given the way the code is written today, this means only
+        // worry about st2, st4. There we can use the information encapsulated
+        // into the predicates.
+        //
+        // Other key point:
+        //      - if you fail on the ld8 in the head, it means you went straight
+        //        to it, i.e. 8byte alignment within an unexisting page.
+        // Again this comes from the fact that if you crossed just for the ld8 then
+        // you are 8byte aligned but also 16byte align, therefore you would
+        // either go for the 16byte copy loop OR the ld8 in the tail part.
+        // The combination ld1, ld2, ld4, ld8 where you fail on ld8 is impossible
+        // because it would mean you had 15bytes to copy in which case you
+        // would have defaulted to the byte by byte copy.
+        //
+        //
+        // 2) TAIL
+        // Here we now we have less than 16 bytes AND we are either 8 or 16 byte
+        // aligned.
+        //
+        // Key point:
+        // This means that we either:
+        //              - are right on a page boundary
+        //      OR
+        //              - are at more than 16 bytes from a page boundary with
+        //                at most 15 bytes to copy: no chance of crossing.
+        //
+        // This allows us to assume that if we fail on a load we haven't possibly
+        // executed any of the previous (tail) ones, so we don't need to do
+        // any stores. For instance, if we fail on ld2, this means we had
+        // 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4.
+        //
+        // This means that we are in a situation similar the a fault in the
+        // head part. That's nice!
+        //
+.failure_in1:
+        sub ret0=endsrc,src1    // number of bytes to zero, i.e. not copied
+        sub len=endsrc,src1,1
+        //
+        // we know that ret0 can never be zero at this point
+        // because we failed why trying to do a load, i.e. there is still
+        // some work to do.
+        // The failure_in1bis and length problem is taken care of at the
+        // calling side.
+        //
+        ;;
+.failure_in1bis:                // from (.failure_in3)
+        mov ar.lc=len           // Continue with a stupid byte store.
+        ;;
+5:
+        st1 [dst1]=r0,1
+        br.cloop.dptk 5b
+        ;;
+        mov pr=saved_pr,0xffffffffffff0000
+        mov ar.lc=saved_lc
+        mov ar.pfs=saved_pfs
+        br.ret.sptk.many rp
+        //
+        // Here we simply restart the loop but instead
+        // of doing loads we fill the pipeline with zeroes
+        // We can't simply store r0 because we may have valid
+        // data in transit in the pipeline.
+        // ar.lc and ar.ec are setup correctly at this point
+        //
+        // we MUST use src1/endsrc here and not dst1/enddst because
+        // of the pipeline effect.
+        //
+.failure_in3:
+        sub ret0=endsrc,src1    // number of bytes to zero, i.e. not copied
+        ;;
+2:
+(p16)   mov val1[0]=r0
+(p16)   mov val2[0]=r0
+(EPI)   st8 [dst1]=val1[PIPE_DEPTH-1],16
+(EPI)   st8 [dst2]=val2[PIPE_DEPTH-1],16
+        br.ctop.dptk 2b
+        ;;
+        cmp.ne p6,p0=dst1,enddst        // Do we need to finish the tail ?
+        sub len=enddst,dst1,1           // precompute len
+(p6)    br.cond.dptk .failure_in1bis
+        ;;
+        mov pr=saved_pr,0xffffffffffff0000
+        mov ar.lc=saved_lc
+        mov ar.pfs=saved_pfs
+        br.ret.sptk.many rp
+.failure_in2:
+        sub ret0=endsrc,src1
+        cmp.ne p6,p0=dst1,enddst        // Do we need to finish the tail ?
+        sub len=enddst,dst1,1           // precompute len
+(p6)    br.cond.dptk .failure_in1bis
+        ;;
+        mov pr=saved_pr,0xffffffffffff0000
+        mov ar.lc=saved_lc
+        mov ar.pfs=saved_pfs
+        br.ret.sptk.many rp
+        //
+        // handling of failures on stores: that's the easy part
+        //
+.failure_out:
+        sub ret0=enddst,dst1
+        mov pr=saved_pr,0xffffffffffff0000
+        mov ar.lc=saved_lc
+        mov ar.pfs=saved_pfs
+        br.ret.sptk.many rp
+END(__copy_user)
diff --git a/arch/ia64/lib/csum_partial_copy.c b/arch/ia64/lib/csum_partial_copy.c
new file mode 100644
index 000000000000..36866e8a5d2b
--- /dev/null
+++ b/arch/ia64/lib/csum_partial_copy.c
@@ -0,0 +1,151 @@
+/*
+ * Network Checksum & Copy routine
+ *
+ * Copyright (C) 1999, 2003-2004 Hewlett-Packard Co
+ *      Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Most of the code has been imported from Linux/Alpha
+ */
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <asm/uaccess.h>
+/*
+ * XXX Fixme: those 2 inlines are meant for debugging and will go away
+ */
+static inline unsigned
+short from64to16(unsigned long x)
+{
+        /* add up 32-bit words for 33 bits */
+        x = (x & 0xffffffff) + (x >> 32);
+        /* add up 16-bit and 17-bit words for 17+c bits */
+        x = (x & 0xffff) + (x >> 16);
+        /* add up 16-bit and 2-bit for 16+c bit */
+        x = (x & 0xffff) + (x >> 16);
+        /* add up carry.. */
+        x = (x & 0xffff) + (x >> 16);
+        return x;
+}
+static inline
+unsigned long do_csum_c(const unsigned char * buff, int len, unsigned int psum)
+{
+        int odd, count;
+        unsigned long result = (unsigned long)psum;
+        if (len <= 0)
+                goto out;
+        odd = 1 & (unsigned long) buff;
+        if (odd) {
+                result = *buff << 8;
+                len--;
+                buff++;
+        }
+        count = len >> 1;               /* nr of 16-bit words.. */
+        if (count) {
+                if (2 & (unsigned long) buff) {
+                        result += *(unsigned short *) buff;
+                        count--;
+                        len -= 2;
+                        buff += 2;
+                }
+                count >>= 1;            /* nr of 32-bit words.. */
+                if (count) {
+                        if (4 & (unsigned long) buff) {
+                                result += *(unsigned int *) buff;
+                                count--;
+                                len -= 4;
+                                buff += 4;
+                        }
+                        count >>= 1;    /* nr of 64-bit words.. */
+                        if (count) {
+                                unsigned long carry = 0;
+                                do {
+                                        unsigned long w = *(unsigned long *) buff;
+                                        count--;
+                                        buff += 8;
+                                        result += carry;
+                                        result += w;
+                                        carry = (w > result);
+                                } while (count);
+                                result += carry;
+                                result = (result & 0xffffffff) + (result >> 32);
+                        }
+                        if (len & 4) {
+                                result += *(unsigned int *) buff;
+                                buff += 4;
+                        }
+                }
+                if (len & 2) {
+                        result += *(unsigned short *) buff;
+                        buff += 2;
+                }
+        }
+        if (len & 1)
+                result += *buff;
+        result = from64to16(result);
+        if (odd)
+                result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+out:
+        return result;
+}
+/*
+ * XXX Fixme
+ *
+ * This is very ugly but temporary. THIS NEEDS SERIOUS ENHANCEMENTS.
+ * But it's very tricky to get right even in C.
+ */
+extern unsigned long do_csum(const unsigned char *, long);
+static unsigned int
+do_csum_partial_copy_from_user (const unsigned char __user *src, unsigned char *dst,
+                                int len, unsigned int psum, int *errp)
+{
+        unsigned long result;
+        /* XXX Fixme
+         * for now we separate the copy from checksum for obvious
+         * alignment difficulties. Look at the Alpha code and you'll be
+         * scared.
+         */
+        if (__copy_from_user(dst, src, len) != 0 && errp)
+                *errp = -EFAULT;
+        result = do_csum(dst, len);
+        /* add in old sum, and carry.. */
+        result += psum;
+        /* 32+c bits -> 32 bits */
+        result = (result & 0xffffffff) + (result >> 32);
+        return result;
+}
+unsigned int
+csum_partial_copy_from_user (const unsigned char __user *src, unsigned char *dst,
+                             int len, unsigned int sum, int *errp)
+{
+        if (!access_ok(VERIFY_READ, src, len)) {
+                *errp = -EFAULT;
+                memset(dst, 0, len);
+                return sum;
+        }
+        return do_csum_partial_copy_from_user(src, dst, len, sum, errp);
+}
+unsigned int
+csum_partial_copy_nocheck(const unsigned char __user *src, unsigned char *dst,
+                          int len, unsigned int sum)
+{
+        return do_csum_partial_copy_from_user(src, dst, len, sum, NULL);
+}
+EXPORT_SYMBOL(csum_partial_copy_nocheck);
diff --git a/arch/ia64/lib/dec_and_lock.c b/arch/ia64/lib/dec_and_lock.c
new file mode 100644
index 000000000000..c7ce92f968f1
--- /dev/null
+++ b/arch/ia64/lib/dec_and_lock.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2003 Jerome Marchand, Bull S.A.
+ *      Cleaned up by David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * This file is released under the GPLv2, or at your option any later version.
+ *
+ * ia64 version of "atomic_dec_and_lock()" using the atomic "cmpxchg" instruction.  This
+ * code is an adaptation of the x86 version of "atomic_dec_and_lock()".
+ */
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+/*
+ * Decrement REFCOUNT and if the count reaches zero, acquire the spinlock.  Both of these
+ * operations have to be done atomically, so that the count doesn't drop to zero without
+ * acquiring the spinlock first.
+ */
+int
+_atomic_dec_and_lock (atomic_t *refcount, spinlock_t *lock)
+{
+        int old, new;
+        do {
+                old = atomic_read(refcount);
+                new = old - 1;
+                if (unlikely (old == 1)) {
+                        /* oops, we may be decrementing to zero, do it the slow way... */
+                        spin_lock(lock);
+                        if (atomic_dec_and_test(refcount))
+                                return 1;
+                        spin_unlock(lock);
+                        return 0;
+                }
+        } while (cmpxchg(&refcount->counter, old, new) != old);
+        return 0;
+}
+EXPORT_SYMBOL(_atomic_dec_and_lock);
diff --git a/arch/ia64/lib/do_csum.S b/arch/ia64/lib/do_csum.S
new file mode 100644
index 000000000000..6bec2fc9f5b2
--- /dev/null
+++ b/arch/ia64/lib/do_csum.S
@@ -0,0 +1,323 @@
+/*
+ *
+ * Optmized version of the standard do_csum() function
+ *
+ * Return: a 64bit quantity containing the 16bit Internet checksum
+ *
+ * Inputs:
+ *      in0: address of buffer to checksum (char *)
+ *      in1: length of the buffer (int)
+ *
+ * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co
+ *      Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * 02/04/22     Ken Chen <kenneth.w.chen@intel.com>
+ *              Data locality study on the checksum buffer.
+ *              More optimization cleanup - remove excessive stop bits.
+ * 02/04/08     David Mosberger <davidm@hpl.hp.com>
+ *              More cleanup and tuning.
+ * 01/04/18     Jun Nakajima <jun.nakajima@intel.com>
+ *              Clean up and optimize and the software pipeline, loading two
+ *              back-to-back 8-byte words per loop. Clean up the initialization
+ *              for the loop. Support the cases where load latency = 1 or 2.
+ *              Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default).
+ */
+#include <asm/asmmacro.h>
+//
+// Theory of operations:
+//      The goal is to go as quickly as possible to the point where
+//      we can checksum 16 bytes/loop. Before reaching that point we must
+//      take care of incorrect alignment of first byte.
+//
+//      The code hereafter also takes care of the "tail" part of the buffer
+//      before entering the core loop, if any. The checksum is a sum so it
+//      allows us to commute operations. So we do the "head" and "tail"
+//      first to finish at full speed in the body. Once we get the head and
+//      tail values, we feed them into the pipeline, very handy initialization.
+//
+//      Of course we deal with the special case where the whole buffer fits
+//      into one 8 byte word. In this case we have only one entry in the pipeline.
+//
+//      We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for
+//      possible load latency and also to accommodate for head and tail.
+//
+//      The end of the function deals with folding the checksum from 64bits
+//      down to 16bits taking care of the carry.
+//
+//      This version avoids synchronization in the core loop by also using a
+//      pipeline for the accumulation of the checksum in resultx[] (x=1,2).
+//
+//       wordx[] (x=1,2)
+//      |---|
+//      |   | 0                 : new value loaded in pipeline
+//      |---|
+//      |   | -                 : in transit data
+//      |---|
+//      |   | LOAD_LATENCY      : current value to add to checksum
+//      |---|
+//      |   | LOAD_LATENCY+1    : previous value added to checksum
+//      |---|                   (previous iteration)
+//
+//      resultx[] (x=1,2)
+//      |---|
+//      |   | 0                 : initial value
+//      |---|
+//      |   | LOAD_LATENCY-1    : new checksum
+//      |---|
+//      |   | LOAD_LATENCY      : previous value of checksum
+//      |---|
+//      |   | LOAD_LATENCY+1    : final checksum when out of the loop
+//      |---|
+//
+//
+//      See RFC1071 "Computing the Internet Checksum" for various techniques for
+//      calculating the Internet checksum.
+//
+// NOT YET DONE:
+//      - Maybe another algorithm which would take care of the folding at the
+//        end in a different manner
+//      - Work with people more knowledgeable than me on the network stack
+//        to figure out if we could not split the function depending on the
+//        type of packet or alignment we get. Like the ip_fast_csum() routine
+//        where we know we have at least 20bytes worth of data to checksum.
+//      - Do a better job of handling small packets.
+//      - Note on prefetching: it was found that under various load, i.e. ftp read/write,
+//        nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate is at 99.8%
+//        on the data that buffer points to (partly because the checksum is often preceded by
+//        a copy_from_user()).  This finding indiate that lfetch will not be beneficial since
+//        the data is already in the cache.
+//
+#define saved_pfs       r11
+#define hmask           r16
+#define tmask           r17
+#define first1          r18
+#define firstval        r19
+#define firstoff        r20
+#define last            r21
+#define lastval         r22
+#define lastoff         r23
+#define saved_lc        r24
+#define saved_pr        r25
+#define tmp1            r26
+#define tmp2            r27
+#define tmp3            r28
+#define carry1          r29
+#define carry2          r30
+#define first2          r31
+#define buf             in0
+#define len             in1
+#define LOAD_LATENCY    2       // XXX fix me
+#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2)
+# error "Only 1 or 2 is supported/tested for LOAD_LATENCY."
+#endif
+#define PIPE_DEPTH                      (LOAD_LATENCY+2)
+#define ELD     p[LOAD_LATENCY]         // end of load
+#define ELD_1   p[LOAD_LATENCY+1]       // and next stage
+// unsigned long do_csum(unsigned char *buf,long len)
+GLOBAL_ENTRY(do_csum)
+        .prologue
+        .save ar.pfs, saved_pfs
+        alloc saved_pfs=ar.pfs,2,16,0,16
+        .rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2]
+        .rotp p[PIPE_DEPTH], pC1[2], pC2[2]
+        mov ret0=r0             // in case we have zero length
+        cmp.lt p0,p6=r0,len     // check for zero length or negative (32bit len)
+        ;;
+        add tmp1=buf,len        // last byte's address
+        .save pr, saved_pr
+        mov saved_pr=pr         // preserve predicates (rotation)
+(p6)    br.ret.spnt.many rp     // return if zero or negative length
+        mov hmask=-1            // initialize head mask
+        tbit.nz p15,p0=buf,0    // is buf an odd address?
+        and first1=-8,buf       // 8-byte align down address of first1 element
+        and firstoff=7,buf      // how many bytes off for first1 element
+        mov tmask=-1            // initialize tail mask
+        ;;
+        adds tmp2=-1,tmp1       // last-1
+        and lastoff=7,tmp1      // how many bytes off for last element
+        ;;
+        sub tmp1=8,lastoff      // complement to lastoff
+        and last=-8,tmp2        // address of word containing last byte
+        ;;
+        sub tmp3=last,first1    // tmp3=distance from first1 to last
+        .save ar.lc, saved_lc
+        mov saved_lc=ar.lc      // save lc
+        cmp.eq p8,p9=last,first1        // everything fits in one word ?
+        ld8 firstval=[first1],8 // load, ahead of time, "first1" word
+        and tmp1=7, tmp1        // make sure that if tmp1==8 -> tmp1=0
+        shl tmp2=firstoff,3     // number of bits
+        ;;
+(p9)    ld8 lastval=[last]      // load, ahead of time, "last" word, if needed
+        shl tmp1=tmp1,3         // number of bits
+(p9)    adds tmp3=-8,tmp3       // effectively loaded
+        ;;
+(p8)    mov lastval=r0          // we don't need lastval if first1==last
+        shl hmask=hmask,tmp2    // build head mask, mask off [0,first1off[
+        shr.u tmask=tmask,tmp1  // build tail mask, mask off ]8,lastoff]
+        ;;
+        .body
+#define count tmp3
+(p8)    and hmask=hmask,tmask   // apply tail mask to head mask if 1 word only
+(p9)    and word2[0]=lastval,tmask      // mask last it as appropriate
+        shr.u count=count,3     // how many 8-byte?
+        ;;
+        // If count is odd, finish this 8-byte word so that we can
+        // load two back-to-back 8-byte words per loop thereafter.
+        and word1[0]=firstval,hmask     // and mask it as appropriate
+        tbit.nz p10,p11=count,0         // if (count is odd)
+        ;;
+(p8)    mov result1[0]=word1[0]
+(p9)    add result1[0]=word1[0],word2[0]
+        ;;
+        cmp.ltu p6,p0=result1[0],word1[0]       // check the carry
+        cmp.eq.or.andcm p8,p0=0,count           // exit if zero 8-byte
+        ;;
+(p6)    adds result1[0]=1,result1[0]
+(p8)    br.cond.dptk .do_csum_exit      // if (within an 8-byte word)
+(p11)   br.cond.dptk .do_csum16         // if (count is even)
+        // Here count is odd.
+        ld8 word1[1]=[first1],8         // load an 8-byte word
+        cmp.eq p9,p10=1,count           // if (count == 1)
+        adds count=-1,count             // loaded an 8-byte word
+        ;;
+        add result1[0]=result1[0],word1[1]
+        ;;
+        cmp.ltu p6,p0=result1[0],word1[1]
+        ;;
+(p6)    adds result1[0]=1,result1[0]
+(p9)    br.cond.sptk .do_csum_exit      // if (count == 1) exit
+        // Fall through to caluculate the checksum, feeding result1[0] as
+        // the initial value in result1[0].
+        //
+        // Calculate the checksum loading two 8-byte words per loop.
+        //
+.do_csum16:
+        add first2=8,first1
+        shr.u count=count,1     // we do 16 bytes per loop
+        ;;
+        adds count=-1,count
+        mov carry1=r0
+        mov carry2=r0
+        brp.loop.imp 1f,2f
+        ;;
+        mov ar.ec=PIPE_DEPTH
+        mov ar.lc=count // set lc
+        mov pr.rot=1<<16
+        // result1[0] must be initialized in advance.
+        mov result2[0]=r0
+        ;;
+        .align 32
+1:
+(ELD_1) cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1]
+(pC1[1])adds carry1=1,carry1
+(ELD_1) cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1]
+(pC2[1])adds carry2=1,carry2
+(ELD)   add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
+(ELD)   add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
+2:
+(p[0])  ld8 word1[0]=[first1],16
+(p[0])  ld8 word2[0]=[first2],16
+        br.ctop.sptk 1b
+        ;;
+        // Since len is a 32-bit value, carry cannot be larger than a 64-bit value.
+(pC1[1])adds carry1=1,carry1    // since we miss the last one
+(pC2[1])adds carry2=1,carry2
+        ;;
+        add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1
+        add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2
+        ;;
+        cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1
+        cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2
+        ;;
+(p6)    adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1]
+(p7)    adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1]
+        ;;
+        add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1]
+        ;;
+        cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1]
+        ;;
+(p6)    adds result1[0]=1,result1[0]
+        ;;
+.do_csum_exit:
+        //
+        // now fold 64 into 16 bits taking care of carry
+        // that's not very good because it has lots of sequentiality
+        //
+        mov tmp3=0xffff
+        zxt4 tmp1=result1[0]
+        shr.u tmp2=result1[0],32
+        ;;
+        add result1[0]=tmp1,tmp2
+        ;;
+        and tmp1=result1[0],tmp3
+        shr.u tmp2=result1[0],16
+        ;;
+        add result1[0]=tmp1,tmp2
+        ;;
+        and tmp1=result1[0],tmp3
+        shr.u tmp2=result1[0],16
+        ;;
+        add result1[0]=tmp1,tmp2
+        ;;
+        and tmp1=result1[0],tmp3
+        shr.u tmp2=result1[0],16
+        ;;
+        add ret0=tmp1,tmp2
+        mov pr=saved_pr,0xffffffffffff0000
+        ;;
+        // if buf was odd then swap bytes
+        mov ar.pfs=saved_pfs            // restore ar.ec
+(p15)   mux1 ret0=ret0,@rev             // reverse word
+        ;;
+        mov ar.lc=saved_lc
+(p15)   shr.u ret0=ret0,64-16   // + shift back to position = swap bytes
+        br.ret.sptk.many rp
+//      I (Jun Nakajima) wrote an equivalent code (see below), but it was
+//      not much better than the original. So keep the original there so that
+//      someone else can challenge.
+//
+//      shr.u word1[0]=result1[0],32
+//      zxt4 result1[0]=result1[0]
+//      ;;
+//      add result1[0]=result1[0],word1[0]
+//      ;;
+//      zxt2 result2[0]=result1[0]
+//      extr.u word1[0]=result1[0],16,16
+//      shr.u carry1=result1[0],32
+//      ;;
+//      add result2[0]=result2[0],word1[0]
+//      ;;
+//      add result2[0]=result2[0],carry1
+//      ;;
+//      extr.u ret0=result2[0],16,16
+//      ;;
+//      add ret0=ret0,result2[0]
+//      ;;
+//      zxt2 ret0=ret0
+//      mov ar.pfs=saved_pfs             // restore ar.ec
+//      mov pr=saved_pr,0xffffffffffff0000
+//      ;;
+//      // if buf was odd then swap bytes
+//      mov ar.lc=saved_lc
+//(p15) mux1 ret0=ret0,@rev             // reverse word
+//      ;;
+//(p15) shr.u ret0=ret0,64-16   // + shift back to position = swap bytes
+//      br.ret.sptk.many rp
+END(do_csum)
diff --git a/arch/ia64/lib/flush.S b/arch/ia64/lib/flush.S
new file mode 100644
index 000000000000..29c802b19669
--- /dev/null
+++ b/arch/ia64/lib/flush.S
@@ -0,0 +1,39 @@
+/*
+ * Cache flushing routines.
+ *
+ * Copyright (C) 1999-2001 Hewlett-Packard Co
+ * Copyright (C) 1999-2001 David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+#include <asm/asmmacro.h>
+#include <asm/page.h>
+        /*
+         * flush_icache_range(start,end)
+         *      Must flush range from start to end-1 but nothing else (need to
+         *      be careful not to touch addresses that may be unmapped).
+         */
+GLOBAL_ENTRY(flush_icache_range)
+        .prologue
+        alloc r2=ar.pfs,2,0,0,0
+        sub r8=in1,in0,1
+        ;;
+        shr.u r8=r8,5                   // we flush 32 bytes per iteration
+        .save ar.lc, r3
+        mov r3=ar.lc                    // save ar.lc
+        ;;
+        .body
+        mov ar.lc=r8
+        ;;
+.Loop:  fc in0                          // issuable on M0 only
+        add in0=32,in0
+        br.cloop.sptk.few .Loop
+        ;;
+        sync.i
+        ;;
+        srlz.i
+        ;;
+        mov ar.lc=r3                    // restore ar.lc
+        br.ret.sptk.many rp
+END(flush_icache_range)
diff --git a/arch/ia64/lib/idiv32.S b/arch/ia64/lib/idiv32.S
new file mode 100644
index 000000000000..2ac28bf0a662
--- /dev/null
+++ b/arch/ia64/lib/idiv32.S
@@ -0,0 +1,83 @@
+/*
+ * Copyright (C) 2000 Hewlett-Packard Co
+ * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * 32-bit integer division.
+ *
+ * This code is based on the application note entitled "Divide, Square Root
+ * and Remainder Algorithms for the IA-64 Architecture".  This document
+ * is available as Intel document number 248725-002 or via the web at
+ * http://developer.intel.com/software/opensource/numerics/
+ *
+ * For more details on the theory behind these algorithms, see "IA-64
+ * and Elementary Functions" by Peter Markstein; HP Professional Books
+ * (http://www.hp.com/go/retailbooks/)
+ */
+#include <asm/asmmacro.h>
+#ifdef MODULO
+# define OP     mod
+#else
+# define OP     div
+#endif
+#ifdef UNSIGNED
+# define SGN    u
+# define EXTEND zxt4
+# define INT_TO_FP(a,b) fcvt.xuf.s1 a=b
+# define FP_TO_INT(a,b) fcvt.fxu.trunc.s1 a=b
+#else
+# define SGN
+# define EXTEND sxt4
+# define INT_TO_FP(a,b) fcvt.xf a=b
+# define FP_TO_INT(a,b) fcvt.fx.trunc.s1 a=b
+#endif
+#define PASTE1(a,b)     a##b
+#define PASTE(a,b)      PASTE1(a,b)
+#define NAME            PASTE(PASTE(__,SGN),PASTE(OP,si3))
+GLOBAL_ENTRY(NAME)
+        .regstk 2,0,0,0
+        // Transfer inputs to FP registers.
+        mov r2 = 0xffdd                 // r2 = -34 + 65535 (fp reg format bias)
+        EXTEND in0 = in0                // in0 = a
+        EXTEND in1 = in1                // in1 = b
+        ;;
+        setf.sig f8 = in0
+        setf.sig f9 = in1
+#ifdef MODULO
+        sub in1 = r0, in1               // in1 = -b
+#endif
+        ;;
+        // Convert the inputs to FP, to avoid FP software-assist faults.
+        INT_TO_FP(f8, f8)
+        INT_TO_FP(f9, f9)
+        ;;
+        setf.exp f7 = r2                // f7 = 2^-34
+        frcpa.s1 f6, p6 = f8, f9        // y0 = frcpa(b)
+        ;;
+(p6)    fmpy.s1 f8 = f8, f6             // q0 = a*y0
+(p6)    fnma.s1 f6 = f9, f6, f1         // e0 = -b*y0 + 1 
+        ;;
+#ifdef MODULO
+        setf.sig f9 = in1               // f9 = -b
+#endif
+(p6)    fma.s1 f8 = f6, f8, f8          // q1 = e0*q0 + q0
+(p6)    fma.s1 f6 = f6, f6, f7          // e1 = e0*e0 + 2^-34
+        ;;
+#ifdef MODULO
+        setf.sig f7 = in0
+#endif
+(p6)    fma.s1 f6 = f6, f8, f8          // q2 = e1*q1 + q1
+        ;;
+        FP_TO_INT(f6, f6)               // q = trunc(q2)
+        ;;
+#ifdef MODULO
+        xma.l f6 = f6, f9, f7           // r = q*(-b) + a
+        ;;
+#endif
+        getf.sig r8 = f6                // transfer result to result register
+        br.ret.sptk.many rp
+END(NAME)
diff --git a/arch/ia64/lib/idiv64.S b/arch/ia64/lib/idiv64.S
new file mode 100644
index 000000000000..f69bd2b0987a
--- /dev/null
+++ b/arch/ia64/lib/idiv64.S
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 1999-2000 Hewlett-Packard Co
+ * Copyright (C) 1999-2000 David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * 64-bit integer division.
+ *
+ * This code is based on the application note entitled "Divide, Square Root
+ * and Remainder Algorithms for the IA-64 Architecture".  This document
+ * is available as Intel document number 248725-002 or via the web at
+ * http://developer.intel.com/software/opensource/numerics/
+ *
+ * For more details on the theory behind these algorithms, see "IA-64
+ * and Elementary Functions" by Peter Markstein; HP Professional Books
+ * (http://www.hp.com/go/retailbooks/)
+ */
+#include <asm/asmmacro.h>
+#ifdef MODULO
+# define OP     mod
+#else
+# define OP     div
+#endif
+#ifdef UNSIGNED
+# define SGN    u
+# define INT_TO_FP(a,b) fcvt.xuf.s1 a=b
+# define FP_TO_INT(a,b) fcvt.fxu.trunc.s1 a=b
+#else
+# define SGN
+# define INT_TO_FP(a,b) fcvt.xf a=b
+# define FP_TO_INT(a,b) fcvt.fx.trunc.s1 a=b
+#endif
+#define PASTE1(a,b)     a##b
+#define PASTE(a,b)      PASTE1(a,b)
+#define NAME            PASTE(PASTE(__,SGN),PASTE(OP,di3))
+GLOBAL_ENTRY(NAME)
+        .regstk 2,0,0,0
+        // Transfer inputs to FP registers.
+        setf.sig f8 = in0
+        setf.sig f9 = in1
+        ;;
+        // Convert the inputs to FP, to avoid FP software-assist faults.
+        INT_TO_FP(f8, f8)
+        INT_TO_FP(f9, f9)
+        ;;
+        frcpa.s1 f11, p6 = f8, f9       // y0 = frcpa(b)
+        ;;
+(p6)    fmpy.s1 f7 = f8, f11            // q0 = a*y0
+(p6)    fnma.s1 f6 = f9, f11, f1        // e0 = -b*y0 + 1
+        ;;
+(p6)    fma.s1 f10 = f7, f6, f7         // q1 = q0*e0 + q0
+(p6)    fmpy.s1 f7 = f6, f6             // e1 = e0*e0
+        ;;
+#ifdef MODULO
+        sub in1 = r0, in1               // in1 = -b
+#endif
+(p6)    fma.s1 f10 = f10, f7, f10       // q2 = q1*e1 + q1
+(p6)    fma.s1 f6 = f11, f6, f11        // y1 = y0*e0 + y0
+        ;;
+(p6)    fma.s1 f6 = f6, f7, f6          // y2 = y1*e1 + y1
+(p6)    fnma.s1 f7 = f9, f10, f8        // r = -b*q2 + a
+        ;;
+#ifdef MODULO
+        setf.sig f8 = in0               // f8 = a
+        setf.sig f9 = in1               // f9 = -b
+#endif
+(p6)    fma.s1 f11 = f7, f6, f10        // q3 = r*y2 + q2
+        ;;
+        FP_TO_INT(f11, f11)             // q = trunc(q3)
+        ;;
+#ifdef MODULO
+        xma.l f11 = f11, f9, f8         // r = q*(-b) + a
+        ;;
+#endif
+        getf.sig r8 = f11               // transfer result to result register
+        br.ret.sptk.many rp
+END(NAME)
diff --git a/arch/ia64/lib/io.c b/arch/ia64/lib/io.c
new file mode 100644
index 000000000000..8949e44091ac
--- /dev/null
+++ b/arch/ia64/lib/io.c
@@ -0,0 +1,165 @@
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <asm/io.h>
+/*
+ * Copy data from IO memory space to "real" memory space.
+ * This needs to be optimized.
+ */
+void memcpy_fromio(void *to, const volatile void __iomem *from, long count)
+{
+        char *dst = to;
+        while (count) {
+                count--;
+                *dst++ = readb(from++);
+        }
+}
+EXPORT_SYMBOL(memcpy_fromio);
+/*
+ * Copy data from "real" memory space to IO memory space.
+ * This needs to be optimized.
+ */
+void memcpy_toio(volatile void __iomem *to, const void *from, long count)
+{
+        const char *src = from;
+        while (count) {
+                count--;
+                writeb(*src++, to++);
+        }
+}
+EXPORT_SYMBOL(memcpy_toio);
+/*
+ * "memset" on IO memory space.
+ * This needs to be optimized.
+ */
+void memset_io(volatile void __iomem *dst, int c, long count)
+{
+        unsigned char ch = (char)(c & 0xff);
+        while (count) {
+                count--;
+                writeb(ch, dst);
+                dst++;
+        }
+}
+EXPORT_SYMBOL(memset_io);
+#ifdef CONFIG_IA64_GENERIC
+#undef __ia64_inb
+#undef __ia64_inw
+#undef __ia64_inl
+#undef __ia64_outb
+#undef __ia64_outw
+#undef __ia64_outl
+#undef __ia64_readb
+#undef __ia64_readw
+#undef __ia64_readl
+#undef __ia64_readq
+#undef __ia64_readb_relaxed
+#undef __ia64_readw_relaxed
+#undef __ia64_readl_relaxed
+#undef __ia64_readq_relaxed
+#undef __ia64_writeb
+#undef __ia64_writew
+#undef __ia64_writel
+#undef __ia64_writeq
+#undef __ia64_mmiowb
+unsigned int
+__ia64_inb (unsigned long port)
+{
+        return ___ia64_inb(port);
+}
+unsigned int
+__ia64_inw (unsigned long port)
+{
+        return ___ia64_inw(port);
+}
+unsigned int
+__ia64_inl (unsigned long port)
+{
+        return ___ia64_inl(port);
+}
+void
+__ia64_outb (unsigned char val, unsigned long port)
+{
+        ___ia64_outb(val, port);
+}
+void
+__ia64_outw (unsigned short val, unsigned long port)
+{
+        ___ia64_outw(val, port);
+}
+void
+__ia64_outl (unsigned int val, unsigned long port)
+{
+        ___ia64_outl(val, port);
+}
+unsigned char
+__ia64_readb (void __iomem *addr)
+{
+        return ___ia64_readb (addr);
+}
+unsigned short
+__ia64_readw (void __iomem *addr)
+{
+        return ___ia64_readw (addr);
+}
+unsigned int
+__ia64_readl (void __iomem *addr)
+{
+        return ___ia64_readl (addr);
+}
+unsigned long
+__ia64_readq (void __iomem *addr)
+{
+        return ___ia64_readq (addr);
+}
+unsigned char
+__ia64_readb_relaxed (void __iomem *addr)
+{
+        return ___ia64_readb (addr);
+}
+unsigned short
+__ia64_readw_relaxed (void __iomem *addr)
+{
+        return ___ia64_readw (addr);
+}
+unsigned int
+__ia64_readl_relaxed (void __iomem *addr)
+{
+        return ___ia64_readl (addr);
+}
+unsigned long
+__ia64_readq_relaxed (void __iomem *addr)
+{
+        return ___ia64_readq (addr);
+}
+void
+__ia64_mmiowb(void)
+{
+        ___ia64_mmiowb();
+}
+#endif /* CONFIG_IA64_GENERIC */
diff --git a/arch/ia64/lib/ip_fast_csum.S b/arch/ia64/lib/ip_fast_csum.S
new file mode 100644
index 000000000000..19674ca2acfc
--- /dev/null
+++ b/arch/ia64/lib/ip_fast_csum.S
@@ -0,0 +1,90 @@
+/*
+ * Optmized version of the ip_fast_csum() function
+ * Used for calculating IP header checksum
+ *
+ * Return: 16bit checksum, complemented
+ *
+ * Inputs:
+ *      in0: address of buffer to checksum (char *)
+ *      in1: length of the buffer (int)
+ *
+ * Copyright (C) 2002 Intel Corp.
+ * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
+ */
+#include <asm/asmmacro.h>
+/*
+ * Since we know that most likely this function is called with buf aligned
+ * on 4-byte boundary and 20 bytes in length, we can execution rather quickly
+ * versus calling generic version of do_csum, which has lots of overhead in
+ * handling various alignments and sizes.  However, due to lack of constrains
+ * put on the function input argument, cases with alignment not on 4-byte or
+ * size not equal to 20 bytes will be handled by the generic do_csum function.
+ */
+#define in0     r32
+#define in1     r33
+#define ret0    r8
+GLOBAL_ENTRY(ip_fast_csum)
+        .prologue
+        .body
+        cmp.ne  p6,p7=5,in1     // size other than 20 byte?
+        and     r14=3,in0       // is it aligned on 4-byte?
+        add     r15=4,in0       // second source pointer
+        ;;
+        cmp.ne.or.andcm p6,p7=r14,r0
+        ;;
+(p7)    ld4     r20=[in0],8
+(p7)    ld4     r21=[r15],8
+(p6)    br.spnt .generic
+        ;;
+        ld4     r22=[in0],8
+        ld4     r23=[r15],8
+        ;;
+        ld4     r24=[in0]
+        add     r20=r20,r21
+        add     r22=r22,r23
+        ;;
+        add     r20=r20,r22
+        ;;
+        add     r20=r20,r24
+        ;;
+        shr.u   ret0=r20,16     // now need to add the carry
+        zxt2    r20=r20
+        ;;
+        add     r20=ret0,r20
+        ;;
+        shr.u   ret0=r20,16     // add carry again
+        zxt2    r20=r20
+        ;;
+        add     r20=ret0,r20
+        ;;
+        shr.u   ret0=r20,16
+        zxt2    r20=r20
+        ;;
+        add     r20=ret0,r20
+        ;;
+        andcm   ret0=-1,r20
+        .restore sp             // reset frame state
+        br.ret.sptk.many b0
+        ;;
+.generic:
+        .prologue
+        .save ar.pfs, r35
+        alloc   r35=ar.pfs,2,2,2,0
+        .save rp, r34
+        mov     r34=b0
+        .body
+        dep.z   out1=in1,2,30
+        mov     out0=in0
+        ;;
+        br.call.sptk.many b0=do_csum
+        ;;
+        andcm   ret0=-1,ret0
+        mov     ar.pfs=r35
+        mov     b0=r34
+        br.ret.sptk.many b0
+END(ip_fast_csum)
diff --git a/arch/ia64/lib/memcpy.S b/arch/ia64/lib/memcpy.S
new file mode 100644
index 000000000000..448908d80b69
--- /dev/null
+++ b/arch/ia64/lib/memcpy.S
@@ -0,0 +1,301 @@
+/*
+ *
+ * Optimized version of the standard memcpy() function
+ *
+ * Inputs:
+ *      in0:    destination address
+ *      in1:    source address
+ *      in2:    number of bytes to copy
+ * Output:
+ *      no return value
+ *
+ * Copyright (C) 2000-2001 Hewlett-Packard Co
+ *      Stephane Eranian <eranian@hpl.hp.com>
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+#include <asm/asmmacro.h>
+GLOBAL_ENTRY(memcpy)
+#       define MEM_LAT  21              /* latency to memory */
+#       define dst      r2
+#       define src      r3
+#       define retval   r8
+#       define saved_pfs r9
+#       define saved_lc r10
+#       define saved_pr r11
+#       define cnt      r16
+#       define src2     r17
+#       define t0       r18
+#       define t1       r19
+#       define t2       r20
+#       define t3       r21
+#       define t4       r22
+#       define src_end  r23
+#       define N        (MEM_LAT + 4)
+#       define Nrot     ((N + 7) & ~7)
+        /*
+         * First, check if everything (src, dst, len) is a multiple of eight.  If
+         * so, we handle everything with no taken branches (other than the loop
+         * itself) and a small icache footprint.  Otherwise, we jump off to
+         * the more general copy routine handling arbitrary
+         * sizes/alignment etc.
+         */
+        .prologue
+        .save ar.pfs, saved_pfs
+        alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot
+        .save ar.lc, saved_lc
+        mov saved_lc=ar.lc
+        or t0=in0,in1
+        ;;
+        or t0=t0,in2
+        .save pr, saved_pr
+        mov saved_pr=pr
+        .body
+        cmp.eq p6,p0=in2,r0     // zero length?
+        mov retval=in0          // return dst
+(p6)    br.ret.spnt.many rp     // zero length, return immediately
+        ;;
+        mov dst=in0             // copy because of rotation
+        shr.u cnt=in2,3         // number of 8-byte words to copy
+        mov pr.rot=1<<16
+        ;;
+        adds cnt=-1,cnt         // br.ctop is repeat/until
+        cmp.gtu p7,p0=16,in2    // copying less than 16 bytes?
+        mov ar.ec=N
+        ;;
+        and t0=0x7,t0
+        mov ar.lc=cnt
+        ;;
+        cmp.ne p6,p0=t0,r0
+        mov src=in1             // copy because of rotation
+(p7)    br.cond.spnt.few .memcpy_short
+(p6)    br.cond.spnt.few .memcpy_long
+        ;;
+        nop.m   0
+        ;;
+        nop.m   0
+        nop.i   0
+        ;;
+        nop.m   0
+        ;;
+        .rotr val[N]
+        .rotp p[N]
+        .align 32
+1: { .mib
+(p[0])  ld8 val[0]=[src],8
+        nop.i 0
+        brp.loop.imp 1b, 2f
+}
+2: { .mfb
+(p[N-1])st8 [dst]=val[N-1],8
+        nop.f 0
+        br.ctop.dptk.few 1b
+}
+        ;;
+        mov ar.lc=saved_lc
+        mov pr=saved_pr,-1
+        mov ar.pfs=saved_pfs
+        br.ret.sptk.many rp
+        /*
+         * Small (<16 bytes) unaligned copying is done via a simple byte-at-the-time
+         * copy loop.  This performs relatively poorly on Itanium, but it doesn't
+         * get used very often (gcc inlines small copies) and due to atomicity
+         * issues, we want to avoid read-modify-write of entire words.
+         */
+        .align 32
+.memcpy_short:
+        adds cnt=-1,in2         // br.ctop is repeat/until
+        mov ar.ec=MEM_LAT
+        brp.loop.imp 1f, 2f
+        ;;
+        mov ar.lc=cnt
+        ;;
+        nop.m   0
+        ;;
+        nop.m   0
+        nop.i   0
+        ;;
+        nop.m   0
+        ;;
+        nop.m   0
+        ;;
+        /*
+         * It is faster to put a stop bit in the loop here because it makes
+         * the pipeline shorter (and latency is what matters on short copies).
+         */
+        .align 32
+1: { .mib
+(p[0])  ld1 val[0]=[src],1
+        nop.i 0
+        brp.loop.imp 1b, 2f
+} ;;
+2: { .mfb
+(p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1
+        nop.f 0
+        br.ctop.dptk.few 1b
+} ;;
+        mov ar.lc=saved_lc
+        mov pr=saved_pr,-1
+        mov ar.pfs=saved_pfs
+        br.ret.sptk.many rp
+        /*
+         * Large (>= 16 bytes) copying is done in a fancy way.  Latency isn't
+         * an overriding concern here, but throughput is.  We first do
+         * sub-word copying until the destination is aligned, then we check
+         * if the source is also aligned.  If so, we do a simple load/store-loop
+         * until there are less than 8 bytes left over and then we do the tail,
+         * by storing the last few bytes using sub-word copying.  If the source
+         * is not aligned, we branch off to the non-congruent loop.
+         *
+         *   stage:   op:
+         *         0  ld
+         *         :
+         * MEM_LAT+3  shrp
+         * MEM_LAT+4  st
+         *
+         * On Itanium, the pipeline itself runs without stalls.  However,  br.ctop
+         * seems to introduce an unavoidable bubble in the pipeline so the overall
+         * latency is 2 cycles/iteration.  This gives us a _copy_ throughput
+         * of 4 byte/cycle.  Still not bad.
+         */
+#       undef N
+#       undef Nrot
+#       define N        (MEM_LAT + 5)           /* number of stages */
+#       define Nrot     ((N+1 + 2 + 7) & ~7)    /* number of rotating regs */
+#define LOG_LOOP_SIZE   6
+.memcpy_long:
+        alloc t3=ar.pfs,3,Nrot,0,Nrot   // resize register frame
+        and t0=-8,src           // t0 = src & ~7
+        and t2=7,src            // t2 = src & 7
+        ;;
+        ld8 t0=[t0]             // t0 = 1st source word
+        adds src2=7,src         // src2 = (src + 7)
+        sub t4=r0,dst           // t4 = -dst
+        ;;
+        and src2=-8,src2        // src2 = (src + 7) & ~7
+        shl t2=t2,3             // t2 = 8*(src & 7)
+        shl t4=t4,3             // t4 = 8*(dst & 7)
+        ;;
+        ld8 t1=[src2]           // t1 = 1st source word if src is 8-byte aligned, 2nd otherwise
+        sub t3=64,t2            // t3 = 64-8*(src & 7)
+        shr.u t0=t0,t2
+        ;;
+        add src_end=src,in2
+        shl t1=t1,t3
+        mov pr=t4,0x38          // (p5,p4,p3)=(dst & 7)
+        ;;
+        or t0=t0,t1
+        mov cnt=r0
+        adds src_end=-1,src_end
+        ;;
+(p3)    st1 [dst]=t0,1
+(p3)    shr.u t0=t0,8
+(p3)    adds cnt=1,cnt
+        ;;
+(p4)    st2 [dst]=t0,2
+(p4)    shr.u t0=t0,16
+(p4)    adds cnt=2,cnt
+        ;;
+(p5)    st4 [dst]=t0,4
+(p5)    adds cnt=4,cnt
+        and src_end=-8,src_end  // src_end = last word of source buffer
+        ;;
+        // At this point, dst is aligned to 8 bytes and there at least 16-7=9 bytes left to copy:
+1:{     add src=cnt,src                 // make src point to remainder of source buffer
+        sub cnt=in2,cnt                 // cnt = number of bytes left to copy
+        mov t4=ip
+  }     ;;
+        and src2=-8,src                 // align source pointer
+        adds t4=.memcpy_loops-1b,t4
+        mov ar.ec=N
+        and t0=7,src                    // t0 = src & 7
+        shr.u t2=cnt,3                  // t2 = number of 8-byte words left to copy
+        shl cnt=cnt,3                   // move bits 0-2 to 3-5
+        ;;
+        .rotr val[N+1], w[2]
+        .rotp p[N]
+        cmp.ne p6,p0=t0,r0              // is src aligned, too?
+        shl t0=t0,LOG_LOOP_SIZE         // t0 = 8*(src & 7)
+        adds t2=-1,t2                   // br.ctop is repeat/until
+        ;;
+        add t4=t0,t4
+        mov pr=cnt,0x38                 // set (p5,p4,p3) to # of bytes last-word bytes to copy
+        mov ar.lc=t2
+        ;;
+        nop.m   0
+        ;;
+        nop.m   0
+        nop.i   0
+        ;;
+        nop.m   0
+        ;;
+(p6)    ld8 val[1]=[src2],8             // prime the pump...
+        mov b6=t4
+        br.sptk.few b6
+        ;;
+.memcpy_tail:
+        // At this point, (p5,p4,p3) are set to the number of bytes left to copy (which is
+        // less than 8) and t0 contains the last few bytes of the src buffer:
+(p5)    st4 [dst]=t0,4
+(p5)    shr.u t0=t0,32
+        mov ar.lc=saved_lc
+        ;;
+(p4)    st2 [dst]=t0,2
+(p4)    shr.u t0=t0,16
+        mov ar.pfs=saved_pfs
+        ;;
+(p3)    st1 [dst]=t0
+        mov pr=saved_pr,-1
+        br.ret.sptk.many rp
+///////////////////////////////////////////////////////
+        .align 64
+#define COPY(shift,index)                                                                       \
+ 1: { .mib                                                                                      \
+        (p[0])          ld8 val[0]=[src2],8;                                                    \
+        (p[MEM_LAT+3])  shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift;                    \
+                        brp.loop.imp 1b, 2f                                                     \
+    };                                                                                          \
+ 2: { .mfb                                                                                      \
+        (p[MEM_LAT+4])  st8 [dst]=w[1],8;                                                       \
+                        nop.f 0;                                                                \
+                        br.ctop.dptk.few 1b;                                                    \
+    };                                                                                          \
+                        ;;                                                                      \
+                        ld8 val[N-1]=[src_end]; /* load last word (may be same as val[N]) */    \
+                        ;;                                                                      \
+                        shrp t0=val[N-1],val[N-index],shift;                                    \
+                        br .memcpy_tail
+.memcpy_loops:
+        COPY(0, 1) /* no point special casing this---it doesn't go any faster without shrp */
+        COPY(8, 0)
+        COPY(16, 0)
+        COPY(24, 0)
+        COPY(32, 0)
+        COPY(40, 0)
+        COPY(48, 0)
+        COPY(56, 0)
+END(memcpy)
diff --git a/arch/ia64/lib/memcpy_mck.S b/arch/ia64/lib/memcpy_mck.S
new file mode 100644
index 000000000000..6f26ef7cc236
--- /dev/null
+++ b/arch/ia64/lib/memcpy_mck.S
@@ -0,0 +1,661 @@
+/*
+ * Itanium 2-optimized version of memcpy and copy_user function
+ *
+ * Inputs:
+ *      in0:    destination address
+ *      in1:    source address
+ *      in2:    number of bytes to copy
+ * Output:
+ *      0 if success, or number of byte NOT copied if error occurred.
+ *
+ * Copyright (C) 2002 Intel Corp.
+ * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
+ */
+#include <linux/config.h>
+#include <asm/asmmacro.h>
+#include <asm/page.h>
+#define EK(y...) EX(y)
+/* McKinley specific optimization */
+#define retval          r8
+#define saved_pfs       r31
+#define saved_lc        r10
+#define saved_pr        r11
+#define saved_in0       r14
+#define saved_in1       r15
+#define saved_in2       r16
+#define src0            r2
+#define src1            r3
+#define dst0            r17
+#define dst1            r18
+#define cnt             r9
+/* r19-r30 are temp for each code section */
+#define PREFETCH_DIST   8
+#define src_pre_mem     r19
+#define dst_pre_mem     r20
+#define src_pre_l2      r21
+#define dst_pre_l2      r22
+#define t1              r23
+#define t2              r24
+#define t3              r25
+#define t4              r26
+#define t5              t1      // alias!
+#define t6              t2      // alias!
+#define t7              t3      // alias!
+#define n8              r27
+#define t9              t5      // alias!
+#define t10             t4      // alias!
+#define t11             t7      // alias!
+#define t12             t6      // alias!
+#define t14             t10     // alias!
+#define t13             r28
+#define t15             r29
+#define tmp             r30
+/* defines for long_copy block */
+#define A       0
+#define B       (PREFETCH_DIST)
+#define C       (B + PREFETCH_DIST)
+#define D       (C + 1)
+#define N       (D + 1)
+#define Nrot    ((N + 7) & ~7)
+/* alias */
+#define in0             r32
+#define in1             r33
+#define in2             r34
+GLOBAL_ENTRY(memcpy)
+        and     r28=0x7,in0
+        and     r29=0x7,in1
+        mov     f6=f0
+        br.cond.sptk .common_code
+        ;;
+GLOBAL_ENTRY(__copy_user)
+        .prologue
+// check dest alignment
+        and     r28=0x7,in0
+        and     r29=0x7,in1
+        mov     f6=f1
+        mov     saved_in0=in0   // save dest pointer
+        mov     saved_in1=in1   // save src pointer
+        mov     saved_in2=in2   // save len
+        ;;
+.common_code:
+        cmp.gt  p15,p0=8,in2    // check for small size
+        cmp.ne  p13,p0=0,r28    // check dest alignment
+        cmp.ne  p14,p0=0,r29    // check src alignment
+        add     src0=0,in1
+        sub     r30=8,r28       // for .align_dest
+        mov     retval=r0       // initialize return value
+        ;;
+        add     dst0=0,in0
+        add     dst1=1,in0      // dest odd index
+        cmp.le  p6,p0 = 1,r30   // for .align_dest
+(p15)   br.cond.dpnt .memcpy_short
+(p13)   br.cond.dpnt .align_dest
+(p14)   br.cond.dpnt .unaligned_src
+        ;;
+// both dest and src are aligned on 8-byte boundary
+.aligned_src:
+        .save ar.pfs, saved_pfs
+        alloc   saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
+        .save pr, saved_pr
+        mov     saved_pr=pr
+        shr.u   cnt=in2,7       // this much cache line
+        ;;
+        cmp.lt  p6,p0=2*PREFETCH_DIST,cnt
+        cmp.lt  p7,p8=1,cnt
+        .save ar.lc, saved_lc
+        mov     saved_lc=ar.lc
+        .body
+        add     cnt=-1,cnt
+        add     src_pre_mem=0,in1       // prefetch src pointer
+        add     dst_pre_mem=0,in0       // prefetch dest pointer
+        ;;
+(p7)    mov     ar.lc=cnt       // prefetch count
+(p8)    mov     ar.lc=r0
+(p6)    br.cond.dpnt .long_copy
+        ;;
+.prefetch:
+        lfetch.fault      [src_pre_mem], 128
+        lfetch.fault.excl [dst_pre_mem], 128
+        br.cloop.dptk.few .prefetch
+        ;;
+.medium_copy:
+        and     tmp=31,in2      // copy length after iteration
+        shr.u   r29=in2,5       // number of 32-byte iteration
+        add     dst1=8,dst0     // 2nd dest pointer
+        ;;
+        add     cnt=-1,r29      // ctop iteration adjustment
+        cmp.eq  p10,p0=r29,r0   // do we really need to loop?
+        add     src1=8,src0     // 2nd src pointer
+        cmp.le  p6,p0=8,tmp
+        ;;
+        cmp.le  p7,p0=16,tmp
+        mov     ar.lc=cnt       // loop setup
+        cmp.eq  p16,p17 = r0,r0
+        mov     ar.ec=2
+(p10)   br.dpnt.few .aligned_src_tail
+        ;;
+        TEXT_ALIGN(32)
+1:
+EX(.ex_handler, (p16)   ld8     r34=[src0],16)
+EK(.ex_handler, (p16)   ld8     r38=[src1],16)
+EX(.ex_handler, (p17)   st8     [dst0]=r33,16)
+EK(.ex_handler, (p17)   st8     [dst1]=r37,16)
+        ;;
+EX(.ex_handler, (p16)   ld8     r32=[src0],16)
+EK(.ex_handler, (p16)   ld8     r36=[src1],16)
+EX(.ex_handler, (p16)   st8     [dst0]=r34,16)
+EK(.ex_handler, (p16)   st8     [dst1]=r38,16)
+        br.ctop.dptk.few 1b
+        ;;
+.aligned_src_tail:
+EX(.ex_handler, (p6)    ld8     t1=[src0])
+        mov     ar.lc=saved_lc
+        mov     ar.pfs=saved_pfs
+EX(.ex_hndlr_s, (p7)    ld8     t2=[src1],8)
+        cmp.le  p8,p0=24,tmp
+        and     r21=-8,tmp
+        ;;
+EX(.ex_hndlr_s, (p8)    ld8     t3=[src1])
+EX(.ex_handler, (p6)    st8     [dst0]=t1)      // store byte 1
+        and     in2=7,tmp       // remaining length
+EX(.ex_hndlr_d, (p7)    st8     [dst1]=t2,8)    // store byte 2
+        add     src0=src0,r21   // setting up src pointer
+        add     dst0=dst0,r21   // setting up dest pointer
+        ;;
+EX(.ex_handler, (p8)    st8     [dst1]=t3)      // store byte 3
+        mov     pr=saved_pr,-1
+        br.dptk.many .memcpy_short
+        ;;
+/* code taken from copy_page_mck */
+.long_copy:
+        .rotr v[2*PREFETCH_DIST]
+        .rotp p[N]
+        mov src_pre_mem = src0
+        mov pr.rot = 0x10000
+        mov ar.ec = 1                           // special unrolled loop
+        mov dst_pre_mem = dst0
+        add src_pre_l2 = 8*8, src0
+        add dst_pre_l2 = 8*8, dst0
+        ;;
+        add src0 = 8, src_pre_mem               // first t1 src
+        mov ar.lc = 2*PREFETCH_DIST - 1
+        shr.u cnt=in2,7                         // number of lines
+        add src1 = 3*8, src_pre_mem             // first t3 src
+        add dst0 = 8, dst_pre_mem               // first t1 dst
+        add dst1 = 3*8, dst_pre_mem             // first t3 dst
+        ;;
+        and tmp=127,in2                         // remaining bytes after this block
+        add cnt = -(2*PREFETCH_DIST) - 1, cnt
+        // same as .line_copy loop, but with all predicated-off instructions removed:
+.prefetch_loop:
+EX(.ex_hndlr_lcpy_1, (p[A])     ld8 v[A] = [src_pre_mem], 128)          // M0
+EK(.ex_hndlr_lcpy_1, (p[B])     st8 [dst_pre_mem] = v[B], 128)          // M2
+        br.ctop.sptk .prefetch_loop
+        ;;
+        cmp.eq p16, p0 = r0, r0                 // reset p16 to 1
+        mov ar.lc = cnt
+        mov ar.ec = N                           // # of stages in pipeline
+        ;;
+.line_copy:
+EX(.ex_handler, (p[D])  ld8 t2 = [src0], 3*8)                   // M0
+EK(.ex_handler, (p[D])  ld8 t4 = [src1], 3*8)                   // M1
+EX(.ex_handler_lcpy,    (p[B])  st8 [dst_pre_mem] = v[B], 128)          // M2 prefetch dst from memory
+EK(.ex_handler_lcpy,    (p[D])  st8 [dst_pre_l2] = n8, 128)             // M3 prefetch dst from L2
+        ;;
+EX(.ex_handler_lcpy,    (p[A])  ld8 v[A] = [src_pre_mem], 128)          // M0 prefetch src from memory
+EK(.ex_handler_lcpy,    (p[C])  ld8 n8 = [src_pre_l2], 128)             // M1 prefetch src from L2
+EX(.ex_handler, (p[D])  st8 [dst0] =  t1, 8)                    // M2
+EK(.ex_handler, (p[D])  st8 [dst1] =  t3, 8)                    // M3
+        ;;
+EX(.ex_handler, (p[D])  ld8  t5 = [src0], 8)
+EK(.ex_handler, (p[D])  ld8  t7 = [src1], 3*8)
+EX(.ex_handler, (p[D])  st8 [dst0] =  t2, 3*8)
+EK(.ex_handler, (p[D])  st8 [dst1] =  t4, 3*8)
+        ;;
+EX(.ex_handler, (p[D])  ld8  t6 = [src0], 3*8)
+EK(.ex_handler, (p[D])  ld8 t10 = [src1], 8)
+EX(.ex_handler, (p[D])  st8 [dst0] =  t5, 8)
+EK(.ex_handler, (p[D])  st8 [dst1] =  t7, 3*8)
+        ;;
+EX(.ex_handler, (p[D])  ld8  t9 = [src0], 3*8)
+EK(.ex_handler, (p[D])  ld8 t11 = [src1], 3*8)
+EX(.ex_handler, (p[D])  st8 [dst0] =  t6, 3*8)
+EK(.ex_handler, (p[D])  st8 [dst1] = t10, 8)
+        ;;
+EX(.ex_handler, (p[D])  ld8 t12 = [src0], 8)
+EK(.ex_handler, (p[D])  ld8 t14 = [src1], 8)
+EX(.ex_handler, (p[D])  st8 [dst0] =  t9, 3*8)
+EK(.ex_handler, (p[D])  st8 [dst1] = t11, 3*8)
+        ;;
+EX(.ex_handler, (p[D])  ld8 t13 = [src0], 4*8)
+EK(.ex_handler, (p[D])  ld8 t15 = [src1], 4*8)
+EX(.ex_handler, (p[D])  st8 [dst0] = t12, 8)
+EK(.ex_handler, (p[D])  st8 [dst1] = t14, 8)
+        ;;
+EX(.ex_handler, (p[C])  ld8  t1 = [src0], 8)
+EK(.ex_handler, (p[C])  ld8  t3 = [src1], 8)
+EX(.ex_handler, (p[D])  st8 [dst0] = t13, 4*8)
+EK(.ex_handler, (p[D])  st8 [dst1] = t15, 4*8)
+        br.ctop.sptk .line_copy
+        ;;
+        add dst0=-8,dst0
+        add src0=-8,src0
+        mov in2=tmp
+        .restore sp
+        br.sptk.many .medium_copy
+        ;;
+#define BLOCK_SIZE      128*32
+#define blocksize       r23
+#define curlen          r24
+// dest is on 8-byte boundary, src is not. We need to do
+// ld8-ld8, shrp, then st8.  Max 8 byte copy per cycle.
+.unaligned_src:
+        .prologue
+        .save ar.pfs, saved_pfs
+        alloc   saved_pfs=ar.pfs,3,5,0,8
+        .save ar.lc, saved_lc
+        mov     saved_lc=ar.lc
+        .save pr, saved_pr
+        mov     saved_pr=pr
+        .body
+.4k_block:
+        mov     saved_in0=dst0  // need to save all input arguments
+        mov     saved_in2=in2
+        mov     blocksize=BLOCK_SIZE
+        ;;
+        cmp.lt  p6,p7=blocksize,in2
+        mov     saved_in1=src0
+        ;;
+(p6)    mov     in2=blocksize
+        ;;
+        shr.u   r21=in2,7       // this much cache line
+        shr.u   r22=in2,4       // number of 16-byte iteration
+        and     curlen=15,in2   // copy length after iteration
+        and     r30=7,src0      // source alignment
+        ;;
+        cmp.lt  p7,p8=1,r21
+        add     cnt=-1,r21
+        ;;
+        add     src_pre_mem=0,src0      // prefetch src pointer
+        add     dst_pre_mem=0,dst0      // prefetch dest pointer
+        and     src0=-8,src0            // 1st src pointer
+(p7)    mov     ar.lc = r21
+(p8)    mov     ar.lc = r0
+        ;;
+        TEXT_ALIGN(32)
+1:      lfetch.fault      [src_pre_mem], 128
+        lfetch.fault.excl [dst_pre_mem], 128
+        br.cloop.dptk.few 1b
+        ;;
+        shladd  dst1=r22,3,dst0 // 2nd dest pointer
+        shladd  src1=r22,3,src0 // 2nd src pointer
+        cmp.eq  p8,p9=r22,r0    // do we really need to loop?
+        cmp.le  p6,p7=8,curlen; // have at least 8 byte remaining?
+        add     cnt=-1,r22      // ctop iteration adjustment
+        ;;
+EX(.ex_handler, (p9)    ld8     r33=[src0],8)   // loop primer
+EK(.ex_handler, (p9)    ld8     r37=[src1],8)
+(p8)    br.dpnt.few .noloop
+        ;;
+// The jump address is calculated based on src alignment. The COPYU
+// macro below need to confine its size to power of two, so an entry
+// can be caulated using shl instead of an expensive multiply. The
+// size is then hard coded by the following #define to match the
+// actual size.  This make it somewhat tedious when COPYU macro gets
+// changed and this need to be adjusted to match.
+#define LOOP_SIZE 6
+1:
+        mov     r29=ip          // jmp_table thread
+        mov     ar.lc=cnt
+        ;;
+        add     r29=.jump_table - 1b - (.jmp1-.jump_table), r29
+        shl     r28=r30, LOOP_SIZE      // jmp_table thread
+        mov     ar.ec=2         // loop setup
+        ;;
+        add     r29=r29,r28             // jmp_table thread
+        cmp.eq  p16,p17=r0,r0
+        ;;
+        mov     b6=r29                  // jmp_table thread
+        ;;
+        br.cond.sptk.few b6
+// for 8-15 byte case
+// We will skip the loop, but need to replicate the side effect
+// that the loop produces.
+.noloop:
+EX(.ex_handler, (p6)    ld8     r37=[src1],8)
+        add     src0=8,src0
+(p6)    shl     r25=r30,3
+        ;;
+EX(.ex_handler, (p6)    ld8     r27=[src1])
+(p6)    shr.u   r28=r37,r25
+(p6)    sub     r26=64,r25
+        ;;
+(p6)    shl     r27=r27,r26
+        ;;
+(p6)    or      r21=r28,r27
+.unaligned_src_tail:
+/* check if we have more than blocksize to copy, if so go back */
+        cmp.gt  p8,p0=saved_in2,blocksize
+        ;;
+(p8)    add     dst0=saved_in0,blocksize
+(p8)    add     src0=saved_in1,blocksize
+(p8)    sub     in2=saved_in2,blocksize
+(p8)    br.dpnt .4k_block
+        ;;
+/* we have up to 15 byte to copy in the tail.
+ * part of work is already done in the jump table code
+ * we are at the following state.
+ * src side:
+ * 
+ *   xxxxxx xx                   <----- r21 has xxxxxxxx already
+ * -------- -------- --------
+ * 0        8        16
+ *          ^
+ *          |
+ *          src1
+ * 
+ * dst
+ * -------- -------- --------
+ * ^
+ * |
+ * dst1
+ */
+EX(.ex_handler, (p6)    st8     [dst1]=r21,8)   // more than 8 byte to copy
+(p6)    add     curlen=-8,curlen        // update length
+        mov     ar.pfs=saved_pfs
+        ;;
+        mov     ar.lc=saved_lc
+        mov     pr=saved_pr,-1
+        mov     in2=curlen      // remaining length
+        mov     dst0=dst1       // dest pointer
+        add     src0=src1,r30   // forward by src alignment
+        ;;
+// 7 byte or smaller.
+.memcpy_short:
+        cmp.le  p8,p9   = 1,in2
+        cmp.le  p10,p11 = 2,in2
+        cmp.le  p12,p13 = 3,in2
+        cmp.le  p14,p15 = 4,in2
+        add     src1=1,src0     // second src pointer
+        add     dst1=1,dst0     // second dest pointer
+        ;;
+EX(.ex_handler_short, (p8)      ld1     t1=[src0],2)
+EK(.ex_handler_short, (p10)     ld1     t2=[src1],2)
+(p9)    br.ret.dpnt rp          // 0 byte copy
+        ;;
+EX(.ex_handler_short, (p8)      st1     [dst0]=t1,2)
+EK(.ex_handler_short, (p10)     st1     [dst1]=t2,2)
+(p11)   br.ret.dpnt rp          // 1 byte copy
+EX(.ex_handler_short, (p12)     ld1     t3=[src0],2)
+EK(.ex_handler_short, (p14)     ld1     t4=[src1],2)
+(p13)   br.ret.dpnt rp          // 2 byte copy
+        ;;
+        cmp.le  p6,p7   = 5,in2
+        cmp.le  p8,p9   = 6,in2
+        cmp.le  p10,p11 = 7,in2
+EX(.ex_handler_short, (p12)     st1     [dst0]=t3,2)
+EK(.ex_handler_short, (p14)     st1     [dst1]=t4,2)
+(p15)   br.ret.dpnt rp          // 3 byte copy
+        ;;
+EX(.ex_handler_short, (p6)      ld1     t5=[src0],2)
+EK(.ex_handler_short, (p8)      ld1     t6=[src1],2)
+(p7)    br.ret.dpnt rp          // 4 byte copy
+        ;;
+EX(.ex_handler_short, (p6)      st1     [dst0]=t5,2)
+EK(.ex_handler_short, (p8)      st1     [dst1]=t6,2)
+(p9)    br.ret.dptk rp          // 5 byte copy
+EX(.ex_handler_short, (p10)     ld1     t7=[src0],2)
+(p11)   br.ret.dptk rp          // 6 byte copy
+        ;;
+EX(.ex_handler_short, (p10)     st1     [dst0]=t7,2)
+        br.ret.dptk rp          // done all cases
+/* Align dest to nearest 8-byte boundary. We know we have at
+ * least 7 bytes to copy, enough to crawl to 8-byte boundary.
+ * Actual number of byte to crawl depend on the dest alignment.
+ * 7 byte or less is taken care at .memcpy_short
+ * src0 - source even index
+ * src1 - source  odd index
+ * dst0 - dest even index
+ * dst1 - dest  odd index
+ * r30  - distance to 8-byte boundary
+ */
+.align_dest:
+        add     src1=1,in1      // source odd index
+        cmp.le  p7,p0 = 2,r30   // for .align_dest
+        cmp.le  p8,p0 = 3,r30   // for .align_dest
+EX(.ex_handler_short, (p6)      ld1     t1=[src0],2)
+        cmp.le  p9,p0 = 4,r30   // for .align_dest
+        cmp.le  p10,p0 = 5,r30
+        ;;
+EX(.ex_handler_short, (p7)      ld1     t2=[src1],2)
+EK(.ex_handler_short, (p8)      ld1     t3=[src0],2)
+        cmp.le  p11,p0 = 6,r30
+EX(.ex_handler_short, (p6)      st1     [dst0] = t1,2)
+        cmp.le  p12,p0 = 7,r30
+        ;;
+EX(.ex_handler_short, (p9)      ld1     t4=[src1],2)
+EK(.ex_handler_short, (p10)     ld1     t5=[src0],2)
+EX(.ex_handler_short, (p7)      st1     [dst1] = t2,2)
+EK(.ex_handler_short, (p8)      st1     [dst0] = t3,2)
+        ;;
+EX(.ex_handler_short, (p11)     ld1     t6=[src1],2)
+EK(.ex_handler_short, (p12)     ld1     t7=[src0],2)
+        cmp.eq  p6,p7=r28,r29
+EX(.ex_handler_short, (p9)      st1     [dst1] = t4,2)
+EK(.ex_handler_short, (p10)     st1     [dst0] = t5,2)
+        sub     in2=in2,r30
+        ;;
+EX(.ex_handler_short, (p11)     st1     [dst1] = t6,2)
+EK(.ex_handler_short, (p12)     st1     [dst0] = t7)
+        add     dst0=in0,r30    // setup arguments
+        add     src0=in1,r30
+(p6)    br.cond.dptk .aligned_src
+(p7)    br.cond.dpnt .unaligned_src
+        ;;
+/* main loop body in jump table format */
+#define COPYU(shift)                                                                    \
+1:                                                                                      \
+EX(.ex_handler,  (p16)  ld8     r32=[src0],8);          /* 1 */                         \
+EK(.ex_handler,  (p16)  ld8     r36=[src1],8);                                          \
+                 (p17)  shrp    r35=r33,r34,shift;;     /* 1 */                         \
+EX(.ex_handler,  (p6)   ld8     r22=[src1]);    /* common, prime for tail section */    \
+                 nop.m  0;                                                              \
+                 (p16)  shrp    r38=r36,r37,shift;                                      \
+EX(.ex_handler,  (p17)  st8     [dst0]=r35,8);          /* 1 */                         \
+EK(.ex_handler,  (p17)  st8     [dst1]=r39,8);                                          \
+                 br.ctop.dptk.few 1b;;                                                  \
+                 (p7)   add     src1=-8,src1;   /* back out for <8 byte case */         \
+                 shrp   r21=r22,r38,shift;      /* speculative work */                  \
+                 br.sptk.few .unaligned_src_tail /* branch out of jump table */         \
+                 ;;
+        TEXT_ALIGN(32)
+.jump_table:
+        COPYU(8)        // unaligned cases
+.jmp1:
+        COPYU(16)
+        COPYU(24)
+        COPYU(32)
+        COPYU(40)
+        COPYU(48)
+        COPYU(56)
+#undef A
+#undef B
+#undef C
+#undef D
+END(memcpy)
+/*
+ * Due to lack of local tag support in gcc 2.x assembler, it is not clear which
+ * instruction failed in the bundle.  The exception algorithm is that we
+ * first figure out the faulting address, then detect if there is any
+ * progress made on the copy, if so, redo the copy from last known copied
+ * location up to the faulting address (exclusive). In the copy_from_user
+ * case, remaining byte in kernel buffer will be zeroed.
+ *
+ * Take copy_from_user as an example, in the code there are multiple loads
+ * in a bundle and those multiple loads could span over two pages, the
+ * faulting address is calculated as page_round_down(max(src0, src1)).
+ * This is based on knowledge that if we can access one byte in a page, we
+ * can access any byte in that page.
+ *
+ * predicate used in the exception handler:
+ * p6-p7: direction
+ * p10-p11: src faulting addr calculation
+ * p12-p13: dst faulting addr calculation
+ */
+#define A       r19
+#define B       r20
+#define C       r21
+#define D       r22
+#define F       r28
+#define memset_arg0     r32
+#define memset_arg2     r33
+#define saved_retval    loc0
+#define saved_rtlink    loc1
+#define saved_pfs_stack loc2
+.ex_hndlr_s:
+        add     src0=8,src0
+        br.sptk .ex_handler
+        ;;
+.ex_hndlr_d:
+        add     dst0=8,dst0
+        br.sptk .ex_handler
+        ;;
+.ex_hndlr_lcpy_1:
+        mov     src1=src_pre_mem
+        mov     dst1=dst_pre_mem
+        cmp.gtu p10,p11=src_pre_mem,saved_in1
+        cmp.gtu p12,p13=dst_pre_mem,saved_in0
+        ;;
+(p10)   add     src0=8,saved_in1
+(p11)   mov     src0=saved_in1
+(p12)   add     dst0=8,saved_in0
+(p13)   mov     dst0=saved_in0
+        br.sptk .ex_handler
+.ex_handler_lcpy:
+        // in line_copy block, the preload addresses should always ahead
+        // of the other two src/dst pointers.  Furthermore, src1/dst1 should
+        // always ahead of src0/dst0.
+        mov     src1=src_pre_mem
+        mov     dst1=dst_pre_mem
+.ex_handler:
+        mov     pr=saved_pr,-1          // first restore pr, lc, and pfs
+        mov     ar.lc=saved_lc
+        mov     ar.pfs=saved_pfs
+        ;;
+.ex_handler_short: // fault occurred in these sections didn't change pr, lc, pfs
+        cmp.ltu p6,p7=saved_in0, saved_in1      // get the copy direction
+        cmp.ltu p10,p11=src0,src1
+        cmp.ltu p12,p13=dst0,dst1
+        fcmp.eq p8,p0=f6,f0             // is it memcpy?
+        mov     tmp = dst0
+        ;;
+(p11)   mov     src1 = src0             // pick the larger of the two
+(p13)   mov     dst0 = dst1             // make dst0 the smaller one
+(p13)   mov     dst1 = tmp              // and dst1 the larger one
+        ;;
+(p6)    dep     F = r0,dst1,0,PAGE_SHIFT // usr dst round down to page boundary
+(p7)    dep     F = r0,src1,0,PAGE_SHIFT // usr src round down to page boundary
+        ;;
+(p6)    cmp.le  p14,p0=dst0,saved_in0   // no progress has been made on store
+(p7)    cmp.le  p14,p0=src0,saved_in1   // no progress has been made on load
+        mov     retval=saved_in2
+(p8)    ld1     tmp=[src1]              // force an oops for memcpy call
+(p8)    st1     [dst1]=r0               // force an oops for memcpy call
+(p14)   br.ret.sptk.many rp
+/*
+ * The remaining byte to copy is calculated as:
+ *
+ * A =  (faulting_addr - orig_src)      -> len to faulting ld address
+ *      or 
+ *      (faulting_addr - orig_dst)      -> len to faulting st address
+ * B =  (cur_dst - orig_dst)            -> len copied so far
+ * C =  A - B                           -> len need to be copied
+ * D =  orig_len - A                    -> len need to be zeroed
+ */
+(p6)    sub     A = F, saved_in0
+(p7)    sub     A = F, saved_in1
+        clrrrb
+        ;;
+        alloc   saved_pfs_stack=ar.pfs,3,3,3,0
+        sub     B = dst0, saved_in0     // how many byte copied so far
+        ;;
+        sub     C = A, B
+        sub     D = saved_in2, A
+        ;;
+        cmp.gt  p8,p0=C,r0              // more than 1 byte?
+        add     memset_arg0=saved_in0, A
+(p6)    mov     memset_arg2=0           // copy_to_user should not call memset
+(p7)    mov     memset_arg2=D           // copy_from_user need to have kbuf zeroed
+        mov     r8=0
+        mov     saved_retval = D
+        mov     saved_rtlink = b0
+        add     out0=saved_in0, B
+        add     out1=saved_in1, B
+        mov     out2=C
+(p8)    br.call.sptk.few b0=__copy_user // recursive call
+        ;;
+        add     saved_retval=saved_retval,r8    // above might return non-zero value
+        cmp.gt  p8,p0=memset_arg2,r0    // more than 1 byte?
+        mov     out0=memset_arg0        // *s
+        mov     out1=r0                 // c
+        mov     out2=memset_arg2        // n
+(p8)    br.call.sptk.few b0=memset
+        ;;
+        mov     retval=saved_retval
+        mov     ar.pfs=saved_pfs_stack
+        mov     b0=saved_rtlink
+        br.ret.sptk.many rp
+/* end of McKinley specific optimization */
+END(__copy_user)
diff --git a/arch/ia64/lib/memset.S b/arch/ia64/lib/memset.S
new file mode 100644
index 000000000000..bd8cf907fe22
--- /dev/null
+++ b/arch/ia64/lib/memset.S
@@ -0,0 +1,362 @@
+/* Optimized version of the standard memset() function.
+   Copyright (c) 2002 Hewlett-Packard Co/CERN
+        Sverre Jarp <Sverre.Jarp@cern.ch>
+   Return: dest
+   Inputs:
+        in0:    dest
+        in1:    value
+        in2:    count
+   The algorithm is fairly straightforward: set byte by byte until we
+   we get to a 16B-aligned address, then loop on 128 B chunks using an
+   early store as prefetching, then loop on 32B chucks, then clear remaining
+   words, finally clear remaining bytes.
+   Since a stf.spill f0 can store 16B in one go, we use this instruction
+   to get peak speed when value = 0.  */
+#include <asm/asmmacro.h>
+#undef ret
+#define dest            in0
+#define value           in1
+#define cnt             in2
+#define tmp             r31
+#define save_lc         r30
+#define ptr0            r29
+#define ptr1            r28
+#define ptr2            r27
+#define ptr3            r26
+#define ptr9            r24
+#define loopcnt         r23
+#define linecnt         r22
+#define bytecnt         r21
+#define fvalue          f6
+// This routine uses only scratch predicate registers (p6 - p15)
+#define p_scr           p6                      // default register for same-cycle branches
+#define p_nz            p7
+#define p_zr            p8
+#define p_unalgn        p9
+#define p_y             p11
+#define p_n             p12
+#define p_yy            p13
+#define p_nn            p14
+#define MIN1            15
+#define MIN1P1HALF      8
+#define LINE_SIZE       128
+#define LSIZE_SH        7                       // shift amount
+#define PREF_AHEAD      8
+GLOBAL_ENTRY(memset)
+{ .mmi
+        .prologue
+        alloc   tmp = ar.pfs, 3, 0, 0, 0
+        .body
+        lfetch.nt1 [dest]                       //
+        .save   ar.lc, save_lc
+        mov.i   save_lc = ar.lc
+} { .mmi
+        mov     ret0 = dest                     // return value
+        cmp.ne  p_nz, p_zr = value, r0          // use stf.spill if value is zero
+        cmp.eq  p_scr, p0 = cnt, r0
+;; }
+{ .mmi
+        and     ptr2 = -(MIN1+1), dest          // aligned address
+        and     tmp = MIN1, dest                // prepare to check for correct alignment
+        tbit.nz p_y, p_n = dest, 0              // Do we have an odd address? (M_B_U)
+} { .mib
+        mov     ptr1 = dest
+        mux1    value = value, @brcst           // create 8 identical bytes in word
+(p_scr) br.ret.dpnt.many rp                     // return immediately if count = 0
+;; }
+{ .mib
+        cmp.ne  p_unalgn, p0 = tmp, r0          //
+} { .mib
+        sub     bytecnt = (MIN1+1), tmp         // NB: # of bytes to move is 1 higher than loopcnt
+        cmp.gt  p_scr, p0 = 16, cnt             // is it a minimalistic task?
+(p_scr) br.cond.dptk.many .move_bytes_unaligned // go move just a few (M_B_U)
+;; }
+{ .mmi
+(p_unalgn) add  ptr1 = (MIN1+1), ptr2           // after alignment
+(p_unalgn) add  ptr2 = MIN1P1HALF, ptr2         // after alignment
+(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3    // should we do a st8 ?
+;; }
+{ .mib
+(p_y)   add     cnt = -8, cnt                   //
+(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2  // should we do a st4 ?
+} { .mib
+(p_y)   st8     [ptr2] = value,-4               //
+(p_n)   add     ptr2 = 4, ptr2                  //
+;; }
+{ .mib
+(p_yy)  add     cnt = -4, cnt                   //
+(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1    // should we do a st2 ?
+} { .mib
+(p_yy)  st4     [ptr2] = value,-2               //
+(p_nn)  add     ptr2 = 2, ptr2                  //
+;; }
+{ .mmi
+        mov     tmp = LINE_SIZE+1               // for compare
+(p_y)   add     cnt = -2, cnt                   //
+(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0  // should we do a st1 ?
+} { .mmi
+        setf.sig fvalue=value                   // transfer value to FLP side
+(p_y)   st2     [ptr2] = value,-1               //
+(p_n)   add     ptr2 = 1, ptr2                  //
+;; }
+{ .mmi
+(p_yy)  st1     [ptr2] = value                  //
+        cmp.gt  p_scr, p0 = tmp, cnt            // is it a minimalistic task?
+} { .mbb
+(p_yy)  add     cnt = -1, cnt                   //
+(p_scr) br.cond.dpnt.many .fraction_of_line     // go move just a few
+;; }
+{ .mib
+        nop.m 0
+        shr.u   linecnt = cnt, LSIZE_SH
+(p_zr)  br.cond.dptk.many .l1b                  // Jump to use stf.spill
+;; }
+        TEXT_ALIGN(32) // --------------------- //  L1A: store ahead into cache lines; fill later
+{ .mmi
+        and     tmp = -(LINE_SIZE), cnt         // compute end of range
+        mov     ptr9 = ptr1                     // used for prefetching
+        and     cnt = (LINE_SIZE-1), cnt        // remainder
+} { .mmi
+        mov     loopcnt = PREF_AHEAD-1          // default prefetch loop
+        cmp.gt  p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
+;; }
+{ .mmi
+(p_scr) add     loopcnt = -1, linecnt           //
+        add     ptr2 = 8, ptr1                  // start of stores (beyond prefetch stores)
+        add     ptr1 = tmp, ptr1                // first address beyond total range
+;; }
+{ .mmi
+        add     tmp = -1, linecnt               // next loop count
+        mov.i   ar.lc = loopcnt                 //
+;; }
+.pref_l1a:
+{ .mib
+        stf8 [ptr9] = fvalue, 128               // Do stores one cache line apart
+        nop.i   0
+        br.cloop.dptk.few .pref_l1a
+;; }
+{ .mmi
+        add     ptr0 = 16, ptr2                 // Two stores in parallel
+        mov.i   ar.lc = tmp                     //
+;; }
+.l1ax:
+ { .mmi
+        stf8 [ptr2] = fvalue, 8
+        stf8 [ptr0] = fvalue, 8
+ ;; }
+ { .mmi
+        stf8 [ptr2] = fvalue, 24
+        stf8 [ptr0] = fvalue, 24
+ ;; }
+ { .mmi
+        stf8 [ptr2] = fvalue, 8
+        stf8 [ptr0] = fvalue, 8
+ ;; }
+ { .mmi
+        stf8 [ptr2] = fvalue, 24
+        stf8 [ptr0] = fvalue, 24
+ ;; }
+ { .mmi
+        stf8 [ptr2] = fvalue, 8
+        stf8 [ptr0] = fvalue, 8
+ ;; }
+ { .mmi
+        stf8 [ptr2] = fvalue, 24
+        stf8 [ptr0] = fvalue, 24
+ ;; }
+ { .mmi
+        stf8 [ptr2] = fvalue, 8
+        stf8 [ptr0] = fvalue, 32
+        cmp.lt  p_scr, p0 = ptr9, ptr1          // do we need more prefetching?
+ ;; }
+{ .mmb
+        stf8 [ptr2] = fvalue, 24
+(p_scr) stf8 [ptr9] = fvalue, 128
+        br.cloop.dptk.few .l1ax
+;; }
+{ .mbb
+        cmp.le  p_scr, p0 = 8, cnt              // just a few bytes left ?
+(p_scr) br.cond.dpnt.many  .fraction_of_line    // Branch no. 2
+        br.cond.dpnt.many  .move_bytes_from_alignment   // Branch no. 3
+;; }
+        TEXT_ALIGN(32)
+.l1b:   // ------------------------------------ //  L1B: store ahead into cache lines; fill later
+{ .mmi
+        and     tmp = -(LINE_SIZE), cnt         // compute end of range
+        mov     ptr9 = ptr1                     // used for prefetching
+        and     cnt = (LINE_SIZE-1), cnt        // remainder
+} { .mmi
+        mov     loopcnt = PREF_AHEAD-1          // default prefetch loop
+        cmp.gt  p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
+;; }
+{ .mmi
+(p_scr) add     loopcnt = -1, linecnt
+        add     ptr2 = 16, ptr1                 // start of stores (beyond prefetch stores)
+        add     ptr1 = tmp, ptr1                // first address beyond total range
+;; }
+{ .mmi
+        add     tmp = -1, linecnt               // next loop count
+        mov.i   ar.lc = loopcnt
+;; }
+.pref_l1b:
+{ .mib
+        stf.spill [ptr9] = f0, 128              // Do stores one cache line apart
+        nop.i   0
+        br.cloop.dptk.few .pref_l1b
+;; }
+{ .mmi
+        add     ptr0 = 16, ptr2                 // Two stores in parallel
+        mov.i   ar.lc = tmp
+;; }
+.l1bx:
+ { .mmi
+        stf.spill [ptr2] = f0, 32
+        stf.spill [ptr0] = f0, 32
+ ;; }
+ { .mmi
+        stf.spill [ptr2] = f0, 32
+        stf.spill [ptr0] = f0, 32
+ ;; }
+ { .mmi
+        stf.spill [ptr2] = f0, 32
+        stf.spill [ptr0] = f0, 64
+        cmp.lt  p_scr, p0 = ptr9, ptr1          // do we need more prefetching?
+ ;; }
+{ .mmb
+        stf.spill [ptr2] = f0, 32
+(p_scr) stf.spill [ptr9] = f0, 128
+        br.cloop.dptk.few .l1bx
+;; }
+{ .mib
+        cmp.gt  p_scr, p0 = 8, cnt              // just a few bytes left ?
+(p_scr) br.cond.dpnt.many  .move_bytes_from_alignment   //
+;; }
+.fraction_of_line:
+{ .mib
+        add     ptr2 = 16, ptr1
+        shr.u   loopcnt = cnt, 5                // loopcnt = cnt / 32
+;; }
+{ .mib
+        cmp.eq  p_scr, p0 = loopcnt, r0
+        add     loopcnt = -1, loopcnt
+(p_scr) br.cond.dpnt.many .store_words
+;; }
+{ .mib
+        and     cnt = 0x1f, cnt                 // compute the remaining cnt
+        mov.i   ar.lc = loopcnt
+;; }
+        TEXT_ALIGN(32)
+.l2:    // ------------------------------------ //  L2A:  store 32B in 2 cycles
+{ .mmb
+        stf8    [ptr1] = fvalue, 8
+        stf8    [ptr2] = fvalue, 8
+;; } { .mmb
+        stf8    [ptr1] = fvalue, 24
+        stf8    [ptr2] = fvalue, 24
+        br.cloop.dptk.many .l2
+;; }
+.store_words:
+{ .mib
+        cmp.gt  p_scr, p0 = 8, cnt              // just a few bytes left ?
+(p_scr) br.cond.dpnt.many .move_bytes_from_alignment    // Branch
+;; }
+{ .mmi
+        stf8    [ptr1] = fvalue, 8              // store
+        cmp.le  p_y, p_n = 16, cnt
+        add     cnt = -8, cnt                   // subtract
+;; }
+{ .mmi
+(p_y)   stf8    [ptr1] = fvalue, 8              // store
+(p_y)   cmp.le.unc p_yy, p_nn = 16, cnt
+(p_y)   add     cnt = -8, cnt                   // subtract
+;; }
+{ .mmi                                          // store
+(p_yy)  stf8    [ptr1] = fvalue, 8
+(p_yy)  add     cnt = -8, cnt                   // subtract
+;; }
+.move_bytes_from_alignment:
+{ .mib
+        cmp.eq  p_scr, p0 = cnt, r0
+        tbit.nz.unc p_y, p0 = cnt, 2            // should we terminate with a st4 ?
+(p_scr) br.cond.dpnt.few .restore_and_exit
+;; }
+{ .mib
+(p_y)   st4     [ptr1] = value,4
+        tbit.nz.unc p_yy, p0 = cnt, 1           // should we terminate with a st2 ?
+;; }
+{ .mib
+(p_yy)  st2     [ptr1] = value,2
+        tbit.nz.unc p_y, p0 = cnt, 0            // should we terminate with a st1 ?
+;; }
+{ .mib
+(p_y)   st1     [ptr1] = value
+;; }
+.restore_and_exit:
+{ .mib
+        nop.m   0
+        mov.i   ar.lc = save_lc
+        br.ret.sptk.many rp
+;; }
+.move_bytes_unaligned:
+{ .mmi
+       .pred.rel "mutex",p_y, p_n
+       .pred.rel "mutex",p_yy, p_nn
+(p_n)   cmp.le  p_yy, p_nn = 4, cnt
+(p_y)   cmp.le  p_yy, p_nn = 5, cnt
+(p_n)   add     ptr2 = 2, ptr1
+} { .mmi
+(p_y)   add     ptr2 = 3, ptr1
+(p_y)   st1     [ptr1] = value, 1               // fill 1 (odd-aligned) byte [15, 14 (or less) left]
+(p_y)   add     cnt = -1, cnt
+;; }
+{ .mmi
+(p_yy)  cmp.le.unc p_y, p0 = 8, cnt
+        add     ptr3 = ptr1, cnt                // prepare last store
+        mov.i   ar.lc = save_lc
+} { .mmi
+(p_yy)  st2     [ptr1] = value, 4               // fill 2 (aligned) bytes
+(p_yy)  st2     [ptr2] = value, 4               // fill 2 (aligned) bytes [11, 10 (o less) left]
+(p_yy)  add     cnt = -4, cnt
+;; }
+{ .mmi
+(p_y)   cmp.le.unc p_yy, p0 = 8, cnt
+        add     ptr3 = -1, ptr3                 // last store
+        tbit.nz p_scr, p0 = cnt, 1              // will there be a st2 at the end ?
+} { .mmi
+(p_y)   st2     [ptr1] = value, 4               // fill 2 (aligned) bytes
+(p_y)   st2     [ptr2] = value, 4               // fill 2 (aligned) bytes [7, 6 (or less) left]
+(p_y)   add     cnt = -4, cnt
+;; }
+{ .mmi
+(p_yy)  st2     [ptr1] = value, 4               // fill 2 (aligned) bytes
+(p_yy)  st2     [ptr2] = value, 4               // fill 2 (aligned) bytes [3, 2 (or less) left]
+        tbit.nz p_y, p0 = cnt, 0                // will there be a st1 at the end ?
+} { .mmi
+(p_yy)  add     cnt = -4, cnt
+;; }
+{ .mmb
+(p_scr) st2     [ptr1] = value                  // fill 2 (aligned) bytes
+(p_y)   st1     [ptr3] = value                  // fill last byte (using ptr3)
+        br.ret.sptk.many rp
+}
+END(memset)
diff --git a/arch/ia64/lib/strlen.S b/arch/ia64/lib/strlen.S
new file mode 100644
index 000000000000..e0cdac0a85b8
--- /dev/null
+++ b/arch/ia64/lib/strlen.S
@@ -0,0 +1,192 @@
+/*
+ *
+ * Optimized version of the standard strlen() function
+ *
+ *
+ * Inputs:
+ *      in0     address of string
+ *
+ * Outputs:
+ *      ret0    the number of characters in the string (0 if empty string)
+ *      does not count the \0
+ *
+ * Copyright (C) 1999, 2001 Hewlett-Packard Co
+ *      Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * 09/24/99 S.Eranian add speculation recovery code
+ */
+#include <asm/asmmacro.h>
+//
+//
+// This is an enhanced version of the basic strlen. it includes a combination
+// of compute zero index (czx), parallel comparisons, speculative loads and
+// loop unroll using rotating registers.
+//
+// General Ideas about the algorithm:
+//        The goal is to look at the string in chunks of 8 bytes.
+//        so we need to do a few extra checks at the beginning because the
+//        string may not be 8-byte aligned. In this case we load the 8byte
+//        quantity which includes the start of the string and mask the unused
+//        bytes with 0xff to avoid confusing czx.
+//        We use speculative loads and software pipelining to hide memory
+//        latency and do read ahead safely. This way we defer any exception.
+//
+//        Because we don't want the kernel to be relying on particular
+//        settings of the DCR register, we provide recovery code in case
+//        speculation fails. The recovery code is going to "redo" the work using
+//        only normal loads. If we still get a fault then we generate a
+//        kernel panic. Otherwise we return the strlen as usual.
+//
+//        The fact that speculation may fail can be caused, for instance, by
+//        the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
+//        a NaT bit will be set if the translation is not present. The normal
+//        load, on the other hand, will cause the translation to be inserted
+//        if the mapping exists.
+//
+//        It should be noted that we execute recovery code only when we need
+//        to use the data that has been speculatively loaded: we don't execute
+//        recovery code on pure read ahead data.
+//
+// Remarks:
+//      - the cmp r0,r0 is used as a fast way to initialize a predicate
+//        register to 1. This is required to make sure that we get the parallel
+//        compare correct.
+//
+//      - we don't use the epilogue counter to exit the loop but we need to set
+//        it to zero beforehand.
+//
+//      - after the loop we must test for Nat values because neither the
+//        czx nor cmp instruction raise a NaT consumption fault. We must be
+//        careful not to look too far for a Nat for which we don't care.
+//        For instance we don't need to look at a NaT in val2 if the zero byte
+//        was in val1.
+//
+//      - Clearly performance tuning is required.
+//
+//
+//
+#define saved_pfs       r11
+#define tmp             r10
+#define base            r16
+#define orig            r17
+#define saved_pr        r18
+#define src             r19
+#define mask            r20
+#define val             r21
+#define val1            r22
+#define val2            r23
+GLOBAL_ENTRY(strlen)
+        .prologue
+        .save ar.pfs, saved_pfs
+        alloc saved_pfs=ar.pfs,11,0,0,8 // rotating must be multiple of 8
+        .rotr v[2], w[2]        // declares our 4 aliases
+        extr.u tmp=in0,0,3      // tmp=least significant 3 bits
+        mov orig=in0            // keep trackof initial byte address
+        dep src=0,in0,0,3       // src=8byte-aligned in0 address
+        .save pr, saved_pr
+        mov saved_pr=pr         // preserve predicates (rotation)
+        ;;
+        .body
+        ld8 v[1]=[src],8        // must not speculate: can fail here
+        shl tmp=tmp,3           // multiply by 8bits/byte
+        mov mask=-1             // our mask
+        ;;
+        ld8.s w[1]=[src],8      // speculatively load next
+        cmp.eq p6,p0=r0,r0      // sets p6 to true for cmp.and
+        sub tmp=64,tmp          // how many bits to shift our mask on the right
+        ;;
+        shr.u   mask=mask,tmp   // zero enough bits to hold v[1] valuable part
+        mov ar.ec=r0            // clear epilogue counter (saved in ar.pfs)
+        ;;
+        add base=-16,src        // keep track of aligned base
+        or v[1]=v[1],mask       // now we have a safe initial byte pattern
+        ;;
+1:
+        ld8.s v[0]=[src],8      // speculatively load next
+        czx1.r val1=v[1]        // search 0 byte from right
+        czx1.r val2=w[1]        // search 0 byte from right following 8bytes
+        ;;
+        ld8.s w[0]=[src],8      // speculatively load next to next
+        cmp.eq.and p6,p0=8,val1 // p6 = p6 and val1==8
+        cmp.eq.and p6,p0=8,val2 // p6 = p6 and mask==8
+(p6)    br.wtop.dptk 1b         // loop until p6 == 0
+        ;;
+        //
+        // We must return try the recovery code iff
+        // val1_is_nat || (val1==8 && val2_is_nat)
+        //
+        // XXX Fixme
+        //      - there must be a better way of doing the test
+        //
+        cmp.eq  p8,p9=8,val1    // p6 = val1 had zero (disambiguate)
+        tnat.nz p6,p7=val1      // test NaT on val1
+(p6)    br.cond.spnt .recover   // jump to recovery if val1 is NaT
+        ;;
+        //
+        // if we come here p7 is true, i.e., initialized for // cmp
+        //
+        cmp.eq.and  p7,p0=8,val1// val1==8?
+        tnat.nz.and p7,p0=val2  // test NaT if val2
+(p7)    br.cond.spnt .recover   // jump to recovery if val2 is NaT
+        ;;
+(p8)    mov val1=val2           // the other test got us out of the loop
+(p8)    adds src=-16,src        // correct position when 3 ahead
+(p9)    adds src=-24,src        // correct position when 4 ahead
+        ;;
+        sub ret0=src,orig       // distance from base
+        sub tmp=8,val1          // which byte in word
+        mov pr=saved_pr,0xffffffffffff0000
+        ;;
+        sub ret0=ret0,tmp       // adjust
+        mov ar.pfs=saved_pfs    // because of ar.ec, restore no matter what
+        br.ret.sptk.many rp     // end of normal execution
+        //
+        // Outlined recovery code when speculation failed
+        //
+        // This time we don't use speculation and rely on the normal exception
+        // mechanism. that's why the loop is not as good as the previous one
+        // because read ahead is not possible
+        //
+        // IMPORTANT:
+        // Please note that in the case of strlen() as opposed to strlen_user()
+        // we don't use the exception mechanism, as this function is not
+        // supposed to fail. If that happens it means we have a bug and the
+        // code will cause of kernel fault.
+        //
+        // XXX Fixme
+        //      - today we restart from the beginning of the string instead
+        //        of trying to continue where we left off.
+        //
+.recover:
+        ld8 val=[base],8        // will fail if unrecoverable fault
+        ;;
+        or val=val,mask         // remask first bytes
+        cmp.eq p0,p6=r0,r0      // nullify first ld8 in loop
+        ;;
+        //
+        // ar.ec is still zero here
+        //
+2:
+(p6)    ld8 val=[base],8        // will fail if unrecoverable fault
+        ;;
+        czx1.r val1=val         // search 0 byte from right
+        ;;
+        cmp.eq p6,p0=8,val1     // val1==8 ?
+(p6)    br.wtop.dptk 2b         // loop until p6 == 0
+        ;;                      // (avoid WAW on p63)
+        sub ret0=base,orig      // distance from base
+        sub tmp=8,val1
+        mov pr=saved_pr,0xffffffffffff0000
+        ;;
+        sub ret0=ret0,tmp       // length=now - back -1
+        mov ar.pfs=saved_pfs    // because of ar.ec, restore no matter what
+        br.ret.sptk.many rp     // end of successful recovery code
+END(strlen)
diff --git a/arch/ia64/lib/strlen_user.S b/arch/ia64/lib/strlen_user.S
new file mode 100644
index 000000000000..c71eded4285e
--- /dev/null
+++ b/arch/ia64/lib/strlen_user.S
@@ -0,0 +1,198 @@
+/*
+ * Optimized version of the strlen_user() function
+ *
+ * Inputs:
+ *      in0     address of buffer
+ *
+ * Outputs:
+ *      ret0    0 in case of fault, strlen(buffer)+1 otherwise
+ *
+ * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ *      Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * 01/19/99 S.Eranian heavily enhanced version (see details below)
+ * 09/24/99 S.Eranian added speculation recovery code
+ */
+#include <asm/asmmacro.h>
+//
+// int strlen_user(char *)
+// ------------------------
+// Returns:
+//      - length of string + 1
+//      - 0 in case an exception is raised
+//
+// This is an enhanced version of the basic strlen_user. it includes a
+// combination of compute zero index (czx), parallel comparisons, speculative
+// loads and loop unroll using rotating registers.
+//
+// General Ideas about the algorithm:
+//        The goal is to look at the string in chunks of 8 bytes.
+//        so we need to do a few extra checks at the beginning because the
+//        string may not be 8-byte aligned. In this case we load the 8byte
+//        quantity which includes the start of the string and mask the unused
+//        bytes with 0xff to avoid confusing czx.
+//        We use speculative loads and software pipelining to hide memory
+//        latency and do read ahead safely. This way we defer any exception.
+//
+//        Because we don't want the kernel to be relying on particular
+//        settings of the DCR register, we provide recovery code in case
+//        speculation fails. The recovery code is going to "redo" the work using
+//        only normal loads. If we still get a fault then we return an
+//        error (ret0=0). Otherwise we return the strlen+1 as usual.
+//        The fact that speculation may fail can be caused, for instance, by
+//        the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
+//        a NaT bit will be set if the translation is not present. The normal
+//        load, on the other hand, will cause the translation to be inserted
+//        if the mapping exists.
+//
+//        It should be noted that we execute recovery code only when we need
+//        to use the data that has been speculatively loaded: we don't execute
+//        recovery code on pure read ahead data.
+//
+// Remarks:
+//      - the cmp r0,r0 is used as a fast way to initialize a predicate
+//        register to 1. This is required to make sure that we get the parallel
+//        compare correct.
+//
+//      - we don't use the epilogue counter to exit the loop but we need to set
+//        it to zero beforehand.
+//
+//      - after the loop we must test for Nat values because neither the
+//        czx nor cmp instruction raise a NaT consumption fault. We must be
+//        careful not to look too far for a Nat for which we don't care.
+//        For instance we don't need to look at a NaT in val2 if the zero byte
+//        was in val1.
+//
+//      - Clearly performance tuning is required.
+//
+#define saved_pfs       r11
+#define tmp             r10
+#define base            r16
+#define orig            r17
+#define saved_pr        r18
+#define src             r19
+#define mask            r20
+#define val             r21
+#define val1            r22
+#define val2            r23
+GLOBAL_ENTRY(__strlen_user)
+        .prologue
+        .save ar.pfs, saved_pfs
+        alloc saved_pfs=ar.pfs,11,0,0,8
+        .rotr v[2], w[2]        // declares our 4 aliases
+        extr.u tmp=in0,0,3      // tmp=least significant 3 bits
+        mov orig=in0            // keep trackof initial byte address
+        dep src=0,in0,0,3       // src=8byte-aligned in0 address
+        .save pr, saved_pr
+        mov saved_pr=pr         // preserve predicates (rotation)
+        ;;
+        .body
+        ld8.s v[1]=[src],8      // load the initial 8bytes (must speculate)
+        shl tmp=tmp,3           // multiply by 8bits/byte
+        mov mask=-1             // our mask
+        ;;
+        ld8.s w[1]=[src],8      // load next 8 bytes in 2nd pipeline
+        cmp.eq p6,p0=r0,r0      // sets p6 (required because of // cmp.and)
+        sub tmp=64,tmp          // how many bits to shift our mask on the right
+        ;;
+        shr.u   mask=mask,tmp   // zero enough bits to hold v[1] valuable part
+        mov ar.ec=r0            // clear epilogue counter (saved in ar.pfs)
+        ;;
+        add base=-16,src        // keep track of aligned base
+        chk.s v[1], .recover    // if already NaT, then directly skip to recover
+        or v[1]=v[1],mask       // now we have a safe initial byte pattern
+        ;;
+1:
+        ld8.s v[0]=[src],8      // speculatively load next
+        czx1.r val1=v[1]        // search 0 byte from right
+        czx1.r val2=w[1]        // search 0 byte from right following 8bytes
+        ;;
+        ld8.s w[0]=[src],8      // speculatively load next to next
+        cmp.eq.and p6,p0=8,val1 // p6 = p6 and val1==8
+        cmp.eq.and p6,p0=8,val2 // p6 = p6 and mask==8
+(p6)    br.wtop.dptk.few 1b     // loop until p6 == 0
+        ;;
+        //
+        // We must return try the recovery code iff
+        // val1_is_nat || (val1==8 && val2_is_nat)
+        //
+        // XXX Fixme
+        //      - there must be a better way of doing the test
+        //
+        cmp.eq  p8,p9=8,val1    // p6 = val1 had zero (disambiguate)
+        tnat.nz p6,p7=val1      // test NaT on val1
+(p6)    br.cond.spnt .recover   // jump to recovery if val1 is NaT
+        ;;
+        //
+        // if we come here p7 is true, i.e., initialized for // cmp
+        //
+        cmp.eq.and  p7,p0=8,val1// val1==8?
+        tnat.nz.and p7,p0=val2  // test NaT if val2
+(p7)    br.cond.spnt .recover   // jump to recovery if val2 is NaT
+        ;;
+(p8)    mov val1=val2           // val2 contains the value
+(p8)    adds src=-16,src        // correct position when 3 ahead
+(p9)    adds src=-24,src        // correct position when 4 ahead
+        ;;
+        sub ret0=src,orig       // distance from origin
+        sub tmp=7,val1          // 7=8-1 because this strlen returns strlen+1
+        mov pr=saved_pr,0xffffffffffff0000
+        ;;
+        sub ret0=ret0,tmp       // length=now - back -1
+        mov ar.pfs=saved_pfs    // because of ar.ec, restore no matter what
+        br.ret.sptk.many rp     // end of normal execution
+        //
+        // Outlined recovery code when speculation failed
+        //
+        // This time we don't use speculation and rely on the normal exception
+        // mechanism. that's why the loop is not as good as the previous one
+        // because read ahead is not possible
+        //
+        // XXX Fixme
+        //      - today we restart from the beginning of the string instead
+        //        of trying to continue where we left off.
+        //
+.recover:
+        EX(.Lexit1, ld8 val=[base],8)   // load the initial bytes
+        ;;
+        or val=val,mask                 // remask first bytes
+        cmp.eq p0,p6=r0,r0              // nullify first ld8 in loop
+        ;;
+        //
+        // ar.ec is still zero here
+        //
+2:
+        EX(.Lexit1, (p6) ld8 val=[base],8)
+        ;;
+        czx1.r val1=val         // search 0 byte from right
+        ;;
+        cmp.eq p6,p0=8,val1     // val1==8 ?
+(p6)    br.wtop.dptk.few 2b     // loop until p6 == 0
+        ;;
+        sub ret0=base,orig      // distance from base
+        sub tmp=7,val1          // 7=8-1 because this strlen returns strlen+1
+        mov pr=saved_pr,0xffffffffffff0000
+        ;;
+        sub ret0=ret0,tmp       // length=now - back -1
+        mov ar.pfs=saved_pfs    // because of ar.ec, restore no matter what
+        br.ret.sptk.many rp     // end of successful recovery code
+        //
+        // We failed even on the normal load (called from exception handler)
+        //
+.Lexit1:
+        mov ret0=0
+        mov pr=saved_pr,0xffffffffffff0000
+        mov ar.pfs=saved_pfs    // because of ar.ec, restore no matter what
+        br.ret.sptk.many rp
+END(__strlen_user)
diff --git a/arch/ia64/lib/strncpy_from_user.S b/arch/ia64/lib/strncpy_from_user.S
new file mode 100644
index 000000000000..a504381f31eb
--- /dev/null
+++ b/arch/ia64/lib/strncpy_from_user.S
@@ -0,0 +1,44 @@
+/*
+ * Just like strncpy() except that if a fault occurs during copying,
+ * -EFAULT is returned.
+ *
+ * Inputs:
+ *      in0:    address of destination buffer
+ *      in1:    address of string to be copied
+ *      in2:    length of buffer in bytes
+ * Outputs:
+ *      r8:     -EFAULT in case of fault or number of bytes copied if no fault
+ *
+ * Copyright (C) 1998-2001 Hewlett-Packard Co
+ * Copyright (C) 1998-2001 David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * 00/03/06 D. Mosberger Fixed to return proper return value (bug found by
+ *                       by Andreas Schwab <schwab@suse.de>).
+ */
+#include <asm/asmmacro.h>
+GLOBAL_ENTRY(__strncpy_from_user)
+        alloc r2=ar.pfs,3,0,0,0
+        mov r8=0
+        mov r9=in1
+        ;;
+        add r10=in1,in2
+        cmp.eq p6,p0=r0,in2
+(p6)    br.ret.spnt.many rp
+        // XXX braindead copy loop---this needs to be optimized
+.Loop1:
+        EX(.Lexit, ld1 r8=[in1],1)
+        ;;
+        EX(.Lexit, st1 [in0]=r8,1)
+        cmp.ne p6,p7=r8,r0
+        ;;
+(p6)    cmp.ne.unc p8,p0=in1,r10
+(p8)    br.cond.dpnt.few .Loop1
+        ;;
+(p6)    mov r8=in2              // buffer filled up---return buffer length
+(p7)    sub r8=in1,r9,1         // return string length (excluding NUL character)
+[.Lexit:]
+        br.ret.sptk.many rp
+END(__strncpy_from_user)
diff --git a/arch/ia64/lib/strnlen_user.S b/arch/ia64/lib/strnlen_user.S
new file mode 100644
index 000000000000..d09066b1e49d
--- /dev/null
+++ b/arch/ia64/lib/strnlen_user.S
@@ -0,0 +1,45 @@
+/*
+ * Returns 0 if exception before NUL or reaching the supplied limit (N),
+ * a value greater than N if the string is longer than the limit, else
+ * strlen.
+ *
+ * Inputs:
+ *      in0:    address of buffer
+ *      in1:    string length limit N
+ * Outputs:
+ *      r8:     0 in case of fault, strlen(buffer)+1 otherwise
+ *
+ * Copyright (C) 1999, 2001 David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+#include <asm/asmmacro.h>
+GLOBAL_ENTRY(__strnlen_user)
+        .prologue
+        alloc r2=ar.pfs,2,0,0,0
+        .save ar.lc, r16
+        mov r16=ar.lc                   // preserve ar.lc
+        .body
+        add r3=-1,in1
+        ;;
+        mov ar.lc=r3
+        mov r9=0
+        ;;
+        // XXX braindead strlen loop---this needs to be optimized
+.Loop1:
+        EXCLR(.Lexit, ld1 r8=[in0],1)
+        add r9=1,r9
+        ;;
+        cmp.eq p6,p0=r8,r0
+(p6)    br.cond.dpnt .Lexit
+        br.cloop.dptk.few .Loop1
+        add r9=1,in1                    // NUL not found---return N+1
+        ;;
+.Lexit:
+        mov r8=r9
+        mov ar.lc=r16                   // restore ar.lc
+        br.ret.sptk.many rp
+END(__strnlen_user)
diff --git a/arch/ia64/lib/swiotlb.c b/arch/ia64/lib/swiotlb.c
new file mode 100644
index 000000000000..ab7b3ad99a7f
--- /dev/null
+++ b/arch/ia64/lib/swiotlb.c
@@ -0,0 +1,658 @@
+/*
+ * Dynamic DMA mapping support.
+ *
+ * This implementation is for IA-64 platforms that do not support
+ * I/O TLBs (aka DMA address translation hardware).
+ * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com>
+ * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com>
+ * Copyright (C) 2000, 2003 Hewlett-Packard Co
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * 03/05/07 davidm      Switch from PCI-DMA to generic device DMA API.
+ * 00/12/13 davidm      Rename to swiotlb.c and add mark_clean() to avoid
+ *                      unnecessary i-cache flushing.
+ * 04/07/.. ak          Better overflow handling. Assorted fixes.
+ */
+#include <linux/cache.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <asm/io.h>
+#include <asm/pci.h>
+#include <asm/dma.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#define OFFSET(val,align) ((unsigned long)      \
+                           ( (val) & ( (align) - 1)))
+#define SG_ENT_VIRT_ADDRESS(sg) (page_address((sg)->page) + (sg)->offset)
+#define SG_ENT_PHYS_ADDRESS(SG) virt_to_phys(SG_ENT_VIRT_ADDRESS(SG))
+/*
+ * Maximum allowable number of contiguous slabs to map,
+ * must be a power of 2.  What is the appropriate value ?
+ * The complexity of {map,unmap}_single is linearly dependent on this value.
+ */
+#define IO_TLB_SEGSIZE  128
+/*
+ * log of the size of each IO TLB slab.  The number of slabs is command line
+ * controllable.
+ */
+#define IO_TLB_SHIFT 11
+int swiotlb_force;
+/*
+ * Used to do a quick range check in swiotlb_unmap_single and
+ * swiotlb_sync_single_*, to see if the memory was in fact allocated by this
+ * API.
+ */
+static char *io_tlb_start, *io_tlb_end;
+/*
+ * The number of IO TLB blocks (in groups of 64) betweeen io_tlb_start and
+ * io_tlb_end.  This is command line adjustable via setup_io_tlb_npages.
+ */
+static unsigned long io_tlb_nslabs;
+/*
+ * When the IOMMU overflows we return a fallback buffer. This sets the size.
+ */
+static unsigned long io_tlb_overflow = 32*1024;
+void *io_tlb_overflow_buffer;
+/*
+ * This is a free list describing the number of free entries available from
+ * each index
+ */
+static unsigned int *io_tlb_list;
+static unsigned int io_tlb_index;
+/*
+ * We need to save away the original address corresponding to a mapped entry
+ * for the sync operations.
+ */
+static unsigned char **io_tlb_orig_addr;
+/*
+ * Protect the above data structures in the map and unmap calls
+ */
+static DEFINE_SPINLOCK(io_tlb_lock);
+static int __init
+setup_io_tlb_npages(char *str)
+{
+        if (isdigit(*str)) {
+                io_tlb_nslabs = simple_strtoul(str, &str, 0) <<
+                        (PAGE_SHIFT - IO_TLB_SHIFT);
+                /* avoid tail segment of size < IO_TLB_SEGSIZE */
+                io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+        }
+        if (*str == ',')
+                ++str;
+        if (!strcmp(str, "force"))
+                swiotlb_force = 1;
+        return 1;
+}
+__setup("swiotlb=", setup_io_tlb_npages);
+/* make io_tlb_overflow tunable too? */
+/*
+ * Statically reserve bounce buffer space and initialize bounce buffer data
+ * structures for the software IO TLB used to implement the PCI DMA API.
+ */
+void
+swiotlb_init_with_default_size (size_t default_size)
+{
+        unsigned long i;
+        if (!io_tlb_nslabs) {
+                io_tlb_nslabs = (default_size >> PAGE_SHIFT);
+                io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+        }
+        /*
+         * Get IO TLB memory from the low pages
+         */
+        io_tlb_start = alloc_bootmem_low_pages(io_tlb_nslabs *
+                                               (1 << IO_TLB_SHIFT));
+        if (!io_tlb_start)
+                panic("Cannot allocate SWIOTLB buffer");
+        io_tlb_end = io_tlb_start + io_tlb_nslabs * (1 << IO_TLB_SHIFT);
+        /*
+         * Allocate and initialize the free list array.  This array is used
+         * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
+         * between io_tlb_start and io_tlb_end.
+         */
+        io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int));
+        for (i = 0; i < io_tlb_nslabs; i++)
+                io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
+        io_tlb_index = 0;
+        io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(char *));
+        /*
+         * Get the overflow emergency buffer
+         */
+        io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
+        printk(KERN_INFO "Placing software IO TLB between 0x%lx - 0x%lx\n",
+               virt_to_phys(io_tlb_start), virt_to_phys(io_tlb_end));
+}
+void
+swiotlb_init (void)
+{
+        swiotlb_init_with_default_size(64 * (1<<20));   /* default to 64MB */
+}
+static inline int
+address_needs_mapping(struct device *hwdev, dma_addr_t addr)
+{
+        dma_addr_t mask = 0xffffffff;
+        /* If the device has a mask, use it, otherwise default to 32 bits */
+        if (hwdev && hwdev->dma_mask)
+                mask = *hwdev->dma_mask;
+        return (addr & ~mask) != 0;
+}
+/*
+ * Allocates bounce buffer and returns its kernel virtual address.
+ */
+static void *
+map_single(struct device *hwdev, char *buffer, size_t size, int dir)
+{
+        unsigned long flags;
+        char *dma_addr;
+        unsigned int nslots, stride, index, wrap;
+        int i;
+        /*
+         * For mappings greater than a page, we limit the stride (and
+         * hence alignment) to a page size.
+         */
+        nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+        if (size > PAGE_SIZE)
+                stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
+        else
+                stride = 1;
+        if (!nslots)
+                BUG();
+        /*
+         * Find suitable number of IO TLB entries size that will fit this
+         * request and allocate a buffer from that IO TLB pool.
+         */
+        spin_lock_irqsave(&io_tlb_lock, flags);
+        {
+                wrap = index = ALIGN(io_tlb_index, stride);
+                if (index >= io_tlb_nslabs)
+                        wrap = index = 0;
+                do {
+                        /*
+                         * If we find a slot that indicates we have 'nslots'
+                         * number of contiguous buffers, we allocate the
+                         * buffers from that slot and mark the entries as '0'
+                         * indicating unavailable.
+                         */
+                        if (io_tlb_list[index] >= nslots) {
+                                int count = 0;
+                                for (i = index; i < (int) (index + nslots); i++)
+                                        io_tlb_list[i] = 0;
+                                for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
+                                        io_tlb_list[i] = ++count;
+                                dma_addr = io_tlb_start + (index << IO_TLB_SHIFT);
+                                /*
+                                 * Update the indices to avoid searching in
+                                 * the next round.
+                                 */
+                                io_tlb_index = ((index + nslots) < io_tlb_nslabs
+                                                ? (index + nslots) : 0);
+                                goto found;
+                        }
+                        index += stride;
+                        if (index >= io_tlb_nslabs)
+                                index = 0;
+                } while (index != wrap);
+                spin_unlock_irqrestore(&io_tlb_lock, flags);
+                return NULL;
+        }
+  found:
+        spin_unlock_irqrestore(&io_tlb_lock, flags);
+        /*
+         * Save away the mapping from the original address to the DMA address.
+         * This is needed when we sync the memory.  Then we sync the buffer if
+         * needed.
+         */
+        io_tlb_orig_addr[index] = buffer;
+        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
+                memcpy(dma_addr, buffer, size);
+        return dma_addr;
+}
+/*
+ * dma_addr is the kernel virtual address of the bounce buffer to unmap.
+ */
+static void
+unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
+{
+        unsigned long flags;
+        int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+        int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
+        char *buffer = io_tlb_orig_addr[index];
+        /*
+         * First, sync the memory before unmapping the entry
+         */
+        if (buffer && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
+                /*
+                 * bounce... copy the data back into the original buffer * and
+                 * delete the bounce buffer.
+                 */
+                memcpy(buffer, dma_addr, size);
+        /*
+         * Return the buffer to the free list by setting the corresponding
+         * entries to indicate the number of contigous entries available.
+         * While returning the entries to the free list, we merge the entries
+         * with slots below and above the pool being returned.
+         */
+        spin_lock_irqsave(&io_tlb_lock, flags);
+        {
+                count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
+                         io_tlb_list[index + nslots] : 0);
+                /*
+                 * Step 1: return the slots to the free list, merging the
+                 * slots with superceeding slots
+                 */
+                for (i = index + nslots - 1; i >= index; i--)
+                        io_tlb_list[i] = ++count;
+                /*
+                 * Step 2: merge the returned slots with the preceding slots,
+                 * if available (non zero)
+                 */
+                for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
+                        io_tlb_list[i] = ++count;
+        }
+        spin_unlock_irqrestore(&io_tlb_lock, flags);
+}
+static void
+sync_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
+{
+        int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
+        char *buffer = io_tlb_orig_addr[index];
+        /*
+         * bounce... copy the data back into/from the original buffer
+         * XXX How do you handle DMA_BIDIRECTIONAL here ?
+         */
+        if (dir == DMA_FROM_DEVICE)
+                memcpy(buffer, dma_addr, size);
+        else if (dir == DMA_TO_DEVICE)
+                memcpy(dma_addr, buffer, size);
+        else
+                BUG();
+}
+void *
+swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+                       dma_addr_t *dma_handle, int flags)
+{
+        unsigned long dev_addr;
+        void *ret;
+        int order = get_order(size);
+        /*
+         * XXX fix me: the DMA API should pass us an explicit DMA mask
+         * instead, or use ZONE_DMA32 (ia64 overloads ZONE_DMA to be a ~32
+         * bit range instead of a 16MB one).
+         */
+        flags |= GFP_DMA;
+        ret = (void *)__get_free_pages(flags, order);
+        if (ret && address_needs_mapping(hwdev, virt_to_phys(ret))) {
+                /*
+                 * The allocated memory isn't reachable by the device.
+                 * Fall back on swiotlb_map_single().
+                 */
+                free_pages((unsigned long) ret, order);
+                ret = NULL;
+        }
+        if (!ret) {
+                /*
+                 * We are either out of memory or the device can't DMA
+                 * to GFP_DMA memory; fall back on
+                 * swiotlb_map_single(), which will grab memory from
+                 * the lowest available address range.
+                 */
+                dma_addr_t handle;
+                handle = swiotlb_map_single(NULL, NULL, size, DMA_FROM_DEVICE);
+                if (dma_mapping_error(handle))
+                        return NULL;
+                ret = phys_to_virt(handle);
+        }
+        memset(ret, 0, size);
+        dev_addr = virt_to_phys(ret);
+        /* Confirm address can be DMA'd by device */
+        if (address_needs_mapping(hwdev, dev_addr)) {
+                printk("hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016lx\n",
+                       (unsigned long long)*hwdev->dma_mask, dev_addr);
+                panic("swiotlb_alloc_coherent: allocated memory is out of "
+                      "range for device");
+        }
+        *dma_handle = dev_addr;
+        return ret;
+}
+void
+swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
+                      dma_addr_t dma_handle)
+{
+        if (!(vaddr >= (void *)io_tlb_start
+                    && vaddr < (void *)io_tlb_end))
+                free_pages((unsigned long) vaddr, get_order(size));
+        else
+                /* DMA_TO_DEVICE to avoid memcpy in unmap_single */
+                swiotlb_unmap_single (hwdev, dma_handle, size, DMA_TO_DEVICE);
+}
+static void
+swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
+{
+        /*
+         * Ran out of IOMMU space for this operation. This is very bad.
+         * Unfortunately the drivers cannot handle this operation properly.
+         * unless they check for pci_dma_mapping_error (most don't)
+         * When the mapping is small enough return a static buffer to limit
+         * the damage, or panic when the transfer is too big.
+         */
+        printk(KERN_ERR "PCI-DMA: Out of SW-IOMMU space for %lu bytes at "
+               "device %s\n", size, dev ? dev->bus_id : "?");
+        if (size > io_tlb_overflow && do_panic) {
+                if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
+                        panic("PCI-DMA: Memory would be corrupted\n");
+                if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
+                        panic("PCI-DMA: Random memory would be DMAed\n");
+        }
+}
+/*
+ * Map a single buffer of the indicated size for DMA in streaming mode.  The
+ * PCI address to use is returned.
+ *
+ * Once the device is given the dma address, the device owns this memory until
+ * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed.
+ */
+dma_addr_t
+swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir)
+{
+        unsigned long dev_addr = virt_to_phys(ptr);
+        void *map;
+        if (dir == DMA_NONE)
+                BUG();
+        /*
+         * If the pointer passed in happens to be in the device's DMA window,
+         * we can safely return the device addr and not worry about bounce
+         * buffering it.
+         */
+        if (!address_needs_mapping(hwdev, dev_addr) && !swiotlb_force)
+                return dev_addr;
+        /*
+         * Oh well, have to allocate and map a bounce buffer.
+         */
+        map = map_single(hwdev, ptr, size, dir);
+        if (!map) {
+                swiotlb_full(hwdev, size, dir, 1);
+                map = io_tlb_overflow_buffer;
+        }
+        dev_addr = virt_to_phys(map);
+        /*
+         * Ensure that the address returned is DMA'ble
+         */
+        if (address_needs_mapping(hwdev, dev_addr))
+                panic("map_single: bounce buffer is not DMA'ble");
+        return dev_addr;
+}
+/*
+ * Since DMA is i-cache coherent, any (complete) pages that were written via
+ * DMA can be marked as "clean" so that lazy_mmu_prot_update() doesn't have to
+ * flush them when they get mapped into an executable vm-area.
+ */
+static void
+mark_clean(void *addr, size_t size)
+{
+        unsigned long pg_addr, end;
+        pg_addr = PAGE_ALIGN((unsigned long) addr);
+        end = (unsigned long) addr + size;
+        while (pg_addr + PAGE_SIZE <= end) {
+                struct page *page = virt_to_page(pg_addr);
+                set_bit(PG_arch_1, &page->flags);
+                pg_addr += PAGE_SIZE;
+        }
+}
+/*
+ * Unmap a single streaming mode DMA translation.  The dma_addr and size must
+ * match what was provided for in a previous swiotlb_map_single call.  All
+ * other usages are undefined.
+ *
+ * After this call, reads by the cpu to the buffer are guaranteed to see
+ * whatever the device wrote there.
+ */
+void
+swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size,
+                     int dir)
+{
+        char *dma_addr = phys_to_virt(dev_addr);
+        if (dir == DMA_NONE)
+                BUG();
+        if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end)
+                unmap_single(hwdev, dma_addr, size, dir);
+        else if (dir == DMA_FROM_DEVICE)
+                mark_clean(dma_addr, size);
+}
+/*
+ * Make physical memory consistent for a single streaming mode DMA translation
+ * after a transfer.
+ *
+ * If you perform a swiotlb_map_single() but wish to interrogate the buffer
+ * using the cpu, yet do not wish to teardown the PCI dma mapping, you must
+ * call this function before doing so.  At the next point you give the PCI dma
+ * address back to the card, you must first perform a
+ * swiotlb_dma_sync_for_device, and then the device again owns the buffer
+ */
+void
+swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
+                            size_t size, int dir)
+{
+        char *dma_addr = phys_to_virt(dev_addr);
+        if (dir == DMA_NONE)
+                BUG();
+        if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end)
+                sync_single(hwdev, dma_addr, size, dir);
+        else if (dir == DMA_FROM_DEVICE)
+                mark_clean(dma_addr, size);
+}
+void
+swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
+                               size_t size, int dir)
+{
+        char *dma_addr = phys_to_virt(dev_addr);
+        if (dir == DMA_NONE)
+                BUG();
+        if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end)
+                sync_single(hwdev, dma_addr, size, dir);
+        else if (dir == DMA_FROM_DEVICE)
+                mark_clean(dma_addr, size);
+}
+/*
+ * Map a set of buffers described by scatterlist in streaming mode for DMA.
+ * This is the scatter-gather version of the above swiotlb_map_single
+ * interface.  Here the scatter gather list elements are each tagged with the
+ * appropriate dma address and length.  They are obtained via
+ * sg_dma_{address,length}(SG).
+ *
+ * NOTE: An implementation may be able to use a smaller number of
+ *       DMA address/length pairs than there are SG table elements.
+ *       (for example via virtual mapping capabilities)
+ *       The routine returns the number of addr/length pairs actually
+ *       used, at most nents.
+ *
+ * Device ownership issues as mentioned above for swiotlb_map_single are the
+ * same here.
+ */
+int
+swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
+               int dir)
+{
+        void *addr;
+        unsigned long dev_addr;
+        int i;
+        if (dir == DMA_NONE)
+                BUG();
+        for (i = 0; i < nelems; i++, sg++) {
+                addr = SG_ENT_VIRT_ADDRESS(sg);
+                dev_addr = virt_to_phys(addr);
+                if (swiotlb_force || address_needs_mapping(hwdev, dev_addr)) {
+                        sg->dma_address = (dma_addr_t) virt_to_phys(map_single(hwdev, addr, sg->length, dir));
+                        if (!sg->dma_address) {
+                                /* Don't panic here, we expect map_sg users
+                                   to do proper error handling. */
+                                swiotlb_full(hwdev, sg->length, dir, 0);
+                                swiotlb_unmap_sg(hwdev, sg - i, i, dir);
+                                sg[0].dma_length = 0;
+                                return 0;
+                        }
+                } else
+                        sg->dma_address = dev_addr;
+                sg->dma_length = sg->length;
+        }
+        return nelems;
+}
+/*
+ * Unmap a set of streaming mode DMA translations.  Again, cpu read rules
+ * concerning calls here are the same as for swiotlb_unmap_single() above.
+ */
+void
+swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
+                 int dir)
+{
+        int i;
+        if (dir == DMA_NONE)
+                BUG();
+        for (i = 0; i < nelems; i++, sg++)
+                if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
+                        unmap_single(hwdev, (void *) phys_to_virt(sg->dma_address), sg->dma_length, dir);
+                else if (dir == DMA_FROM_DEVICE)
+                        mark_clean(SG_ENT_VIRT_ADDRESS(sg), sg->dma_length);
+}
+/*
+ * Make physical memory consistent for a set of streaming mode DMA translations
+ * after a transfer.
+ *
+ * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
+ * and usage.
+ */
+void
+swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
+                        int nelems, int dir)
+{
+        int i;
+        if (dir == DMA_NONE)
+                BUG();
+        for (i = 0; i < nelems; i++, sg++)
+                if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
+                        sync_single(hwdev, (void *) sg->dma_address,
+                                    sg->dma_length, dir);
+}
+void
+swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
+                           int nelems, int dir)
+{
+        int i;
+        if (dir == DMA_NONE)
+                BUG();
+        for (i = 0; i < nelems; i++, sg++)
+                if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
+                        sync_single(hwdev, (void *) sg->dma_address,
+                                    sg->dma_length, dir);
+}
+int
+swiotlb_dma_mapping_error(dma_addr_t dma_addr)
+{
+        return (dma_addr == virt_to_phys(io_tlb_overflow_buffer));
+}
+/*
+ * Return whether the given PCI device DMA address mask can be supported
+ * properly.  For example, if your device can only drive the low 24-bits
+ * during PCI bus mastering, then you would pass 0x00ffffff as the mask to
+ * this function.
+ */
+int
+swiotlb_dma_supported (struct device *hwdev, u64 mask)
+{
+        return (virt_to_phys (io_tlb_end) - 1) <= mask;
+}
+EXPORT_SYMBOL(swiotlb_init);
+EXPORT_SYMBOL(swiotlb_map_single);
+EXPORT_SYMBOL(swiotlb_unmap_single);
+EXPORT_SYMBOL(swiotlb_map_sg);
+EXPORT_SYMBOL(swiotlb_unmap_sg);
+EXPORT_SYMBOL(swiotlb_sync_single_for_cpu);
+EXPORT_SYMBOL(swiotlb_sync_single_for_device);
+EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu);
+EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
+EXPORT_SYMBOL(swiotlb_dma_mapping_error);
+EXPORT_SYMBOL(swiotlb_alloc_coherent);
+EXPORT_SYMBOL(swiotlb_free_coherent);
+EXPORT_SYMBOL(swiotlb_dma_supported);
diff --git a/arch/ia64/lib/xor.S b/arch/ia64/lib/xor.S
new file mode 100644
index 000000000000..54e3f7eab8e9
--- /dev/null
+++ b/arch/ia64/lib/xor.S
@@ -0,0 +1,184 @@
+/*
+ * arch/ia64/lib/xor.S
+ *
+ * Optimized RAID-5 checksumming functions for IA-64.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <asm/asmmacro.h>
+GLOBAL_ENTRY(xor_ia64_2)
+        .prologue
+        .fframe 0
+        .save ar.pfs, r31
+        alloc r31 = ar.pfs, 3, 0, 13, 16
+        .save ar.lc, r30
+        mov r30 = ar.lc
+        .save pr, r29
+        mov r29 = pr
+        ;;
+        .body
+        mov r8 = in1
+        mov ar.ec = 6 + 2
+        shr in0 = in0, 3
+        ;;
+        adds in0 = -1, in0
+        mov r16 = in1
+        mov r17 = in2
+        ;;
+        mov ar.lc = in0
+        mov pr.rot = 1 << 16
+        ;;
+        .rotr s1[6+1], s2[6+1], d[2]
+        .rotp p[6+2]
+0:
+(p[0])  ld8.nta s1[0] = [r16], 8
+(p[0])  ld8.nta s2[0] = [r17], 8
+(p[6])  xor d[0] = s1[6], s2[6]
+(p[6+1])st8.nta [r8] = d[1], 8
+        nop.f 0
+        br.ctop.dptk.few 0b
+        ;;
+        mov ar.lc = r30
+        mov pr = r29, -1
+        br.ret.sptk.few rp
+END(xor_ia64_2)
+GLOBAL_ENTRY(xor_ia64_3)
+        .prologue
+        .fframe 0
+        .save ar.pfs, r31
+        alloc r31 = ar.pfs, 4, 0, 20, 24
+        .save ar.lc, r30
+        mov r30 = ar.lc
+        .save pr, r29
+        mov r29 = pr
+        ;;
+        .body
+        mov r8 = in1
+        mov ar.ec = 6 + 2
+        shr in0 = in0, 3
+        ;;
+        adds in0 = -1, in0
+        mov r16 = in1
+        mov r17 = in2
+        ;;
+        mov r18 = in3
+        mov ar.lc = in0
+        mov pr.rot = 1 << 16
+        ;;
+        .rotr s1[6+1], s2[6+1], s3[6+1], d[2]
+        .rotp p[6+2]
+0:
+(p[0])  ld8.nta s1[0] = [r16], 8
+(p[0])  ld8.nta s2[0] = [r17], 8
+(p[6])  xor d[0] = s1[6], s2[6]
+        ;;
+(p[0])  ld8.nta s3[0] = [r18], 8
+(p[6+1])st8.nta [r8] = d[1], 8
+(p[6])  xor d[0] = d[0], s3[6]
+        br.ctop.dptk.few 0b
+        ;;
+        mov ar.lc = r30
+        mov pr = r29, -1
+        br.ret.sptk.few rp
+END(xor_ia64_3)
+GLOBAL_ENTRY(xor_ia64_4)
+        .prologue
+        .fframe 0
+        .save ar.pfs, r31
+        alloc r31 = ar.pfs, 5, 0, 27, 32
+        .save ar.lc, r30
+        mov r30 = ar.lc
+        .save pr, r29
+        mov r29 = pr
+        ;;
+        .body
+        mov r8 = in1
+        mov ar.ec = 6 + 2
+        shr in0 = in0, 3
+        ;;
+        adds in0 = -1, in0
+        mov r16 = in1
+        mov r17 = in2
+        ;;
+        mov r18 = in3
+        mov ar.lc = in0
+        mov pr.rot = 1 << 16
+        mov r19 = in4
+        ;;
+        .rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], d[2]
+        .rotp p[6+2]
+0:
+(p[0])  ld8.nta s1[0] = [r16], 8
+(p[0])  ld8.nta s2[0] = [r17], 8
+(p[6])  xor d[0] = s1[6], s2[6]
+(p[0])  ld8.nta s3[0] = [r18], 8
+(p[0])  ld8.nta s4[0] = [r19], 8
+(p[6])  xor r20 = s3[6], s4[6]
+        ;;
+(p[6+1])st8.nta [r8] = d[1], 8
+(p[6])  xor d[0] = d[0], r20
+        br.ctop.dptk.few 0b
+        ;;
+        mov ar.lc = r30
+        mov pr = r29, -1
+        br.ret.sptk.few rp
+END(xor_ia64_4)
+GLOBAL_ENTRY(xor_ia64_5)
+        .prologue
+        .fframe 0
+        .save ar.pfs, r31
+        alloc r31 = ar.pfs, 6, 0, 34, 40
+        .save ar.lc, r30
+        mov r30 = ar.lc
+        .save pr, r29
+        mov r29 = pr
+        ;;
+        .body
+        mov r8 = in1
+        mov ar.ec = 6 + 2
+        shr in0 = in0, 3
+        ;;
+        adds in0 = -1, in0
+        mov r16 = in1
+        mov r17 = in2
+        ;;
+        mov r18 = in3
+        mov ar.lc = in0
+        mov pr.rot = 1 << 16
+        mov r19 = in4
+        mov r20 = in5
+        ;;
+        .rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], s5[6+1], d[2]
+        .rotp p[6+2]
+0:
+(p[0])  ld8.nta s1[0] = [r16], 8
+(p[0])  ld8.nta s2[0] = [r17], 8
+(p[6])  xor d[0] = s1[6], s2[6]
+(p[0])  ld8.nta s3[0] = [r18], 8
+(p[0])  ld8.nta s4[0] = [r19], 8
+(p[6])  xor r21 = s3[6], s4[6]
+        ;;
+(p[0])  ld8.nta s5[0] = [r20], 8
+(p[6+1])st8.nta [r8] = d[1], 8
+(p[6])  xor d[0] = d[0], r21
+        ;;
+(p[6])    xor d[0] = d[0], s5[6]
+        nop.f 0
+        br.ctop.dptk.few 0b
+        ;;
+        mov ar.lc = r30
+        mov pr = r29, -1
+        br.ret.sptk.few rp
+END(xor_ia64_5)
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
commit	1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree	0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/ia64/lib