Linux-2.6.12-rc2v2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
author: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
commit: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree: 0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/ia64/lib/do_csum.S
1 files changed, 323 insertions, 0 deletions
diff --git a/arch/ia64/lib/do_csum.S b/arch/ia64/lib/do_csum.S
new file mode 100644
index 000000000000..6bec2fc9f5b2
--- /dev/null
+++ b/arch/ia64/lib/do_csum.S
@@ -0,0 +1,323 @@
+/*
+ *
+ * Optmized version of the standard do_csum() function
+ *
+ * Return: a 64bit quantity containing the 16bit Internet checksum
+ *
+ * Inputs:
+ *      in0: address of buffer to checksum (char *)
+ *      in1: length of the buffer (int)
+ *
+ * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co
+ *      Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * 02/04/22     Ken Chen <kenneth.w.chen@intel.com>
+ *              Data locality study on the checksum buffer.
+ *              More optimization cleanup - remove excessive stop bits.
+ * 02/04/08     David Mosberger <davidm@hpl.hp.com>
+ *              More cleanup and tuning.
+ * 01/04/18     Jun Nakajima <jun.nakajima@intel.com>
+ *              Clean up and optimize and the software pipeline, loading two
+ *              back-to-back 8-byte words per loop. Clean up the initialization
+ *              for the loop. Support the cases where load latency = 1 or 2.
+ *              Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default).
+ */
+#include <asm/asmmacro.h>
+//
+// Theory of operations:
+//      The goal is to go as quickly as possible to the point where
+//      we can checksum 16 bytes/loop. Before reaching that point we must
+//      take care of incorrect alignment of first byte.
+//
+//      The code hereafter also takes care of the "tail" part of the buffer
+//      before entering the core loop, if any. The checksum is a sum so it
+//      allows us to commute operations. So we do the "head" and "tail"
+//      first to finish at full speed in the body. Once we get the head and
+//      tail values, we feed them into the pipeline, very handy initialization.
+//
+//      Of course we deal with the special case where the whole buffer fits
+//      into one 8 byte word. In this case we have only one entry in the pipeline.
+//
+//      We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for
+//      possible load latency and also to accommodate for head and tail.
+//
+//      The end of the function deals with folding the checksum from 64bits
+//      down to 16bits taking care of the carry.
+//
+//      This version avoids synchronization in the core loop by also using a
+//      pipeline for the accumulation of the checksum in resultx[] (x=1,2).
+//
+//       wordx[] (x=1,2)
+//      |---|
+//      |   | 0                 : new value loaded in pipeline
+//      |---|
+//      |   | -                 : in transit data
+//      |---|
+//      |   | LOAD_LATENCY      : current value to add to checksum
+//      |---|
+//      |   | LOAD_LATENCY+1    : previous value added to checksum
+//      |---|                   (previous iteration)
+//
+//      resultx[] (x=1,2)
+//      |---|
+//      |   | 0                 : initial value
+//      |---|
+//      |   | LOAD_LATENCY-1    : new checksum
+//      |---|
+//      |   | LOAD_LATENCY      : previous value of checksum
+//      |---|
+//      |   | LOAD_LATENCY+1    : final checksum when out of the loop
+//      |---|
+//
+//
+//      See RFC1071 "Computing the Internet Checksum" for various techniques for
+//      calculating the Internet checksum.
+//
+// NOT YET DONE:
+//      - Maybe another algorithm which would take care of the folding at the
+//        end in a different manner
+//      - Work with people more knowledgeable than me on the network stack
+//        to figure out if we could not split the function depending on the
+//        type of packet or alignment we get. Like the ip_fast_csum() routine
+//        where we know we have at least 20bytes worth of data to checksum.
+//      - Do a better job of handling small packets.
+//      - Note on prefetching: it was found that under various load, i.e. ftp read/write,
+//        nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate is at 99.8%
+//        on the data that buffer points to (partly because the checksum is often preceded by
+//        a copy_from_user()).  This finding indiate that lfetch will not be beneficial since
+//        the data is already in the cache.
+//
+#define saved_pfs       r11
+#define hmask           r16
+#define tmask           r17
+#define first1          r18
+#define firstval        r19
+#define firstoff        r20
+#define last            r21
+#define lastval         r22
+#define lastoff         r23
+#define saved_lc        r24
+#define saved_pr        r25
+#define tmp1            r26
+#define tmp2            r27
+#define tmp3            r28
+#define carry1          r29
+#define carry2          r30
+#define first2          r31
+#define buf             in0
+#define len             in1
+#define LOAD_LATENCY    2       // XXX fix me
+#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2)
+# error "Only 1 or 2 is supported/tested for LOAD_LATENCY."
+#endif
+#define PIPE_DEPTH                      (LOAD_LATENCY+2)
+#define ELD     p[LOAD_LATENCY]         // end of load
+#define ELD_1   p[LOAD_LATENCY+1]       // and next stage
+// unsigned long do_csum(unsigned char *buf,long len)
+GLOBAL_ENTRY(do_csum)
+        .prologue
+        .save ar.pfs, saved_pfs
+        alloc saved_pfs=ar.pfs,2,16,0,16
+        .rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2]
+        .rotp p[PIPE_DEPTH], pC1[2], pC2[2]
+        mov ret0=r0             // in case we have zero length
+        cmp.lt p0,p6=r0,len     // check for zero length or negative (32bit len)
+        ;;
+        add tmp1=buf,len        // last byte's address
+        .save pr, saved_pr
+        mov saved_pr=pr         // preserve predicates (rotation)
+(p6)    br.ret.spnt.many rp     // return if zero or negative length
+        mov hmask=-1            // initialize head mask
+        tbit.nz p15,p0=buf,0    // is buf an odd address?
+        and first1=-8,buf       // 8-byte align down address of first1 element
+        and firstoff=7,buf      // how many bytes off for first1 element
+        mov tmask=-1            // initialize tail mask
+        ;;
+        adds tmp2=-1,tmp1       // last-1
+        and lastoff=7,tmp1      // how many bytes off for last element
+        ;;
+        sub tmp1=8,lastoff      // complement to lastoff
+        and last=-8,tmp2        // address of word containing last byte
+        ;;
+        sub tmp3=last,first1    // tmp3=distance from first1 to last
+        .save ar.lc, saved_lc
+        mov saved_lc=ar.lc      // save lc
+        cmp.eq p8,p9=last,first1        // everything fits in one word ?
+        ld8 firstval=[first1],8 // load, ahead of time, "first1" word
+        and tmp1=7, tmp1        // make sure that if tmp1==8 -> tmp1=0
+        shl tmp2=firstoff,3     // number of bits
+        ;;
+(p9)    ld8 lastval=[last]      // load, ahead of time, "last" word, if needed
+        shl tmp1=tmp1,3         // number of bits
+(p9)    adds tmp3=-8,tmp3       // effectively loaded
+        ;;
+(p8)    mov lastval=r0          // we don't need lastval if first1==last
+        shl hmask=hmask,tmp2    // build head mask, mask off [0,first1off[
+        shr.u tmask=tmask,tmp1  // build tail mask, mask off ]8,lastoff]
+        ;;
+        .body
+#define count tmp3
+(p8)    and hmask=hmask,tmask   // apply tail mask to head mask if 1 word only
+(p9)    and word2[0]=lastval,tmask      // mask last it as appropriate
+        shr.u count=count,3     // how many 8-byte?
+        ;;
+        // If count is odd, finish this 8-byte word so that we can
+        // load two back-to-back 8-byte words per loop thereafter.
+        and word1[0]=firstval,hmask     // and mask it as appropriate
+        tbit.nz p10,p11=count,0         // if (count is odd)
+        ;;
+(p8)    mov result1[0]=word1[0]
+(p9)    add result1[0]=word1[0],word2[0]
+        ;;
+        cmp.ltu p6,p0=result1[0],word1[0]       // check the carry
+        cmp.eq.or.andcm p8,p0=0,count           // exit if zero 8-byte
+        ;;
+(p6)    adds result1[0]=1,result1[0]
+(p8)    br.cond.dptk .do_csum_exit      // if (within an 8-byte word)
+(p11)   br.cond.dptk .do_csum16         // if (count is even)
+        // Here count is odd.
+        ld8 word1[1]=[first1],8         // load an 8-byte word
+        cmp.eq p9,p10=1,count           // if (count == 1)
+        adds count=-1,count             // loaded an 8-byte word
+        ;;
+        add result1[0]=result1[0],word1[1]
+        ;;
+        cmp.ltu p6,p0=result1[0],word1[1]
+        ;;
+(p6)    adds result1[0]=1,result1[0]
+(p9)    br.cond.sptk .do_csum_exit      // if (count == 1) exit
+        // Fall through to caluculate the checksum, feeding result1[0] as
+        // the initial value in result1[0].
+        //
+        // Calculate the checksum loading two 8-byte words per loop.
+        //
+.do_csum16:
+        add first2=8,first1
+        shr.u count=count,1     // we do 16 bytes per loop
+        ;;
+        adds count=-1,count
+        mov carry1=r0
+        mov carry2=r0
+        brp.loop.imp 1f,2f
+        ;;
+        mov ar.ec=PIPE_DEPTH
+        mov ar.lc=count // set lc
+        mov pr.rot=1<<16
+        // result1[0] must be initialized in advance.
+        mov result2[0]=r0
+        ;;
+        .align 32
+1:
+(ELD_1) cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1]
+(pC1[1])adds carry1=1,carry1
+(ELD_1) cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1]
+(pC2[1])adds carry2=1,carry2
+(ELD)   add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
+(ELD)   add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
+2:
+(p[0])  ld8 word1[0]=[first1],16
+(p[0])  ld8 word2[0]=[first2],16
+        br.ctop.sptk 1b
+        ;;
+        // Since len is a 32-bit value, carry cannot be larger than a 64-bit value.
+(pC1[1])adds carry1=1,carry1    // since we miss the last one
+(pC2[1])adds carry2=1,carry2
+        ;;
+        add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1
+        add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2
+        ;;
+        cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1
+        cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2
+        ;;
+(p6)    adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1]
+(p7)    adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1]
+        ;;
+        add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1]
+        ;;
+        cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1]
+        ;;
+(p6)    adds result1[0]=1,result1[0]
+        ;;
+.do_csum_exit:
+        //
+        // now fold 64 into 16 bits taking care of carry
+        // that's not very good because it has lots of sequentiality
+        //
+        mov tmp3=0xffff
+        zxt4 tmp1=result1[0]
+        shr.u tmp2=result1[0],32
+        ;;
+        add result1[0]=tmp1,tmp2
+        ;;
+        and tmp1=result1[0],tmp3
+        shr.u tmp2=result1[0],16
+        ;;
+        add result1[0]=tmp1,tmp2
+        ;;
+        and tmp1=result1[0],tmp3
+        shr.u tmp2=result1[0],16
+        ;;
+        add result1[0]=tmp1,tmp2
+        ;;
+        and tmp1=result1[0],tmp3
+        shr.u tmp2=result1[0],16
+        ;;
+        add ret0=tmp1,tmp2
+        mov pr=saved_pr,0xffffffffffff0000
+        ;;
+        // if buf was odd then swap bytes
+        mov ar.pfs=saved_pfs            // restore ar.ec
+(p15)   mux1 ret0=ret0,@rev             // reverse word
+        ;;
+        mov ar.lc=saved_lc
+(p15)   shr.u ret0=ret0,64-16   // + shift back to position = swap bytes
+        br.ret.sptk.many rp
+//      I (Jun Nakajima) wrote an equivalent code (see below), but it was
+//      not much better than the original. So keep the original there so that
+//      someone else can challenge.
+//
+//      shr.u word1[0]=result1[0],32
+//      zxt4 result1[0]=result1[0]
+//      ;;
+//      add result1[0]=result1[0],word1[0]
+//      ;;
+//      zxt2 result2[0]=result1[0]
+//      extr.u word1[0]=result1[0],16,16
+//      shr.u carry1=result1[0],32
+//      ;;
+//      add result2[0]=result2[0],word1[0]
+//      ;;
+//      add result2[0]=result2[0],carry1
+//      ;;
+//      extr.u ret0=result2[0],16,16
+//      ;;
+//      add ret0=ret0,result2[0]
+//      ;;
+//      zxt2 ret0=ret0
+//      mov ar.pfs=saved_pfs             // restore ar.ec
+//      mov pr=saved_pr,0xffffffffffff0000
+//      ;;
+//      // if buf was odd then swap bytes
+//      mov ar.lc=saved_lc
+//(p15) mux1 ret0=ret0,@rev             // reverse word
+//      ;;
+//(p15) shr.u ret0=ret0,64-16   // + shift back to position = swap bytes
+//      br.ret.sptk.many rp
+END(do_csum)
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
commit	1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree	0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/ia64/lib/do_csum.S

diff --git a/arch/ia64/lib/do_csum.S b/arch/ia64/lib/do_csum.S new file mode 100644 index 000000000000..6bec2fc9f5b2 --- /dev/null +++ b/arch/ia64/lib/do_csum.S
@@ -0,0 +1,323 @@
	1	/*
	2	*
	3	* Optmized version of the standard do_csum() function
	4	*
	5	* Return: a 64bit quantity containing the 16bit Internet checksum
	6	*
	7	* Inputs:
	8	* in0: address of buffer to checksum (char *)
	9	* in1: length of the buffer (int)
	10	*
	11	* Copyright (C) 1999, 2001-2002 Hewlett-Packard Co
	12	* Stephane Eranian <eranian@hpl.hp.com>
	13	*
	14	* 02/04/22 Ken Chen <kenneth.w.chen@intel.com>
	15	* Data locality study on the checksum buffer.
	16	* More optimization cleanup - remove excessive stop bits.
	17	* 02/04/08 David Mosberger <davidm@hpl.hp.com>
	18	* More cleanup and tuning.
	19	* 01/04/18 Jun Nakajima <jun.nakajima@intel.com>
	20	* Clean up and optimize and the software pipeline, loading two
	21	* back-to-back 8-byte words per loop. Clean up the initialization
	22	* for the loop. Support the cases where load latency = 1 or 2.
	23	* Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default).
	24	*/
	25
	26	#include <asm/asmmacro.h>
	27
	28	//
	29	// Theory of operations:
	30	// The goal is to go as quickly as possible to the point where
	31	// we can checksum 16 bytes/loop. Before reaching that point we must
	32	// take care of incorrect alignment of first byte.
	33	//
	34	// The code hereafter also takes care of the "tail" part of the buffer
	35	// before entering the core loop, if any. The checksum is a sum so it
	36	// allows us to commute operations. So we do the "head" and "tail"
	37	// first to finish at full speed in the body. Once we get the head and
	38	// tail values, we feed them into the pipeline, very handy initialization.
	39	//
	40	// Of course we deal with the special case where the whole buffer fits
	41	// into one 8 byte word. In this case we have only one entry in the pipeline.
	42	//
	43	// We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for
	44	// possible load latency and also to accommodate for head and tail.
	45	//
	46	// The end of the function deals with folding the checksum from 64bits
	47	// down to 16bits taking care of the carry.
	48	//
	49	// This version avoids synchronization in the core loop by also using a
	50	// pipeline for the accumulation of the checksum in resultx[] (x=1,2).
	51	//
	52	// wordx[] (x=1,2)
	53	// \|---\|
	54	// \| \| 0 : new value loaded in pipeline
	55	// \|---\|
	56	// \| \| - : in transit data
	57	// \|---\|
	58	// \| \| LOAD_LATENCY : current value to add to checksum
	59	// \|---\|
	60	// \| \| LOAD_LATENCY+1 : previous value added to checksum
	61	// \|---\| (previous iteration)
	62	//
	63	// resultx[] (x=1,2)
	64	// \|---\|
	65	// \| \| 0 : initial value
	66	// \|---\|
	67	// \| \| LOAD_LATENCY-1 : new checksum
	68	// \|---\|
	69	// \| \| LOAD_LATENCY : previous value of checksum
	70	// \|---\|
	71	// \| \| LOAD_LATENCY+1 : final checksum when out of the loop
	72	// \|---\|
	73	//
	74	//
	75	// See RFC1071 "Computing the Internet Checksum" for various techniques for
	76	// calculating the Internet checksum.
	77	//
	78	// NOT YET DONE:
	79	// - Maybe another algorithm which would take care of the folding at the
	80	// end in a different manner
	81	// - Work with people more knowledgeable than me on the network stack
	82	// to figure out if we could not split the function depending on the
	83	// type of packet or alignment we get. Like the ip_fast_csum() routine
	84	// where we know we have at least 20bytes worth of data to checksum.
	85	// - Do a better job of handling small packets.
	86	// - Note on prefetching: it was found that under various load, i.e. ftp read/write,
	87	// nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate is at 99.8%
	88	// on the data that buffer points to (partly because the checksum is often preceded by
	89	// a copy_from_user()). This finding indiate that lfetch will not be beneficial since
	90	// the data is already in the cache.
	91	//
	92
	93	#define saved_pfs r11
	94	#define hmask r16
	95	#define tmask r17
	96	#define first1 r18
	97	#define firstval r19
	98	#define firstoff r20
	99	#define last r21
	100	#define lastval r22
	101	#define lastoff r23
	102	#define saved_lc r24
	103	#define saved_pr r25
	104	#define tmp1 r26
	105	#define tmp2 r27
	106	#define tmp3 r28
	107	#define carry1 r29
	108	#define carry2 r30
	109	#define first2 r31
	110
	111	#define buf in0
	112	#define len in1
	113
	114	#define LOAD_LATENCY 2 // XXX fix me
	115
	116	#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2)
	117	# error "Only 1 or 2 is supported/tested for LOAD_LATENCY."
	118	#endif
	119
	120	#define PIPE_DEPTH (LOAD_LATENCY+2)
	121	#define ELD p[LOAD_LATENCY] // end of load
	122	#define ELD_1 p[LOAD_LATENCY+1] // and next stage
	123
	124	// unsigned long do_csum(unsigned char *buf,long len)
	125
	126	GLOBAL_ENTRY(do_csum)
	127	.prologue
	128	.save ar.pfs, saved_pfs
	129	alloc saved_pfs=ar.pfs,2,16,0,16
	130	.rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2]
	131	.rotp p[PIPE_DEPTH], pC1[2], pC2[2]
	132	mov ret0=r0 // in case we have zero length
	133	cmp.lt p0,p6=r0,len // check for zero length or negative (32bit len)
	134	;;
	135	add tmp1=buf,len // last byte's address
	136	.save pr, saved_pr
	137	mov saved_pr=pr // preserve predicates (rotation)
	138	(p6) br.ret.spnt.many rp // return if zero or negative length
	139
	140	mov hmask=-1 // initialize head mask
	141	tbit.nz p15,p0=buf,0 // is buf an odd address?
	142	and first1=-8,buf // 8-byte align down address of first1 element
	143
	144	and firstoff=7,buf // how many bytes off for first1 element
	145	mov tmask=-1 // initialize tail mask
	146
	147	;;
	148	adds tmp2=-1,tmp1 // last-1
	149	and lastoff=7,tmp1 // how many bytes off for last element
	150	;;
	151	sub tmp1=8,lastoff // complement to lastoff
	152	and last=-8,tmp2 // address of word containing last byte
	153	;;
	154	sub tmp3=last,first1 // tmp3=distance from first1 to last
	155	.save ar.lc, saved_lc
	156	mov saved_lc=ar.lc // save lc
	157	cmp.eq p8,p9=last,first1 // everything fits in one word ?
	158
	159	ld8 firstval=[first1],8 // load, ahead of time, "first1" word
	160	and tmp1=7, tmp1 // make sure that if tmp1==8 -> tmp1=0
	161	shl tmp2=firstoff,3 // number of bits
	162	;;
	163	(p9) ld8 lastval=[last] // load, ahead of time, "last" word, if needed
	164	shl tmp1=tmp1,3 // number of bits
	165	(p9) adds tmp3=-8,tmp3 // effectively loaded
	166	;;
	167	(p8) mov lastval=r0 // we don't need lastval if first1==last
	168	shl hmask=hmask,tmp2 // build head mask, mask off [0,first1off[
	169	shr.u tmask=tmask,tmp1 // build tail mask, mask off ]8,lastoff]
	170	;;
	171	.body
	172	#define count tmp3
	173
	174	(p8) and hmask=hmask,tmask // apply tail mask to head mask if 1 word only
	175	(p9) and word2[0]=lastval,tmask // mask last it as appropriate
	176	shr.u count=count,3 // how many 8-byte?
	177	;;
	178	// If count is odd, finish this 8-byte word so that we can
	179	// load two back-to-back 8-byte words per loop thereafter.
	180	and word1[0]=firstval,hmask // and mask it as appropriate
	181	tbit.nz p10,p11=count,0 // if (count is odd)
	182	;;
	183	(p8) mov result1[0]=word1[0]
	184	(p9) add result1[0]=word1[0],word2[0]
	185	;;
	186	cmp.ltu p6,p0=result1[0],word1[0] // check the carry
	187	cmp.eq.or.andcm p8,p0=0,count // exit if zero 8-byte
	188	;;
	189	(p6) adds result1[0]=1,result1[0]
	190	(p8) br.cond.dptk .do_csum_exit // if (within an 8-byte word)
	191	(p11) br.cond.dptk .do_csum16 // if (count is even)
	192
	193	// Here count is odd.
	194	ld8 word1[1]=[first1],8 // load an 8-byte word
	195	cmp.eq p9,p10=1,count // if (count == 1)
	196	adds count=-1,count // loaded an 8-byte word
	197	;;
	198	add result1[0]=result1[0],word1[1]
	199	;;
	200	cmp.ltu p6,p0=result1[0],word1[1]
	201	;;
	202	(p6) adds result1[0]=1,result1[0]
	203	(p9) br.cond.sptk .do_csum_exit // if (count == 1) exit
	204	// Fall through to caluculate the checksum, feeding result1[0] as
	205	// the initial value in result1[0].
	206	//
	207	// Calculate the checksum loading two 8-byte words per loop.
	208	//
	209	.do_csum16:
	210	add first2=8,first1
	211	shr.u count=count,1 // we do 16 bytes per loop
	212	;;
	213	adds count=-1,count
	214	mov carry1=r0
	215	mov carry2=r0
	216	brp.loop.imp 1f,2f
	217	;;
	218	mov ar.ec=PIPE_DEPTH
	219	mov ar.lc=count // set lc
	220	mov pr.rot=1<<16
	221	// result1[0] must be initialized in advance.
	222	mov result2[0]=r0
	223	;;
	224	.align 32
	225	1:
	226	(ELD_1) cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1]
	227	(pC1[1])adds carry1=1,carry1
	228	(ELD_1) cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1]
	229	(pC2[1])adds carry2=1,carry2
	230	(ELD) add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
	231	(ELD) add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
	232	2:
	233	(p[0]) ld8 word1[0]=[first1],16
	234	(p[0]) ld8 word2[0]=[first2],16
	235	br.ctop.sptk 1b
	236	;;
	237	// Since len is a 32-bit value, carry cannot be larger than a 64-bit value.
	238	(pC1[1])adds carry1=1,carry1 // since we miss the last one
	239	(pC2[1])adds carry2=1,carry2
	240	;;
	241	add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1
	242	add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2
	243	;;
	244	cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1
	245	cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2
	246	;;
	247	(p6) adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1]
	248	(p7) adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1]
	249	;;
	250	add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1]
	251	;;
	252	cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1]
	253	;;
	254	(p6) adds result1[0]=1,result1[0]
	255	;;
	256	.do_csum_exit:
	257	//
	258	// now fold 64 into 16 bits taking care of carry
	259	// that's not very good because it has lots of sequentiality
	260	//
	261	mov tmp3=0xffff
	262	zxt4 tmp1=result1[0]
	263	shr.u tmp2=result1[0],32
	264	;;
	265	add result1[0]=tmp1,tmp2
	266	;;
	267	and tmp1=result1[0],tmp3
	268	shr.u tmp2=result1[0],16
	269	;;
	270	add result1[0]=tmp1,tmp2
	271	;;
	272	and tmp1=result1[0],tmp3
	273	shr.u tmp2=result1[0],16
	274	;;
	275	add result1[0]=tmp1,tmp2
	276	;;
	277	and tmp1=result1[0],tmp3
	278	shr.u tmp2=result1[0],16
	279	;;
	280	add ret0=tmp1,tmp2
	281	mov pr=saved_pr,0xffffffffffff0000
	282	;;
	283	// if buf was odd then swap bytes
	284	mov ar.pfs=saved_pfs // restore ar.ec
	285	(p15) mux1 ret0=ret0,@rev // reverse word
	286	;;
	287	mov ar.lc=saved_lc
	288	(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes
	289	br.ret.sptk.many rp
	290
	291	// I (Jun Nakajima) wrote an equivalent code (see below), but it was
	292	// not much better than the original. So keep the original there so that
	293	// someone else can challenge.
	294	//
	295	// shr.u word1[0]=result1[0],32
	296	// zxt4 result1[0]=result1[0]
	297	// ;;
	298	// add result1[0]=result1[0],word1[0]
	299	// ;;
	300	// zxt2 result2[0]=result1[0]
	301	// extr.u word1[0]=result1[0],16,16
	302	// shr.u carry1=result1[0],32
	303	// ;;
	304	// add result2[0]=result2[0],word1[0]
	305	// ;;
	306	// add result2[0]=result2[0],carry1
	307	// ;;
	308	// extr.u ret0=result2[0],16,16
	309	// ;;
	310	// add ret0=ret0,result2[0]
	311	// ;;
	312	// zxt2 ret0=ret0
	313	// mov ar.pfs=saved_pfs // restore ar.ec
	314	// mov pr=saved_pr,0xffffffffffff0000
	315	// ;;
	316	// // if buf was odd then swap bytes
	317	// mov ar.lc=saved_lc
	318	//(p15) mux1 ret0=ret0,@rev // reverse word
	319	// ;;
	320	//(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes
	321	// br.ret.sptk.many rp
	322
	323	END(do_csum)