Linux-2.6.12-rc2v2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
author: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
commit: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree: 0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/arm26/lib/csumpartialcopygeneric.S
1 files changed, 352 insertions, 0 deletions
diff --git a/arch/arm26/lib/csumpartialcopygeneric.S b/arch/arm26/lib/csumpartialcopygeneric.S
new file mode 100644
index 000000000000..5249c3ad11db
--- /dev/null
+++ b/arch/arm26/lib/csumpartialcopygeneric.S
@@ -0,0 +1,352 @@
+/*
+ *  linux/arch/arm26/lib/csumpartialcopygeneric.S
+ *
+ *  Copyright (C) 1995-2001 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * JMA 01/06/03 Commented out some shl0s; probobly irrelevant to arm26 
+ *
+ */
+/*
+ * unsigned int
+ * csum_partial_copy_xxx(const char *src, char *dst, int len, int sum, )
+ *  r0 = src, r1 = dst, r2 = len, r3 = sum
+ *  Returns : r0 = checksum
+ *
+ * Note that 'tst' and 'teq' preserve the carry flag.
+ */
+/* Quick hack */
+                .macro  save_regs
+                stmfd   sp!, {r1, r4 - r8, fp, ip, lr, pc}
+                .endm
+/* end Quick Hack */
+src     .req    r0
+dst     .req    r1
+len     .req    r2
+sum     .req    r3
+.zero:          mov     r0, sum
+                load_regs       ea
+                /*
+                 * Align an unaligned destination pointer.  We know that
+                 * we have >= 8 bytes here, so we don't need to check
+                 * the length.  Note that the source pointer hasn't been
+                 * aligned yet.
+                 */
+.dst_unaligned: tst     dst, #1
+                beq     .dst_16bit
+                load1b  ip
+                sub     len, len, #1
+                adcs    sum, sum, ip, lsl #byte(1)      @ update checksum
+                strb    ip, [dst], #1
+                tst     dst, #2
+                moveq   pc, lr                  @ dst is now 32bit aligned
+.dst_16bit:     load2b  r8, ip
+                sub     len, len, #2
+                adcs    sum, sum, r8, lsl #byte(0)
+                strb    r8, [dst], #1
+                adcs    sum, sum, ip, lsl #byte(1)
+                strb    ip, [dst], #1
+                mov     pc, lr                  @ dst is now 32bit aligned
+                /*
+                 * Handle 0 to 7 bytes, with any alignment of source and
+                 * destination pointers.  Note that when we get here, C = 0
+                 */
+.less8:         teq     len, #0                 @ check for zero count
+                beq     .zero
+                /* we must have at least one byte. */
+                tst     dst, #1                 @ dst 16-bit aligned
+                beq     .less8_aligned
+                /* Align dst */
+                load1b  ip
+                sub     len, len, #1
+                adcs    sum, sum, ip, lsl #byte(1)      @ update checksum
+                strb    ip, [dst], #1
+                tst     len, #6
+                beq     .less8_byteonly
+1:              load2b  r8, ip
+                sub     len, len, #2
+                adcs    sum, sum, r8, lsl #byte(0)
+                strb    r8, [dst], #1
+                adcs    sum, sum, ip, lsl #byte(1)
+                strb    ip, [dst], #1
+.less8_aligned: tst     len, #6
+                bne     1b
+.less8_byteonly:
+                tst     len, #1
+                beq     .done
+                load1b  r8
+                adcs    sum, sum, r8, lsl #byte(0)      @ update checksum
+                strb    r8, [dst], #1
+                b       .done
+FN_ENTRY
+                mov     ip, sp
+                save_regs
+                sub     fp, ip, #4
+                cmp     len, #8                 @ Ensure that we have at least
+                blo     .less8                  @ 8 bytes to copy.
+                adds    sum, sum, #0            @ C = 0
+                tst     dst, #3                 @ Test destination alignment
+                blne    .dst_unaligned          @ align destination, return here
+                /*
+                 * Ok, the dst pointer is now 32bit aligned, and we know
+                 * that we must have more than 4 bytes to copy.  Note
+                 * that C contains the carry from the dst alignment above.
+                 */
+                tst     src, #3                 @ Test source alignment
+                bne     .src_not_aligned
+                /* Routine for src & dst aligned */
+                bics    ip, len, #15
+                beq     2f
+1:              load4l  r4, r5, r6, r7
+                stmia   dst!, {r4, r5, r6, r7}
+                adcs    sum, sum, r4
+                adcs    sum, sum, r5
+                adcs    sum, sum, r6
+                adcs    sum, sum, r7
+                sub     ip, ip, #16
+                teq     ip, #0
+                bne     1b
+2:              ands    ip, len, #12
+                beq     4f
+                tst     ip, #8
+                beq     3f
+                load2l  r4, r5
+                stmia   dst!, {r4, r5}
+                adcs    sum, sum, r4
+                adcs    sum, sum, r5
+                tst     ip, #4
+                beq     4f
+3:              load1l  r4
+                str     r4, [dst], #4
+                adcs    sum, sum, r4
+4:              ands    len, len, #3
+                beq     .done
+                load1l  r4
+                tst     len, #2
+/*              mov     r5, r4, lsr #byte(0)
+FIXME? 0 Shift anyhow!
+*/
+                beq     .exit
+                adcs    sum, sum, r4, push #16
+                strb    r5, [dst], #1
+                mov     r5, r4, lsr #byte(1)
+                strb    r5, [dst], #1
+                mov     r5, r4, lsr #byte(2)
+.exit:          tst     len, #1
+                strneb  r5, [dst], #1
+                andne   r5, r5, #255
+                adcnes  sum, sum, r5, lsl #byte(0)
+                /*
+                 * If the dst pointer was not 16-bit aligned, we
+                 * need to rotate the checksum here to get around
+                 * the inefficient byte manipulations in the
+                 * architecture independent code.
+                 */
+.done:          adc     r0, sum, #0
+                ldr     sum, [sp, #0]           @ dst
+                tst     sum, #1
+                movne   sum, r0, lsl #8
+                orrne   r0, sum, r0, lsr #24
+                load_regs       ea
+.src_not_aligned:
+                adc     sum, sum, #0            @ include C from dst alignment
+                and     ip, src, #3
+                bic     src, src, #3
+                load1l  r5
+                cmp     ip, #2
+                beq     .src2_aligned
+                bhi     .src3_aligned
+                mov     r4, r5, pull #8         @ C = 0
+                bics    ip, len, #15
+                beq     2f
+1:              load4l  r5, r6, r7, r8
+                orr     r4, r4, r5, push #24
+                mov     r5, r5, pull #8
+                orr     r5, r5, r6, push #24
+                mov     r6, r6, pull #8
+                orr     r6, r6, r7, push #24
+                mov     r7, r7, pull #8
+                orr     r7, r7, r8, push #24
+                stmia   dst!, {r4, r5, r6, r7}
+                adcs    sum, sum, r4
+                adcs    sum, sum, r5
+                adcs    sum, sum, r6
+                adcs    sum, sum, r7
+                mov     r4, r8, pull #8
+                sub     ip, ip, #16
+                teq     ip, #0
+                bne     1b
+2:              ands    ip, len, #12
+                beq     4f
+                tst     ip, #8
+                beq     3f
+                load2l  r5, r6
+                orr     r4, r4, r5, push #24
+                mov     r5, r5, pull #8
+                orr     r5, r5, r6, push #24
+                stmia   dst!, {r4, r5}
+                adcs    sum, sum, r4
+                adcs    sum, sum, r5
+                mov     r4, r6, pull #8
+                tst     ip, #4
+                beq     4f
+3:              load1l  r5
+                orr     r4, r4, r5, push #24
+                str     r4, [dst], #4
+                adcs    sum, sum, r4
+                mov     r4, r5, pull #8
+4:              ands    len, len, #3
+                beq     .done
+/*              mov     r5, r4, lsr #byte(0)
+FIXME? 0 Shift anyhow
+*/
+                tst     len, #2
+                beq     .exit
+                adcs    sum, sum, r4, push #16
+                strb    r5, [dst], #1
+                mov     r5, r4, lsr #byte(1)
+                strb    r5, [dst], #1
+                mov     r5, r4, lsr #byte(2)
+                b       .exit
+.src2_aligned:  mov     r4, r5, pull #16
+                adds    sum, sum, #0
+                bics    ip, len, #15
+                beq     2f
+1:              load4l  r5, r6, r7, r8
+                orr     r4, r4, r5, push #16
+                mov     r5, r5, pull #16
+                orr     r5, r5, r6, push #16
+                mov     r6, r6, pull #16
+                orr     r6, r6, r7, push #16
+                mov     r7, r7, pull #16
+                orr     r7, r7, r8, push #16
+                stmia   dst!, {r4, r5, r6, r7}
+                adcs    sum, sum, r4
+                adcs    sum, sum, r5
+                adcs    sum, sum, r6
+                adcs    sum, sum, r7
+                mov     r4, r8, pull #16
+                sub     ip, ip, #16
+                teq     ip, #0
+                bne     1b
+2:              ands    ip, len, #12
+                beq     4f
+                tst     ip, #8
+                beq     3f
+                load2l  r5, r6
+                orr     r4, r4, r5, push #16
+                mov     r5, r5, pull #16
+                orr     r5, r5, r6, push #16
+                stmia   dst!, {r4, r5}
+                adcs    sum, sum, r4
+                adcs    sum, sum, r5
+                mov     r4, r6, pull #16
+                tst     ip, #4
+                beq     4f
+3:              load1l  r5
+                orr     r4, r4, r5, push #16
+                str     r4, [dst], #4
+                adcs    sum, sum, r4
+                mov     r4, r5, pull #16
+4:              ands    len, len, #3
+                beq     .done
+/*              mov     r5, r4, lsr #byte(0)
+FIXME? 0 Shift anyhow
+*/
+                tst     len, #2
+                beq     .exit
+                adcs    sum, sum, r4
+                strb    r5, [dst], #1
+                mov     r5, r4, lsr #byte(1)
+                strb    r5, [dst], #1
+                tst     len, #1
+                beq     .done
+                load1b  r5
+                b       .exit
+.src3_aligned:  mov     r4, r5, pull #24
+                adds    sum, sum, #0
+                bics    ip, len, #15
+                beq     2f
+1:              load4l  r5, r6, r7, r8
+                orr     r4, r4, r5, push #8
+                mov     r5, r5, pull #24
+                orr     r5, r5, r6, push #8
+                mov     r6, r6, pull #24
+                orr     r6, r6, r7, push #8
+                mov     r7, r7, pull #24
+                orr     r7, r7, r8, push #8
+                stmia   dst!, {r4, r5, r6, r7}
+                adcs    sum, sum, r4
+                adcs    sum, sum, r5
+                adcs    sum, sum, r6
+                adcs    sum, sum, r7
+                mov     r4, r8, pull #24
+                sub     ip, ip, #16
+                teq     ip, #0
+                bne     1b
+2:              ands    ip, len, #12
+                beq     4f
+                tst     ip, #8
+                beq     3f
+                load2l  r5, r6
+                orr     r4, r4, r5, push #8
+                mov     r5, r5, pull #24
+                orr     r5, r5, r6, push #8
+                stmia   dst!, {r4, r5}
+                adcs    sum, sum, r4
+                adcs    sum, sum, r5
+                mov     r4, r6, pull #24
+                tst     ip, #4
+                beq     4f
+3:              load1l  r5
+                orr     r4, r4, r5, push #8
+                str     r4, [dst], #4
+                adcs    sum, sum, r4
+                mov     r4, r5, pull #24
+4:              ands    len, len, #3
+                beq     .done
+/*              mov     r5, r4, lsr #byte(0)
+FIXME? 0 Shift anyhow
+*/
+                tst     len, #2
+                beq     .exit
+                strb    r5, [dst], #1
+                adcs    sum, sum, r4
+                load1l  r4
+/*              mov     r5, r4, lsr #byte(0)
+FIXME? 0 Shift anyhow
+*/
+                strb    r5, [dst], #1
+                adcs    sum, sum, r4, push #24
+                mov     r5, r4, lsr #byte(1)
+                b       .exit
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
commit	1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree	0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/arm26/lib/csumpartialcopygeneric.S

diff --git a/arch/arm26/lib/csumpartialcopygeneric.S b/arch/arm26/lib/csumpartialcopygeneric.S new file mode 100644 index 000000000000..5249c3ad11db --- /dev/null +++ b/arch/arm26/lib/csumpartialcopygeneric.S
@@ -0,0 +1,352 @@
	1	/*
	2	* linux/arch/arm26/lib/csumpartialcopygeneric.S
	3	*
	4	* Copyright (C) 1995-2001 Russell King
	5	*
	6	* This program is free software; you can redistribute it and/or modify
	7	* it under the terms of the GNU General Public License version 2 as
	8	* published by the Free Software Foundation.
	9	*
	10	* JMA 01/06/03 Commented out some shl0s; probobly irrelevant to arm26
	11	*
	12	*/
	13
	14	/*
	15	* unsigned int
	16	* csum_partial_copy_xxx(const char src, char dst, int len, int sum, )
	17	* r0 = src, r1 = dst, r2 = len, r3 = sum
	18	* Returns : r0 = checksum
	19	*
	20	* Note that 'tst' and 'teq' preserve the carry flag.
	21	*/
	22
	23	/* Quick hack */
	24	.macro save_regs
	25	stmfd sp!, {r1, r4 - r8, fp, ip, lr, pc}
	26	.endm
	27
	28	/* end Quick Hack */
	29
	30	src .req r0
	31	dst .req r1
	32	len .req r2
	33	sum .req r3
	34
	35	.zero: mov r0, sum
	36	load_regs ea
	37
	38	/*
	39	* Align an unaligned destination pointer. We know that
	40	* we have >= 8 bytes here, so we don't need to check
	41	* the length. Note that the source pointer hasn't been
	42	* aligned yet.
	43	*/
	44	.dst_unaligned: tst dst, #1
	45	beq .dst_16bit
	46
	47	load1b ip
	48	sub len, len, #1
	49	adcs sum, sum, ip, lsl #byte(1) @ update checksum
	50	strb ip, [dst], #1
	51	tst dst, #2
	52	moveq pc, lr @ dst is now 32bit aligned
	53
	54	.dst_16bit: load2b r8, ip
	55	sub len, len, #2
	56	adcs sum, sum, r8, lsl #byte(0)
	57	strb r8, [dst], #1
	58	adcs sum, sum, ip, lsl #byte(1)
	59	strb ip, [dst], #1
	60	mov pc, lr @ dst is now 32bit aligned
	61
	62	/*
	63	* Handle 0 to 7 bytes, with any alignment of source and
	64	* destination pointers. Note that when we get here, C = 0
	65	*/
	66	.less8: teq len, #0 @ check for zero count
	67	beq .zero
	68
	69	/* we must have at least one byte. */
	70	tst dst, #1 @ dst 16-bit aligned
	71	beq .less8_aligned
	72
	73	/* Align dst */
	74	load1b ip
	75	sub len, len, #1
	76	adcs sum, sum, ip, lsl #byte(1) @ update checksum
	77	strb ip, [dst], #1
	78	tst len, #6
	79	beq .less8_byteonly
	80
	81	1: load2b r8, ip
	82	sub len, len, #2
	83	adcs sum, sum, r8, lsl #byte(0)
	84	strb r8, [dst], #1
	85	adcs sum, sum, ip, lsl #byte(1)
	86	strb ip, [dst], #1
	87	.less8_aligned: tst len, #6
	88	bne 1b
	89	.less8_byteonly:
	90	tst len, #1
	91	beq .done
	92	load1b r8
	93	adcs sum, sum, r8, lsl #byte(0) @ update checksum
	94	strb r8, [dst], #1
	95	b .done
	96
	97	FN_ENTRY
	98	mov ip, sp
	99	save_regs
	100	sub fp, ip, #4
	101
	102	cmp len, #8 @ Ensure that we have at least
	103	blo .less8 @ 8 bytes to copy.
	104
	105	adds sum, sum, #0 @ C = 0
	106	tst dst, #3 @ Test destination alignment
	107	blne .dst_unaligned @ align destination, return here
	108
	109	/*
	110	* Ok, the dst pointer is now 32bit aligned, and we know
	111	* that we must have more than 4 bytes to copy. Note
	112	* that C contains the carry from the dst alignment above.
	113	*/
	114
	115	tst src, #3 @ Test source alignment
	116	bne .src_not_aligned
	117
	118	/* Routine for src & dst aligned */
	119
	120	bics ip, len, #15
	121	beq 2f
	122
	123	1: load4l r4, r5, r6, r7
	124	stmia dst!, {r4, r5, r6, r7}
	125	adcs sum, sum, r4
	126	adcs sum, sum, r5
	127	adcs sum, sum, r6
	128	adcs sum, sum, r7
	129	sub ip, ip, #16
	130	teq ip, #0
	131	bne 1b
	132
	133	2: ands ip, len, #12
	134	beq 4f
	135	tst ip, #8
	136	beq 3f
	137	load2l r4, r5
	138	stmia dst!, {r4, r5}
	139	adcs sum, sum, r4
	140	adcs sum, sum, r5
	141	tst ip, #4
	142	beq 4f
	143
	144	3: load1l r4
	145	str r4, [dst], #4
	146	adcs sum, sum, r4
	147
	148	4: ands len, len, #3
	149	beq .done
	150	load1l r4
	151	tst len, #2
	152	/* mov r5, r4, lsr #byte(0)
	153	FIXME? 0 Shift anyhow!
	154	*/
	155	beq .exit
	156	adcs sum, sum, r4, push #16
	157	strb r5, [dst], #1
	158	mov r5, r4, lsr #byte(1)
	159	strb r5, [dst], #1
	160	mov r5, r4, lsr #byte(2)
	161	.exit: tst len, #1
	162	strneb r5, [dst], #1
	163	andne r5, r5, #255
	164	adcnes sum, sum, r5, lsl #byte(0)
	165
	166	/*
	167	* If the dst pointer was not 16-bit aligned, we
	168	* need to rotate the checksum here to get around
	169	* the inefficient byte manipulations in the
	170	* architecture independent code.
	171	*/
	172	.done: adc r0, sum, #0
	173	ldr sum, [sp, #0] @ dst
	174	tst sum, #1
	175	movne sum, r0, lsl #8
	176	orrne r0, sum, r0, lsr #24
	177	load_regs ea
	178
	179	.src_not_aligned:
	180	adc sum, sum, #0 @ include C from dst alignment
	181	and ip, src, #3
	182	bic src, src, #3
	183	load1l r5
	184	cmp ip, #2
	185	beq .src2_aligned
	186	bhi .src3_aligned
	187	mov r4, r5, pull #8 @ C = 0
	188	bics ip, len, #15
	189	beq 2f
	190	1: load4l r5, r6, r7, r8
	191	orr r4, r4, r5, push #24
	192	mov r5, r5, pull #8
	193	orr r5, r5, r6, push #24
	194	mov r6, r6, pull #8
	195	orr r6, r6, r7, push #24
	196	mov r7, r7, pull #8
	197	orr r7, r7, r8, push #24
	198	stmia dst!, {r4, r5, r6, r7}
	199	adcs sum, sum, r4
	200	adcs sum, sum, r5
	201	adcs sum, sum, r6
	202	adcs sum, sum, r7
	203	mov r4, r8, pull #8
	204	sub ip, ip, #16
	205	teq ip, #0
	206	bne 1b
	207	2: ands ip, len, #12
	208	beq 4f
	209	tst ip, #8
	210	beq 3f
	211	load2l r5, r6
	212	orr r4, r4, r5, push #24
	213	mov r5, r5, pull #8
	214	orr r5, r5, r6, push #24
	215	stmia dst!, {r4, r5}
	216	adcs sum, sum, r4
	217	adcs sum, sum, r5
	218	mov r4, r6, pull #8
	219	tst ip, #4
	220	beq 4f
	221	3: load1l r5
	222	orr r4, r4, r5, push #24
	223	str r4, [dst], #4
	224	adcs sum, sum, r4
	225	mov r4, r5, pull #8
	226	4: ands len, len, #3
	227	beq .done
	228	/* mov r5, r4, lsr #byte(0)
	229	FIXME? 0 Shift anyhow
	230	*/
	231	tst len, #2
	232	beq .exit
	233	adcs sum, sum, r4, push #16
	234	strb r5, [dst], #1
	235	mov r5, r4, lsr #byte(1)
	236	strb r5, [dst], #1
	237	mov r5, r4, lsr #byte(2)
	238	b .exit
	239
	240	.src2_aligned: mov r4, r5, pull #16
	241	adds sum, sum, #0
	242	bics ip, len, #15
	243	beq 2f
	244	1: load4l r5, r6, r7, r8
	245	orr r4, r4, r5, push #16
	246	mov r5, r5, pull #16
	247	orr r5, r5, r6, push #16
	248	mov r6, r6, pull #16
	249	orr r6, r6, r7, push #16
	250	mov r7, r7, pull #16
	251	orr r7, r7, r8, push #16
	252	stmia dst!, {r4, r5, r6, r7}
	253	adcs sum, sum, r4
	254	adcs sum, sum, r5
	255	adcs sum, sum, r6
	256	adcs sum, sum, r7
	257	mov r4, r8, pull #16
	258	sub ip, ip, #16
	259	teq ip, #0
	260	bne 1b
	261	2: ands ip, len, #12
	262	beq 4f
	263	tst ip, #8
	264	beq 3f
	265	load2l r5, r6
	266	orr r4, r4, r5, push #16
	267	mov r5, r5, pull #16
	268	orr r5, r5, r6, push #16
	269	stmia dst!, {r4, r5}
	270	adcs sum, sum, r4
	271	adcs sum, sum, r5
	272	mov r4, r6, pull #16
	273	tst ip, #4
	274	beq 4f
	275	3: load1l r5
	276	orr r4, r4, r5, push #16
	277	str r4, [dst], #4
	278	adcs sum, sum, r4
	279	mov r4, r5, pull #16
	280	4: ands len, len, #3
	281	beq .done
	282	/* mov r5, r4, lsr #byte(0)
	283	FIXME? 0 Shift anyhow
	284	*/
	285	tst len, #2
	286	beq .exit
	287	adcs sum, sum, r4
	288	strb r5, [dst], #1
	289	mov r5, r4, lsr #byte(1)
	290	strb r5, [dst], #1
	291	tst len, #1
	292	beq .done
	293	load1b r5
	294	b .exit
	295
	296	.src3_aligned: mov r4, r5, pull #24
	297	adds sum, sum, #0
	298	bics ip, len, #15
	299	beq 2f
	300	1: load4l r5, r6, r7, r8
	301	orr r4, r4, r5, push #8
	302	mov r5, r5, pull #24
	303	orr r5, r5, r6, push #8
	304	mov r6, r6, pull #24
	305	orr r6, r6, r7, push #8
	306	mov r7, r7, pull #24
	307	orr r7, r7, r8, push #8
	308	stmia dst!, {r4, r5, r6, r7}
	309	adcs sum, sum, r4
	310	adcs sum, sum, r5
	311	adcs sum, sum, r6
	312	adcs sum, sum, r7
	313	mov r4, r8, pull #24
	314	sub ip, ip, #16
	315	teq ip, #0
	316	bne 1b
	317	2: ands ip, len, #12
	318	beq 4f
	319	tst ip, #8
	320	beq 3f
	321	load2l r5, r6
	322	orr r4, r4, r5, push #8
	323	mov r5, r5, pull #24
	324	orr r5, r5, r6, push #8
	325	stmia dst!, {r4, r5}
	326	adcs sum, sum, r4
	327	adcs sum, sum, r5
	328	mov r4, r6, pull #24
	329	tst ip, #4
	330	beq 4f
	331	3: load1l r5
	332	orr r4, r4, r5, push #8
	333	str r4, [dst], #4
	334	adcs sum, sum, r4
	335	mov r4, r5, pull #24
	336	4: ands len, len, #3
	337	beq .done
	338	/* mov r5, r4, lsr #byte(0)
	339	FIXME? 0 Shift anyhow
	340	*/
	341	tst len, #2
	342	beq .exit
	343	strb r5, [dst], #1
	344	adcs sum, sum, r4
	345	load1l r4
	346	/* mov r5, r4, lsr #byte(0)
	347	FIXME? 0 Shift anyhow
	348	*/
	349	strb r5, [dst], #1
	350	adcs sum, sum, r4, push #24
	351	mov r5, r4, lsr #byte(1)
	352	b .exit