1 files changed, 274 insertions, 0 deletions
diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c
new file mode 100644
index 000000000000..8593bc82398a
--- /dev/null
+++ b/arch/tile/lib/memset_32.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+#include <arch/chip.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+void *memset(void *s, int c, size_t n)
+{
+        uint32_t *out32;
+        int n32;
+        uint32_t v16, v32;
+        uint8_t *out8 = s;
+#if !CHIP_HAS_WH64()
+        int ahead32;
+#else
+        int to_align32;
+#endif
+        /* Experimentation shows that a trivial tight loop is a win up until
+         * around a size of 20, where writing a word at a time starts to win.
+         */
+#define BYTE_CUTOFF 20
+#if BYTE_CUTOFF < 3
+        /* This must be at least at least this big, or some code later
+         * on doesn't work.
+         */
+#error "BYTE_CUTOFF is too small"
+#endif
+        if (n < BYTE_CUTOFF) {
+                /* Strangely, this turns out to be the tightest way to
+                 * write this loop.
+                 */
+                if (n != 0) {
+                        do {
+                                /* Strangely, combining these into one line
+                                 * performs worse.
+                                 */
+                                *out8 = c;
+                                out8++;
+                        } while (--n != 0);
+                }
+                return s;
+        }
+#if !CHIP_HAS_WH64()
+        /* Use a spare issue slot to start prefetching the first cache
+         * line early. This instruction is free as the store can be buried
+         * in otherwise idle issue slots doing ALU ops.
+         */
+        __insn_prefetch(out8);
+        /* We prefetch the end so that a short memset that spans two cache
+         * lines gets some prefetching benefit. Again we believe this is free
+         * to issue.
+         */
+        __insn_prefetch(&out8[n - 1]);
+#endif /* !CHIP_HAS_WH64() */
+        /* Align 'out8'. We know n >= 3 so this won't write past the end. */
+        while (((uintptr_t) out8 & 3) != 0) {
+                *out8++ = c;
+                --n;
+        }
+        /* Align 'n'. */
+        while (n & 3)
+                out8[--n] = c;
+        out32 = (uint32_t *) out8;
+        n32 = n >> 2;
+        /* Tile input byte out to 32 bits. */
+        v16 = __insn_intlb(c, c);
+        v32 = __insn_intlh(v16, v16);
+        /* This must be at least 8 or the following loop doesn't work. */
+#define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4)
+#if !CHIP_HAS_WH64()
+        ahead32 = CACHE_LINE_SIZE_IN_WORDS;
+        /* We already prefetched the first and last cache lines, so
+         * we only need to do more prefetching if we are storing
+         * to more than two cache lines.
+         */
+        if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) {
+                int i;
+                /* Prefetch the next several cache lines.
+                 * This is the setup code for the software-pipelined
+                 * loop below.
+                 */
+#define MAX_PREFETCH 5
+                ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS;
+                if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS)
+                        ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS;
+                for (i = CACHE_LINE_SIZE_IN_WORDS;
+                     i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS)
+                        __insn_prefetch(&out32[i]);
+        }
+        if (n32 > ahead32) {
+                while (1) {
+                        int j;
+                        /* Prefetch by reading one word several cache lines
+                         * ahead.  Since loads are non-blocking this will
+                         * cause the full cache line to be read while we are
+                         * finishing earlier cache lines.  Using a store
+                         * here causes microarchitectural performance
+                         * problems where a victimizing store miss goes to
+                         * the head of the retry FIFO and locks the pipe for
+                         * a few cycles.  So a few subsequent stores in this
+                         * loop go into the retry FIFO, and then later
+                         * stores see other stores to the same cache line
+                         * are already in the retry FIFO and themselves go
+                         * into the retry FIFO, filling it up and grinding
+                         * to a halt waiting for the original miss to be
+                         * satisfied.
+                         */
+                        __insn_prefetch(&out32[ahead32]);
+#if 1
+#if CACHE_LINE_SIZE_IN_WORDS % 4 != 0
+#error "Unhandled CACHE_LINE_SIZE_IN_WORDS"
+#endif
+                        n32 -= CACHE_LINE_SIZE_IN_WORDS;
+                        /* Save icache space by only partially unrolling
+                         * this loop.
+                         */
+                        for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) {
+                                *out32++ = v32;
+                                *out32++ = v32;
+                                *out32++ = v32;
+                                *out32++ = v32;
+                        }
+#else
+                        /* Unfortunately, due to a code generator flaw this
+                         * allocates a separate register for each of these
+                         * stores, which requires a large number of spills,
+                         * which makes this procedure enormously bigger
+                         * (something like 70%)
+                         */
+                        *out32++ = v32;
+                        *out32++ = v32;
+                        *out32++ = v32;
+                        *out32++ = v32;
+                        *out32++ = v32;
+                        *out32++ = v32;
+                        *out32++ = v32;
+                        *out32++ = v32;
+                        *out32++ = v32;
+                        *out32++ = v32;
+                        *out32++ = v32;
+                        *out32++ = v32;
+                        *out32++ = v32;
+                        *out32++ = v32;
+                        *out32++ = v32;
+                        n32 -= 16;
+#endif
+                        /* To save compiled code size, reuse this loop even
+                         * when we run out of prefetching to do by dropping
+                         * ahead32 down.
+                         */
+                        if (n32 <= ahead32) {
+                                /* Not even a full cache line left,
+                                 * so stop now.
+                                 */
+                                if (n32 < CACHE_LINE_SIZE_IN_WORDS)
+                                        break;
+                                /* Choose a small enough value that we don't
+                                 * prefetch past the end.  There's no sense
+                                 * in touching cache lines we don't have to.
+                                 */
+                                ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1;
+                        }
+                }
+        }
+#else /* CHIP_HAS_WH64() */
+        /* Determine how many words we need to emit before the 'out32'
+         * pointer becomes aligned modulo the cache line size.
+         */
+        to_align32 =
+                (-((uintptr_t)out32 >> 2)) & (CACHE_LINE_SIZE_IN_WORDS - 1);
+        /* Only bother aligning and using wh64 if there is at least
+         * one full cache line to process.  This check also prevents
+         * overrunning the end of the buffer with alignment words.
+         */
+        if (to_align32 <= n32 - CACHE_LINE_SIZE_IN_WORDS) {
+                int lines_left;
+                /* Align out32 mod the cache line size so we can use wh64. */
+                n32 -= to_align32;
+                for (; to_align32 != 0; to_align32--) {
+                        *out32 = v32;
+                        out32++;
+                }
+                /* Use unsigned divide to turn this into a right shift. */
+                lines_left = (unsigned)n32 / CACHE_LINE_SIZE_IN_WORDS;
+                do {
+                        /* Only wh64 a few lines at a time, so we don't
+                         * exceed the maximum number of victim lines.
+                         */
+                        int x = ((lines_left < CHIP_MAX_OUTSTANDING_VICTIMS())
+                                  ? lines_left
+                                  : CHIP_MAX_OUTSTANDING_VICTIMS());
+                        uint32_t *wh = out32;
+                        int i = x;
+                        int j;
+                        lines_left -= x;
+                        do {
+                                __insn_wh64(wh);
+                                wh += CACHE_LINE_SIZE_IN_WORDS;
+                        } while (--i);
+                        for (j = x * (CACHE_LINE_SIZE_IN_WORDS / 4); j != 0; j--) {
+                                *out32++ = v32;
+                                *out32++ = v32;
+                                *out32++ = v32;
+                                *out32++ = v32;
+                        }
+                } while (lines_left != 0);
+                /* We processed all full lines above, so only this many
+                 * words remain to be processed.
+                 */
+                n32 &= CACHE_LINE_SIZE_IN_WORDS - 1;
+        }
+#endif /* CHIP_HAS_WH64() */
+        /* Now handle any leftover values. */
+        if (n32 != 0) {
+                do {
+                        *out32 = v32;
+                        out32++;
+                } while (--n32 != 0);
+        }
+        return s;
+}
+EXPORT_SYMBOL(memset);

diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c new file mode 100644 index 000000000000..8593bc82398a --- /dev/null +++ b/arch/tile/lib/memset_32.c
@@ -0,0 +1,274 @@
	1	/*
	2	* Copyright 2010 Tilera Corporation. All Rights Reserved.
	3	*
	4	* This program is free software; you can redistribute it and/or
	5	* modify it under the terms of the GNU General Public License
	6	* as published by the Free Software Foundation, version 2.
	7	*
	8	* This program is distributed in the hope that it will be useful, but
	9	* WITHOUT ANY WARRANTY; without even the implied warranty of
	10	* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
	11	* NON INFRINGEMENT. See the GNU General Public License for
	12	* more details.
	13	*/
	14
	15	#include <arch/chip.h>
	16
	17	#include <linux/types.h>
	18	#include <linux/string.h>
	19	#include <linux/module.h>
	20
	21
	22	void memset(void s, int c, size_t n)
	23	{
	24	uint32_t *out32;
	25	int n32;
	26	uint32_t v16, v32;
	27	uint8_t *out8 = s;
	28	#if !CHIP_HAS_WH64()
	29	int ahead32;
	30	#else
	31	int to_align32;
	32	#endif
	33
	34	/* Experimentation shows that a trivial tight loop is a win up until
	35	* around a size of 20, where writing a word at a time starts to win.
	36	*/
	37	#define BYTE_CUTOFF 20
	38
	39	#if BYTE_CUTOFF < 3
	40	/* This must be at least at least this big, or some code later
	41	* on doesn't work.
	42	*/
	43	#error "BYTE_CUTOFF is too small"
	44	#endif
	45
	46	if (n < BYTE_CUTOFF) {
	47	/* Strangely, this turns out to be the tightest way to
	48	* write this loop.
	49	*/
	50	if (n != 0) {
	51	do {
	52	/* Strangely, combining these into one line
	53	* performs worse.
	54	*/
	55	*out8 = c;
	56	out8++;
	57	} while (--n != 0);
	58	}
	59
	60	return s;
	61	}
	62
	63	#if !CHIP_HAS_WH64()
	64	/* Use a spare issue slot to start prefetching the first cache
	65	* line early. This instruction is free as the store can be buried
	66	* in otherwise idle issue slots doing ALU ops.
	67	*/
	68	__insn_prefetch(out8);
	69
	70	/* We prefetch the end so that a short memset that spans two cache
	71	* lines gets some prefetching benefit. Again we believe this is free
	72	* to issue.
	73	*/
	74	__insn_prefetch(&out8[n - 1]);
	75	#endif /* !CHIP_HAS_WH64() */
	76
	77
	78	/* Align 'out8'. We know n >= 3 so this won't write past the end. */
	79	while (((uintptr_t) out8 & 3) != 0) {
	80	*out8++ = c;
	81	--n;
	82	}
	83
	84	/* Align 'n'. */
	85	while (n & 3)
	86	out8[--n] = c;
	87
	88	out32 = (uint32_t *) out8;
	89	n32 = n >> 2;
	90
	91	/* Tile input byte out to 32 bits. */
	92	v16 = __insn_intlb(c, c);
	93	v32 = __insn_intlh(v16, v16);
	94
	95	/* This must be at least 8 or the following loop doesn't work. */
	96	#define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4)
	97
	98	#if !CHIP_HAS_WH64()
	99
	100	ahead32 = CACHE_LINE_SIZE_IN_WORDS;
	101
	102	/* We already prefetched the first and last cache lines, so
	103	* we only need to do more prefetching if we are storing
	104	* to more than two cache lines.
	105	*/
	106	if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) {
	107	int i;
	108
	109	/* Prefetch the next several cache lines.
	110	* This is the setup code for the software-pipelined
	111	* loop below.
	112	*/
	113	#define MAX_PREFETCH 5
	114	ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS;
	115	if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS)
	116	ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS;
	117
	118	for (i = CACHE_LINE_SIZE_IN_WORDS;
	119	i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS)
	120	__insn_prefetch(&out32[i]);
	121	}
	122
	123	if (n32 > ahead32) {
	124	while (1) {
	125	int j;
	126
	127	/* Prefetch by reading one word several cache lines
	128	* ahead. Since loads are non-blocking this will
	129	* cause the full cache line to be read while we are
	130	* finishing earlier cache lines. Using a store
	131	* here causes microarchitectural performance
	132	* problems where a victimizing store miss goes to
	133	* the head of the retry FIFO and locks the pipe for
	134	* a few cycles. So a few subsequent stores in this
	135	* loop go into the retry FIFO, and then later
	136	* stores see other stores to the same cache line
	137	* are already in the retry FIFO and themselves go
	138	* into the retry FIFO, filling it up and grinding
	139	* to a halt waiting for the original miss to be
	140	* satisfied.
	141	*/
	142	__insn_prefetch(&out32[ahead32]);
	143
	144	#if 1
	145	#if CACHE_LINE_SIZE_IN_WORDS % 4 != 0
	146	#error "Unhandled CACHE_LINE_SIZE_IN_WORDS"
	147	#endif
	148
	149	n32 -= CACHE_LINE_SIZE_IN_WORDS;
	150
	151	/* Save icache space by only partially unrolling
	152	* this loop.
	153	*/
	154	for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) {
	155	*out32++ = v32;
	156	*out32++ = v32;
	157	*out32++ = v32;
	158	*out32++ = v32;
	159	}
	160	#else
	161	/* Unfortunately, due to a code generator flaw this
	162	* allocates a separate register for each of these
	163	* stores, which requires a large number of spills,
	164	* which makes this procedure enormously bigger
	165	* (something like 70%)
	166	*/
	167	*out32++ = v32;
	168	*out32++ = v32;
	169	*out32++ = v32;
	170	*out32++ = v32;
	171	*out32++ = v32;
	172	*out32++ = v32;
	173	*out32++ = v32;
	174	*out32++ = v32;
	175	*out32++ = v32;
	176	*out32++ = v32;
	177	*out32++ = v32;
	178	*out32++ = v32;
	179	*out32++ = v32;
	180	*out32++ = v32;
	181	*out32++ = v32;
	182	n32 -= 16;
	183	#endif
	184
	185	/* To save compiled code size, reuse this loop even
	186	* when we run out of prefetching to do by dropping
	187	* ahead32 down.
	188	*/
	189	if (n32 <= ahead32) {
	190	/* Not even a full cache line left,
	191	* so stop now.
	192	*/
	193	if (n32 < CACHE_LINE_SIZE_IN_WORDS)
	194	break;
	195
	196	/* Choose a small enough value that we don't
	197	* prefetch past the end. There's no sense
	198	* in touching cache lines we don't have to.
	199	*/
	200	ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1;
	201	}
	202	}
	203	}
	204
	205	#else /* CHIP_HAS_WH64() */
	206
	207	/* Determine how many words we need to emit before the 'out32'
	208	* pointer becomes aligned modulo the cache line size.
	209	*/
	210	to_align32 =
	211	(-((uintptr_t)out32 >> 2)) & (CACHE_LINE_SIZE_IN_WORDS - 1);
	212
	213	/* Only bother aligning and using wh64 if there is at least
	214	* one full cache line to process. This check also prevents
	215	* overrunning the end of the buffer with alignment words.
	216	*/
	217	if (to_align32 <= n32 - CACHE_LINE_SIZE_IN_WORDS) {
	218	int lines_left;
	219
	220	/* Align out32 mod the cache line size so we can use wh64. */
	221	n32 -= to_align32;
	222	for (; to_align32 != 0; to_align32--) {
	223	*out32 = v32;
	224	out32++;
	225	}
	226
	227	/* Use unsigned divide to turn this into a right shift. */
	228	lines_left = (unsigned)n32 / CACHE_LINE_SIZE_IN_WORDS;
	229
	230	do {
	231	/* Only wh64 a few lines at a time, so we don't
	232	* exceed the maximum number of victim lines.
	233	*/
	234	int x = ((lines_left < CHIP_MAX_OUTSTANDING_VICTIMS())
	235	? lines_left
	236	: CHIP_MAX_OUTSTANDING_VICTIMS());
	237	uint32_t *wh = out32;
	238	int i = x;
	239	int j;
	240
	241	lines_left -= x;
	242
	243	do {
	244	__insn_wh64(wh);
	245	wh += CACHE_LINE_SIZE_IN_WORDS;
	246	} while (--i);
	247
	248	for (j = x * (CACHE_LINE_SIZE_IN_WORDS / 4); j != 0; j--) {
	249	*out32++ = v32;
	250	*out32++ = v32;
	251	*out32++ = v32;
	252	*out32++ = v32;
	253	}
	254	} while (lines_left != 0);
	255
	256	/* We processed all full lines above, so only this many
	257	* words remain to be processed.
	258	*/
	259	n32 &= CACHE_LINE_SIZE_IN_WORDS - 1;
	260	}
	261
	262	#endif /* CHIP_HAS_WH64() */
	263
	264	/* Now handle any leftover values. */
	265	if (n32 != 0) {
	266	do {
	267	*out32 = v32;
	268	out32++;
	269	} while (--n32 != 0);
	270	}
	271
	272	return s;
	273	}
	274	EXPORT_SYMBOL(memset);