1 files changed, 1 insertions, 109 deletions
diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c
index 57dbb3a5bff8..2042bfe6595f 100644
--- a/arch/tile/lib/memset_32.c
+++ b/arch/tile/lib/memset_32.c
@@ -12,13 +12,10 @@
 *   more details.
 */
-#include <arch/chip.h>
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/module.h>
+#include <arch/chip.h>
-#undef memset
 void *memset(void *s, int c, size_t n)
 {
@@ -26,11 +23,7 @@ void *memset(void *s, int c, size_t n)
        int n32;
        uint32_t v16, v32;
        uint8_t *out8 = s;
-#if !CHIP_HAS_WH64()
-        int ahead32;
-#else
        int to_align32;
-#endif
        /* Experimentation shows that a trivial tight loop is a win up until
         * around a size of 20, where writing a word at a time starts to win.
@@ -61,21 +54,6 @@ void *memset(void *s, int c, size_t n)
                return s;
        }
-#if !CHIP_HAS_WH64()
-        /* Use a spare issue slot to start prefetching the first cache
-         * line early. This instruction is free as the store can be buried
-         * in otherwise idle issue slots doing ALU ops.
-         */
-        __insn_prefetch(out8);
-        /* We prefetch the end so that a short memset that spans two cache
-         * lines gets some prefetching benefit. Again we believe this is free
-         * to issue.
-         */
-        __insn_prefetch(&out8[n - 1]);
-#endif /* !CHIP_HAS_WH64() */
        /* Align 'out8'. We know n >= 3 so this won't write past the end. */
        while (((uintptr_t) out8 & 3) != 0) {
                *out8++ = c;
@@ -96,90 +74,6 @@ void *memset(void *s, int c, size_t n)
        /* This must be at least 8 or the following loop doesn't work. */
 #define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4)
-#if !CHIP_HAS_WH64()
-        ahead32 = CACHE_LINE_SIZE_IN_WORDS;
-        /* We already prefetched the first and last cache lines, so
-         * we only need to do more prefetching if we are storing
-         * to more than two cache lines.
-         */
-        if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) {
-                int i;
-                /* Prefetch the next several cache lines.
-                 * This is the setup code for the software-pipelined
-                 * loop below.
-                 */
-#define MAX_PREFETCH 5
-                ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS;
-                if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS)
-                        ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS;
-                for (i = CACHE_LINE_SIZE_IN_WORDS;
-                     i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS)
-                        __insn_prefetch(&out32[i]);
-        }
-        if (n32 > ahead32) {
-                while (1) {
-                        int j;
-                        /* Prefetch by reading one word several cache lines
-                         * ahead.  Since loads are non-blocking this will
-                         * cause the full cache line to be read while we are
-                         * finishing earlier cache lines.  Using a store
-                         * here causes microarchitectural performance
-                         * problems where a victimizing store miss goes to
-                         * the head of the retry FIFO and locks the pipe for
-                         * a few cycles.  So a few subsequent stores in this
-                         * loop go into the retry FIFO, and then later
-                         * stores see other stores to the same cache line
-                         * are already in the retry FIFO and themselves go
-                         * into the retry FIFO, filling it up and grinding
-                         * to a halt waiting for the original miss to be
-                         * satisfied.
-                         */
-                        __insn_prefetch(&out32[ahead32]);
-#if CACHE_LINE_SIZE_IN_WORDS % 4 != 0
-#error "Unhandled CACHE_LINE_SIZE_IN_WORDS"
-#endif
-                        n32 -= CACHE_LINE_SIZE_IN_WORDS;
-                        /* Save icache space by only partially unrolling
-                         * this loop.
-                         */
-                        for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) {
-                                *out32++ = v32;
-                                *out32++ = v32;
-                                *out32++ = v32;
-                                *out32++ = v32;
-                        }
-                        /* To save compiled code size, reuse this loop even
-                         * when we run out of prefetching to do by dropping
-                         * ahead32 down.
-                         */
-                        if (n32 <= ahead32) {
-                                /* Not even a full cache line left,
-                                 * so stop now.
-                                 */
-                                if (n32 < CACHE_LINE_SIZE_IN_WORDS)
-                                        break;
-                                /* Choose a small enough value that we don't
-                                 * prefetch past the end.  There's no sense
-                                 * in touching cache lines we don't have to.
-                                 */
-                                ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1;
-                        }
-                }
-        }
-#else /* CHIP_HAS_WH64() */
        /* Determine how many words we need to emit before the 'out32'
         * pointer becomes aligned modulo the cache line size.
         */
@@ -236,8 +130,6 @@ void *memset(void *s, int c, size_t n)
                n32 &= CACHE_LINE_SIZE_IN_WORDS - 1;
        }
-#endif /* CHIP_HAS_WH64() */
        /* Now handle any leftover values. */
        if (n32 != 0) {
                do {

diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c index 57dbb3a5bff8..2042bfe6595f 100644 --- a/arch/tile/lib/memset_32.c +++ b/arch/tile/lib/memset_32.c
@@ -12,13 +12,10 @@
12	* more details.	12	* more details.
13	*/	13	*/
14		14
15	#include <arch/chip.h>
16
17	#include <linux/types.h>	15	#include <linux/types.h>
18	#include <linux/string.h>	16	#include <linux/string.h>
19	#include <linux/module.h>	17	#include <linux/module.h>
20		18	#include <arch/chip.h>
21	#undef memset
22		19
23	void memset(void s, int c, size_t n)	20	void memset(void s, int c, size_t n)
24	{	21	{
@@ -26,11 +23,7 @@ void memset(void s, int c, size_t n)
26	int n32;	23	int n32;
27	uint32_t v16, v32;	24	uint32_t v16, v32;
28	uint8_t *out8 = s;	25	uint8_t *out8 = s;
29	#if !CHIP_HAS_WH64()
30	int ahead32;
31	#else
32	int to_align32;	26	int to_align32;
33	#endif
34		27
35	/* Experimentation shows that a trivial tight loop is a win up until	28	/* Experimentation shows that a trivial tight loop is a win up until
36	* around a size of 20, where writing a word at a time starts to win.	29	* around a size of 20, where writing a word at a time starts to win.
@@ -61,21 +54,6 @@ void memset(void s, int c, size_t n)
61	return s;	54	return s;
62	}	55	}
63		56
64	#if !CHIP_HAS_WH64()
65	/* Use a spare issue slot to start prefetching the first cache
66	* line early. This instruction is free as the store can be buried
67	* in otherwise idle issue slots doing ALU ops.
68	*/
69	__insn_prefetch(out8);
70
71	/* We prefetch the end so that a short memset that spans two cache
72	* lines gets some prefetching benefit. Again we believe this is free
73	* to issue.
74	*/
75	__insn_prefetch(&out8[n - 1]);
76	#endif /* !CHIP_HAS_WH64() */
77
78
79	/* Align 'out8'. We know n >= 3 so this won't write past the end. */	57	/* Align 'out8'. We know n >= 3 so this won't write past the end. */
80	while (((uintptr_t) out8 & 3) != 0) {	58	while (((uintptr_t) out8 & 3) != 0) {
81	*out8++ = c;	59	*out8++ = c;
@@ -96,90 +74,6 @@ void memset(void s, int c, size_t n)
96	/* This must be at least 8 or the following loop doesn't work. */	74	/* This must be at least 8 or the following loop doesn't work. */
97	#define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4)	75	#define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4)
98		76
99	#if !CHIP_HAS_WH64()
100
101	ahead32 = CACHE_LINE_SIZE_IN_WORDS;
102
103	/* We already prefetched the first and last cache lines, so
104	* we only need to do more prefetching if we are storing
105	* to more than two cache lines.
106	*/
107	if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) {
108	int i;
109
110	/* Prefetch the next several cache lines.
111	* This is the setup code for the software-pipelined
112	* loop below.
113	*/
114	#define MAX_PREFETCH 5
115	ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS;
116	if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS)
117	ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS;
118
119	for (i = CACHE_LINE_SIZE_IN_WORDS;
120	i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS)
121	__insn_prefetch(&out32[i]);
122	}
123
124	if (n32 > ahead32) {
125	while (1) {
126	int j;
127
128	/* Prefetch by reading one word several cache lines
129	* ahead. Since loads are non-blocking this will
130	* cause the full cache line to be read while we are
131	* finishing earlier cache lines. Using a store
132	* here causes microarchitectural performance
133	* problems where a victimizing store miss goes to
134	* the head of the retry FIFO and locks the pipe for
135	* a few cycles. So a few subsequent stores in this
136	* loop go into the retry FIFO, and then later
137	* stores see other stores to the same cache line
138	* are already in the retry FIFO and themselves go
139	* into the retry FIFO, filling it up and grinding
140	* to a halt waiting for the original miss to be
141	* satisfied.
142	*/
143	__insn_prefetch(&out32[ahead32]);
144
145	#if CACHE_LINE_SIZE_IN_WORDS % 4 != 0
146	#error "Unhandled CACHE_LINE_SIZE_IN_WORDS"
147	#endif
148
149	n32 -= CACHE_LINE_SIZE_IN_WORDS;
150
151	/* Save icache space by only partially unrolling
152	* this loop.
153	*/
154	for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) {
155	*out32++ = v32;
156	*out32++ = v32;
157	*out32++ = v32;
158	*out32++ = v32;
159	}
160
161	/* To save compiled code size, reuse this loop even
162	* when we run out of prefetching to do by dropping
163	* ahead32 down.
164	*/
165	if (n32 <= ahead32) {
166	/* Not even a full cache line left,
167	* so stop now.
168	*/
169	if (n32 < CACHE_LINE_SIZE_IN_WORDS)
170	break;
171
172	/* Choose a small enough value that we don't
173	* prefetch past the end. There's no sense
174	* in touching cache lines we don't have to.
175	*/
176	ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1;
177	}
178	}
179	}
180
181	#else /* CHIP_HAS_WH64() */
182
183	/* Determine how many words we need to emit before the 'out32'	77	/* Determine how many words we need to emit before the 'out32'
184	* pointer becomes aligned modulo the cache line size.	78	* pointer becomes aligned modulo the cache line size.
185	*/	79	*/
@@ -236,8 +130,6 @@ void memset(void s, int c, size_t n)
236	n32 &= CACHE_LINE_SIZE_IN_WORDS - 1;	130	n32 &= CACHE_LINE_SIZE_IN_WORDS - 1;
237	}	131	}
238		132
239	#endif /* CHIP_HAS_WH64() */
240
241	/* Now handle any leftover values. */	133	/* Now handle any leftover values. */
242	if (n32 != 0) {	134	if (n32 != 0) {
243	do {	135	do {