aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2017-05-25 15:07:08 -0400
committerDavid S. Miller <davem@davemloft.net>2017-05-25 15:07:08 -0400
commit60925ee97e2be4993fb7a2f7e70be0fbce08cf0f (patch)
tree8c2db73ea429d948a46c6b8577ce6c40a3fd18bd
parentbe941bf2e6a32605935865972df7abf74087944f (diff)
parent145d978585977438ebb55079487827006c604e39 (diff)
Merge branch 'sparc64-queued-locks'
Babu Moger says: ==================== Enable queued rwlock and queued spinlock for SPARC This series of patches enables queued rwlock and queued spinlock support for SPARC. These features were introduced some time ago in upstream. Here are some of the earlier discussions. https://lwn.net/Articles/572765/ https://lwn.net/Articles/582200/ https://lwn.net/Articles/561775/ https://lwn.net/Articles/590243/ Tests: Ran AIM7 benchmark to verify the performance on various workloads. https://github.com/davidlohr/areaim. Same benchmark was used when this feature was introduced and enabled on x86. Here are the test results. Kernel 4.11.0-rc6 4.11.0-rc6 + Change baseline queued locks (Avg No.of jobs) (Avg No.of jobs) Workload High systime 10-100 user 17290.48 17295.18 +0.02 High systime 200-1000 users 109814.95 110248.87 +0.39 High systime 1200-2000 users 107912.40 127923.16 +18.54 Disk IO 10-100 users 168910.16 158834.17 -5.96 Disk IO 200-1000 users 242781.74 281285.80 +15.85 Disk IO 1200-2000 users 228518.23 218421.23 -4.41 Disk IO 10-100 users 183933.77 207928.67 +13.04 Disk IO 200-1000 users 491981.56 500162.33 +1.66 Disk IO 1200-2000 users 463395.66 467312.70 +0.84 fserver 10-100 users 254177.53 270283.08 +6.33 fserver IO 200-1000 users 269017.35 324812.2 +20.74 fserver IO 1200-2000 users 229538.87 284713.77 +24.03 Disk I/O results are little bit in negative territory. But majority of the performance changes are in positive and it is significant in some cases. Changes: v3 -> v4: 1. Took care of Geert Uytterhoeven's comment about patch #3(def_bool y) 2. Working on separate patch sets to define CPU_BIG_ENDIAN for all the default big endian architectures based on feedback from Geert and Arnd. v2 -> v3: 1. Rebased the patches on top of 4.12-rc2. 2. Re-ordered the patch #1 and patch #2. That is the same order I have seen the issues. So, it should be addressed in the same order. Patch #1 removes the check __LINUX_SPINLOCK_TYPES_H. Patch #2 addreses the compile error with qrwlock.c. This addresses the comments from Dave Miller on v2. v1 -> v2: Addressed the comments from David Miller. 1. Added CPU_BIG_ENDIAN for all SPARC 2. Removed #ifndef __LINUX_SPINLOCK_TYPES_H guard from spinlock_types.h 3. Removed check for CONFIG_QUEUED_RWLOCKS in SPARC64 as it is the default definition for SPARC64 now. Cleaned-up the previous arch_read_xxx and arch_write_xxx definitions as it is defined now in qrwlock.h. 4. Removed check for CONFIG_QUEUED_SPINLOCKS in SPARC64 as it is the default definition now for SPARC64 now. Cleaned-up the previous arch_spin_xxx definitions as it is defined in qspinlock.h. v1: Initial version ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--arch/sparc/Kconfig5
-rw-r--r--arch/sparc/include/asm/cmpxchg_64.h76
-rw-r--r--arch/sparc/include/asm/qrwlock.h7
-rw-r--r--arch/sparc/include/asm/qspinlock.h7
-rw-r--r--arch/sparc/include/asm/spinlock_64.h208
-rw-r--r--arch/sparc/include/asm/spinlock_types.h12
-rw-r--r--kernel/locking/qrwlock.c1
7 files changed, 97 insertions, 219 deletions
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 58243b0d21c0..78af684f63b9 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -83,6 +83,8 @@ config SPARC64
83 select ARCH_SUPPORTS_ATOMIC_RMW 83 select ARCH_SUPPORTS_ATOMIC_RMW
84 select HAVE_NMI 84 select HAVE_NMI
85 select HAVE_REGS_AND_STACK_ACCESS_API 85 select HAVE_REGS_AND_STACK_ACCESS_API
86 select ARCH_USE_QUEUED_RWLOCKS
87 select ARCH_USE_QUEUED_SPINLOCKS
86 88
87config ARCH_DEFCONFIG 89config ARCH_DEFCONFIG
88 string 90 string
@@ -92,6 +94,9 @@ config ARCH_DEFCONFIG
92config ARCH_PROC_KCORE_TEXT 94config ARCH_PROC_KCORE_TEXT
93 def_bool y 95 def_bool y
94 96
97config CPU_BIG_ENDIAN
98 def_bool y
99
95config ARCH_ATU 100config ARCH_ATU
96 bool 101 bool
97 default y if SPARC64 102 default y if SPARC64
diff --git a/arch/sparc/include/asm/cmpxchg_64.h b/arch/sparc/include/asm/cmpxchg_64.h
index faa2f61058c2..4028f4f1e561 100644
--- a/arch/sparc/include/asm/cmpxchg_64.h
+++ b/arch/sparc/include/asm/cmpxchg_64.h
@@ -6,6 +6,17 @@
6#ifndef __ARCH_SPARC64_CMPXCHG__ 6#ifndef __ARCH_SPARC64_CMPXCHG__
7#define __ARCH_SPARC64_CMPXCHG__ 7#define __ARCH_SPARC64_CMPXCHG__
8 8
9static inline unsigned long
10__cmpxchg_u32(volatile int *m, int old, int new)
11{
12 __asm__ __volatile__("cas [%2], %3, %0"
13 : "=&r" (new)
14 : "0" (new), "r" (m), "r" (old)
15 : "memory");
16
17 return new;
18}
19
9static inline unsigned long xchg32(__volatile__ unsigned int *m, unsigned int val) 20static inline unsigned long xchg32(__volatile__ unsigned int *m, unsigned int val)
10{ 21{
11 unsigned long tmp1, tmp2; 22 unsigned long tmp1, tmp2;
@@ -44,10 +55,38 @@ static inline unsigned long xchg64(__volatile__ unsigned long *m, unsigned long
44 55
45void __xchg_called_with_bad_pointer(void); 56void __xchg_called_with_bad_pointer(void);
46 57
58/*
59 * Use 4 byte cas instruction to achieve 2 byte xchg. Main logic
60 * here is to get the bit shift of the byte we are interested in.
61 * The XOR is handy for reversing the bits for big-endian byte order.
62 */
63static inline unsigned long
64xchg16(__volatile__ unsigned short *m, unsigned short val)
65{
66 unsigned long maddr = (unsigned long)m;
67 int bit_shift = (((unsigned long)m & 2) ^ 2) << 3;
68 unsigned int mask = 0xffff << bit_shift;
69 unsigned int *ptr = (unsigned int *) (maddr & ~2);
70 unsigned int old32, new32, load32;
71
72 /* Read the old value */
73 load32 = *ptr;
74
75 do {
76 old32 = load32;
77 new32 = (load32 & (~mask)) | val << bit_shift;
78 load32 = __cmpxchg_u32(ptr, old32, new32);
79 } while (load32 != old32);
80
81 return (load32 & mask) >> bit_shift;
82}
83
47static inline unsigned long __xchg(unsigned long x, __volatile__ void * ptr, 84static inline unsigned long __xchg(unsigned long x, __volatile__ void * ptr,
48 int size) 85 int size)
49{ 86{
50 switch (size) { 87 switch (size) {
88 case 2:
89 return xchg16(ptr, x);
51 case 4: 90 case 4:
52 return xchg32(ptr, x); 91 return xchg32(ptr, x);
53 case 8: 92 case 8:
@@ -65,10 +104,11 @@ static inline unsigned long __xchg(unsigned long x, __volatile__ void * ptr,
65 104
66#include <asm-generic/cmpxchg-local.h> 105#include <asm-generic/cmpxchg-local.h>
67 106
107
68static inline unsigned long 108static inline unsigned long
69__cmpxchg_u32(volatile int *m, int old, int new) 109__cmpxchg_u64(volatile long *m, unsigned long old, unsigned long new)
70{ 110{
71 __asm__ __volatile__("cas [%2], %3, %0" 111 __asm__ __volatile__("casx [%2], %3, %0"
72 : "=&r" (new) 112 : "=&r" (new)
73 : "0" (new), "r" (m), "r" (old) 113 : "0" (new), "r" (m), "r" (old)
74 : "memory"); 114 : "memory");
@@ -76,15 +116,31 @@ __cmpxchg_u32(volatile int *m, int old, int new)
76 return new; 116 return new;
77} 117}
78 118
119/*
120 * Use 4 byte cas instruction to achieve 1 byte cmpxchg. Main logic
121 * here is to get the bit shift of the byte we are interested in.
122 * The XOR is handy for reversing the bits for big-endian byte order
123 */
79static inline unsigned long 124static inline unsigned long
80__cmpxchg_u64(volatile long *m, unsigned long old, unsigned long new) 125__cmpxchg_u8(volatile unsigned char *m, unsigned char old, unsigned char new)
81{ 126{
82 __asm__ __volatile__("casx [%2], %3, %0" 127 unsigned long maddr = (unsigned long)m;
83 : "=&r" (new) 128 int bit_shift = (((unsigned long)m & 3) ^ 3) << 3;
84 : "0" (new), "r" (m), "r" (old) 129 unsigned int mask = 0xff << bit_shift;
85 : "memory"); 130 unsigned int *ptr = (unsigned int *) (maddr & ~3);
86 131 unsigned int old32, new32, load;
87 return new; 132 unsigned int load32 = *ptr;
133
134 do {
135 new32 = (load32 & ~mask) | (new << bit_shift);
136 old32 = (load32 & ~mask) | (old << bit_shift);
137 load32 = __cmpxchg_u32(ptr, old32, new32);
138 if (load32 == old32)
139 return old;
140 load = (load32 & mask) >> bit_shift;
141 } while (load == old);
142
143 return load;
88} 144}
89 145
90/* This function doesn't exist, so you'll get a linker error 146/* This function doesn't exist, so you'll get a linker error
@@ -95,6 +151,8 @@ static inline unsigned long
95__cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size) 151__cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size)
96{ 152{
97 switch (size) { 153 switch (size) {
154 case 1:
155 return __cmpxchg_u8(ptr, old, new);
98 case 4: 156 case 4:
99 return __cmpxchg_u32(ptr, old, new); 157 return __cmpxchg_u32(ptr, old, new);
100 case 8: 158 case 8:
diff --git a/arch/sparc/include/asm/qrwlock.h b/arch/sparc/include/asm/qrwlock.h
new file mode 100644
index 000000000000..d68a4b102100
--- /dev/null
+++ b/arch/sparc/include/asm/qrwlock.h
@@ -0,0 +1,7 @@
1#ifndef _ASM_SPARC_QRWLOCK_H
2#define _ASM_SPARC_QRWLOCK_H
3
4#include <asm-generic/qrwlock_types.h>
5#include <asm-generic/qrwlock.h>
6
7#endif /* _ASM_SPARC_QRWLOCK_H */
diff --git a/arch/sparc/include/asm/qspinlock.h b/arch/sparc/include/asm/qspinlock.h
new file mode 100644
index 000000000000..5ae9a2802846
--- /dev/null
+++ b/arch/sparc/include/asm/qspinlock.h
@@ -0,0 +1,7 @@
1#ifndef _ASM_SPARC_QSPINLOCK_H
2#define _ASM_SPARC_QSPINLOCK_H
3
4#include <asm-generic/qspinlock_types.h>
5#include <asm-generic/qspinlock.h>
6
7#endif /* _ASM_SPARC_QSPINLOCK_H */
diff --git a/arch/sparc/include/asm/spinlock_64.h b/arch/sparc/include/asm/spinlock_64.h
index 07c9f2e9bf57..f7028f5e1a5a 100644
--- a/arch/sparc/include/asm/spinlock_64.h
+++ b/arch/sparc/include/asm/spinlock_64.h
@@ -10,216 +10,12 @@
10 10
11#include <asm/processor.h> 11#include <asm/processor.h>
12#include <asm/barrier.h> 12#include <asm/barrier.h>
13 13#include <asm/qrwlock.h>
14/* To get debugging spinlocks which detect and catch 14#include <asm/qspinlock.h>
15 * deadlock situations, set CONFIG_DEBUG_SPINLOCK
16 * and rebuild your kernel.
17 */
18
19/* Because we play games to save cycles in the non-contention case, we
20 * need to be extra careful about branch targets into the "spinning"
21 * code. They live in their own section, but the newer V9 branches
22 * have a shorter range than the traditional 32-bit sparc branch
23 * variants. The rule is that the branches that go into and out of
24 * the spinner sections must be pre-V9 branches.
25 */
26
27#define arch_spin_is_locked(lp) ((lp)->lock != 0)
28
29static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
30{
31 smp_cond_load_acquire(&lock->lock, !VAL);
32}
33
34static inline void arch_spin_lock(arch_spinlock_t *lock)
35{
36 unsigned long tmp;
37
38 __asm__ __volatile__(
39"1: ldstub [%1], %0\n"
40" brnz,pn %0, 2f\n"
41" nop\n"
42" .subsection 2\n"
43"2: ldub [%1], %0\n"
44" brnz,pt %0, 2b\n"
45" nop\n"
46" ba,a,pt %%xcc, 1b\n"
47" .previous"
48 : "=&r" (tmp)
49 : "r" (lock)
50 : "memory");
51}
52
53static inline int arch_spin_trylock(arch_spinlock_t *lock)
54{
55 unsigned long result;
56
57 __asm__ __volatile__(
58" ldstub [%1], %0\n"
59 : "=r" (result)
60 : "r" (lock)
61 : "memory");
62
63 return (result == 0UL);
64}
65
66static inline void arch_spin_unlock(arch_spinlock_t *lock)
67{
68 __asm__ __volatile__(
69" stb %%g0, [%0]"
70 : /* No outputs */
71 : "r" (lock)
72 : "memory");
73}
74
75static inline void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
76{
77 unsigned long tmp1, tmp2;
78
79 __asm__ __volatile__(
80"1: ldstub [%2], %0\n"
81" brnz,pn %0, 2f\n"
82" nop\n"
83" .subsection 2\n"
84"2: rdpr %%pil, %1\n"
85" wrpr %3, %%pil\n"
86"3: ldub [%2], %0\n"
87" brnz,pt %0, 3b\n"
88" nop\n"
89" ba,pt %%xcc, 1b\n"
90" wrpr %1, %%pil\n"
91" .previous"
92 : "=&r" (tmp1), "=&r" (tmp2)
93 : "r"(lock), "r"(flags)
94 : "memory");
95}
96
97/* Multi-reader locks, these are much saner than the 32-bit Sparc ones... */
98
99static inline void arch_read_lock(arch_rwlock_t *lock)
100{
101 unsigned long tmp1, tmp2;
102
103 __asm__ __volatile__ (
104"1: ldsw [%2], %0\n"
105" brlz,pn %0, 2f\n"
106"4: add %0, 1, %1\n"
107" cas [%2], %0, %1\n"
108" cmp %0, %1\n"
109" bne,pn %%icc, 1b\n"
110" nop\n"
111" .subsection 2\n"
112"2: ldsw [%2], %0\n"
113" brlz,pt %0, 2b\n"
114" nop\n"
115" ba,a,pt %%xcc, 4b\n"
116" .previous"
117 : "=&r" (tmp1), "=&r" (tmp2)
118 : "r" (lock)
119 : "memory");
120}
121
122static inline int arch_read_trylock(arch_rwlock_t *lock)
123{
124 int tmp1, tmp2;
125
126 __asm__ __volatile__ (
127"1: ldsw [%2], %0\n"
128" brlz,a,pn %0, 2f\n"
129" mov 0, %0\n"
130" add %0, 1, %1\n"
131" cas [%2], %0, %1\n"
132" cmp %0, %1\n"
133" bne,pn %%icc, 1b\n"
134" mov 1, %0\n"
135"2:"
136 : "=&r" (tmp1), "=&r" (tmp2)
137 : "r" (lock)
138 : "memory");
139
140 return tmp1;
141}
142
143static inline void arch_read_unlock(arch_rwlock_t *lock)
144{
145 unsigned long tmp1, tmp2;
146
147 __asm__ __volatile__(
148"1: lduw [%2], %0\n"
149" sub %0, 1, %1\n"
150" cas [%2], %0, %1\n"
151" cmp %0, %1\n"
152" bne,pn %%xcc, 1b\n"
153" nop"
154 : "=&r" (tmp1), "=&r" (tmp2)
155 : "r" (lock)
156 : "memory");
157}
158
159static inline void arch_write_lock(arch_rwlock_t *lock)
160{
161 unsigned long mask, tmp1, tmp2;
162
163 mask = 0x80000000UL;
164
165 __asm__ __volatile__(
166"1: lduw [%2], %0\n"
167" brnz,pn %0, 2f\n"
168"4: or %0, %3, %1\n"
169" cas [%2], %0, %1\n"
170" cmp %0, %1\n"
171" bne,pn %%icc, 1b\n"
172" nop\n"
173" .subsection 2\n"
174"2: lduw [%2], %0\n"
175" brnz,pt %0, 2b\n"
176" nop\n"
177" ba,a,pt %%xcc, 4b\n"
178" .previous"
179 : "=&r" (tmp1), "=&r" (tmp2)
180 : "r" (lock), "r" (mask)
181 : "memory");
182}
183
184static inline void arch_write_unlock(arch_rwlock_t *lock)
185{
186 __asm__ __volatile__(
187" stw %%g0, [%0]"
188 : /* no outputs */
189 : "r" (lock)
190 : "memory");
191}
192
193static inline int arch_write_trylock(arch_rwlock_t *lock)
194{
195 unsigned long mask, tmp1, tmp2, result;
196
197 mask = 0x80000000UL;
198
199 __asm__ __volatile__(
200" mov 0, %2\n"
201"1: lduw [%3], %0\n"
202" brnz,pn %0, 2f\n"
203" or %0, %4, %1\n"
204" cas [%3], %0, %1\n"
205" cmp %0, %1\n"
206" bne,pn %%icc, 1b\n"
207" nop\n"
208" mov 1, %2\n"
209"2:"
210 : "=&r" (tmp1), "=&r" (tmp2), "=&r" (result)
211 : "r" (lock), "r" (mask)
212 : "memory");
213
214 return result;
215}
216 15
217#define arch_read_lock_flags(p, f) arch_read_lock(p) 16#define arch_read_lock_flags(p, f) arch_read_lock(p)
218#define arch_write_lock_flags(p, f) arch_write_lock(p) 17#define arch_write_lock_flags(p, f) arch_write_lock(p)
219 18
220#define arch_read_can_lock(rw) (!((rw)->lock & 0x80000000UL))
221#define arch_write_can_lock(rw) (!(rw)->lock)
222
223#define arch_spin_relax(lock) cpu_relax() 19#define arch_spin_relax(lock) cpu_relax()
224#define arch_read_relax(lock) cpu_relax() 20#define arch_read_relax(lock) cpu_relax()
225#define arch_write_relax(lock) cpu_relax() 21#define arch_write_relax(lock) cpu_relax()
diff --git a/arch/sparc/include/asm/spinlock_types.h b/arch/sparc/include/asm/spinlock_types.h
index 9c454fdeaad8..bce8ef44dfa9 100644
--- a/arch/sparc/include/asm/spinlock_types.h
+++ b/arch/sparc/include/asm/spinlock_types.h
@@ -1,20 +1,24 @@
1#ifndef __SPARC_SPINLOCK_TYPES_H 1#ifndef __SPARC_SPINLOCK_TYPES_H
2#define __SPARC_SPINLOCK_TYPES_H 2#define __SPARC_SPINLOCK_TYPES_H
3 3
4#ifndef __LINUX_SPINLOCK_TYPES_H 4#ifdef CONFIG_QUEUED_SPINLOCKS
5# error "please don't include this file directly" 5#include <asm-generic/qspinlock_types.h>
6#endif 6#else
7 7
8typedef struct { 8typedef struct {
9 volatile unsigned char lock; 9 volatile unsigned char lock;
10} arch_spinlock_t; 10} arch_spinlock_t;
11 11
12#define __ARCH_SPIN_LOCK_UNLOCKED { 0 } 12#define __ARCH_SPIN_LOCK_UNLOCKED { 0 }
13#endif /* CONFIG_QUEUED_SPINLOCKS */
13 14
15#ifdef CONFIG_QUEUED_RWLOCKS
16#include <asm-generic/qrwlock_types.h>
17#else
14typedef struct { 18typedef struct {
15 volatile unsigned int lock; 19 volatile unsigned int lock;
16} arch_rwlock_t; 20} arch_rwlock_t;
17 21
18#define __ARCH_RW_LOCK_UNLOCKED { 0 } 22#define __ARCH_RW_LOCK_UNLOCKED { 0 }
19 23#endif /* CONFIG_QUEUED_RWLOCKS */
20#endif 24#endif
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index cc3ed0ccdfa2..2655f26ec882 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -20,6 +20,7 @@
20#include <linux/cpumask.h> 20#include <linux/cpumask.h>
21#include <linux/percpu.h> 21#include <linux/percpu.h>
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/spinlock.h>
23#include <asm/qrwlock.h> 24#include <asm/qrwlock.h>
24 25
25/* 26/*