aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHendrik Brueckner <brueckner@linux.vnet.ibm.com>2015-02-18 08:46:00 -0500
committerMartin Schwidefsky <schwidefsky@de.ibm.com>2016-06-14 10:54:11 -0400
commit04864808029e59ea1bf075c756a0f35c8398fc11 (patch)
tree266b3ce8c6c7d6f3389778ffafb4112b528114b0
parentde3fa841e429de7e288facf9b642948677fac581 (diff)
s390/vx: add support functions for in-kernel FPU use
Introduce the kernel_fpu_begin() and kernel_fpu_end() function to enclose any in-kernel use of FPU instructions and registers. In enclosed sections, you can perform floating-point or vector (SIMD) computations. The functions take care of saving and restoring FPU register contents and controls. For usage details, see the guidelines in arch/s390/include/asm/fpu/api.h Signed-off-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com> Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
-rw-r--r--arch/s390/include/asm/fpu/api.h75
-rw-r--r--arch/s390/include/asm/fpu/types.h10
-rw-r--r--arch/s390/kernel/Makefile2
-rw-r--r--arch/s390/kernel/fpu.c249
4 files changed, 335 insertions, 1 deletions
diff --git a/arch/s390/include/asm/fpu/api.h b/arch/s390/include/asm/fpu/api.h
index 5e04f3cbd320..78ba3ddb9e18 100644
--- a/arch/s390/include/asm/fpu/api.h
+++ b/arch/s390/include/asm/fpu/api.h
@@ -1,6 +1,41 @@
1/* 1/*
2 * In-kernel FPU support functions 2 * In-kernel FPU support functions
3 * 3 *
4 *
5 * Consider these guidelines before using in-kernel FPU functions:
6 *
7 * 1. Use kernel_fpu_begin() and kernel_fpu_end() to enclose all in-kernel
8 * use of floating-point or vector registers and instructions.
9 *
10 * 2. For kernel_fpu_begin(), specify the vector register range you want to
11 * use with the KERNEL_VXR_* constants. Consider these usage guidelines:
12 *
13 * a) If your function typically runs in process-context, use the lower
14 * half of the vector registers, for example, specify KERNEL_VXR_LOW.
15 * b) If your function typically runs in soft-irq or hard-irq context,
16 * prefer using the upper half of the vector registers, for example,
17 * specify KERNEL_VXR_HIGH.
18 *
19 * If you adhere to these guidelines, an interrupted process context
20 * does not require to save and restore vector registers because of
21 * disjoint register ranges.
22 *
23 * Also note that the __kernel_fpu_begin()/__kernel_fpu_end() functions
24 * includes logic to save and restore up to 16 vector registers at once.
25 *
26 * 3. You can nest kernel_fpu_begin()/kernel_fpu_end() by using different
27 * struct kernel_fpu states. Vector registers that are in use by outer
28 * levels are saved and restored. You can minimize the save and restore
29 * effort by choosing disjoint vector register ranges.
30 *
31 * 5. To use vector floating-point instructions, specify the KERNEL_FPC
32 * flag to save and restore floating-point controls in addition to any
33 * vector register range.
34 *
35 * 6. To use floating-point registers and instructions only, specify the
36 * KERNEL_FPR flag. This flag triggers a save and restore of vector
37 * registers V0 to V15 and floating-point controls.
38 *
4 * Copyright IBM Corp. 2015 39 * Copyright IBM Corp. 2015
5 * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> 40 * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
6 */ 41 */
@@ -8,6 +43,8 @@
8#ifndef _ASM_S390_FPU_API_H 43#ifndef _ASM_S390_FPU_API_H
9#define _ASM_S390_FPU_API_H 44#define _ASM_S390_FPU_API_H
10 45
46#include <linux/preempt.h>
47
11void save_fpu_regs(void); 48void save_fpu_regs(void);
12 49
13static inline int test_fp_ctl(u32 fpc) 50static inline int test_fp_ctl(u32 fpc)
@@ -27,4 +64,42 @@ static inline int test_fp_ctl(u32 fpc)
27 return rc; 64 return rc;
28} 65}
29 66
67#define KERNEL_VXR_V0V7 1
68#define KERNEL_VXR_V8V15 2
69#define KERNEL_VXR_V16V23 4
70#define KERNEL_VXR_V24V31 8
71#define KERNEL_FPR 16
72#define KERNEL_FPC 256
73
74#define KERNEL_VXR_LOW (KERNEL_VXR_V0V7|KERNEL_VXR_V8V15)
75#define KERNEL_VXR_MID (KERNEL_VXR_V8V15|KERNEL_VXR_V16V23)
76#define KERNEL_VXR_HIGH (KERNEL_VXR_V16V23|KERNEL_VXR_V24V31)
77
78#define KERNEL_FPU_MASK (KERNEL_VXR_LOW|KERNEL_VXR_HIGH|KERNEL_FPR)
79
80struct kernel_fpu;
81
82/*
83 * Note the functions below must be called with preemption disabled.
84 * Do not enable preemption before calling __kernel_fpu_end() to prevent
85 * an corruption of an existing kernel FPU state.
86 *
87 * Prefer using the kernel_fpu_begin()/kernel_fpu_end() pair of functions.
88 */
89void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags);
90void __kernel_fpu_end(struct kernel_fpu *state);
91
92
93static inline void kernel_fpu_begin(struct kernel_fpu *state, u32 flags)
94{
95 preempt_disable();
96 __kernel_fpu_begin(state, flags);
97}
98
99static inline void kernel_fpu_end(struct kernel_fpu *state)
100{
101 __kernel_fpu_end(state);
102 preempt_enable();
103}
104
30#endif /* _ASM_S390_FPU_API_H */ 105#endif /* _ASM_S390_FPU_API_H */
diff --git a/arch/s390/include/asm/fpu/types.h b/arch/s390/include/asm/fpu/types.h
index fe937c9b6471..bce255ead72b 100644
--- a/arch/s390/include/asm/fpu/types.h
+++ b/arch/s390/include/asm/fpu/types.h
@@ -24,4 +24,14 @@ struct fpu {
24/* VX array structure for address operand constraints in inline assemblies */ 24/* VX array structure for address operand constraints in inline assemblies */
25struct vx_array { __vector128 _[__NUM_VXRS]; }; 25struct vx_array { __vector128 _[__NUM_VXRS]; };
26 26
27/* In-kernel FPU state structure */
28struct kernel_fpu {
29 u32 mask;
30 u32 fpc;
31 union {
32 freg_t fprs[__NUM_FPRS];
33 __vector128 vxrs[__NUM_VXRS];
34 };
35};
36
27#endif /* _ASM_S390_FPU_TYPES_H */ 37#endif /* _ASM_S390_FPU_TYPES_H */
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 2f5586ab8a6a..8d1419120bb7 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -45,7 +45,7 @@ obj-y := traps.o time.o process.o base.o early.o setup.o idle.o vtime.o
45obj-y += processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o 45obj-y += processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o
46obj-y += debug.o irq.o ipl.o dis.o diag.o sclp.o vdso.o 46obj-y += debug.o irq.o ipl.o dis.o diag.o sclp.o vdso.o
47obj-y += sysinfo.o jump_label.o lgr.o os_info.o machine_kexec.o pgm_check.o 47obj-y += sysinfo.o jump_label.o lgr.o os_info.o machine_kexec.o pgm_check.o
48obj-y += runtime_instr.o cache.o dumpstack.o 48obj-y += runtime_instr.o cache.o fpu.o dumpstack.o
49obj-y += entry.o reipl.o relocate_kernel.o 49obj-y += entry.o reipl.o relocate_kernel.o
50 50
51extra-y += head.o head64.o vmlinux.lds 51extra-y += head.o head64.o vmlinux.lds
diff --git a/arch/s390/kernel/fpu.c b/arch/s390/kernel/fpu.c
new file mode 100644
index 000000000000..81d1d1887507
--- /dev/null
+++ b/arch/s390/kernel/fpu.c
@@ -0,0 +1,249 @@
1/*
2 * In-kernel vector facility support functions
3 *
4 * Copyright IBM Corp. 2015
5 * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
6 */
7#include <linux/kernel.h>
8#include <linux/cpu.h>
9#include <linux/sched.h>
10#include <asm/fpu/types.h>
11#include <asm/fpu/api.h>
12
13/*
14 * Per-CPU variable to maintain FPU register ranges that are in use
15 * by the kernel.
16 */
17static DEFINE_PER_CPU(u32, kernel_fpu_state);
18
19#define KERNEL_FPU_STATE_MASK (KERNEL_FPU_MASK|KERNEL_FPC)
20
21
22void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags)
23{
24 if (!__this_cpu_read(kernel_fpu_state)) {
25 /*
26 * Save user space FPU state and register contents. Multiple
27 * calls because of interruptions do not matter and return
28 * immediately. This also sets CIF_FPU to lazy restore FP/VX
29 * register contents when returning to user space.
30 */
31 save_fpu_regs();
32 }
33
34 /* Update flags to use the vector facility for KERNEL_FPR */
35 if (MACHINE_HAS_VX && (state->mask & KERNEL_FPR)) {
36 flags |= KERNEL_VXR_LOW | KERNEL_FPC;
37 flags &= ~KERNEL_FPR;
38 }
39
40 /* Save and update current kernel VX state */
41 state->mask = __this_cpu_read(kernel_fpu_state);
42 __this_cpu_or(kernel_fpu_state, flags & KERNEL_FPU_STATE_MASK);
43
44 /*
45 * If this is the first call to __kernel_fpu_begin(), no additional
46 * work is required.
47 */
48 if (!(state->mask & KERNEL_FPU_STATE_MASK))
49 return;
50
51 /*
52 * If KERNEL_FPR is still set, the vector facility is not available
53 * and, thus, save floating-point control and registers only.
54 */
55 if (state->mask & KERNEL_FPR) {
56 asm volatile("stfpc %0" : "=Q" (state->fpc));
57 asm volatile("std 0,%0" : "=Q" (state->fprs[0]));
58 asm volatile("std 1,%0" : "=Q" (state->fprs[1]));
59 asm volatile("std 2,%0" : "=Q" (state->fprs[2]));
60 asm volatile("std 3,%0" : "=Q" (state->fprs[3]));
61 asm volatile("std 4,%0" : "=Q" (state->fprs[4]));
62 asm volatile("std 5,%0" : "=Q" (state->fprs[5]));
63 asm volatile("std 6,%0" : "=Q" (state->fprs[6]));
64 asm volatile("std 7,%0" : "=Q" (state->fprs[7]));
65 asm volatile("std 8,%0" : "=Q" (state->fprs[8]));
66 asm volatile("std 9,%0" : "=Q" (state->fprs[9]));
67 asm volatile("std 10,%0" : "=Q" (state->fprs[10]));
68 asm volatile("std 11,%0" : "=Q" (state->fprs[11]));
69 asm volatile("std 12,%0" : "=Q" (state->fprs[12]));
70 asm volatile("std 13,%0" : "=Q" (state->fprs[13]));
71 asm volatile("std 14,%0" : "=Q" (state->fprs[14]));
72 asm volatile("std 15,%0" : "=Q" (state->fprs[15]));
73 return;
74 }
75
76 /*
77 * If this is a nested call to __kernel_fpu_begin(), check the saved
78 * state mask to save and later restore the vector registers that
79 * are already in use. Let's start with checking floating-point
80 * controls.
81 */
82 if (state->mask & KERNEL_FPC)
83 asm volatile("stfpc %0" : "=m" (state->fpc));
84
85 /* Test and save vector registers */
86 asm volatile (
87 /*
88 * Test if any vector register must be saved and, if so,
89 * test if all register can be saved.
90 */
91 " tmll %[m],15\n" /* KERNEL_VXR_MASK */
92 " jz 20f\n" /* no work -> done */
93 " la 1,%[vxrs]\n" /* load save area */
94 " jo 18f\n" /* -> save V0..V31 */
95
96 /*
97 * Test if V8..V23 can be saved at once... this speeds up
98 * for KERNEL_fpu_MID only. Otherwise continue to split the
99 * range of vector registers into two halves and test them
100 * separately.
101 */
102 " tmll %[m],6\n" /* KERNEL_VXR_MID */
103 " jo 17f\n" /* -> save V8..V23 */
104
105 /* Test and save the first half of 16 vector registers */
106 "1: tmll %[m],3\n" /* KERNEL_VXR_LOW */
107 " jz 10f\n" /* -> KERNEL_VXR_HIGH */
108 " jo 2f\n" /* 11 -> save V0..V15 */
109 " brc 4,3f\n" /* 01 -> save V0..V7 */
110 " brc 2,4f\n" /* 10 -> save V8..V15 */
111
112 /* Test and save the second half of 16 vector registers */
113 "10: tmll %[m],12\n" /* KERNEL_VXR_HIGH */
114 " jo 19f\n" /* 11 -> save V16..V31 */
115 " brc 4,11f\n" /* 01 -> save V16..V23 */
116 " brc 2,12f\n" /* 10 -> save V24..V31 */
117 " j 20f\n" /* 00 -> done */
118
119 /*
120 * Below are the vstm combinations to save multiple vector
121 * registers at once.
122 */
123 "2: .word 0xe70f,0x1000,0x003e\n" /* vstm 0,15,0(1) */
124 " j 10b\n" /* -> VXR_HIGH */
125 "3: .word 0xe707,0x1000,0x003e\n" /* vstm 0,7,0(1) */
126 " j 10b\n" /* -> VXR_HIGH */
127 "4: .word 0xe78f,0x1080,0x003e\n" /* vstm 8,15,128(1) */
128 " j 10b\n" /* -> VXR_HIGH */
129 "\n"
130 "11: .word 0xe707,0x1100,0x0c3e\n" /* vstm 16,23,256(1) */
131 " j 20f\n" /* -> done */
132 "12: .word 0xe78f,0x1180,0x0c3e\n" /* vstm 24,31,384(1) */
133 " j 20f\n" /* -> done */
134 "\n"
135 "17: .word 0xe787,0x1080,0x043e\n" /* vstm 8,23,128(1) */
136 " nill %[m],249\n" /* m &= ~VXR_MID */
137 " j 1b\n" /* -> VXR_LOW */
138 "\n"
139 "18: .word 0xe70f,0x1000,0x003e\n" /* vstm 0,15,0(1) */
140 "19: .word 0xe70f,0x1100,0x0c3e\n" /* vstm 16,31,256(1) */
141 "20:"
142 : [vxrs] "=Q" (*(struct vx_array *) &state->vxrs)
143 : [m] "d" (state->mask)
144 : "1", "cc");
145}
146EXPORT_SYMBOL(__kernel_fpu_begin);
147
148void __kernel_fpu_end(struct kernel_fpu *state)
149{
150 /* Just update the per-CPU state if there is nothing to restore */
151 if (!(state->mask & KERNEL_FPU_STATE_MASK))
152 goto update_fpu_state;
153
154 /*
155 * If KERNEL_FPR is specified, the vector facility is not available
156 * and, thus, restore floating-point control and registers only.
157 */
158 if (state->mask & KERNEL_FPR) {
159 asm volatile("lfpc %0" : : "Q" (state->fpc));
160 asm volatile("ld 0,%0" : : "Q" (state->fprs[0]));
161 asm volatile("ld 1,%0" : : "Q" (state->fprs[1]));
162 asm volatile("ld 2,%0" : : "Q" (state->fprs[2]));
163 asm volatile("ld 3,%0" : : "Q" (state->fprs[3]));
164 asm volatile("ld 4,%0" : : "Q" (state->fprs[4]));
165 asm volatile("ld 5,%0" : : "Q" (state->fprs[5]));
166 asm volatile("ld 6,%0" : : "Q" (state->fprs[6]));
167 asm volatile("ld 7,%0" : : "Q" (state->fprs[7]));
168 asm volatile("ld 8,%0" : : "Q" (state->fprs[8]));
169 asm volatile("ld 9,%0" : : "Q" (state->fprs[9]));
170 asm volatile("ld 10,%0" : : "Q" (state->fprs[10]));
171 asm volatile("ld 11,%0" : : "Q" (state->fprs[11]));
172 asm volatile("ld 12,%0" : : "Q" (state->fprs[12]));
173 asm volatile("ld 13,%0" : : "Q" (state->fprs[13]));
174 asm volatile("ld 14,%0" : : "Q" (state->fprs[14]));
175 asm volatile("ld 15,%0" : : "Q" (state->fprs[15]));
176 goto update_fpu_state;
177 }
178
179 /* Test and restore floating-point controls */
180 if (state->mask & KERNEL_FPC)
181 asm volatile("lfpc %0" : : "Q" (state->fpc));
182
183 /* Test and restore (load) vector registers */
184 asm volatile (
185 /*
186 * Test if any vector registers must be loaded and, if so,
187 * test if all registers can be loaded at once.
188 */
189 " tmll %[m],15\n" /* KERNEL_VXR_MASK */
190 " jz 20f\n" /* no work -> done */
191 " la 1,%[vxrs]\n" /* load load area */
192 " jo 18f\n" /* -> load V0..V31 */
193
194 /*
195 * Test if V8..V23 can be restored at once... this speeds up
196 * for KERNEL_VXR_MID only. Otherwise continue to split the
197 * range of vector registers into two halves and test them
198 * separately.
199 */
200 " tmll %[m],6\n" /* KERNEL_VXR_MID */
201 " jo 17f\n" /* -> load V8..V23 */
202
203 /* Test and load the first half of 16 vector registers */
204 "1: tmll %[m],3\n" /* KERNEL_VXR_LOW */
205 " jz 10f\n" /* -> KERNEL_VXR_HIGH */
206 " jo 2f\n" /* 11 -> load V0..V15 */
207 " brc 4,3f\n" /* 01 -> load V0..V7 */
208 " brc 2,4f\n" /* 10 -> load V8..V15 */
209
210 /* Test and load the second half of 16 vector registers */
211 "10: tmll %[m],12\n" /* KERNEL_VXR_HIGH */
212 " jo 19f\n" /* 11 -> load V16..V31 */
213 " brc 4,11f\n" /* 01 -> load V16..V23 */
214 " brc 2,12f\n" /* 10 -> load V24..V31 */
215 " j 20f\n" /* 00 -> done */
216
217 /*
218 * Below are the vstm combinations to load multiple vector
219 * registers at once.
220 */
221 "2: .word 0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */
222 " j 10b\n" /* -> VXR_HIGH */
223 "3: .word 0xe707,0x1000,0x0036\n" /* vlm 0,7,0(1) */
224 " j 10b\n" /* -> VXR_HIGH */
225 "4: .word 0xe78f,0x1080,0x0036\n" /* vlm 8,15,128(1) */
226 " j 10b\n" /* -> VXR_HIGH */
227 "\n"
228 "11: .word 0xe707,0x1100,0x0c36\n" /* vlm 16,23,256(1) */
229 " j 20f\n" /* -> done */
230 "12: .word 0xe78f,0x1180,0x0c36\n" /* vlm 24,31,384(1) */
231 " j 20f\n" /* -> done */
232 "\n"
233 "17: .word 0xe787,0x1080,0x0436\n" /* vlm 8,23,128(1) */
234 " nill %[m],249\n" /* m &= ~VXR_MID */
235 " j 1b\n" /* -> VXR_LOW */
236 "\n"
237 "18: .word 0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */
238 "19: .word 0xe70f,0x1100,0x0c36\n" /* vlm 16,31,256(1) */
239 "20:"
240 :
241 : [vxrs] "Q" (*(struct vx_array *) &state->vxrs),
242 [m] "d" (state->mask)
243 : "1", "cc");
244
245update_fpu_state:
246 /* Update current kernel VX state */
247 __this_cpu_write(kernel_fpu_state, state->mask);
248}
249EXPORT_SYMBOL(__kernel_fpu_end);