s390/vx: add support functions for in-kernel FPU use

Introduce the kernel_fpu_begin() and kernel_fpu_end() function to enclose any in-kernel use of FPU instructions and registers. In enclosed sections, you can perform floating-point or vector (SIMD) computations. The functions take care of saving and restoring FPU register contents and controls. For usage details, see the guidelines in arch/s390/include/asm/fpu/api.h Signed-off-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com> Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
author: Hendrik Brueckner <brueckner@linux.vnet.ibm.com> 2015-02-18 08:46:00 -0500
committer: Martin Schwidefsky <schwidefsky@de.ibm.com> 2016-06-14 10:54:11 -0400
commit: 04864808029e59ea1bf075c756a0f35c8398fc11 (patch)
tree: 266b3ce8c6c7d6f3389778ffafb4112b528114b0
parent: de3fa841e429de7e288facf9b642948677fac581 (diff)
4 files changed, 335 insertions, 1 deletions
diff --git a/arch/s390/include/asm/fpu/api.h b/arch/s390/include/asm/fpu/api.h
index 5e04f3cbd320..78ba3ddb9e18 100644
--- a/arch/s390/include/asm/fpu/api.h
+++ b/arch/s390/include/asm/fpu/api.h
@@ -1,6 +1,41 @@
 /*
 * In-kernel FPU support functions
 *
+ *
+ * Consider these guidelines before using in-kernel FPU functions:
+ *
+ *  1. Use kernel_fpu_begin() and kernel_fpu_end() to enclose all in-kernel
+ *     use of floating-point or vector registers and instructions.
+ *
+ *  2. For kernel_fpu_begin(), specify the vector register range you want to
+ *     use with the KERNEL_VXR_* constants. Consider these usage guidelines:
+ *
+ *     a) If your function typically runs in process-context, use the lower
+ *        half of the vector registers, for example, specify KERNEL_VXR_LOW.
+ *     b) If your function typically runs in soft-irq or hard-irq context,
+ *        prefer using the upper half of the vector registers, for example,
+ *        specify KERNEL_VXR_HIGH.
+ *
+ *     If you adhere to these guidelines, an interrupted process context
+ *     does not require to save and restore vector registers because of
+ *     disjoint register ranges.
+ *
+ *     Also note that the __kernel_fpu_begin()/__kernel_fpu_end() functions
+ *     includes logic to save and restore up to 16 vector registers at once.
+ *
+ *  3. You can nest kernel_fpu_begin()/kernel_fpu_end() by using different
+ *     struct kernel_fpu states.  Vector registers that are in use by outer
+ *     levels are saved and restored.  You can minimize the save and restore
+ *     effort by choosing disjoint vector register ranges.
+ *
+ *  5. To use vector floating-point instructions, specify the KERNEL_FPC
+ *     flag to save and restore floating-point controls in addition to any
+ *     vector register range.
+ *
+ *  6. To use floating-point registers and instructions only, specify the
+ *     KERNEL_FPR flag.  This flag triggers a save and restore of vector
+ *     registers V0 to V15 and floating-point controls.
+ *
 * Copyright IBM Corp. 2015
 * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
 */
@@ -8,6 +43,8 @@
 #ifndef _ASM_S390_FPU_API_H
 #define _ASM_S390_FPU_API_H
+#include <linux/preempt.h>
 void save_fpu_regs(void);
 static inline int test_fp_ctl(u32 fpc)
@@ -27,4 +64,42 @@ static inline int test_fp_ctl(u32 fpc)
        return rc;
 }
+#define KERNEL_VXR_V0V7         1
+#define KERNEL_VXR_V8V15        2
+#define KERNEL_VXR_V16V23       4
+#define KERNEL_VXR_V24V31       8
+#define KERNEL_FPR              16
+#define KERNEL_FPC              256
+#define KERNEL_VXR_LOW          (KERNEL_VXR_V0V7|KERNEL_VXR_V8V15)
+#define KERNEL_VXR_MID          (KERNEL_VXR_V8V15|KERNEL_VXR_V16V23)
+#define KERNEL_VXR_HIGH         (KERNEL_VXR_V16V23|KERNEL_VXR_V24V31)
+#define KERNEL_FPU_MASK         (KERNEL_VXR_LOW|KERNEL_VXR_HIGH|KERNEL_FPR)
+struct kernel_fpu;
+/*
+ * Note the functions below must be called with preemption disabled.
+ * Do not enable preemption before calling __kernel_fpu_end() to prevent
+ * an corruption of an existing kernel FPU state.
+ *
+ * Prefer using the kernel_fpu_begin()/kernel_fpu_end() pair of functions.
+ */
+void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags);
+void __kernel_fpu_end(struct kernel_fpu *state);
+static inline void kernel_fpu_begin(struct kernel_fpu *state, u32 flags)
+{
+        preempt_disable();
+        __kernel_fpu_begin(state, flags);
+}
+static inline void kernel_fpu_end(struct kernel_fpu *state)
+{
+        __kernel_fpu_end(state);
+        preempt_enable();
+}
 #endif /* _ASM_S390_FPU_API_H */
diff --git a/arch/s390/include/asm/fpu/types.h b/arch/s390/include/asm/fpu/types.h
index fe937c9b6471..bce255ead72b 100644
--- a/arch/s390/include/asm/fpu/types.h
+++ b/arch/s390/include/asm/fpu/types.h
@@ -24,4 +24,14 @@ struct fpu {
 /* VX array structure for address operand constraints in inline assemblies */
 struct vx_array { __vector128 _[__NUM_VXRS]; };
+/* In-kernel FPU state structure */
+struct kernel_fpu {
+        u32         mask;
+        u32         fpc;
+        union {
+                freg_t fprs[__NUM_FPRS];
+                __vector128 vxrs[__NUM_VXRS];
+        };
+};
 #endif /* _ASM_S390_FPU_TYPES_H */
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 2f5586ab8a6a..8d1419120bb7 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -45,7 +45,7 @@ obj-y	:= traps.o time.o process.o base.o early.o setup.o idle.o vtime.o
 obj-y   += processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o
 obj-y   += debug.o irq.o ipl.o dis.o diag.o sclp.o vdso.o
 obj-y   += sysinfo.o jump_label.o lgr.o os_info.o machine_kexec.o pgm_check.o
-obj-y   += runtime_instr.o cache.o dumpstack.o
+obj-y   += runtime_instr.o cache.o fpu.o dumpstack.o
 obj-y   += entry.o reipl.o relocate_kernel.o
 extra-y                         += head.o head64.o vmlinux.lds
diff --git a/arch/s390/kernel/fpu.c b/arch/s390/kernel/fpu.c
new file mode 100644
index 000000000000..81d1d1887507
--- /dev/null
+++ b/arch/s390/kernel/fpu.c
@@ -0,0 +1,249 @@
+/*
+ * In-kernel vector facility support functions
+ *
+ * Copyright IBM Corp. 2015
+ * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+ */
+#include <linux/kernel.h>
+#include <linux/cpu.h>
+#include <linux/sched.h>
+#include <asm/fpu/types.h>
+#include <asm/fpu/api.h>
+/*
+ * Per-CPU variable to maintain FPU register ranges that are in use
+ * by the kernel.
+ */
+static DEFINE_PER_CPU(u32, kernel_fpu_state);
+#define KERNEL_FPU_STATE_MASK   (KERNEL_FPU_MASK|KERNEL_FPC)
+void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags)
+{
+        if (!__this_cpu_read(kernel_fpu_state)) {
+                /*
+                 * Save user space FPU state and register contents.  Multiple
+                 * calls because of interruptions do not matter and return
+                 * immediately.  This also sets CIF_FPU to lazy restore FP/VX
+                 * register contents when returning to user space.
+                 */
+                save_fpu_regs();
+        }
+        /* Update flags to use the vector facility for KERNEL_FPR */
+        if (MACHINE_HAS_VX && (state->mask & KERNEL_FPR)) {
+                flags |= KERNEL_VXR_LOW | KERNEL_FPC;
+                flags &= ~KERNEL_FPR;
+        }
+        /* Save and update current kernel VX state */
+        state->mask = __this_cpu_read(kernel_fpu_state);
+        __this_cpu_or(kernel_fpu_state, flags & KERNEL_FPU_STATE_MASK);
+        /*
+         * If this is the first call to __kernel_fpu_begin(), no additional
+         * work is required.
+         */
+        if (!(state->mask & KERNEL_FPU_STATE_MASK))
+                return;
+        /*
+         * If KERNEL_FPR is still set, the vector facility is not available
+         * and, thus, save floating-point control and registers only.
+         */
+        if (state->mask & KERNEL_FPR) {
+                asm volatile("stfpc %0" : "=Q" (state->fpc));
+                asm volatile("std 0,%0" : "=Q" (state->fprs[0]));
+                asm volatile("std 1,%0" : "=Q" (state->fprs[1]));
+                asm volatile("std 2,%0" : "=Q" (state->fprs[2]));
+                asm volatile("std 3,%0" : "=Q" (state->fprs[3]));
+                asm volatile("std 4,%0" : "=Q" (state->fprs[4]));
+                asm volatile("std 5,%0" : "=Q" (state->fprs[5]));
+                asm volatile("std 6,%0" : "=Q" (state->fprs[6]));
+                asm volatile("std 7,%0" : "=Q" (state->fprs[7]));
+                asm volatile("std 8,%0" : "=Q" (state->fprs[8]));
+                asm volatile("std 9,%0" : "=Q" (state->fprs[9]));
+                asm volatile("std 10,%0" : "=Q" (state->fprs[10]));
+                asm volatile("std 11,%0" : "=Q" (state->fprs[11]));
+                asm volatile("std 12,%0" : "=Q" (state->fprs[12]));
+                asm volatile("std 13,%0" : "=Q" (state->fprs[13]));
+                asm volatile("std 14,%0" : "=Q" (state->fprs[14]));
+                asm volatile("std 15,%0" : "=Q" (state->fprs[15]));
+                return;
+        }
+        /*
+         * If this is a nested call to __kernel_fpu_begin(), check the saved
+         * state mask to save and later restore the vector registers that
+         * are already in use.  Let's start with checking floating-point
+         * controls.
+         */
+        if (state->mask & KERNEL_FPC)
+                asm volatile("stfpc %0" : "=m" (state->fpc));
+        /* Test and save vector registers */
+        asm volatile (
+                /*
+                 * Test if any vector register must be saved and, if so,
+                 * test if all register can be saved.
+                 */
+                "       tmll    %[m],15\n"      /* KERNEL_VXR_MASK */
+                "       jz      20f\n"          /* no work -> done */
+                "       la      1,%[vxrs]\n"    /* load save area */
+                "       jo      18f\n"          /* -> save V0..V31 */
+                /*
+                 * Test if V8..V23 can be saved at once... this speeds up
+                 * for KERNEL_fpu_MID only. Otherwise continue to split the
+                 * range of vector registers into two halves and test them
+                 * separately.
+                 */
+                "       tmll    %[m],6\n"       /* KERNEL_VXR_MID */
+                "       jo      17f\n"          /* -> save V8..V23 */
+                /* Test and save the first half of 16 vector registers */
+                "1:     tmll    %[m],3\n"       /* KERNEL_VXR_LOW */
+                "       jz      10f\n"          /* -> KERNEL_VXR_HIGH */
+                "       jo      2f\n"           /* 11 -> save V0..V15 */
+                "       brc     4,3f\n"         /* 01 -> save V0..V7  */
+                "       brc     2,4f\n"         /* 10 -> save V8..V15 */
+                /* Test and save the second half of 16 vector registers */
+                "10:    tmll    %[m],12\n"      /* KERNEL_VXR_HIGH */
+                "       jo      19f\n"          /* 11 -> save V16..V31 */
+                "       brc     4,11f\n"        /* 01 -> save V16..V23  */
+                "       brc     2,12f\n"        /* 10 -> save V24..V31 */
+                "       j       20f\n"          /* 00 -> done */
+                /*
+                 * Below are the vstm combinations to save multiple vector
+                 * registers at once.
+                 */
+                "2:     .word   0xe70f,0x1000,0x003e\n" /* vstm 0,15,0(1) */
+                "       j       10b\n"                  /* -> VXR_HIGH */
+                "3:     .word   0xe707,0x1000,0x003e\n" /* vstm 0,7,0(1) */
+                "       j       10b\n"                  /* -> VXR_HIGH */
+                "4:     .word   0xe78f,0x1080,0x003e\n" /* vstm 8,15,128(1) */
+                "       j       10b\n"                  /* -> VXR_HIGH */
+                "\n"
+                "11:    .word   0xe707,0x1100,0x0c3e\n" /* vstm 16,23,256(1) */
+                "       j       20f\n"                  /* -> done */
+                "12:    .word   0xe78f,0x1180,0x0c3e\n" /* vstm 24,31,384(1) */
+                "       j       20f\n"                  /* -> done */
+                "\n"
+                "17:    .word   0xe787,0x1080,0x043e\n" /* vstm 8,23,128(1) */
+                "       nill    %[m],249\n"             /* m &= ~VXR_MID    */
+                "       j       1b\n"                   /* -> VXR_LOW */
+                "\n"
+                "18:    .word   0xe70f,0x1000,0x003e\n" /* vstm 0,15,0(1) */
+                "19:    .word   0xe70f,0x1100,0x0c3e\n" /* vstm 16,31,256(1) */
+                "20:"
+                : [vxrs] "=Q" (*(struct vx_array *) &state->vxrs)
+                : [m] "d" (state->mask)
+                : "1", "cc");
+}
+EXPORT_SYMBOL(__kernel_fpu_begin);
+void __kernel_fpu_end(struct kernel_fpu *state)
+{
+        /* Just update the per-CPU state if there is nothing to restore */
+        if (!(state->mask & KERNEL_FPU_STATE_MASK))
+                goto update_fpu_state;
+        /*
+         * If KERNEL_FPR is specified, the vector facility is not available
+         * and, thus, restore floating-point control and registers only.
+         */
+        if (state->mask & KERNEL_FPR) {
+                asm volatile("lfpc %0" : : "Q" (state->fpc));
+                asm volatile("ld 0,%0" : : "Q" (state->fprs[0]));
+                asm volatile("ld 1,%0" : : "Q" (state->fprs[1]));
+                asm volatile("ld 2,%0" : : "Q" (state->fprs[2]));
+                asm volatile("ld 3,%0" : : "Q" (state->fprs[3]));
+                asm volatile("ld 4,%0" : : "Q" (state->fprs[4]));
+                asm volatile("ld 5,%0" : : "Q" (state->fprs[5]));
+                asm volatile("ld 6,%0" : : "Q" (state->fprs[6]));
+                asm volatile("ld 7,%0" : : "Q" (state->fprs[7]));
+                asm volatile("ld 8,%0" : : "Q" (state->fprs[8]));
+                asm volatile("ld 9,%0" : : "Q" (state->fprs[9]));
+                asm volatile("ld 10,%0" : : "Q" (state->fprs[10]));
+                asm volatile("ld 11,%0" : : "Q" (state->fprs[11]));
+                asm volatile("ld 12,%0" : : "Q" (state->fprs[12]));
+                asm volatile("ld 13,%0" : : "Q" (state->fprs[13]));
+                asm volatile("ld 14,%0" : : "Q" (state->fprs[14]));
+                asm volatile("ld 15,%0" : : "Q" (state->fprs[15]));
+                goto update_fpu_state;
+        }
+        /* Test and restore floating-point controls */
+        if (state->mask & KERNEL_FPC)
+                asm volatile("lfpc %0" : : "Q" (state->fpc));
+        /* Test and restore (load) vector registers */
+        asm volatile (
+                /*
+                 * Test if any vector registers must be loaded and, if so,
+                 * test if all registers can be loaded at once.
+                 */
+                "       tmll    %[m],15\n"      /* KERNEL_VXR_MASK */
+                "       jz      20f\n"          /* no work -> done */
+                "       la      1,%[vxrs]\n"    /* load load area */
+                "       jo      18f\n"          /* -> load V0..V31 */
+                /*
+                 * Test if V8..V23 can be restored at once... this speeds up
+                 * for KERNEL_VXR_MID only. Otherwise continue to split the
+                 * range of vector registers into two halves and test them
+                 * separately.
+                 */
+                "       tmll    %[m],6\n"       /* KERNEL_VXR_MID */
+                "       jo      17f\n"          /* -> load V8..V23 */
+                /* Test and load the first half of 16 vector registers */
+                "1:     tmll    %[m],3\n"       /* KERNEL_VXR_LOW */
+                "       jz      10f\n"          /* -> KERNEL_VXR_HIGH */
+                "       jo      2f\n"           /* 11 -> load V0..V15 */
+                "       brc     4,3f\n"         /* 01 -> load V0..V7  */
+                "       brc     2,4f\n"         /* 10 -> load V8..V15 */
+                /* Test and load the second half of 16 vector registers */
+                "10:    tmll    %[m],12\n"      /* KERNEL_VXR_HIGH */
+                "       jo      19f\n"          /* 11 -> load V16..V31 */
+                "       brc     4,11f\n"        /* 01 -> load V16..V23  */
+                "       brc     2,12f\n"        /* 10 -> load V24..V31 */
+                "       j       20f\n"          /* 00 -> done */
+                /*
+                 * Below are the vstm combinations to load multiple vector
+                 * registers at once.
+                 */
+                "2:     .word   0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */
+                "       j       10b\n"                  /* -> VXR_HIGH */
+                "3:     .word   0xe707,0x1000,0x0036\n" /* vlm 0,7,0(1) */
+                "       j       10b\n"                  /* -> VXR_HIGH */
+                "4:     .word   0xe78f,0x1080,0x0036\n" /* vlm 8,15,128(1) */
+                "       j       10b\n"                  /* -> VXR_HIGH */
+                "\n"
+                "11:    .word   0xe707,0x1100,0x0c36\n" /* vlm 16,23,256(1) */
+                "       j       20f\n"                  /* -> done */
+                "12:    .word   0xe78f,0x1180,0x0c36\n" /* vlm 24,31,384(1) */
+                "       j       20f\n"                  /* -> done */
+                "\n"
+                "17:    .word   0xe787,0x1080,0x0436\n" /* vlm 8,23,128(1) */
+                "       nill    %[m],249\n"             /* m &= ~VXR_MID    */
+                "       j       1b\n"                   /* -> VXR_LOW */
+                "\n"
+                "18:    .word   0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */
+                "19:    .word   0xe70f,0x1100,0x0c36\n" /* vlm 16,31,256(1) */
+                "20:"
+                :
+                : [vxrs] "Q" (*(struct vx_array *) &state->vxrs),
+                  [m] "d" (state->mask)
+                : "1", "cc");
+update_fpu_state:
+        /* Update current kernel VX state */
+        __this_cpu_write(kernel_fpu_state, state->mask);
+}
+EXPORT_SYMBOL(__kernel_fpu_end);
author	Hendrik Brueckner <brueckner@linux.vnet.ibm.com>	2015-02-18 08:46:00 -0500
committer	Martin Schwidefsky <schwidefsky@de.ibm.com>	2016-06-14 10:54:11 -0400
commit	04864808029e59ea1bf075c756a0f35c8398fc11 (patch)
tree	266b3ce8c6c7d6f3389778ffafb4112b528114b0
parent	de3fa841e429de7e288facf9b642948677fac581 (diff)

diff --git a/arch/s390/include/asm/fpu/api.h b/arch/s390/include/asm/fpu/api.h index 5e04f3cbd320..78ba3ddb9e18 100644 --- a/arch/s390/include/asm/fpu/api.h +++ b/arch/s390/include/asm/fpu/api.h
@@ -1,6 +1,41 @@
1	/*	1	/*
2	* In-kernel FPU support functions	2	* In-kernel FPU support functions
3	*	3	*
		4	*
		5	* Consider these guidelines before using in-kernel FPU functions:
		6	*
		7	* 1. Use kernel_fpu_begin() and kernel_fpu_end() to enclose all in-kernel
		8	* use of floating-point or vector registers and instructions.
		9	*
		10	* 2. For kernel_fpu_begin(), specify the vector register range you want to
		11	* use with the KERNEL_VXR_* constants. Consider these usage guidelines:
		12	*
		13	* a) If your function typically runs in process-context, use the lower
		14	* half of the vector registers, for example, specify KERNEL_VXR_LOW.
		15	* b) If your function typically runs in soft-irq or hard-irq context,
		16	* prefer using the upper half of the vector registers, for example,
		17	* specify KERNEL_VXR_HIGH.
		18	*
		19	* If you adhere to these guidelines, an interrupted process context
		20	* does not require to save and restore vector registers because of
		21	* disjoint register ranges.
		22	*
		23	* Also note that the __kernel_fpu_begin()/__kernel_fpu_end() functions
		24	* includes logic to save and restore up to 16 vector registers at once.
		25	*
		26	* 3. You can nest kernel_fpu_begin()/kernel_fpu_end() by using different
		27	* struct kernel_fpu states. Vector registers that are in use by outer
		28	* levels are saved and restored. You can minimize the save and restore
		29	* effort by choosing disjoint vector register ranges.
		30	*
		31	* 5. To use vector floating-point instructions, specify the KERNEL_FPC
		32	* flag to save and restore floating-point controls in addition to any
		33	* vector register range.
		34	*
		35	* 6. To use floating-point registers and instructions only, specify the
		36	* KERNEL_FPR flag. This flag triggers a save and restore of vector
		37	* registers V0 to V15 and floating-point controls.
		38	*
4	* Copyright IBM Corp. 2015	39	* Copyright IBM Corp. 2015
5	* Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>	40	* Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
6	*/	41	*/
@@ -8,6 +43,8 @@
8	#ifndef _ASM_S390_FPU_API_H	43	#ifndef _ASM_S390_FPU_API_H
9	#define _ASM_S390_FPU_API_H	44	#define _ASM_S390_FPU_API_H
10		45
		46	#include <linux/preempt.h>
		47
11	void save_fpu_regs(void);	48	void save_fpu_regs(void);
12		49
13	static inline int test_fp_ctl(u32 fpc)	50	static inline int test_fp_ctl(u32 fpc)
@@ -27,4 +64,42 @@ static inline int test_fp_ctl(u32 fpc)
27	return rc;	64	return rc;
28	}	65	}
29		66
		67	#define KERNEL_VXR_V0V7 1
		68	#define KERNEL_VXR_V8V15 2
		69	#define KERNEL_VXR_V16V23 4
		70	#define KERNEL_VXR_V24V31 8
		71	#define KERNEL_FPR 16
		72	#define KERNEL_FPC 256
		73
		74	#define KERNEL_VXR_LOW (KERNEL_VXR_V0V7\|KERNEL_VXR_V8V15)
		75	#define KERNEL_VXR_MID (KERNEL_VXR_V8V15\|KERNEL_VXR_V16V23)
		76	#define KERNEL_VXR_HIGH (KERNEL_VXR_V16V23\|KERNEL_VXR_V24V31)
		77
		78	#define KERNEL_FPU_MASK (KERNEL_VXR_LOW\|KERNEL_VXR_HIGH\|KERNEL_FPR)
		79
		80	struct kernel_fpu;
		81
		82	/*
		83	* Note the functions below must be called with preemption disabled.
		84	* Do not enable preemption before calling __kernel_fpu_end() to prevent
		85	* an corruption of an existing kernel FPU state.
		86	*
		87	* Prefer using the kernel_fpu_begin()/kernel_fpu_end() pair of functions.
		88	*/
		89	void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags);
		90	void __kernel_fpu_end(struct kernel_fpu *state);
		91
		92
		93	static inline void kernel_fpu_begin(struct kernel_fpu *state, u32 flags)
		94	{
		95	preempt_disable();
		96	__kernel_fpu_begin(state, flags);
		97	}
		98
		99	static inline void kernel_fpu_end(struct kernel_fpu *state)
		100	{
		101	__kernel_fpu_end(state);
		102	preempt_enable();
		103	}
		104
30	#endif /* _ASM_S390_FPU_API_H */	105	#endif /* _ASM_S390_FPU_API_H */


diff --git a/arch/s390/include/asm/fpu/types.h b/arch/s390/include/asm/fpu/types.h index fe937c9b6471..bce255ead72b 100644 --- a/arch/s390/include/asm/fpu/types.h +++ b/arch/s390/include/asm/fpu/types.h
@@ -24,4 +24,14 @@ struct fpu {
24	/* VX array structure for address operand constraints in inline assemblies */	24	/* VX array structure for address operand constraints in inline assemblies */
25	struct vx_array { __vector128 _[__NUM_VXRS]; };	25	struct vx_array { __vector128 _[__NUM_VXRS]; };
26		26
		27	/* In-kernel FPU state structure */
		28	struct kernel_fpu {
		29	u32 mask;
		30	u32 fpc;
		31	union {
		32	freg_t fprs[__NUM_FPRS];
		33	__vector128 vxrs[__NUM_VXRS];
		34	};
		35	};
		36
27	#endif /* _ASM_S390_FPU_TYPES_H */	37	#endif /* _ASM_S390_FPU_TYPES_H */


diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 2f5586ab8a6a..8d1419120bb7 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile
@@ -45,7 +45,7 @@ obj-y := traps.o time.o process.o base.o early.o setup.o idle.o vtime.o
45	obj-y += processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o	45	obj-y += processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o
46	obj-y += debug.o irq.o ipl.o dis.o diag.o sclp.o vdso.o	46	obj-y += debug.o irq.o ipl.o dis.o diag.o sclp.o vdso.o
47	obj-y += sysinfo.o jump_label.o lgr.o os_info.o machine_kexec.o pgm_check.o	47	obj-y += sysinfo.o jump_label.o lgr.o os_info.o machine_kexec.o pgm_check.o
48	obj-y += runtime_instr.o cache.o dumpstack.o	48	obj-y += runtime_instr.o cache.o fpu.o dumpstack.o
49	obj-y += entry.o reipl.o relocate_kernel.o	49	obj-y += entry.o reipl.o relocate_kernel.o
50		50
51	extra-y += head.o head64.o vmlinux.lds	51	extra-y += head.o head64.o vmlinux.lds


diff --git a/arch/s390/kernel/fpu.c b/arch/s390/kernel/fpu.c new file mode 100644 index 000000000000..81d1d1887507 --- /dev/null +++ b/arch/s390/kernel/fpu.c
@@ -0,0 +1,249 @@
		1	/*
		2	* In-kernel vector facility support functions
		3	*
		4	* Copyright IBM Corp. 2015
		5	* Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
		6	*/
		7	#include <linux/kernel.h>
		8	#include <linux/cpu.h>
		9	#include <linux/sched.h>
		10	#include <asm/fpu/types.h>
		11	#include <asm/fpu/api.h>
		12
		13	/*
		14	* Per-CPU variable to maintain FPU register ranges that are in use
		15	* by the kernel.
		16	*/
		17	static DEFINE_PER_CPU(u32, kernel_fpu_state);
		18
		19	#define KERNEL_FPU_STATE_MASK (KERNEL_FPU_MASK\|KERNEL_FPC)
		20
		21
		22	void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags)
		23	{
		24	if (!__this_cpu_read(kernel_fpu_state)) {
		25	/*
		26	* Save user space FPU state and register contents. Multiple
		27	* calls because of interruptions do not matter and return
		28	* immediately. This also sets CIF_FPU to lazy restore FP/VX
		29	* register contents when returning to user space.
		30	*/
		31	save_fpu_regs();
		32	}
		33
		34	/* Update flags to use the vector facility for KERNEL_FPR */
		35	if (MACHINE_HAS_VX && (state->mask & KERNEL_FPR)) {
		36	flags \|= KERNEL_VXR_LOW \| KERNEL_FPC;
		37	flags &= ~KERNEL_FPR;
		38	}
		39
		40	/* Save and update current kernel VX state */
		41	state->mask = __this_cpu_read(kernel_fpu_state);
		42	__this_cpu_or(kernel_fpu_state, flags & KERNEL_FPU_STATE_MASK);
		43
		44	/*
		45	* If this is the first call to __kernel_fpu_begin(), no additional
		46	* work is required.
		47	*/
		48	if (!(state->mask & KERNEL_FPU_STATE_MASK))
		49	return;
		50
		51	/*
		52	* If KERNEL_FPR is still set, the vector facility is not available
		53	* and, thus, save floating-point control and registers only.
		54	*/
		55	if (state->mask & KERNEL_FPR) {
		56	asm volatile("stfpc %0" : "=Q" (state->fpc));
		57	asm volatile("std 0,%0" : "=Q" (state->fprs[0]));
		58	asm volatile("std 1,%0" : "=Q" (state->fprs[1]));
		59	asm volatile("std 2,%0" : "=Q" (state->fprs[2]));
		60	asm volatile("std 3,%0" : "=Q" (state->fprs[3]));
		61	asm volatile("std 4,%0" : "=Q" (state->fprs[4]));
		62	asm volatile("std 5,%0" : "=Q" (state->fprs[5]));
		63	asm volatile("std 6,%0" : "=Q" (state->fprs[6]));
		64	asm volatile("std 7,%0" : "=Q" (state->fprs[7]));
		65	asm volatile("std 8,%0" : "=Q" (state->fprs[8]));
		66	asm volatile("std 9,%0" : "=Q" (state->fprs[9]));
		67	asm volatile("std 10,%0" : "=Q" (state->fprs[10]));
		68	asm volatile("std 11,%0" : "=Q" (state->fprs[11]));
		69	asm volatile("std 12,%0" : "=Q" (state->fprs[12]));
		70	asm volatile("std 13,%0" : "=Q" (state->fprs[13]));
		71	asm volatile("std 14,%0" : "=Q" (state->fprs[14]));
		72	asm volatile("std 15,%0" : "=Q" (state->fprs[15]));
		73	return;
		74	}
		75
		76	/*
		77	* If this is a nested call to __kernel_fpu_begin(), check the saved
		78	* state mask to save and later restore the vector registers that
		79	* are already in use. Let's start with checking floating-point
		80	* controls.
		81	*/
		82	if (state->mask & KERNEL_FPC)
		83	asm volatile("stfpc %0" : "=m" (state->fpc));
		84
		85	/* Test and save vector registers */
		86	asm volatile (
		87	/*
		88	* Test if any vector register must be saved and, if so,
		89	* test if all register can be saved.
		90	*/
		91	" tmll %[m],15\n" /* KERNEL_VXR_MASK */
		92	" jz 20f\n" /* no work -> done */
		93	" la 1,%[vxrs]\n" /* load save area */
		94	" jo 18f\n" /* -> save V0..V31 */
		95
		96	/*
		97	* Test if V8..V23 can be saved at once... this speeds up
		98	* for KERNEL_fpu_MID only. Otherwise continue to split the
		99	* range of vector registers into two halves and test them
		100	* separately.
		101	*/
		102	" tmll %[m],6\n" /* KERNEL_VXR_MID */
		103	" jo 17f\n" /* -> save V8..V23 */
		104
		105	/* Test and save the first half of 16 vector registers */
		106	"1: tmll %[m],3\n" /* KERNEL_VXR_LOW */
		107	" jz 10f\n" /* -> KERNEL_VXR_HIGH */
		108	" jo 2f\n" /* 11 -> save V0..V15 */
		109	" brc 4,3f\n" /* 01 -> save V0..V7 */
		110	" brc 2,4f\n" /* 10 -> save V8..V15 */
		111
		112	/* Test and save the second half of 16 vector registers */
		113	"10: tmll %[m],12\n" /* KERNEL_VXR_HIGH */
		114	" jo 19f\n" /* 11 -> save V16..V31 */
		115	" brc 4,11f\n" /* 01 -> save V16..V23 */
		116	" brc 2,12f\n" /* 10 -> save V24..V31 */
		117	" j 20f\n" /* 00 -> done */
		118
		119	/*
		120	* Below are the vstm combinations to save multiple vector
		121	* registers at once.
		122	*/
		123	"2: .word 0xe70f,0x1000,0x003e\n" /* vstm 0,15,0(1) */
		124	" j 10b\n" /* -> VXR_HIGH */
		125	"3: .word 0xe707,0x1000,0x003e\n" /* vstm 0,7,0(1) */
		126	" j 10b\n" /* -> VXR_HIGH */
		127	"4: .word 0xe78f,0x1080,0x003e\n" /* vstm 8,15,128(1) */
		128	" j 10b\n" /* -> VXR_HIGH */
		129	"\n"
		130	"11: .word 0xe707,0x1100,0x0c3e\n" /* vstm 16,23,256(1) */
		131	" j 20f\n" /* -> done */
		132	"12: .word 0xe78f,0x1180,0x0c3e\n" /* vstm 24,31,384(1) */
		133	" j 20f\n" /* -> done */
		134	"\n"
		135	"17: .word 0xe787,0x1080,0x043e\n" /* vstm 8,23,128(1) */
		136	" nill %[m],249\n" /* m &= ~VXR_MID */
		137	" j 1b\n" /* -> VXR_LOW */
		138	"\n"
		139	"18: .word 0xe70f,0x1000,0x003e\n" /* vstm 0,15,0(1) */
		140	"19: .word 0xe70f,0x1100,0x0c3e\n" /* vstm 16,31,256(1) */
		141	"20:"
		142	: [vxrs] "=Q" ((struct vx_array ) &state->vxrs)
		143	: [m] "d" (state->mask)
		144	: "1", "cc");
		145	}
		146	EXPORT_SYMBOL(__kernel_fpu_begin);
		147
		148	void __kernel_fpu_end(struct kernel_fpu *state)
		149	{
		150	/* Just update the per-CPU state if there is nothing to restore */
		151	if (!(state->mask & KERNEL_FPU_STATE_MASK))
		152	goto update_fpu_state;
		153
		154	/*
		155	* If KERNEL_FPR is specified, the vector facility is not available
		156	* and, thus, restore floating-point control and registers only.
		157	*/
		158	if (state->mask & KERNEL_FPR) {
		159	asm volatile("lfpc %0" : : "Q" (state->fpc));
		160	asm volatile("ld 0,%0" : : "Q" (state->fprs[0]));
		161	asm volatile("ld 1,%0" : : "Q" (state->fprs[1]));
		162	asm volatile("ld 2,%0" : : "Q" (state->fprs[2]));
		163	asm volatile("ld 3,%0" : : "Q" (state->fprs[3]));
		164	asm volatile("ld 4,%0" : : "Q" (state->fprs[4]));
		165	asm volatile("ld 5,%0" : : "Q" (state->fprs[5]));
		166	asm volatile("ld 6,%0" : : "Q" (state->fprs[6]));
		167	asm volatile("ld 7,%0" : : "Q" (state->fprs[7]));
		168	asm volatile("ld 8,%0" : : "Q" (state->fprs[8]));
		169	asm volatile("ld 9,%0" : : "Q" (state->fprs[9]));
		170	asm volatile("ld 10,%0" : : "Q" (state->fprs[10]));
		171	asm volatile("ld 11,%0" : : "Q" (state->fprs[11]));
		172	asm volatile("ld 12,%0" : : "Q" (state->fprs[12]));
		173	asm volatile("ld 13,%0" : : "Q" (state->fprs[13]));
		174	asm volatile("ld 14,%0" : : "Q" (state->fprs[14]));
		175	asm volatile("ld 15,%0" : : "Q" (state->fprs[15]));
		176	goto update_fpu_state;
		177	}
		178
		179	/* Test and restore floating-point controls */
		180	if (state->mask & KERNEL_FPC)
		181	asm volatile("lfpc %0" : : "Q" (state->fpc));
		182
		183	/* Test and restore (load) vector registers */
		184	asm volatile (
		185	/*
		186	* Test if any vector registers must be loaded and, if so,
		187	* test if all registers can be loaded at once.
		188	*/
		189	" tmll %[m],15\n" /* KERNEL_VXR_MASK */
		190	" jz 20f\n" /* no work -> done */
		191	" la 1,%[vxrs]\n" /* load load area */
		192	" jo 18f\n" /* -> load V0..V31 */
		193
		194	/*
		195	* Test if V8..V23 can be restored at once... this speeds up
		196	* for KERNEL_VXR_MID only. Otherwise continue to split the
		197	* range of vector registers into two halves and test them
		198	* separately.
		199	*/
		200	" tmll %[m],6\n" /* KERNEL_VXR_MID */
		201	" jo 17f\n" /* -> load V8..V23 */
		202
		203	/* Test and load the first half of 16 vector registers */
		204	"1: tmll %[m],3\n" /* KERNEL_VXR_LOW */
		205	" jz 10f\n" /* -> KERNEL_VXR_HIGH */
		206	" jo 2f\n" /* 11 -> load V0..V15 */
		207	" brc 4,3f\n" /* 01 -> load V0..V7 */
		208	" brc 2,4f\n" /* 10 -> load V8..V15 */
		209
		210	/* Test and load the second half of 16 vector registers */
		211	"10: tmll %[m],12\n" /* KERNEL_VXR_HIGH */
		212	" jo 19f\n" /* 11 -> load V16..V31 */
		213	" brc 4,11f\n" /* 01 -> load V16..V23 */
		214	" brc 2,12f\n" /* 10 -> load V24..V31 */
		215	" j 20f\n" /* 00 -> done */
		216
		217	/*
		218	* Below are the vstm combinations to load multiple vector
		219	* registers at once.
		220	*/
		221	"2: .word 0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */
		222	" j 10b\n" /* -> VXR_HIGH */
		223	"3: .word 0xe707,0x1000,0x0036\n" /* vlm 0,7,0(1) */
		224	" j 10b\n" /* -> VXR_HIGH */
		225	"4: .word 0xe78f,0x1080,0x0036\n" /* vlm 8,15,128(1) */
		226	" j 10b\n" /* -> VXR_HIGH */
		227	"\n"
		228	"11: .word 0xe707,0x1100,0x0c36\n" /* vlm 16,23,256(1) */
		229	" j 20f\n" /* -> done */
		230	"12: .word 0xe78f,0x1180,0x0c36\n" /* vlm 24,31,384(1) */
		231	" j 20f\n" /* -> done */
		232	"\n"
		233	"17: .word 0xe787,0x1080,0x0436\n" /* vlm 8,23,128(1) */
		234	" nill %[m],249\n" /* m &= ~VXR_MID */
		235	" j 1b\n" /* -> VXR_LOW */
		236	"\n"
		237	"18: .word 0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */
		238	"19: .word 0xe70f,0x1100,0x0c36\n" /* vlm 16,31,256(1) */
		239	"20:"
		240	:
		241	: [vxrs] "Q" ((struct vx_array ) &state->vxrs),
		242	[m] "d" (state->mask)
		243	: "1", "cc");
		244
		245	update_fpu_state:
		246	/* Update current kernel VX state */
		247	__this_cpu_write(kernel_fpu_state, state->mask);
		248	}
		249	EXPORT_SYMBOL(__kernel_fpu_end);