aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/arm/net/bpf_jit_32.c225
-rw-r--r--arch/x86/entry/entry_32.S11
-rw-r--r--arch/x86/entry/entry_64.S11
-rw-r--r--arch/x86/events/intel/rapl.c4
-rw-r--r--arch/x86/include/asm/apic.h1
-rw-r--r--arch/x86/include/asm/cpufeatures.h3
-rw-r--r--arch/x86/include/asm/mem_encrypt.h4
-rw-r--r--arch/x86/include/asm/nospec-branch.h6
-rw-r--r--arch/x86/kernel/apic/apic.c49
-rw-r--r--arch/x86/kernel/apic/vector.c7
-rw-r--r--arch/x86/kernel/cpu/bugs.c36
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.c8
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/head64.c4
-rw-r--r--arch/x86/kernel/idt.c12
-rw-r--r--arch/x86/kernel/irqinit.c3
-rw-r--r--arch/x86/kernel/setup.c10
-rw-r--r--arch/x86/kernel/tsc.c9
-rw-r--r--arch/x86/mm/fault.c7
-rw-r--r--arch/x86/mm/kasan_init_64.c24
-rw-r--r--arch/x86/mm/mem_encrypt.c356
-rw-r--r--arch/x86/mm/mem_encrypt_boot.S80
-rw-r--r--drivers/gpio/gpio-mmio.c30
-rw-r--r--drivers/mmc/host/sdhci-esdhc-imx.c14
-rw-r--r--drivers/nvme/host/pci.c28
-rw-r--r--include/linux/delayacct.h8
-rw-r--r--include/linux/vermagic.h8
-rw-r--r--kernel/delayacct.c42
-rw-r--r--kernel/futex.c86
-rw-r--r--kernel/locking/rtmutex.c26
-rw-r--r--kernel/locking/rtmutex_common.h1
-rw-r--r--kernel/sched/core.c6
-rw-r--r--kernel/time/timer.c2
-rw-r--r--scripts/Makefile.build14
-rw-r--r--tools/objtool/elf.c4
35 files changed, 768 insertions, 372 deletions
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index c199990e12b6..323a4df59a6c 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -27,14 +27,58 @@
27 27
28int bpf_jit_enable __read_mostly; 28int bpf_jit_enable __read_mostly;
29 29
30/*
31 * eBPF prog stack layout:
32 *
33 * high
34 * original ARM_SP => +-----+
35 * | | callee saved registers
36 * +-----+ <= (BPF_FP + SCRATCH_SIZE)
37 * | ... | eBPF JIT scratch space
38 * eBPF fp register => +-----+
39 * (BPF_FP) | ... | eBPF prog stack
40 * +-----+
41 * |RSVD | JIT scratchpad
42 * current ARM_SP => +-----+ <= (BPF_FP - STACK_SIZE + SCRATCH_SIZE)
43 * | |
44 * | ... | Function call stack
45 * | |
46 * +-----+
47 * low
48 *
49 * The callee saved registers depends on whether frame pointers are enabled.
50 * With frame pointers (to be compliant with the ABI):
51 *
52 * high
53 * original ARM_SP => +------------------+ \
54 * | pc | |
55 * current ARM_FP => +------------------+ } callee saved registers
56 * |r4-r8,r10,fp,ip,lr| |
57 * +------------------+ /
58 * low
59 *
60 * Without frame pointers:
61 *
62 * high
63 * original ARM_SP => +------------------+
64 * | r4-r8,r10,fp,lr | callee saved registers
65 * current ARM_FP => +------------------+
66 * low
67 *
68 * When popping registers off the stack at the end of a BPF function, we
69 * reference them via the current ARM_FP register.
70 */
71#define CALLEE_MASK (1 << ARM_R4 | 1 << ARM_R5 | 1 << ARM_R6 | \
72 1 << ARM_R7 | 1 << ARM_R8 | 1 << ARM_R10 | \
73 1 << ARM_FP)
74#define CALLEE_PUSH_MASK (CALLEE_MASK | 1 << ARM_LR)
75#define CALLEE_POP_MASK (CALLEE_MASK | 1 << ARM_PC)
76
30#define STACK_OFFSET(k) (k) 77#define STACK_OFFSET(k) (k)
31#define TMP_REG_1 (MAX_BPF_JIT_REG + 0) /* TEMP Register 1 */ 78#define TMP_REG_1 (MAX_BPF_JIT_REG + 0) /* TEMP Register 1 */
32#define TMP_REG_2 (MAX_BPF_JIT_REG + 1) /* TEMP Register 2 */ 79#define TMP_REG_2 (MAX_BPF_JIT_REG + 1) /* TEMP Register 2 */
33#define TCALL_CNT (MAX_BPF_JIT_REG + 2) /* Tail Call Count */ 80#define TCALL_CNT (MAX_BPF_JIT_REG + 2) /* Tail Call Count */
34 81
35/* Flags used for JIT optimization */
36#define SEEN_CALL (1 << 0)
37
38#define FLAG_IMM_OVERFLOW (1 << 0) 82#define FLAG_IMM_OVERFLOW (1 << 0)
39 83
40/* 84/*
@@ -95,7 +139,6 @@ static const u8 bpf2a32[][2] = {
95 * idx : index of current last JITed instruction. 139 * idx : index of current last JITed instruction.
96 * prologue_bytes : bytes used in prologue. 140 * prologue_bytes : bytes used in prologue.
97 * epilogue_offset : offset of epilogue starting. 141 * epilogue_offset : offset of epilogue starting.
98 * seen : bit mask used for JIT optimization.
99 * offsets : array of eBPF instruction offsets in 142 * offsets : array of eBPF instruction offsets in
100 * JITed code. 143 * JITed code.
101 * target : final JITed code. 144 * target : final JITed code.
@@ -110,7 +153,6 @@ struct jit_ctx {
110 unsigned int idx; 153 unsigned int idx;
111 unsigned int prologue_bytes; 154 unsigned int prologue_bytes;
112 unsigned int epilogue_offset; 155 unsigned int epilogue_offset;
113 u32 seen;
114 u32 flags; 156 u32 flags;
115 u32 *offsets; 157 u32 *offsets;
116 u32 *target; 158 u32 *target;
@@ -179,8 +221,13 @@ static void jit_fill_hole(void *area, unsigned int size)
179 *ptr++ = __opcode_to_mem_arm(ARM_INST_UDF); 221 *ptr++ = __opcode_to_mem_arm(ARM_INST_UDF);
180} 222}
181 223
182/* Stack must be multiples of 16 Bytes */ 224#if defined(CONFIG_AEABI) && (__LINUX_ARM_ARCH__ >= 5)
183#define STACK_ALIGN(sz) (((sz) + 3) & ~3) 225/* EABI requires the stack to be aligned to 64-bit boundaries */
226#define STACK_ALIGNMENT 8
227#else
228/* Stack must be aligned to 32-bit boundaries */
229#define STACK_ALIGNMENT 4
230#endif
184 231
185/* Stack space for BPF_REG_2, BPF_REG_3, BPF_REG_4, 232/* Stack space for BPF_REG_2, BPF_REG_3, BPF_REG_4,
186 * BPF_REG_5, BPF_REG_7, BPF_REG_8, BPF_REG_9, 233 * BPF_REG_5, BPF_REG_7, BPF_REG_8, BPF_REG_9,
@@ -194,7 +241,7 @@ static void jit_fill_hole(void *area, unsigned int size)
194 + SCRATCH_SIZE + \ 241 + SCRATCH_SIZE + \
195 + 4 /* extra for skb_copy_bits buffer */) 242 + 4 /* extra for skb_copy_bits buffer */)
196 243
197#define STACK_SIZE STACK_ALIGN(_STACK_SIZE) 244#define STACK_SIZE ALIGN(_STACK_SIZE, STACK_ALIGNMENT)
198 245
199/* Get the offset of eBPF REGISTERs stored on scratch space. */ 246/* Get the offset of eBPF REGISTERs stored on scratch space. */
200#define STACK_VAR(off) (STACK_SIZE-off-4) 247#define STACK_VAR(off) (STACK_SIZE-off-4)
@@ -285,16 +332,19 @@ static inline void emit_mov_i(const u8 rd, u32 val, struct jit_ctx *ctx)
285 emit_mov_i_no8m(rd, val, ctx); 332 emit_mov_i_no8m(rd, val, ctx);
286} 333}
287 334
288static inline void emit_blx_r(u8 tgt_reg, struct jit_ctx *ctx) 335static void emit_bx_r(u8 tgt_reg, struct jit_ctx *ctx)
289{ 336{
290 ctx->seen |= SEEN_CALL;
291#if __LINUX_ARM_ARCH__ < 5
292 emit(ARM_MOV_R(ARM_LR, ARM_PC), ctx);
293
294 if (elf_hwcap & HWCAP_THUMB) 337 if (elf_hwcap & HWCAP_THUMB)
295 emit(ARM_BX(tgt_reg), ctx); 338 emit(ARM_BX(tgt_reg), ctx);
296 else 339 else
297 emit(ARM_MOV_R(ARM_PC, tgt_reg), ctx); 340 emit(ARM_MOV_R(ARM_PC, tgt_reg), ctx);
341}
342
343static inline void emit_blx_r(u8 tgt_reg, struct jit_ctx *ctx)
344{
345#if __LINUX_ARM_ARCH__ < 5
346 emit(ARM_MOV_R(ARM_LR, ARM_PC), ctx);
347 emit_bx_r(tgt_reg, ctx);
298#else 348#else
299 emit(ARM_BLX_R(tgt_reg), ctx); 349 emit(ARM_BLX_R(tgt_reg), ctx);
300#endif 350#endif
@@ -354,7 +404,6 @@ static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op)
354 } 404 }
355 405
356 /* Call appropriate function */ 406 /* Call appropriate function */
357 ctx->seen |= SEEN_CALL;
358 emit_mov_i(ARM_IP, op == BPF_DIV ? 407 emit_mov_i(ARM_IP, op == BPF_DIV ?
359 (u32)jit_udiv32 : (u32)jit_mod32, ctx); 408 (u32)jit_udiv32 : (u32)jit_mod32, ctx);
360 emit_blx_r(ARM_IP, ctx); 409 emit_blx_r(ARM_IP, ctx);
@@ -620,8 +669,6 @@ static inline void emit_a32_lsh_r64(const u8 dst[], const u8 src[], bool dstk,
620 /* Do LSH operation */ 669 /* Do LSH operation */
621 emit(ARM_SUB_I(ARM_IP, rt, 32), ctx); 670 emit(ARM_SUB_I(ARM_IP, rt, 32), ctx);
622 emit(ARM_RSB_I(tmp2[0], rt, 32), ctx); 671 emit(ARM_RSB_I(tmp2[0], rt, 32), ctx);
623 /* As we are using ARM_LR */
624 ctx->seen |= SEEN_CALL;
625 emit(ARM_MOV_SR(ARM_LR, rm, SRTYPE_ASL, rt), ctx); 672 emit(ARM_MOV_SR(ARM_LR, rm, SRTYPE_ASL, rt), ctx);
626 emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd, SRTYPE_ASL, ARM_IP), ctx); 673 emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd, SRTYPE_ASL, ARM_IP), ctx);
627 emit(ARM_ORR_SR(ARM_IP, ARM_LR, rd, SRTYPE_LSR, tmp2[0]), ctx); 674 emit(ARM_ORR_SR(ARM_IP, ARM_LR, rd, SRTYPE_LSR, tmp2[0]), ctx);
@@ -656,8 +703,6 @@ static inline void emit_a32_arsh_r64(const u8 dst[], const u8 src[], bool dstk,
656 /* Do the ARSH operation */ 703 /* Do the ARSH operation */
657 emit(ARM_RSB_I(ARM_IP, rt, 32), ctx); 704 emit(ARM_RSB_I(ARM_IP, rt, 32), ctx);
658 emit(ARM_SUBS_I(tmp2[0], rt, 32), ctx); 705 emit(ARM_SUBS_I(tmp2[0], rt, 32), ctx);
659 /* As we are using ARM_LR */
660 ctx->seen |= SEEN_CALL;
661 emit(ARM_MOV_SR(ARM_LR, rd, SRTYPE_LSR, rt), ctx); 706 emit(ARM_MOV_SR(ARM_LR, rd, SRTYPE_LSR, rt), ctx);
662 emit(ARM_ORR_SR(ARM_LR, ARM_LR, rm, SRTYPE_ASL, ARM_IP), ctx); 707 emit(ARM_ORR_SR(ARM_LR, ARM_LR, rm, SRTYPE_ASL, ARM_IP), ctx);
663 _emit(ARM_COND_MI, ARM_B(0), ctx); 708 _emit(ARM_COND_MI, ARM_B(0), ctx);
@@ -692,8 +737,6 @@ static inline void emit_a32_lsr_r64(const u8 dst[], const u8 src[], bool dstk,
692 /* Do LSH operation */ 737 /* Do LSH operation */
693 emit(ARM_RSB_I(ARM_IP, rt, 32), ctx); 738 emit(ARM_RSB_I(ARM_IP, rt, 32), ctx);
694 emit(ARM_SUBS_I(tmp2[0], rt, 32), ctx); 739 emit(ARM_SUBS_I(tmp2[0], rt, 32), ctx);
695 /* As we are using ARM_LR */
696 ctx->seen |= SEEN_CALL;
697 emit(ARM_MOV_SR(ARM_LR, rd, SRTYPE_LSR, rt), ctx); 740 emit(ARM_MOV_SR(ARM_LR, rd, SRTYPE_LSR, rt), ctx);
698 emit(ARM_ORR_SR(ARM_LR, ARM_LR, rm, SRTYPE_ASL, ARM_IP), ctx); 741 emit(ARM_ORR_SR(ARM_LR, ARM_LR, rm, SRTYPE_ASL, ARM_IP), ctx);
699 emit(ARM_ORR_SR(ARM_LR, ARM_LR, rm, SRTYPE_LSR, tmp2[0]), ctx); 742 emit(ARM_ORR_SR(ARM_LR, ARM_LR, rm, SRTYPE_LSR, tmp2[0]), ctx);
@@ -828,8 +871,6 @@ static inline void emit_a32_mul_r64(const u8 dst[], const u8 src[], bool dstk,
828 /* Do Multiplication */ 871 /* Do Multiplication */
829 emit(ARM_MUL(ARM_IP, rd, rn), ctx); 872 emit(ARM_MUL(ARM_IP, rd, rn), ctx);
830 emit(ARM_MUL(ARM_LR, rm, rt), ctx); 873 emit(ARM_MUL(ARM_LR, rm, rt), ctx);
831 /* As we are using ARM_LR */
832 ctx->seen |= SEEN_CALL;
833 emit(ARM_ADD_R(ARM_LR, ARM_IP, ARM_LR), ctx); 874 emit(ARM_ADD_R(ARM_LR, ARM_IP, ARM_LR), ctx);
834 875
835 emit(ARM_UMULL(ARM_IP, rm, rd, rt), ctx); 876 emit(ARM_UMULL(ARM_IP, rm, rd, rt), ctx);
@@ -872,33 +913,53 @@ static inline void emit_str_r(const u8 dst, const u8 src, bool dstk,
872} 913}
873 914
874/* dst = *(size*)(src + off) */ 915/* dst = *(size*)(src + off) */
875static inline void emit_ldx_r(const u8 dst, const u8 src, bool dstk, 916static inline void emit_ldx_r(const u8 dst[], const u8 src, bool dstk,
876 const s32 off, struct jit_ctx *ctx, const u8 sz){ 917 s32 off, struct jit_ctx *ctx, const u8 sz){
877 const u8 *tmp = bpf2a32[TMP_REG_1]; 918 const u8 *tmp = bpf2a32[TMP_REG_1];
878 u8 rd = dstk ? tmp[1] : dst; 919 const u8 *rd = dstk ? tmp : dst;
879 u8 rm = src; 920 u8 rm = src;
921 s32 off_max;
880 922
881 if (off) { 923 if (sz == BPF_H)
924 off_max = 0xff;
925 else
926 off_max = 0xfff;
927
928 if (off < 0 || off > off_max) {
882 emit_a32_mov_i(tmp[0], off, false, ctx); 929 emit_a32_mov_i(tmp[0], off, false, ctx);
883 emit(ARM_ADD_R(tmp[0], tmp[0], src), ctx); 930 emit(ARM_ADD_R(tmp[0], tmp[0], src), ctx);
884 rm = tmp[0]; 931 rm = tmp[0];
932 off = 0;
933 } else if (rd[1] == rm) {
934 emit(ARM_MOV_R(tmp[0], rm), ctx);
935 rm = tmp[0];
885 } 936 }
886 switch (sz) { 937 switch (sz) {
887 case BPF_W: 938 case BPF_B:
888 /* Load a Word */ 939 /* Load a Byte */
889 emit(ARM_LDR_I(rd, rm, 0), ctx); 940 emit(ARM_LDRB_I(rd[1], rm, off), ctx);
941 emit_a32_mov_i(dst[0], 0, dstk, ctx);
890 break; 942 break;
891 case BPF_H: 943 case BPF_H:
892 /* Load a HalfWord */ 944 /* Load a HalfWord */
893 emit(ARM_LDRH_I(rd, rm, 0), ctx); 945 emit(ARM_LDRH_I(rd[1], rm, off), ctx);
946 emit_a32_mov_i(dst[0], 0, dstk, ctx);
894 break; 947 break;
895 case BPF_B: 948 case BPF_W:
896 /* Load a Byte */ 949 /* Load a Word */
897 emit(ARM_LDRB_I(rd, rm, 0), ctx); 950 emit(ARM_LDR_I(rd[1], rm, off), ctx);
951 emit_a32_mov_i(dst[0], 0, dstk, ctx);
952 break;
953 case BPF_DW:
954 /* Load a Double Word */
955 emit(ARM_LDR_I(rd[1], rm, off), ctx);
956 emit(ARM_LDR_I(rd[0], rm, off + 4), ctx);
898 break; 957 break;
899 } 958 }
900 if (dstk) 959 if (dstk)
901 emit(ARM_STR_I(rd, ARM_SP, STACK_VAR(dst)), ctx); 960 emit(ARM_STR_I(rd[1], ARM_SP, STACK_VAR(dst[1])), ctx);
961 if (dstk && sz == BPF_DW)
962 emit(ARM_STR_I(rd[0], ARM_SP, STACK_VAR(dst[0])), ctx);
902} 963}
903 964
904/* Arithmatic Operation */ 965/* Arithmatic Operation */
@@ -906,7 +967,6 @@ static inline void emit_ar_r(const u8 rd, const u8 rt, const u8 rm,
906 const u8 rn, struct jit_ctx *ctx, u8 op) { 967 const u8 rn, struct jit_ctx *ctx, u8 op) {
907 switch (op) { 968 switch (op) {
908 case BPF_JSET: 969 case BPF_JSET:
909 ctx->seen |= SEEN_CALL;
910 emit(ARM_AND_R(ARM_IP, rt, rn), ctx); 970 emit(ARM_AND_R(ARM_IP, rt, rn), ctx);
911 emit(ARM_AND_R(ARM_LR, rd, rm), ctx); 971 emit(ARM_AND_R(ARM_LR, rd, rm), ctx);
912 emit(ARM_ORRS_R(ARM_IP, ARM_LR, ARM_IP), ctx); 972 emit(ARM_ORRS_R(ARM_IP, ARM_LR, ARM_IP), ctx);
@@ -945,7 +1005,7 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx)
945 const u8 *tcc = bpf2a32[TCALL_CNT]; 1005 const u8 *tcc = bpf2a32[TCALL_CNT];
946 const int idx0 = ctx->idx; 1006 const int idx0 = ctx->idx;
947#define cur_offset (ctx->idx - idx0) 1007#define cur_offset (ctx->idx - idx0)
948#define jmp_offset (out_offset - (cur_offset)) 1008#define jmp_offset (out_offset - (cur_offset) - 2)
949 u32 off, lo, hi; 1009 u32 off, lo, hi;
950 1010
951 /* if (index >= array->map.max_entries) 1011 /* if (index >= array->map.max_entries)
@@ -956,7 +1016,7 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx)
956 emit_a32_mov_i(tmp[1], off, false, ctx); 1016 emit_a32_mov_i(tmp[1], off, false, ctx);
957 emit(ARM_LDR_I(tmp2[1], ARM_SP, STACK_VAR(r2[1])), ctx); 1017 emit(ARM_LDR_I(tmp2[1], ARM_SP, STACK_VAR(r2[1])), ctx);
958 emit(ARM_LDR_R(tmp[1], tmp2[1], tmp[1]), ctx); 1018 emit(ARM_LDR_R(tmp[1], tmp2[1], tmp[1]), ctx);
959 /* index (64 bit) */ 1019 /* index is 32-bit for arrays */
960 emit(ARM_LDR_I(tmp2[1], ARM_SP, STACK_VAR(r3[1])), ctx); 1020 emit(ARM_LDR_I(tmp2[1], ARM_SP, STACK_VAR(r3[1])), ctx);
961 /* index >= array->map.max_entries */ 1021 /* index >= array->map.max_entries */
962 emit(ARM_CMP_R(tmp2[1], tmp[1]), ctx); 1022 emit(ARM_CMP_R(tmp2[1], tmp[1]), ctx);
@@ -997,7 +1057,7 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx)
997 emit_a32_mov_i(tmp2[1], off, false, ctx); 1057 emit_a32_mov_i(tmp2[1], off, false, ctx);
998 emit(ARM_LDR_R(tmp[1], tmp[1], tmp2[1]), ctx); 1058 emit(ARM_LDR_R(tmp[1], tmp[1], tmp2[1]), ctx);
999 emit(ARM_ADD_I(tmp[1], tmp[1], ctx->prologue_bytes), ctx); 1059 emit(ARM_ADD_I(tmp[1], tmp[1], ctx->prologue_bytes), ctx);
1000 emit(ARM_BX(tmp[1]), ctx); 1060 emit_bx_r(tmp[1], ctx);
1001 1061
1002 /* out: */ 1062 /* out: */
1003 if (out_offset == -1) 1063 if (out_offset == -1)
@@ -1070,54 +1130,22 @@ static void build_prologue(struct jit_ctx *ctx)
1070 const u8 r2 = bpf2a32[BPF_REG_1][1]; 1130 const u8 r2 = bpf2a32[BPF_REG_1][1];
1071 const u8 r3 = bpf2a32[BPF_REG_1][0]; 1131 const u8 r3 = bpf2a32[BPF_REG_1][0];
1072 const u8 r4 = bpf2a32[BPF_REG_6][1]; 1132 const u8 r4 = bpf2a32[BPF_REG_6][1];
1073 const u8 r5 = bpf2a32[BPF_REG_6][0];
1074 const u8 r6 = bpf2a32[TMP_REG_1][1];
1075 const u8 r7 = bpf2a32[TMP_REG_1][0];
1076 const u8 r8 = bpf2a32[TMP_REG_2][1];
1077 const u8 r10 = bpf2a32[TMP_REG_2][0];
1078 const u8 fplo = bpf2a32[BPF_REG_FP][1]; 1133 const u8 fplo = bpf2a32[BPF_REG_FP][1];
1079 const u8 fphi = bpf2a32[BPF_REG_FP][0]; 1134 const u8 fphi = bpf2a32[BPF_REG_FP][0];
1080 const u8 sp = ARM_SP;
1081 const u8 *tcc = bpf2a32[TCALL_CNT]; 1135 const u8 *tcc = bpf2a32[TCALL_CNT];
1082 1136
1083 u16 reg_set = 0;
1084
1085 /*
1086 * eBPF prog stack layout
1087 *
1088 * high
1089 * original ARM_SP => +-----+ eBPF prologue
1090 * |FP/LR|
1091 * current ARM_FP => +-----+
1092 * | ... | callee saved registers
1093 * eBPF fp register => +-----+ <= (BPF_FP)
1094 * | ... | eBPF JIT scratch space
1095 * | | eBPF prog stack
1096 * +-----+
1097 * |RSVD | JIT scratchpad
1098 * current A64_SP => +-----+ <= (BPF_FP - STACK_SIZE)
1099 * | |
1100 * | ... | Function call stack
1101 * | |
1102 * +-----+
1103 * low
1104 */
1105
1106 /* Save callee saved registers. */ 1137 /* Save callee saved registers. */
1107 reg_set |= (1<<r4) | (1<<r5) | (1<<r6) | (1<<r7) | (1<<r8) | (1<<r10);
1108#ifdef CONFIG_FRAME_POINTER 1138#ifdef CONFIG_FRAME_POINTER
1109 reg_set |= (1<<ARM_FP) | (1<<ARM_IP) | (1<<ARM_LR) | (1<<ARM_PC); 1139 u16 reg_set = CALLEE_PUSH_MASK | 1 << ARM_IP | 1 << ARM_PC;
1110 emit(ARM_MOV_R(ARM_IP, sp), ctx); 1140 emit(ARM_MOV_R(ARM_IP, ARM_SP), ctx);
1111 emit(ARM_PUSH(reg_set), ctx); 1141 emit(ARM_PUSH(reg_set), ctx);
1112 emit(ARM_SUB_I(ARM_FP, ARM_IP, 4), ctx); 1142 emit(ARM_SUB_I(ARM_FP, ARM_IP, 4), ctx);
1113#else 1143#else
1114 /* Check if call instruction exists in BPF body */ 1144 emit(ARM_PUSH(CALLEE_PUSH_MASK), ctx);
1115 if (ctx->seen & SEEN_CALL) 1145 emit(ARM_MOV_R(ARM_FP, ARM_SP), ctx);
1116 reg_set |= (1<<ARM_LR);
1117 emit(ARM_PUSH(reg_set), ctx);
1118#endif 1146#endif
1119 /* Save frame pointer for later */ 1147 /* Save frame pointer for later */
1120 emit(ARM_SUB_I(ARM_IP, sp, SCRATCH_SIZE), ctx); 1148 emit(ARM_SUB_I(ARM_IP, ARM_SP, SCRATCH_SIZE), ctx);
1121 1149
1122 ctx->stack_size = imm8m(STACK_SIZE); 1150 ctx->stack_size = imm8m(STACK_SIZE);
1123 1151
@@ -1140,33 +1168,19 @@ static void build_prologue(struct jit_ctx *ctx)
1140 /* end of prologue */ 1168 /* end of prologue */
1141} 1169}
1142 1170
1171/* restore callee saved registers. */
1143static void build_epilogue(struct jit_ctx *ctx) 1172static void build_epilogue(struct jit_ctx *ctx)
1144{ 1173{
1145 const u8 r4 = bpf2a32[BPF_REG_6][1];
1146 const u8 r5 = bpf2a32[BPF_REG_6][0];
1147 const u8 r6 = bpf2a32[TMP_REG_1][1];
1148 const u8 r7 = bpf2a32[TMP_REG_1][0];
1149 const u8 r8 = bpf2a32[TMP_REG_2][1];
1150 const u8 r10 = bpf2a32[TMP_REG_2][0];
1151 u16 reg_set = 0;
1152
1153 /* unwind function call stack */
1154 emit(ARM_ADD_I(ARM_SP, ARM_SP, ctx->stack_size), ctx);
1155
1156 /* restore callee saved registers. */
1157 reg_set |= (1<<r4) | (1<<r5) | (1<<r6) | (1<<r7) | (1<<r8) | (1<<r10);
1158#ifdef CONFIG_FRAME_POINTER 1174#ifdef CONFIG_FRAME_POINTER
1159 /* the first instruction of the prologue was: mov ip, sp */ 1175 /* When using frame pointers, some additional registers need to
1160 reg_set |= (1<<ARM_FP) | (1<<ARM_SP) | (1<<ARM_PC); 1176 * be loaded. */
1177 u16 reg_set = CALLEE_POP_MASK | 1 << ARM_SP;
1178 emit(ARM_SUB_I(ARM_SP, ARM_FP, hweight16(reg_set) * 4), ctx);
1161 emit(ARM_LDM(ARM_SP, reg_set), ctx); 1179 emit(ARM_LDM(ARM_SP, reg_set), ctx);
1162#else 1180#else
1163 if (ctx->seen & SEEN_CALL)
1164 reg_set |= (1<<ARM_PC);
1165 /* Restore callee saved registers. */ 1181 /* Restore callee saved registers. */
1166 emit(ARM_POP(reg_set), ctx); 1182 emit(ARM_MOV_R(ARM_SP, ARM_FP), ctx);
1167 /* Return back to the callee function */ 1183 emit(ARM_POP(CALLEE_POP_MASK), ctx);
1168 if (!(ctx->seen & SEEN_CALL))
1169 emit(ARM_BX(ARM_LR), ctx);
1170#endif 1184#endif
1171} 1185}
1172 1186
@@ -1394,8 +1408,6 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
1394 emit_rev32(rt, rt, ctx); 1408 emit_rev32(rt, rt, ctx);
1395 goto emit_bswap_uxt; 1409 goto emit_bswap_uxt;
1396 case 64: 1410 case 64:
1397 /* Because of the usage of ARM_LR */
1398 ctx->seen |= SEEN_CALL;
1399 emit_rev32(ARM_LR, rt, ctx); 1411 emit_rev32(ARM_LR, rt, ctx);
1400 emit_rev32(rt, rd, ctx); 1412 emit_rev32(rt, rd, ctx);
1401 emit(ARM_MOV_R(rd, ARM_LR), ctx); 1413 emit(ARM_MOV_R(rd, ARM_LR), ctx);
@@ -1448,22 +1460,7 @@ exit:
1448 rn = sstk ? tmp2[1] : src_lo; 1460 rn = sstk ? tmp2[1] : src_lo;
1449 if (sstk) 1461 if (sstk)
1450 emit(ARM_LDR_I(rn, ARM_SP, STACK_VAR(src_lo)), ctx); 1462 emit(ARM_LDR_I(rn, ARM_SP, STACK_VAR(src_lo)), ctx);
1451 switch (BPF_SIZE(code)) { 1463 emit_ldx_r(dst, rn, dstk, off, ctx, BPF_SIZE(code));
1452 case BPF_W:
1453 /* Load a Word */
1454 case BPF_H:
1455 /* Load a Half-Word */
1456 case BPF_B:
1457 /* Load a Byte */
1458 emit_ldx_r(dst_lo, rn, dstk, off, ctx, BPF_SIZE(code));
1459 emit_a32_mov_i(dst_hi, 0, dstk, ctx);
1460 break;
1461 case BPF_DW:
1462 /* Load a double word */
1463 emit_ldx_r(dst_lo, rn, dstk, off, ctx, BPF_W);
1464 emit_ldx_r(dst_hi, rn, dstk, off+4, ctx, BPF_W);
1465 break;
1466 }
1467 break; 1464 break;
1468 /* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + imm)) */ 1465 /* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + imm)) */
1469 case BPF_LD | BPF_ABS | BPF_W: 1466 case BPF_LD | BPF_ABS | BPF_W:
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index a1f28a54f23a..60c4c342316c 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -244,6 +244,17 @@ ENTRY(__switch_to_asm)
244 movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset 244 movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset
245#endif 245#endif
246 246
247#ifdef CONFIG_RETPOLINE
248 /*
249 * When switching from a shallower to a deeper call stack
250 * the RSB may either underflow or use entries populated
251 * with userspace addresses. On CPUs where those concerns
252 * exist, overwrite the RSB with entries which capture
253 * speculative execution to prevent attack.
254 */
255 FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
256#endif
257
247 /* restore callee-saved registers */ 258 /* restore callee-saved registers */
248 popl %esi 259 popl %esi
249 popl %edi 260 popl %edi
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 4f8e1d35a97c..aa15b4c0e3d1 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -491,6 +491,17 @@ ENTRY(__switch_to_asm)
491 movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset 491 movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset
492#endif 492#endif
493 493
494#ifdef CONFIG_RETPOLINE
495 /*
496 * When switching from a shallower to a deeper call stack
497 * the RSB may either underflow or use entries populated
498 * with userspace addresses. On CPUs where those concerns
499 * exist, overwrite the RSB with entries which capture
500 * speculative execution to prevent attack.
501 */
502 FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
503#endif
504
494 /* restore callee-saved registers */ 505 /* restore callee-saved registers */
495 popq %r15 506 popq %r15
496 popq %r14 507 popq %r14
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 005908ee9333..a2efb490f743 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -755,14 +755,14 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
755 X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X, snbep_rapl_init), 755 X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X, snbep_rapl_init),
756 756
757 X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_CORE, hsw_rapl_init), 757 X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_CORE, hsw_rapl_init),
758 X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_X, hsw_rapl_init), 758 X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_X, hsx_rapl_init),
759 X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_ULT, hsw_rapl_init), 759 X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_ULT, hsw_rapl_init),
760 X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_GT3E, hsw_rapl_init), 760 X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_GT3E, hsw_rapl_init),
761 761
762 X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE, hsw_rapl_init), 762 X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE, hsw_rapl_init),
763 X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E, hsw_rapl_init), 763 X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E, hsw_rapl_init),
764 X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, hsx_rapl_init), 764 X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, hsx_rapl_init),
765 X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, hsw_rapl_init), 765 X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, hsx_rapl_init),
766 766
767 X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_rapl_init), 767 X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_rapl_init),
768 X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNM, knl_rapl_init), 768 X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNM, knl_rapl_init),
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index a9e57f08bfa6..98722773391d 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -136,6 +136,7 @@ extern void disconnect_bsp_APIC(int virt_wire_setup);
136extern void disable_local_APIC(void); 136extern void disable_local_APIC(void);
137extern void lapic_shutdown(void); 137extern void lapic_shutdown(void);
138extern void sync_Arb_IDs(void); 138extern void sync_Arb_IDs(void);
139extern void init_bsp_APIC(void);
139extern void apic_intr_mode_init(void); 140extern void apic_intr_mode_init(void);
140extern void setup_local_APIC(void); 141extern void setup_local_APIC(void);
141extern void init_apic_mappings(void); 142extern void init_apic_mappings(void);
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index f275447862f4..25b9375c1484 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -206,11 +206,11 @@
206#define X86_FEATURE_RETPOLINE ( 7*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */ 206#define X86_FEATURE_RETPOLINE ( 7*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */
207#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* AMD Retpoline mitigation for Spectre variant 2 */ 207#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* AMD Retpoline mitigation for Spectre variant 2 */
208#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ 208#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
209#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
210#define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */ 209#define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */
211#define X86_FEATURE_AVX512_4FMAPS ( 7*32+17) /* AVX-512 Multiply Accumulation Single precision */ 210#define X86_FEATURE_AVX512_4FMAPS ( 7*32+17) /* AVX-512 Multiply Accumulation Single precision */
212 211
213#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ 212#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
213#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */
214 214
215/* Virtualization flags: Linux defined, word 8 */ 215/* Virtualization flags: Linux defined, word 8 */
216#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ 216#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
@@ -245,6 +245,7 @@
245#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ 245#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */
246#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ 246#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
247#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ 247#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */
248#define X86_FEATURE_INTEL_PT ( 9*32+25) /* Intel Processor Trace */
248#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ 249#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */
249#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ 250#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
250#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ 251#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index c9459a4c3c68..22c5f3e6f820 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -39,7 +39,7 @@ void __init sme_unmap_bootdata(char *real_mode_data);
39 39
40void __init sme_early_init(void); 40void __init sme_early_init(void);
41 41
42void __init sme_encrypt_kernel(void); 42void __init sme_encrypt_kernel(struct boot_params *bp);
43void __init sme_enable(struct boot_params *bp); 43void __init sme_enable(struct boot_params *bp);
44 44
45int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size); 45int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size);
@@ -67,7 +67,7 @@ static inline void __init sme_unmap_bootdata(char *real_mode_data) { }
67 67
68static inline void __init sme_early_init(void) { } 68static inline void __init sme_early_init(void) { }
69 69
70static inline void __init sme_encrypt_kernel(void) { } 70static inline void __init sme_encrypt_kernel(struct boot_params *bp) { }
71static inline void __init sme_enable(struct boot_params *bp) { } 71static inline void __init sme_enable(struct boot_params *bp) { }
72 72
73static inline bool sme_active(void) { return false; } 73static inline bool sme_active(void) { return false; }
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 402a11c803c3..7b45d8424150 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -11,7 +11,7 @@
11 * Fill the CPU return stack buffer. 11 * Fill the CPU return stack buffer.
12 * 12 *
13 * Each entry in the RSB, if used for a speculative 'ret', contains an 13 * Each entry in the RSB, if used for a speculative 'ret', contains an
14 * infinite 'pause; jmp' loop to capture speculative execution. 14 * infinite 'pause; lfence; jmp' loop to capture speculative execution.
15 * 15 *
16 * This is required in various cases for retpoline and IBRS-based 16 * This is required in various cases for retpoline and IBRS-based
17 * mitigations for the Spectre variant 2 vulnerability. Sometimes to 17 * mitigations for the Spectre variant 2 vulnerability. Sometimes to
@@ -38,11 +38,13 @@
38 call 772f; \ 38 call 772f; \
39773: /* speculation trap */ \ 39773: /* speculation trap */ \
40 pause; \ 40 pause; \
41 lfence; \
41 jmp 773b; \ 42 jmp 773b; \
42772: \ 43772: \
43 call 774f; \ 44 call 774f; \
44775: /* speculation trap */ \ 45775: /* speculation trap */ \
45 pause; \ 46 pause; \
47 lfence; \
46 jmp 775b; \ 48 jmp 775b; \
47774: \ 49774: \
48 dec reg; \ 50 dec reg; \
@@ -73,6 +75,7 @@
73 call .Ldo_rop_\@ 75 call .Ldo_rop_\@
74.Lspec_trap_\@: 76.Lspec_trap_\@:
75 pause 77 pause
78 lfence
76 jmp .Lspec_trap_\@ 79 jmp .Lspec_trap_\@
77.Ldo_rop_\@: 80.Ldo_rop_\@:
78 mov \reg, (%_ASM_SP) 81 mov \reg, (%_ASM_SP)
@@ -165,6 +168,7 @@
165 " .align 16\n" \ 168 " .align 16\n" \
166 "901: call 903f;\n" \ 169 "901: call 903f;\n" \
167 "902: pause;\n" \ 170 "902: pause;\n" \
171 " lfence;\n" \
168 " jmp 902b;\n" \ 172 " jmp 902b;\n" \
169 " .align 16\n" \ 173 " .align 16\n" \
170 "903: addl $4, %%esp;\n" \ 174 "903: addl $4, %%esp;\n" \
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 880441f24146..25ddf02598d2 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1286,6 +1286,55 @@ static int __init apic_intr_mode_select(void)
1286 return APIC_SYMMETRIC_IO; 1286 return APIC_SYMMETRIC_IO;
1287} 1287}
1288 1288
1289/*
1290 * An initial setup of the virtual wire mode.
1291 */
1292void __init init_bsp_APIC(void)
1293{
1294 unsigned int value;
1295
1296 /*
1297 * Don't do the setup now if we have a SMP BIOS as the
1298 * through-I/O-APIC virtual wire mode might be active.
1299 */
1300 if (smp_found_config || !boot_cpu_has(X86_FEATURE_APIC))
1301 return;
1302
1303 /*
1304 * Do not trust the local APIC being empty at bootup.
1305 */
1306 clear_local_APIC();
1307
1308 /*
1309 * Enable APIC.
1310 */
1311 value = apic_read(APIC_SPIV);
1312 value &= ~APIC_VECTOR_MASK;
1313 value |= APIC_SPIV_APIC_ENABLED;
1314
1315#ifdef CONFIG_X86_32
1316 /* This bit is reserved on P4/Xeon and should be cleared */
1317 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
1318 (boot_cpu_data.x86 == 15))
1319 value &= ~APIC_SPIV_FOCUS_DISABLED;
1320 else
1321#endif
1322 value |= APIC_SPIV_FOCUS_DISABLED;
1323 value |= SPURIOUS_APIC_VECTOR;
1324 apic_write(APIC_SPIV, value);
1325
1326 /*
1327 * Set up the virtual wire mode.
1328 */
1329 apic_write(APIC_LVT0, APIC_DM_EXTINT);
1330 value = APIC_DM_NMI;
1331 if (!lapic_is_integrated()) /* 82489DX */
1332 value |= APIC_LVT_LEVEL_TRIGGER;
1333 if (apic_extnmi == APIC_EXTNMI_NONE)
1334 value |= APIC_LVT_MASKED;
1335 apic_write(APIC_LVT1, value);
1336}
1337
1289/* Init the interrupt delivery mode for the BSP */ 1338/* Init the interrupt delivery mode for the BSP */
1290void __init apic_intr_mode_init(void) 1339void __init apic_intr_mode_init(void)
1291{ 1340{
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index f8b03bb8e725..3cc471beb50b 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -542,14 +542,17 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
542 542
543 err = assign_irq_vector_policy(irqd, info); 543 err = assign_irq_vector_policy(irqd, info);
544 trace_vector_setup(virq + i, false, err); 544 trace_vector_setup(virq + i, false, err);
545 if (err) 545 if (err) {
546 irqd->chip_data = NULL;
547 free_apic_chip_data(apicd);
546 goto error; 548 goto error;
549 }
547 } 550 }
548 551
549 return 0; 552 return 0;
550 553
551error: 554error:
552 x86_vector_free_irqs(domain, virq, i + 1); 555 x86_vector_free_irqs(domain, virq, i);
553 return err; 556 return err;
554} 557}
555 558
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index e4dc26185aa7..390b3dc3d438 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -23,6 +23,7 @@
23#include <asm/alternative.h> 23#include <asm/alternative.h>
24#include <asm/pgtable.h> 24#include <asm/pgtable.h>
25#include <asm/set_memory.h> 25#include <asm/set_memory.h>
26#include <asm/intel-family.h>
26 27
27static void __init spectre_v2_select_mitigation(void); 28static void __init spectre_v2_select_mitigation(void);
28 29
@@ -155,6 +156,23 @@ disable:
155 return SPECTRE_V2_CMD_NONE; 156 return SPECTRE_V2_CMD_NONE;
156} 157}
157 158
159/* Check for Skylake-like CPUs (for RSB handling) */
160static bool __init is_skylake_era(void)
161{
162 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
163 boot_cpu_data.x86 == 6) {
164 switch (boot_cpu_data.x86_model) {
165 case INTEL_FAM6_SKYLAKE_MOBILE:
166 case INTEL_FAM6_SKYLAKE_DESKTOP:
167 case INTEL_FAM6_SKYLAKE_X:
168 case INTEL_FAM6_KABYLAKE_MOBILE:
169 case INTEL_FAM6_KABYLAKE_DESKTOP:
170 return true;
171 }
172 }
173 return false;
174}
175
158static void __init spectre_v2_select_mitigation(void) 176static void __init spectre_v2_select_mitigation(void)
159{ 177{
160 enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); 178 enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
@@ -213,6 +231,24 @@ retpoline_auto:
213 231
214 spectre_v2_enabled = mode; 232 spectre_v2_enabled = mode;
215 pr_info("%s\n", spectre_v2_strings[mode]); 233 pr_info("%s\n", spectre_v2_strings[mode]);
234
235 /*
236 * If neither SMEP or KPTI are available, there is a risk of
237 * hitting userspace addresses in the RSB after a context switch
238 * from a shallow call stack to a deeper one. To prevent this fill
239 * the entire RSB, even when using IBRS.
240 *
241 * Skylake era CPUs have a separate issue with *underflow* of the
242 * RSB, when they will predict 'ret' targets from the generic BTB.
243 * The proper mitigation for this is IBRS. If IBRS is not supported
244 * or deactivated in favour of retpolines the RSB fill on context
245 * switch is required.
246 */
247 if ((!boot_cpu_has(X86_FEATURE_PTI) &&
248 !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) {
249 setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
250 pr_info("Filling RSB on context switch\n");
251 }
216} 252}
217 253
218#undef pr_fmt 254#undef pr_fmt
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 88dcf8479013..99442370de40 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -525,10 +525,6 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
525 */ 525 */
526 if (static_branch_unlikely(&rdt_mon_enable_key)) 526 if (static_branch_unlikely(&rdt_mon_enable_key))
527 rmdir_mondata_subdir_allrdtgrp(r, d->id); 527 rmdir_mondata_subdir_allrdtgrp(r, d->id);
528 kfree(d->ctrl_val);
529 kfree(d->rmid_busy_llc);
530 kfree(d->mbm_total);
531 kfree(d->mbm_local);
532 list_del(&d->list); 528 list_del(&d->list);
533 if (is_mbm_enabled()) 529 if (is_mbm_enabled())
534 cancel_delayed_work(&d->mbm_over); 530 cancel_delayed_work(&d->mbm_over);
@@ -545,6 +541,10 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
545 cancel_delayed_work(&d->cqm_limbo); 541 cancel_delayed_work(&d->cqm_limbo);
546 } 542 }
547 543
544 kfree(d->ctrl_val);
545 kfree(d->rmid_busy_llc);
546 kfree(d->mbm_total);
547 kfree(d->mbm_local);
548 kfree(d); 548 kfree(d);
549 return; 549 return;
550 } 550 }
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 05459ad3db46..d0e69769abfd 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -21,7 +21,6 @@ struct cpuid_bit {
21static const struct cpuid_bit cpuid_bits[] = { 21static const struct cpuid_bit cpuid_bits[] = {
22 { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, 22 { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
23 { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, 23 { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
24 { X86_FEATURE_INTEL_PT, CPUID_EBX, 25, 0x00000007, 0 },
25 { X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 }, 24 { X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 },
26 { X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 }, 25 { X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 },
27 { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, 26 { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 6a5d757b9cfd..7ba5d819ebe3 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -157,8 +157,8 @@ unsigned long __head __startup_64(unsigned long physaddr,
157 p = fixup_pointer(&phys_base, physaddr); 157 p = fixup_pointer(&phys_base, physaddr);
158 *p += load_delta - sme_get_me_mask(); 158 *p += load_delta - sme_get_me_mask();
159 159
160 /* Encrypt the kernel (if SME is active) */ 160 /* Encrypt the kernel and related (if SME is active) */
161 sme_encrypt_kernel(); 161 sme_encrypt_kernel(bp);
162 162
163 /* 163 /*
164 * Return the SME encryption mask (if SME is active) to be used as a 164 * Return the SME encryption mask (if SME is active) to be used as a
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index d985cef3984f..56d99be3706a 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -56,7 +56,7 @@ struct idt_data {
56 * Early traps running on the DEFAULT_STACK because the other interrupt 56 * Early traps running on the DEFAULT_STACK because the other interrupt
57 * stacks work only after cpu_init(). 57 * stacks work only after cpu_init().
58 */ 58 */
59static const __initdata struct idt_data early_idts[] = { 59static const __initconst struct idt_data early_idts[] = {
60 INTG(X86_TRAP_DB, debug), 60 INTG(X86_TRAP_DB, debug),
61 SYSG(X86_TRAP_BP, int3), 61 SYSG(X86_TRAP_BP, int3),
62#ifdef CONFIG_X86_32 62#ifdef CONFIG_X86_32
@@ -70,7 +70,7 @@ static const __initdata struct idt_data early_idts[] = {
70 * the traps which use them are reinitialized with IST after cpu_init() has 70 * the traps which use them are reinitialized with IST after cpu_init() has
71 * set up TSS. 71 * set up TSS.
72 */ 72 */
73static const __initdata struct idt_data def_idts[] = { 73static const __initconst struct idt_data def_idts[] = {
74 INTG(X86_TRAP_DE, divide_error), 74 INTG(X86_TRAP_DE, divide_error),
75 INTG(X86_TRAP_NMI, nmi), 75 INTG(X86_TRAP_NMI, nmi),
76 INTG(X86_TRAP_BR, bounds), 76 INTG(X86_TRAP_BR, bounds),
@@ -108,7 +108,7 @@ static const __initdata struct idt_data def_idts[] = {
108/* 108/*
109 * The APIC and SMP idt entries 109 * The APIC and SMP idt entries
110 */ 110 */
111static const __initdata struct idt_data apic_idts[] = { 111static const __initconst struct idt_data apic_idts[] = {
112#ifdef CONFIG_SMP 112#ifdef CONFIG_SMP
113 INTG(RESCHEDULE_VECTOR, reschedule_interrupt), 113 INTG(RESCHEDULE_VECTOR, reschedule_interrupt),
114 INTG(CALL_FUNCTION_VECTOR, call_function_interrupt), 114 INTG(CALL_FUNCTION_VECTOR, call_function_interrupt),
@@ -150,7 +150,7 @@ static const __initdata struct idt_data apic_idts[] = {
150 * Early traps running on the DEFAULT_STACK because the other interrupt 150 * Early traps running on the DEFAULT_STACK because the other interrupt
151 * stacks work only after cpu_init(). 151 * stacks work only after cpu_init().
152 */ 152 */
153static const __initdata struct idt_data early_pf_idts[] = { 153static const __initconst struct idt_data early_pf_idts[] = {
154 INTG(X86_TRAP_PF, page_fault), 154 INTG(X86_TRAP_PF, page_fault),
155}; 155};
156 156
@@ -158,7 +158,7 @@ static const __initdata struct idt_data early_pf_idts[] = {
158 * Override for the debug_idt. Same as the default, but with interrupt 158 * Override for the debug_idt. Same as the default, but with interrupt
159 * stack set to DEFAULT_STACK (0). Required for NMI trap handling. 159 * stack set to DEFAULT_STACK (0). Required for NMI trap handling.
160 */ 160 */
161static const __initdata struct idt_data dbg_idts[] = { 161static const __initconst struct idt_data dbg_idts[] = {
162 INTG(X86_TRAP_DB, debug), 162 INTG(X86_TRAP_DB, debug),
163 INTG(X86_TRAP_BP, int3), 163 INTG(X86_TRAP_BP, int3),
164}; 164};
@@ -180,7 +180,7 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss;
180 * The exceptions which use Interrupt stacks. They are setup after 180 * The exceptions which use Interrupt stacks. They are setup after
181 * cpu_init() when the TSS has been initialized. 181 * cpu_init() when the TSS has been initialized.
182 */ 182 */
183static const __initdata struct idt_data ist_idts[] = { 183static const __initconst struct idt_data ist_idts[] = {
184 ISTG(X86_TRAP_DB, debug, DEBUG_STACK), 184 ISTG(X86_TRAP_DB, debug, DEBUG_STACK),
185 ISTG(X86_TRAP_NMI, nmi, NMI_STACK), 185 ISTG(X86_TRAP_NMI, nmi, NMI_STACK),
186 SISTG(X86_TRAP_BP, int3, DEBUG_STACK), 186 SISTG(X86_TRAP_BP, int3, DEBUG_STACK),
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 8da3e909e967..a539410c4ea9 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -61,6 +61,9 @@ void __init init_ISA_irqs(void)
61 struct irq_chip *chip = legacy_pic->chip; 61 struct irq_chip *chip = legacy_pic->chip;
62 int i; 62 int i;
63 63
64#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
65 init_bsp_APIC();
66#endif
64 legacy_pic->init(0); 67 legacy_pic->init(0);
65 68
66 for (i = 0; i < nr_legacy_irqs(); i++) 69 for (i = 0; i < nr_legacy_irqs(); i++)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 145810b0edf6..68d7ab81c62f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -364,16 +364,6 @@ static void __init reserve_initrd(void)
364 !ramdisk_image || !ramdisk_size) 364 !ramdisk_image || !ramdisk_size)
365 return; /* No initrd provided by bootloader */ 365 return; /* No initrd provided by bootloader */
366 366
367 /*
368 * If SME is active, this memory will be marked encrypted by the
369 * kernel when it is accessed (including relocation). However, the
370 * ramdisk image was loaded decrypted by the bootloader, so make
371 * sure that it is encrypted before accessing it. For SEV the
372 * ramdisk will already be encrypted, so only do this for SME.
373 */
374 if (sme_active())
375 sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image);
376
377 initrd_start = 0; 367 initrd_start = 0;
378 368
379 mapped_size = memblock_mem_size(max_pfn_mapped); 369 mapped_size = memblock_mem_size(max_pfn_mapped);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 8ea117f8142e..e169e85db434 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -602,7 +602,6 @@ unsigned long native_calibrate_tsc(void)
602 case INTEL_FAM6_KABYLAKE_DESKTOP: 602 case INTEL_FAM6_KABYLAKE_DESKTOP:
603 crystal_khz = 24000; /* 24.0 MHz */ 603 crystal_khz = 24000; /* 24.0 MHz */
604 break; 604 break;
605 case INTEL_FAM6_SKYLAKE_X:
606 case INTEL_FAM6_ATOM_DENVERTON: 605 case INTEL_FAM6_ATOM_DENVERTON:
607 crystal_khz = 25000; /* 25.0 MHz */ 606 crystal_khz = 25000; /* 25.0 MHz */
608 break; 607 break;
@@ -612,6 +611,8 @@ unsigned long native_calibrate_tsc(void)
612 } 611 }
613 } 612 }
614 613
614 if (crystal_khz == 0)
615 return 0;
615 /* 616 /*
616 * TSC frequency determined by CPUID is a "hardware reported" 617 * TSC frequency determined by CPUID is a "hardware reported"
617 * frequency and is the most accurate one so far we have. This 618 * frequency and is the most accurate one so far we have. This
@@ -1315,6 +1316,12 @@ void __init tsc_init(void)
1315 (unsigned long)cpu_khz / 1000, 1316 (unsigned long)cpu_khz / 1000,
1316 (unsigned long)cpu_khz % 1000); 1317 (unsigned long)cpu_khz % 1000);
1317 1318
1319 if (cpu_khz != tsc_khz) {
1320 pr_info("Detected %lu.%03lu MHz TSC",
1321 (unsigned long)tsc_khz / 1000,
1322 (unsigned long)tsc_khz % 1000);
1323 }
1324
1318 /* Sanitize TSC ADJUST before cyc2ns gets initialized */ 1325 /* Sanitize TSC ADJUST before cyc2ns gets initialized */
1319 tsc_store_and_check_tsc_adjust(true); 1326 tsc_store_and_check_tsc_adjust(true);
1320 1327
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 06fe3d51d385..b3e40773dce0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -172,14 +172,15 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
172 * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really 172 * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
173 * faulted on a pte with its pkey=4. 173 * faulted on a pte with its pkey=4.
174 */ 174 */
175static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey) 175static void fill_sig_info_pkey(int si_signo, int si_code, siginfo_t *info,
176 u32 *pkey)
176{ 177{
177 /* This is effectively an #ifdef */ 178 /* This is effectively an #ifdef */
178 if (!boot_cpu_has(X86_FEATURE_OSPKE)) 179 if (!boot_cpu_has(X86_FEATURE_OSPKE))
179 return; 180 return;
180 181
181 /* Fault not from Protection Keys: nothing to do */ 182 /* Fault not from Protection Keys: nothing to do */
182 if (si_code != SEGV_PKUERR) 183 if ((si_code != SEGV_PKUERR) || (si_signo != SIGSEGV))
183 return; 184 return;
184 /* 185 /*
185 * force_sig_info_fault() is called from a number of 186 * force_sig_info_fault() is called from a number of
@@ -218,7 +219,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
218 lsb = PAGE_SHIFT; 219 lsb = PAGE_SHIFT;
219 info.si_addr_lsb = lsb; 220 info.si_addr_lsb = lsb;
220 221
221 fill_sig_info_pkey(si_code, &info, pkey); 222 fill_sig_info_pkey(si_signo, si_code, &info, pkey);
222 223
223 force_sig_info(si_signo, &info, tsk); 224 force_sig_info(si_signo, &info, tsk);
224} 225}
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 47388f0c0e59..af6f2f9c6a26 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -21,10 +21,14 @@ extern struct range pfn_mapped[E820_MAX_ENTRIES];
21 21
22static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); 22static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
23 23
24static __init void *early_alloc(size_t size, int nid) 24static __init void *early_alloc(size_t size, int nid, bool panic)
25{ 25{
26 return memblock_virt_alloc_try_nid_nopanic(size, size, 26 if (panic)
27 __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid); 27 return memblock_virt_alloc_try_nid(size, size,
28 __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid);
29 else
30 return memblock_virt_alloc_try_nid_nopanic(size, size,
31 __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid);
28} 32}
29 33
30static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr, 34static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr,
@@ -38,14 +42,14 @@ static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr,
38 if (boot_cpu_has(X86_FEATURE_PSE) && 42 if (boot_cpu_has(X86_FEATURE_PSE) &&
39 ((end - addr) == PMD_SIZE) && 43 ((end - addr) == PMD_SIZE) &&
40 IS_ALIGNED(addr, PMD_SIZE)) { 44 IS_ALIGNED(addr, PMD_SIZE)) {
41 p = early_alloc(PMD_SIZE, nid); 45 p = early_alloc(PMD_SIZE, nid, false);
42 if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL)) 46 if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL))
43 return; 47 return;
44 else if (p) 48 else if (p)
45 memblock_free(__pa(p), PMD_SIZE); 49 memblock_free(__pa(p), PMD_SIZE);
46 } 50 }
47 51
48 p = early_alloc(PAGE_SIZE, nid); 52 p = early_alloc(PAGE_SIZE, nid, true);
49 pmd_populate_kernel(&init_mm, pmd, p); 53 pmd_populate_kernel(&init_mm, pmd, p);
50 } 54 }
51 55
@@ -57,7 +61,7 @@ static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr,
57 if (!pte_none(*pte)) 61 if (!pte_none(*pte))
58 continue; 62 continue;
59 63
60 p = early_alloc(PAGE_SIZE, nid); 64 p = early_alloc(PAGE_SIZE, nid, true);
61 entry = pfn_pte(PFN_DOWN(__pa(p)), PAGE_KERNEL); 65 entry = pfn_pte(PFN_DOWN(__pa(p)), PAGE_KERNEL);
62 set_pte_at(&init_mm, addr, pte, entry); 66 set_pte_at(&init_mm, addr, pte, entry);
63 } while (pte++, addr += PAGE_SIZE, addr != end); 67 } while (pte++, addr += PAGE_SIZE, addr != end);
@@ -75,14 +79,14 @@ static void __init kasan_populate_pud(pud_t *pud, unsigned long addr,
75 if (boot_cpu_has(X86_FEATURE_GBPAGES) && 79 if (boot_cpu_has(X86_FEATURE_GBPAGES) &&
76 ((end - addr) == PUD_SIZE) && 80 ((end - addr) == PUD_SIZE) &&
77 IS_ALIGNED(addr, PUD_SIZE)) { 81 IS_ALIGNED(addr, PUD_SIZE)) {
78 p = early_alloc(PUD_SIZE, nid); 82 p = early_alloc(PUD_SIZE, nid, false);
79 if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL)) 83 if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL))
80 return; 84 return;
81 else if (p) 85 else if (p)
82 memblock_free(__pa(p), PUD_SIZE); 86 memblock_free(__pa(p), PUD_SIZE);
83 } 87 }
84 88
85 p = early_alloc(PAGE_SIZE, nid); 89 p = early_alloc(PAGE_SIZE, nid, true);
86 pud_populate(&init_mm, pud, p); 90 pud_populate(&init_mm, pud, p);
87 } 91 }
88 92
@@ -101,7 +105,7 @@ static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr,
101 unsigned long next; 105 unsigned long next;
102 106
103 if (p4d_none(*p4d)) { 107 if (p4d_none(*p4d)) {
104 void *p = early_alloc(PAGE_SIZE, nid); 108 void *p = early_alloc(PAGE_SIZE, nid, true);
105 109
106 p4d_populate(&init_mm, p4d, p); 110 p4d_populate(&init_mm, p4d, p);
107 } 111 }
@@ -122,7 +126,7 @@ static void __init kasan_populate_pgd(pgd_t *pgd, unsigned long addr,
122 unsigned long next; 126 unsigned long next;
123 127
124 if (pgd_none(*pgd)) { 128 if (pgd_none(*pgd)) {
125 p = early_alloc(PAGE_SIZE, nid); 129 p = early_alloc(PAGE_SIZE, nid, true);
126 pgd_populate(&init_mm, pgd, p); 130 pgd_populate(&init_mm, pgd, p);
127 } 131 }
128 132
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 391b13402e40..3ef362f598e3 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -464,37 +464,62 @@ void swiotlb_set_mem_attributes(void *vaddr, unsigned long size)
464 set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT); 464 set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT);
465} 465}
466 466
467static void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start, 467struct sme_populate_pgd_data {
468 unsigned long end) 468 void *pgtable_area;
469 pgd_t *pgd;
470
471 pmdval_t pmd_flags;
472 pteval_t pte_flags;
473 unsigned long paddr;
474
475 unsigned long vaddr;
476 unsigned long vaddr_end;
477};
478
479static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd)
469{ 480{
470 unsigned long pgd_start, pgd_end, pgd_size; 481 unsigned long pgd_start, pgd_end, pgd_size;
471 pgd_t *pgd_p; 482 pgd_t *pgd_p;
472 483
473 pgd_start = start & PGDIR_MASK; 484 pgd_start = ppd->vaddr & PGDIR_MASK;
474 pgd_end = end & PGDIR_MASK; 485 pgd_end = ppd->vaddr_end & PGDIR_MASK;
475 486
476 pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1); 487 pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1) * sizeof(pgd_t);
477 pgd_size *= sizeof(pgd_t);
478 488
479 pgd_p = pgd_base + pgd_index(start); 489 pgd_p = ppd->pgd + pgd_index(ppd->vaddr);
480 490
481 memset(pgd_p, 0, pgd_size); 491 memset(pgd_p, 0, pgd_size);
482} 492}
483 493
484#define PGD_FLAGS _KERNPG_TABLE_NOENC 494#define PGD_FLAGS _KERNPG_TABLE_NOENC
485#define P4D_FLAGS _KERNPG_TABLE_NOENC 495#define P4D_FLAGS _KERNPG_TABLE_NOENC
486#define PUD_FLAGS _KERNPG_TABLE_NOENC 496#define PUD_FLAGS _KERNPG_TABLE_NOENC
487#define PMD_FLAGS (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) 497#define PMD_FLAGS _KERNPG_TABLE_NOENC
498
499#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
500
501#define PMD_FLAGS_DEC PMD_FLAGS_LARGE
502#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \
503 (_PAGE_PAT | _PAGE_PWT))
504
505#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC)
506
507#define PTE_FLAGS (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL)
508
509#define PTE_FLAGS_DEC PTE_FLAGS
510#define PTE_FLAGS_DEC_WP ((PTE_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \
511 (_PAGE_PAT | _PAGE_PWT))
512
513#define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC)
488 514
489static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area, 515static pmd_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd)
490 unsigned long vaddr, pmdval_t pmd_val)
491{ 516{
492 pgd_t *pgd_p; 517 pgd_t *pgd_p;
493 p4d_t *p4d_p; 518 p4d_t *p4d_p;
494 pud_t *pud_p; 519 pud_t *pud_p;
495 pmd_t *pmd_p; 520 pmd_t *pmd_p;
496 521
497 pgd_p = pgd_base + pgd_index(vaddr); 522 pgd_p = ppd->pgd + pgd_index(ppd->vaddr);
498 if (native_pgd_val(*pgd_p)) { 523 if (native_pgd_val(*pgd_p)) {
499 if (IS_ENABLED(CONFIG_X86_5LEVEL)) 524 if (IS_ENABLED(CONFIG_X86_5LEVEL))
500 p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); 525 p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
@@ -504,15 +529,15 @@ static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area,
504 pgd_t pgd; 529 pgd_t pgd;
505 530
506 if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 531 if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
507 p4d_p = pgtable_area; 532 p4d_p = ppd->pgtable_area;
508 memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D); 533 memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
509 pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D; 534 ppd->pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D;
510 535
511 pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS); 536 pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS);
512 } else { 537 } else {
513 pud_p = pgtable_area; 538 pud_p = ppd->pgtable_area;
514 memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); 539 memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
515 pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; 540 ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
516 541
517 pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS); 542 pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS);
518 } 543 }
@@ -520,58 +545,160 @@ static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area,
520 } 545 }
521 546
522 if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 547 if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
523 p4d_p += p4d_index(vaddr); 548 p4d_p += p4d_index(ppd->vaddr);
524 if (native_p4d_val(*p4d_p)) { 549 if (native_p4d_val(*p4d_p)) {
525 pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK); 550 pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK);
526 } else { 551 } else {
527 p4d_t p4d; 552 p4d_t p4d;
528 553
529 pud_p = pgtable_area; 554 pud_p = ppd->pgtable_area;
530 memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); 555 memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
531 pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; 556 ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
532 557
533 p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS); 558 p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS);
534 native_set_p4d(p4d_p, p4d); 559 native_set_p4d(p4d_p, p4d);
535 } 560 }
536 } 561 }
537 562
538 pud_p += pud_index(vaddr); 563 pud_p += pud_index(ppd->vaddr);
539 if (native_pud_val(*pud_p)) { 564 if (native_pud_val(*pud_p)) {
540 if (native_pud_val(*pud_p) & _PAGE_PSE) 565 if (native_pud_val(*pud_p) & _PAGE_PSE)
541 goto out; 566 return NULL;
542 567
543 pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK); 568 pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK);
544 } else { 569 } else {
545 pud_t pud; 570 pud_t pud;
546 571
547 pmd_p = pgtable_area; 572 pmd_p = ppd->pgtable_area;
548 memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); 573 memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
549 pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD; 574 ppd->pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD;
550 575
551 pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS); 576 pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS);
552 native_set_pud(pud_p, pud); 577 native_set_pud(pud_p, pud);
553 } 578 }
554 579
555 pmd_p += pmd_index(vaddr); 580 return pmd_p;
581}
582
583static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd)
584{
585 pmd_t *pmd_p;
586
587 pmd_p = sme_prepare_pgd(ppd);
588 if (!pmd_p)
589 return;
590
591 pmd_p += pmd_index(ppd->vaddr);
556 if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE)) 592 if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE))
557 native_set_pmd(pmd_p, native_make_pmd(pmd_val)); 593 native_set_pmd(pmd_p, native_make_pmd(ppd->paddr | ppd->pmd_flags));
594}
558 595
559out: 596static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd)
560 return pgtable_area; 597{
598 pmd_t *pmd_p;
599 pte_t *pte_p;
600
601 pmd_p = sme_prepare_pgd(ppd);
602 if (!pmd_p)
603 return;
604
605 pmd_p += pmd_index(ppd->vaddr);
606 if (native_pmd_val(*pmd_p)) {
607 if (native_pmd_val(*pmd_p) & _PAGE_PSE)
608 return;
609
610 pte_p = (pte_t *)(native_pmd_val(*pmd_p) & ~PTE_FLAGS_MASK);
611 } else {
612 pmd_t pmd;
613
614 pte_p = ppd->pgtable_area;
615 memset(pte_p, 0, sizeof(*pte_p) * PTRS_PER_PTE);
616 ppd->pgtable_area += sizeof(*pte_p) * PTRS_PER_PTE;
617
618 pmd = native_make_pmd((pteval_t)pte_p + PMD_FLAGS);
619 native_set_pmd(pmd_p, pmd);
620 }
621
622 pte_p += pte_index(ppd->vaddr);
623 if (!native_pte_val(*pte_p))
624 native_set_pte(pte_p, native_make_pte(ppd->paddr | ppd->pte_flags));
625}
626
627static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
628{
629 while (ppd->vaddr < ppd->vaddr_end) {
630 sme_populate_pgd_large(ppd);
631
632 ppd->vaddr += PMD_PAGE_SIZE;
633 ppd->paddr += PMD_PAGE_SIZE;
634 }
635}
636
637static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd)
638{
639 while (ppd->vaddr < ppd->vaddr_end) {
640 sme_populate_pgd(ppd);
641
642 ppd->vaddr += PAGE_SIZE;
643 ppd->paddr += PAGE_SIZE;
644 }
645}
646
647static void __init __sme_map_range(struct sme_populate_pgd_data *ppd,
648 pmdval_t pmd_flags, pteval_t pte_flags)
649{
650 unsigned long vaddr_end;
651
652 ppd->pmd_flags = pmd_flags;
653 ppd->pte_flags = pte_flags;
654
655 /* Save original end value since we modify the struct value */
656 vaddr_end = ppd->vaddr_end;
657
658 /* If start is not 2MB aligned, create PTE entries */
659 ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_PAGE_SIZE);
660 __sme_map_range_pte(ppd);
661
662 /* Create PMD entries */
663 ppd->vaddr_end = vaddr_end & PMD_PAGE_MASK;
664 __sme_map_range_pmd(ppd);
665
666 /* If end is not 2MB aligned, create PTE entries */
667 ppd->vaddr_end = vaddr_end;
668 __sme_map_range_pte(ppd);
669}
670
671static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd)
672{
673 __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC);
674}
675
676static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd)
677{
678 __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC);
679}
680
681static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd)
682{
683 __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP);
561} 684}
562 685
563static unsigned long __init sme_pgtable_calc(unsigned long len) 686static unsigned long __init sme_pgtable_calc(unsigned long len)
564{ 687{
565 unsigned long p4d_size, pud_size, pmd_size; 688 unsigned long p4d_size, pud_size, pmd_size, pte_size;
566 unsigned long total; 689 unsigned long total;
567 690
568 /* 691 /*
569 * Perform a relatively simplistic calculation of the pagetable 692 * Perform a relatively simplistic calculation of the pagetable
570 * entries that are needed. That mappings will be covered by 2MB 693 * entries that are needed. Those mappings will be covered mostly
571 * PMD entries so we can conservatively calculate the required 694 * by 2MB PMD entries so we can conservatively calculate the required
572 * number of P4D, PUD and PMD structures needed to perform the 695 * number of P4D, PUD and PMD structures needed to perform the
573 * mappings. Incrementing the count for each covers the case where 696 * mappings. For mappings that are not 2MB aligned, PTE mappings
574 * the addresses cross entries. 697 * would be needed for the start and end portion of the address range
698 * that fall outside of the 2MB alignment. This results in, at most,
699 * two extra pages to hold PTE entries for each range that is mapped.
700 * Incrementing the count for each covers the case where the addresses
701 * cross entries.
575 */ 702 */
576 if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 703 if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
577 p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; 704 p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
@@ -585,8 +712,9 @@ static unsigned long __init sme_pgtable_calc(unsigned long len)
585 } 712 }
586 pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1; 713 pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1;
587 pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; 714 pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
715 pte_size = 2 * sizeof(pte_t) * PTRS_PER_PTE;
588 716
589 total = p4d_size + pud_size + pmd_size; 717 total = p4d_size + pud_size + pmd_size + pte_size;
590 718
591 /* 719 /*
592 * Now calculate the added pagetable structures needed to populate 720 * Now calculate the added pagetable structures needed to populate
@@ -610,29 +738,29 @@ static unsigned long __init sme_pgtable_calc(unsigned long len)
610 return total; 738 return total;
611} 739}
612 740
613void __init sme_encrypt_kernel(void) 741void __init sme_encrypt_kernel(struct boot_params *bp)
614{ 742{
615 unsigned long workarea_start, workarea_end, workarea_len; 743 unsigned long workarea_start, workarea_end, workarea_len;
616 unsigned long execute_start, execute_end, execute_len; 744 unsigned long execute_start, execute_end, execute_len;
617 unsigned long kernel_start, kernel_end, kernel_len; 745 unsigned long kernel_start, kernel_end, kernel_len;
746 unsigned long initrd_start, initrd_end, initrd_len;
747 struct sme_populate_pgd_data ppd;
618 unsigned long pgtable_area_len; 748 unsigned long pgtable_area_len;
619 unsigned long paddr, pmd_flags;
620 unsigned long decrypted_base; 749 unsigned long decrypted_base;
621 void *pgtable_area;
622 pgd_t *pgd;
623 750
624 if (!sme_active()) 751 if (!sme_active())
625 return; 752 return;
626 753
627 /* 754 /*
628 * Prepare for encrypting the kernel by building new pagetables with 755 * Prepare for encrypting the kernel and initrd by building new
629 * the necessary attributes needed to encrypt the kernel in place. 756 * pagetables with the necessary attributes needed to encrypt the
757 * kernel in place.
630 * 758 *
631 * One range of virtual addresses will map the memory occupied 759 * One range of virtual addresses will map the memory occupied
632 * by the kernel as encrypted. 760 * by the kernel and initrd as encrypted.
633 * 761 *
634 * Another range of virtual addresses will map the memory occupied 762 * Another range of virtual addresses will map the memory occupied
635 * by the kernel as decrypted and write-protected. 763 * by the kernel and initrd as decrypted and write-protected.
636 * 764 *
637 * The use of write-protect attribute will prevent any of the 765 * The use of write-protect attribute will prevent any of the
638 * memory from being cached. 766 * memory from being cached.
@@ -643,6 +771,20 @@ void __init sme_encrypt_kernel(void)
643 kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE); 771 kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE);
644 kernel_len = kernel_end - kernel_start; 772 kernel_len = kernel_end - kernel_start;
645 773
774 initrd_start = 0;
775 initrd_end = 0;
776 initrd_len = 0;
777#ifdef CONFIG_BLK_DEV_INITRD
778 initrd_len = (unsigned long)bp->hdr.ramdisk_size |
779 ((unsigned long)bp->ext_ramdisk_size << 32);
780 if (initrd_len) {
781 initrd_start = (unsigned long)bp->hdr.ramdisk_image |
782 ((unsigned long)bp->ext_ramdisk_image << 32);
783 initrd_end = PAGE_ALIGN(initrd_start + initrd_len);
784 initrd_len = initrd_end - initrd_start;
785 }
786#endif
787
646 /* Set the encryption workarea to be immediately after the kernel */ 788 /* Set the encryption workarea to be immediately after the kernel */
647 workarea_start = kernel_end; 789 workarea_start = kernel_end;
648 790
@@ -665,16 +807,21 @@ void __init sme_encrypt_kernel(void)
665 */ 807 */
666 pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD; 808 pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD;
667 pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2; 809 pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2;
810 if (initrd_len)
811 pgtable_area_len += sme_pgtable_calc(initrd_len) * 2;
668 812
669 /* PUDs and PMDs needed in the current pagetables for the workarea */ 813 /* PUDs and PMDs needed in the current pagetables for the workarea */
670 pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len); 814 pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len);
671 815
672 /* 816 /*
673 * The total workarea includes the executable encryption area and 817 * The total workarea includes the executable encryption area and
674 * the pagetable area. 818 * the pagetable area. The start of the workarea is already 2MB
819 * aligned, align the end of the workarea on a 2MB boundary so that
820 * we don't try to create/allocate PTE entries from the workarea
821 * before it is mapped.
675 */ 822 */
676 workarea_len = execute_len + pgtable_area_len; 823 workarea_len = execute_len + pgtable_area_len;
677 workarea_end = workarea_start + workarea_len; 824 workarea_end = ALIGN(workarea_start + workarea_len, PMD_PAGE_SIZE);
678 825
679 /* 826 /*
680 * Set the address to the start of where newly created pagetable 827 * Set the address to the start of where newly created pagetable
@@ -683,45 +830,30 @@ void __init sme_encrypt_kernel(void)
683 * pagetables and when the new encrypted and decrypted kernel 830 * pagetables and when the new encrypted and decrypted kernel
684 * mappings are populated. 831 * mappings are populated.
685 */ 832 */
686 pgtable_area = (void *)execute_end; 833 ppd.pgtable_area = (void *)execute_end;
687 834
688 /* 835 /*
689 * Make sure the current pagetable structure has entries for 836 * Make sure the current pagetable structure has entries for
690 * addressing the workarea. 837 * addressing the workarea.
691 */ 838 */
692 pgd = (pgd_t *)native_read_cr3_pa(); 839 ppd.pgd = (pgd_t *)native_read_cr3_pa();
693 paddr = workarea_start; 840 ppd.paddr = workarea_start;
694 while (paddr < workarea_end) { 841 ppd.vaddr = workarea_start;
695 pgtable_area = sme_populate_pgd(pgd, pgtable_area, 842 ppd.vaddr_end = workarea_end;
696 paddr, 843 sme_map_range_decrypted(&ppd);
697 paddr + PMD_FLAGS);
698
699 paddr += PMD_PAGE_SIZE;
700 }
701 844
702 /* Flush the TLB - no globals so cr3 is enough */ 845 /* Flush the TLB - no globals so cr3 is enough */
703 native_write_cr3(__native_read_cr3()); 846 native_write_cr3(__native_read_cr3());
704 847
705 /* 848 /*
706 * A new pagetable structure is being built to allow for the kernel 849 * A new pagetable structure is being built to allow for the kernel
707 * to be encrypted. It starts with an empty PGD that will then be 850 * and initrd to be encrypted. It starts with an empty PGD that will
708 * populated with new PUDs and PMDs as the encrypted and decrypted 851 * then be populated with new PUDs and PMDs as the encrypted and
709 * kernel mappings are created. 852 * decrypted kernel mappings are created.
710 */ 853 */
711 pgd = pgtable_area; 854 ppd.pgd = ppd.pgtable_area;
712 memset(pgd, 0, sizeof(*pgd) * PTRS_PER_PGD); 855 memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD);
713 pgtable_area += sizeof(*pgd) * PTRS_PER_PGD; 856 ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD;
714
715 /* Add encrypted kernel (identity) mappings */
716 pmd_flags = PMD_FLAGS | _PAGE_ENC;
717 paddr = kernel_start;
718 while (paddr < kernel_end) {
719 pgtable_area = sme_populate_pgd(pgd, pgtable_area,
720 paddr,
721 paddr + pmd_flags);
722
723 paddr += PMD_PAGE_SIZE;
724 }
725 857
726 /* 858 /*
727 * A different PGD index/entry must be used to get different 859 * A different PGD index/entry must be used to get different
@@ -730,47 +862,79 @@ void __init sme_encrypt_kernel(void)
730 * the base of the mapping. 862 * the base of the mapping.
731 */ 863 */
732 decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1); 864 decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1);
865 if (initrd_len) {
866 unsigned long check_base;
867
868 check_base = (pgd_index(initrd_end) + 1) & (PTRS_PER_PGD - 1);
869 decrypted_base = max(decrypted_base, check_base);
870 }
733 decrypted_base <<= PGDIR_SHIFT; 871 decrypted_base <<= PGDIR_SHIFT;
734 872
873 /* Add encrypted kernel (identity) mappings */
874 ppd.paddr = kernel_start;
875 ppd.vaddr = kernel_start;
876 ppd.vaddr_end = kernel_end;
877 sme_map_range_encrypted(&ppd);
878
735 /* Add decrypted, write-protected kernel (non-identity) mappings */ 879 /* Add decrypted, write-protected kernel (non-identity) mappings */
736 pmd_flags = (PMD_FLAGS & ~_PAGE_CACHE_MASK) | (_PAGE_PAT | _PAGE_PWT); 880 ppd.paddr = kernel_start;
737 paddr = kernel_start; 881 ppd.vaddr = kernel_start + decrypted_base;
738 while (paddr < kernel_end) { 882 ppd.vaddr_end = kernel_end + decrypted_base;
739 pgtable_area = sme_populate_pgd(pgd, pgtable_area, 883 sme_map_range_decrypted_wp(&ppd);
740 paddr + decrypted_base, 884
741 paddr + pmd_flags); 885 if (initrd_len) {
742 886 /* Add encrypted initrd (identity) mappings */
743 paddr += PMD_PAGE_SIZE; 887 ppd.paddr = initrd_start;
888 ppd.vaddr = initrd_start;
889 ppd.vaddr_end = initrd_end;
890 sme_map_range_encrypted(&ppd);
891 /*
892 * Add decrypted, write-protected initrd (non-identity) mappings
893 */
894 ppd.paddr = initrd_start;
895 ppd.vaddr = initrd_start + decrypted_base;
896 ppd.vaddr_end = initrd_end + decrypted_base;
897 sme_map_range_decrypted_wp(&ppd);
744 } 898 }
745 899
746 /* Add decrypted workarea mappings to both kernel mappings */ 900 /* Add decrypted workarea mappings to both kernel mappings */
747 paddr = workarea_start; 901 ppd.paddr = workarea_start;
748 while (paddr < workarea_end) { 902 ppd.vaddr = workarea_start;
749 pgtable_area = sme_populate_pgd(pgd, pgtable_area, 903 ppd.vaddr_end = workarea_end;
750 paddr, 904 sme_map_range_decrypted(&ppd);
751 paddr + PMD_FLAGS);
752 905
753 pgtable_area = sme_populate_pgd(pgd, pgtable_area, 906 ppd.paddr = workarea_start;
754 paddr + decrypted_base, 907 ppd.vaddr = workarea_start + decrypted_base;
755 paddr + PMD_FLAGS); 908 ppd.vaddr_end = workarea_end + decrypted_base;
756 909 sme_map_range_decrypted(&ppd);
757 paddr += PMD_PAGE_SIZE;
758 }
759 910
760 /* Perform the encryption */ 911 /* Perform the encryption */
761 sme_encrypt_execute(kernel_start, kernel_start + decrypted_base, 912 sme_encrypt_execute(kernel_start, kernel_start + decrypted_base,
762 kernel_len, workarea_start, (unsigned long)pgd); 913 kernel_len, workarea_start, (unsigned long)ppd.pgd);
914
915 if (initrd_len)
916 sme_encrypt_execute(initrd_start, initrd_start + decrypted_base,
917 initrd_len, workarea_start,
918 (unsigned long)ppd.pgd);
763 919
764 /* 920 /*
765 * At this point we are running encrypted. Remove the mappings for 921 * At this point we are running encrypted. Remove the mappings for
766 * the decrypted areas - all that is needed for this is to remove 922 * the decrypted areas - all that is needed for this is to remove
767 * the PGD entry/entries. 923 * the PGD entry/entries.
768 */ 924 */
769 sme_clear_pgd(pgd, kernel_start + decrypted_base, 925 ppd.vaddr = kernel_start + decrypted_base;
770 kernel_end + decrypted_base); 926 ppd.vaddr_end = kernel_end + decrypted_base;
927 sme_clear_pgd(&ppd);
928
929 if (initrd_len) {
930 ppd.vaddr = initrd_start + decrypted_base;
931 ppd.vaddr_end = initrd_end + decrypted_base;
932 sme_clear_pgd(&ppd);
933 }
771 934
772 sme_clear_pgd(pgd, workarea_start + decrypted_base, 935 ppd.vaddr = workarea_start + decrypted_base;
773 workarea_end + decrypted_base); 936 ppd.vaddr_end = workarea_end + decrypted_base;
937 sme_clear_pgd(&ppd);
774 938
775 /* Flush the TLB - no globals so cr3 is enough */ 939 /* Flush the TLB - no globals so cr3 is enough */
776 native_write_cr3(__native_read_cr3()); 940 native_write_cr3(__native_read_cr3());
diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S
index 730e6d541df1..01f682cf77a8 100644
--- a/arch/x86/mm/mem_encrypt_boot.S
+++ b/arch/x86/mm/mem_encrypt_boot.S
@@ -22,9 +22,9 @@ ENTRY(sme_encrypt_execute)
22 22
23 /* 23 /*
24 * Entry parameters: 24 * Entry parameters:
25 * RDI - virtual address for the encrypted kernel mapping 25 * RDI - virtual address for the encrypted mapping
26 * RSI - virtual address for the decrypted kernel mapping 26 * RSI - virtual address for the decrypted mapping
27 * RDX - length of kernel 27 * RDX - length to encrypt
28 * RCX - virtual address of the encryption workarea, including: 28 * RCX - virtual address of the encryption workarea, including:
29 * - stack page (PAGE_SIZE) 29 * - stack page (PAGE_SIZE)
30 * - encryption routine page (PAGE_SIZE) 30 * - encryption routine page (PAGE_SIZE)
@@ -41,9 +41,9 @@ ENTRY(sme_encrypt_execute)
41 addq $PAGE_SIZE, %rax /* Workarea encryption routine */ 41 addq $PAGE_SIZE, %rax /* Workarea encryption routine */
42 42
43 push %r12 43 push %r12
44 movq %rdi, %r10 /* Encrypted kernel */ 44 movq %rdi, %r10 /* Encrypted area */
45 movq %rsi, %r11 /* Decrypted kernel */ 45 movq %rsi, %r11 /* Decrypted area */
46 movq %rdx, %r12 /* Kernel length */ 46 movq %rdx, %r12 /* Area length */
47 47
48 /* Copy encryption routine into the workarea */ 48 /* Copy encryption routine into the workarea */
49 movq %rax, %rdi /* Workarea encryption routine */ 49 movq %rax, %rdi /* Workarea encryption routine */
@@ -52,10 +52,10 @@ ENTRY(sme_encrypt_execute)
52 rep movsb 52 rep movsb
53 53
54 /* Setup registers for call */ 54 /* Setup registers for call */
55 movq %r10, %rdi /* Encrypted kernel */ 55 movq %r10, %rdi /* Encrypted area */
56 movq %r11, %rsi /* Decrypted kernel */ 56 movq %r11, %rsi /* Decrypted area */
57 movq %r8, %rdx /* Pagetables used for encryption */ 57 movq %r8, %rdx /* Pagetables used for encryption */
58 movq %r12, %rcx /* Kernel length */ 58 movq %r12, %rcx /* Area length */
59 movq %rax, %r8 /* Workarea encryption routine */ 59 movq %rax, %r8 /* Workarea encryption routine */
60 addq $PAGE_SIZE, %r8 /* Workarea intermediate copy buffer */ 60 addq $PAGE_SIZE, %r8 /* Workarea intermediate copy buffer */
61 61
@@ -71,7 +71,7 @@ ENDPROC(sme_encrypt_execute)
71 71
72ENTRY(__enc_copy) 72ENTRY(__enc_copy)
73/* 73/*
74 * Routine used to encrypt kernel. 74 * Routine used to encrypt memory in place.
75 * This routine must be run outside of the kernel proper since 75 * This routine must be run outside of the kernel proper since
76 * the kernel will be encrypted during the process. So this 76 * the kernel will be encrypted during the process. So this
77 * routine is defined here and then copied to an area outside 77 * routine is defined here and then copied to an area outside
@@ -79,19 +79,19 @@ ENTRY(__enc_copy)
79 * during execution. 79 * during execution.
80 * 80 *
81 * On entry the registers must be: 81 * On entry the registers must be:
82 * RDI - virtual address for the encrypted kernel mapping 82 * RDI - virtual address for the encrypted mapping
83 * RSI - virtual address for the decrypted kernel mapping 83 * RSI - virtual address for the decrypted mapping
84 * RDX - address of the pagetables to use for encryption 84 * RDX - address of the pagetables to use for encryption
85 * RCX - length of kernel 85 * RCX - length of area
86 * R8 - intermediate copy buffer 86 * R8 - intermediate copy buffer
87 * 87 *
88 * RAX - points to this routine 88 * RAX - points to this routine
89 * 89 *
90 * The kernel will be encrypted by copying from the non-encrypted 90 * The area will be encrypted by copying from the non-encrypted
91 * kernel space to an intermediate buffer and then copying from the 91 * memory space to an intermediate buffer and then copying from the
92 * intermediate buffer back to the encrypted kernel space. The physical 92 * intermediate buffer back to the encrypted memory space. The physical
93 * addresses of the two kernel space mappings are the same which 93 * addresses of the two mappings are the same which results in the area
94 * results in the kernel being encrypted "in place". 94 * being encrypted "in place".
95 */ 95 */
96 /* Enable the new page tables */ 96 /* Enable the new page tables */
97 mov %rdx, %cr3 97 mov %rdx, %cr3
@@ -103,47 +103,55 @@ ENTRY(__enc_copy)
103 orq $X86_CR4_PGE, %rdx 103 orq $X86_CR4_PGE, %rdx
104 mov %rdx, %cr4 104 mov %rdx, %cr4
105 105
106 push %r15
107 push %r12
108
109 movq %rcx, %r9 /* Save area length */
110 movq %rdi, %r10 /* Save encrypted area address */
111 movq %rsi, %r11 /* Save decrypted area address */
112
106 /* Set the PAT register PA5 entry to write-protect */ 113 /* Set the PAT register PA5 entry to write-protect */
107 push %rcx
108 movl $MSR_IA32_CR_PAT, %ecx 114 movl $MSR_IA32_CR_PAT, %ecx
109 rdmsr 115 rdmsr
110 push %rdx /* Save original PAT value */ 116 mov %rdx, %r15 /* Save original PAT value */
111 andl $0xffff00ff, %edx /* Clear PA5 */ 117 andl $0xffff00ff, %edx /* Clear PA5 */
112 orl $0x00000500, %edx /* Set PA5 to WP */ 118 orl $0x00000500, %edx /* Set PA5 to WP */
113 wrmsr 119 wrmsr
114 pop %rdx /* RDX contains original PAT value */
115 pop %rcx
116
117 movq %rcx, %r9 /* Save kernel length */
118 movq %rdi, %r10 /* Save encrypted kernel address */
119 movq %rsi, %r11 /* Save decrypted kernel address */
120 120
121 wbinvd /* Invalidate any cache entries */ 121 wbinvd /* Invalidate any cache entries */
122 122
123 /* Copy/encrypt 2MB at a time */ 123 /* Copy/encrypt up to 2MB at a time */
124 movq $PMD_PAGE_SIZE, %r12
1241: 1251:
125 movq %r11, %rsi /* Source - decrypted kernel */ 126 cmpq %r12, %r9
127 jnb 2f
128 movq %r9, %r12
129
1302:
131 movq %r11, %rsi /* Source - decrypted area */
126 movq %r8, %rdi /* Dest - intermediate copy buffer */ 132 movq %r8, %rdi /* Dest - intermediate copy buffer */
127 movq $PMD_PAGE_SIZE, %rcx /* 2MB length */ 133 movq %r12, %rcx
128 rep movsb 134 rep movsb
129 135
130 movq %r8, %rsi /* Source - intermediate copy buffer */ 136 movq %r8, %rsi /* Source - intermediate copy buffer */
131 movq %r10, %rdi /* Dest - encrypted kernel */ 137 movq %r10, %rdi /* Dest - encrypted area */
132 movq $PMD_PAGE_SIZE, %rcx /* 2MB length */ 138 movq %r12, %rcx
133 rep movsb 139 rep movsb
134 140
135 addq $PMD_PAGE_SIZE, %r11 141 addq %r12, %r11
136 addq $PMD_PAGE_SIZE, %r10 142 addq %r12, %r10
137 subq $PMD_PAGE_SIZE, %r9 /* Kernel length decrement */ 143 subq %r12, %r9 /* Kernel length decrement */
138 jnz 1b /* Kernel length not zero? */ 144 jnz 1b /* Kernel length not zero? */
139 145
140 /* Restore PAT register */ 146 /* Restore PAT register */
141 push %rdx /* Save original PAT value */
142 movl $MSR_IA32_CR_PAT, %ecx 147 movl $MSR_IA32_CR_PAT, %ecx
143 rdmsr 148 rdmsr
144 pop %rdx /* Restore original PAT value */ 149 mov %r15, %rdx /* Restore original PAT value */
145 wrmsr 150 wrmsr
146 151
152 pop %r12
153 pop %r15
154
147 ret 155 ret
148.L__enc_copy_end: 156.L__enc_copy_end:
149ENDPROC(__enc_copy) 157ENDPROC(__enc_copy)
diff --git a/drivers/gpio/gpio-mmio.c b/drivers/gpio/gpio-mmio.c
index f9042bcc27a4..7b14d6280e44 100644
--- a/drivers/gpio/gpio-mmio.c
+++ b/drivers/gpio/gpio-mmio.c
@@ -152,14 +152,13 @@ static int bgpio_get_set_multiple(struct gpio_chip *gc, unsigned long *mask,
152{ 152{
153 unsigned long get_mask = 0; 153 unsigned long get_mask = 0;
154 unsigned long set_mask = 0; 154 unsigned long set_mask = 0;
155 int bit = 0;
156 155
157 while ((bit = find_next_bit(mask, gc->ngpio, bit)) != gc->ngpio) { 156 /* Make sure we first clear any bits that are zero when we read the register */
158 if (gc->bgpio_dir & BIT(bit)) 157 *bits &= ~*mask;
159 set_mask |= BIT(bit); 158
160 else 159 /* Exploit the fact that we know which directions are set */
161 get_mask |= BIT(bit); 160 set_mask = *mask & gc->bgpio_dir;
162 } 161 get_mask = *mask & ~gc->bgpio_dir;
163 162
164 if (set_mask) 163 if (set_mask)
165 *bits |= gc->read_reg(gc->reg_set) & set_mask; 164 *bits |= gc->read_reg(gc->reg_set) & set_mask;
@@ -176,13 +175,13 @@ static int bgpio_get(struct gpio_chip *gc, unsigned int gpio)
176 175
177/* 176/*
178 * This only works if the bits in the GPIO register are in native endianness. 177 * This only works if the bits in the GPIO register are in native endianness.
179 * It is dirt simple and fast in this case. (Also the most common case.)
180 */ 178 */
181static int bgpio_get_multiple(struct gpio_chip *gc, unsigned long *mask, 179static int bgpio_get_multiple(struct gpio_chip *gc, unsigned long *mask,
182 unsigned long *bits) 180 unsigned long *bits)
183{ 181{
184 182 /* Make sure we first clear any bits that are zero when we read the register */
185 *bits = gc->read_reg(gc->reg_dat) & *mask; 183 *bits &= ~*mask;
184 *bits |= gc->read_reg(gc->reg_dat) & *mask;
186 return 0; 185 return 0;
187} 186}
188 187
@@ -196,9 +195,12 @@ static int bgpio_get_multiple_be(struct gpio_chip *gc, unsigned long *mask,
196 unsigned long val; 195 unsigned long val;
197 int bit; 196 int bit;
198 197
198 /* Make sure we first clear any bits that are zero when we read the register */
199 *bits &= ~*mask;
200
199 /* Create a mirrored mask */ 201 /* Create a mirrored mask */
200 bit = 0; 202 bit = -1;
201 while ((bit = find_next_bit(mask, gc->ngpio, bit)) != gc->ngpio) 203 while ((bit = find_next_bit(mask, gc->ngpio, bit + 1)) < gc->ngpio)
202 readmask |= bgpio_line2mask(gc, bit); 204 readmask |= bgpio_line2mask(gc, bit);
203 205
204 /* Read the register */ 206 /* Read the register */
@@ -208,8 +210,8 @@ static int bgpio_get_multiple_be(struct gpio_chip *gc, unsigned long *mask,
208 * Mirror the result into the "bits" result, this will give line 0 210 * Mirror the result into the "bits" result, this will give line 0
209 * in bit 0 ... line 31 in bit 31 for a 32bit register. 211 * in bit 0 ... line 31 in bit 31 for a 32bit register.
210 */ 212 */
211 bit = 0; 213 bit = -1;
212 while ((bit = find_next_bit(&val, gc->ngpio, bit)) != gc->ngpio) 214 while ((bit = find_next_bit(&val, gc->ngpio, bit + 1)) < gc->ngpio)
213 *bits |= bgpio_line2mask(gc, bit); 215 *bits |= bgpio_line2mask(gc, bit);
214 216
215 return 0; 217 return 0;
diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c
index 85140c9af581..8b941f814472 100644
--- a/drivers/mmc/host/sdhci-esdhc-imx.c
+++ b/drivers/mmc/host/sdhci-esdhc-imx.c
@@ -687,6 +687,20 @@ static inline void esdhc_pltfm_set_clock(struct sdhci_host *host,
687 return; 687 return;
688 } 688 }
689 689
690 /* For i.MX53 eSDHCv3, SYSCTL.SDCLKFS may not be set to 0. */
691 if (is_imx53_esdhc(imx_data)) {
692 /*
693 * According to the i.MX53 reference manual, if DLLCTRL[10] can
694 * be set, then the controller is eSDHCv3, else it is eSDHCv2.
695 */
696 val = readl(host->ioaddr + ESDHC_DLL_CTRL);
697 writel(val | BIT(10), host->ioaddr + ESDHC_DLL_CTRL);
698 temp = readl(host->ioaddr + ESDHC_DLL_CTRL);
699 writel(val, host->ioaddr + ESDHC_DLL_CTRL);
700 if (temp & BIT(10))
701 pre_div = 2;
702 }
703
690 temp = sdhci_readl(host, ESDHC_SYSTEM_CONTROL); 704 temp = sdhci_readl(host, ESDHC_SYSTEM_CONTROL);
691 temp &= ~(ESDHC_CLOCK_IPGEN | ESDHC_CLOCK_HCKEN | ESDHC_CLOCK_PEREN 705 temp &= ~(ESDHC_CLOCK_IPGEN | ESDHC_CLOCK_HCKEN | ESDHC_CLOCK_PEREN
692 | ESDHC_CLOCK_MASK); 706 | ESDHC_CLOCK_MASK);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index d53550e612bc..4276ebfff22b 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -451,10 +451,13 @@ static void **nvme_pci_iod_list(struct request *req)
451static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) 451static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
452{ 452{
453 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 453 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
454 int nseg = blk_rq_nr_phys_segments(req);
454 unsigned int avg_seg_size; 455 unsigned int avg_seg_size;
455 456
456 avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), 457 if (nseg == 0)
457 blk_rq_nr_phys_segments(req)); 458 return false;
459
460 avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
458 461
459 if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1)))) 462 if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
460 return false; 463 return false;
@@ -722,20 +725,19 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
722} 725}
723 726
724static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, 727static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
725 struct request *req, struct nvme_rw_command *cmd) 728 struct request *req, struct nvme_rw_command *cmd, int entries)
726{ 729{
727 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 730 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
728 int length = blk_rq_payload_bytes(req);
729 struct dma_pool *pool; 731 struct dma_pool *pool;
730 struct nvme_sgl_desc *sg_list; 732 struct nvme_sgl_desc *sg_list;
731 struct scatterlist *sg = iod->sg; 733 struct scatterlist *sg = iod->sg;
732 int entries = iod->nents, i = 0;
733 dma_addr_t sgl_dma; 734 dma_addr_t sgl_dma;
735 int i = 0;
734 736
735 /* setting the transfer type as SGL */ 737 /* setting the transfer type as SGL */
736 cmd->flags = NVME_CMD_SGL_METABUF; 738 cmd->flags = NVME_CMD_SGL_METABUF;
737 739
738 if (length == sg_dma_len(sg)) { 740 if (entries == 1) {
739 nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg); 741 nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
740 return BLK_STS_OK; 742 return BLK_STS_OK;
741 } 743 }
@@ -775,13 +777,9 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
775 } 777 }
776 778
777 nvme_pci_sgl_set_data(&sg_list[i++], sg); 779 nvme_pci_sgl_set_data(&sg_list[i++], sg);
778
779 length -= sg_dma_len(sg);
780 sg = sg_next(sg); 780 sg = sg_next(sg);
781 entries--; 781 } while (--entries > 0);
782 } while (length > 0);
783 782
784 WARN_ON(entries > 0);
785 return BLK_STS_OK; 783 return BLK_STS_OK;
786} 784}
787 785
@@ -793,6 +791,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
793 enum dma_data_direction dma_dir = rq_data_dir(req) ? 791 enum dma_data_direction dma_dir = rq_data_dir(req) ?
794 DMA_TO_DEVICE : DMA_FROM_DEVICE; 792 DMA_TO_DEVICE : DMA_FROM_DEVICE;
795 blk_status_t ret = BLK_STS_IOERR; 793 blk_status_t ret = BLK_STS_IOERR;
794 int nr_mapped;
796 795
797 sg_init_table(iod->sg, blk_rq_nr_phys_segments(req)); 796 sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
798 iod->nents = blk_rq_map_sg(q, req, iod->sg); 797 iod->nents = blk_rq_map_sg(q, req, iod->sg);
@@ -800,12 +799,13 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
800 goto out; 799 goto out;
801 800
802 ret = BLK_STS_RESOURCE; 801 ret = BLK_STS_RESOURCE;
803 if (!dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, dma_dir, 802 nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, dma_dir,
804 DMA_ATTR_NO_WARN)) 803 DMA_ATTR_NO_WARN);
804 if (!nr_mapped)
805 goto out; 805 goto out;
806 806
807 if (iod->use_sgl) 807 if (iod->use_sgl)
808 ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw); 808 ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped);
809 else 809 else
810 ret = nvme_pci_setup_prps(dev, req, &cmnd->rw); 810 ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
811 811
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 4178d2493547..5e335b6203f4 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -71,7 +71,7 @@ extern void delayacct_init(void);
71extern void __delayacct_tsk_init(struct task_struct *); 71extern void __delayacct_tsk_init(struct task_struct *);
72extern void __delayacct_tsk_exit(struct task_struct *); 72extern void __delayacct_tsk_exit(struct task_struct *);
73extern void __delayacct_blkio_start(void); 73extern void __delayacct_blkio_start(void);
74extern void __delayacct_blkio_end(void); 74extern void __delayacct_blkio_end(struct task_struct *);
75extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *); 75extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *);
76extern __u64 __delayacct_blkio_ticks(struct task_struct *); 76extern __u64 __delayacct_blkio_ticks(struct task_struct *);
77extern void __delayacct_freepages_start(void); 77extern void __delayacct_freepages_start(void);
@@ -122,10 +122,10 @@ static inline void delayacct_blkio_start(void)
122 __delayacct_blkio_start(); 122 __delayacct_blkio_start();
123} 123}
124 124
125static inline void delayacct_blkio_end(void) 125static inline void delayacct_blkio_end(struct task_struct *p)
126{ 126{
127 if (current->delays) 127 if (current->delays)
128 __delayacct_blkio_end(); 128 __delayacct_blkio_end(p);
129 delayacct_clear_flag(DELAYACCT_PF_BLKIO); 129 delayacct_clear_flag(DELAYACCT_PF_BLKIO);
130} 130}
131 131
@@ -169,7 +169,7 @@ static inline void delayacct_tsk_free(struct task_struct *tsk)
169{} 169{}
170static inline void delayacct_blkio_start(void) 170static inline void delayacct_blkio_start(void)
171{} 171{}
172static inline void delayacct_blkio_end(void) 172static inline void delayacct_blkio_end(struct task_struct *p)
173{} 173{}
174static inline int delayacct_add_tsk(struct taskstats *d, 174static inline int delayacct_add_tsk(struct taskstats *d,
175 struct task_struct *tsk) 175 struct task_struct *tsk)
diff --git a/include/linux/vermagic.h b/include/linux/vermagic.h
index bae807eb2933..853291714ae0 100644
--- a/include/linux/vermagic.h
+++ b/include/linux/vermagic.h
@@ -31,11 +31,17 @@
31#else 31#else
32#define MODULE_RANDSTRUCT_PLUGIN 32#define MODULE_RANDSTRUCT_PLUGIN
33#endif 33#endif
34#ifdef RETPOLINE
35#define MODULE_VERMAGIC_RETPOLINE "retpoline "
36#else
37#define MODULE_VERMAGIC_RETPOLINE ""
38#endif
34 39
35#define VERMAGIC_STRING \ 40#define VERMAGIC_STRING \
36 UTS_RELEASE " " \ 41 UTS_RELEASE " " \
37 MODULE_VERMAGIC_SMP MODULE_VERMAGIC_PREEMPT \ 42 MODULE_VERMAGIC_SMP MODULE_VERMAGIC_PREEMPT \
38 MODULE_VERMAGIC_MODULE_UNLOAD MODULE_VERMAGIC_MODVERSIONS \ 43 MODULE_VERMAGIC_MODULE_UNLOAD MODULE_VERMAGIC_MODVERSIONS \
39 MODULE_ARCH_VERMAGIC \ 44 MODULE_ARCH_VERMAGIC \
40 MODULE_RANDSTRUCT_PLUGIN 45 MODULE_RANDSTRUCT_PLUGIN \
46 MODULE_VERMAGIC_RETPOLINE
41 47
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 4a1c33416b6a..e2764d767f18 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -51,16 +51,16 @@ void __delayacct_tsk_init(struct task_struct *tsk)
51 * Finish delay accounting for a statistic using its timestamps (@start), 51 * Finish delay accounting for a statistic using its timestamps (@start),
52 * accumalator (@total) and @count 52 * accumalator (@total) and @count
53 */ 53 */
54static void delayacct_end(u64 *start, u64 *total, u32 *count) 54static void delayacct_end(spinlock_t *lock, u64 *start, u64 *total, u32 *count)
55{ 55{
56 s64 ns = ktime_get_ns() - *start; 56 s64 ns = ktime_get_ns() - *start;
57 unsigned long flags; 57 unsigned long flags;
58 58
59 if (ns > 0) { 59 if (ns > 0) {
60 spin_lock_irqsave(&current->delays->lock, flags); 60 spin_lock_irqsave(lock, flags);
61 *total += ns; 61 *total += ns;
62 (*count)++; 62 (*count)++;
63 spin_unlock_irqrestore(&current->delays->lock, flags); 63 spin_unlock_irqrestore(lock, flags);
64 } 64 }
65} 65}
66 66
@@ -69,17 +69,25 @@ void __delayacct_blkio_start(void)
69 current->delays->blkio_start = ktime_get_ns(); 69 current->delays->blkio_start = ktime_get_ns();
70} 70}
71 71
72void __delayacct_blkio_end(void) 72/*
73 * We cannot rely on the `current` macro, as we haven't yet switched back to
74 * the process being woken.
75 */
76void __delayacct_blkio_end(struct task_struct *p)
73{ 77{
74 if (current->delays->flags & DELAYACCT_PF_SWAPIN) 78 struct task_delay_info *delays = p->delays;
75 /* Swapin block I/O */ 79 u64 *total;
76 delayacct_end(&current->delays->blkio_start, 80 u32 *count;
77 &current->delays->swapin_delay, 81
78 &current->delays->swapin_count); 82 if (p->delays->flags & DELAYACCT_PF_SWAPIN) {
79 else /* Other block I/O */ 83 total = &delays->swapin_delay;
80 delayacct_end(&current->delays->blkio_start, 84 count = &delays->swapin_count;
81 &current->delays->blkio_delay, 85 } else {
82 &current->delays->blkio_count); 86 total = &delays->blkio_delay;
87 count = &delays->blkio_count;
88 }
89
90 delayacct_end(&delays->lock, &delays->blkio_start, total, count);
83} 91}
84 92
85int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) 93int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
@@ -153,8 +161,10 @@ void __delayacct_freepages_start(void)
153 161
154void __delayacct_freepages_end(void) 162void __delayacct_freepages_end(void)
155{ 163{
156 delayacct_end(&current->delays->freepages_start, 164 delayacct_end(
157 &current->delays->freepages_delay, 165 &current->delays->lock,
158 &current->delays->freepages_count); 166 &current->delays->freepages_start,
167 &current->delays->freepages_delay,
168 &current->delays->freepages_count);
159} 169}
160 170
diff --git a/kernel/futex.c b/kernel/futex.c
index 57d0b3657e16..8c5424dd5924 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1878,6 +1878,9 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1878 struct futex_q *this, *next; 1878 struct futex_q *this, *next;
1879 DEFINE_WAKE_Q(wake_q); 1879 DEFINE_WAKE_Q(wake_q);
1880 1880
1881 if (nr_wake < 0 || nr_requeue < 0)
1882 return -EINVAL;
1883
1881 /* 1884 /*
1882 * When PI not supported: return -ENOSYS if requeue_pi is true, 1885 * When PI not supported: return -ENOSYS if requeue_pi is true,
1883 * consequently the compiler knows requeue_pi is always false past 1886 * consequently the compiler knows requeue_pi is always false past
@@ -2294,21 +2297,17 @@ static void unqueue_me_pi(struct futex_q *q)
2294 spin_unlock(q->lock_ptr); 2297 spin_unlock(q->lock_ptr);
2295} 2298}
2296 2299
2297/*
2298 * Fixup the pi_state owner with the new owner.
2299 *
2300 * Must be called with hash bucket lock held and mm->sem held for non
2301 * private futexes.
2302 */
2303static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 2300static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
2304 struct task_struct *newowner) 2301 struct task_struct *argowner)
2305{ 2302{
2306 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
2307 struct futex_pi_state *pi_state = q->pi_state; 2303 struct futex_pi_state *pi_state = q->pi_state;
2308 u32 uval, uninitialized_var(curval), newval; 2304 u32 uval, uninitialized_var(curval), newval;
2309 struct task_struct *oldowner; 2305 struct task_struct *oldowner, *newowner;
2306 u32 newtid;
2310 int ret; 2307 int ret;
2311 2308
2309 lockdep_assert_held(q->lock_ptr);
2310
2312 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 2311 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
2313 2312
2314 oldowner = pi_state->owner; 2313 oldowner = pi_state->owner;
@@ -2317,11 +2316,17 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
2317 newtid |= FUTEX_OWNER_DIED; 2316 newtid |= FUTEX_OWNER_DIED;
2318 2317
2319 /* 2318 /*
2320 * We are here either because we stole the rtmutex from the 2319 * We are here because either:
2321 * previous highest priority waiter or we are the highest priority 2320 *
2322 * waiter but have failed to get the rtmutex the first time. 2321 * - we stole the lock and pi_state->owner needs updating to reflect
2322 * that (@argowner == current),
2323 *
2324 * or:
2325 *
2326 * - someone stole our lock and we need to fix things to point to the
2327 * new owner (@argowner == NULL).
2323 * 2328 *
2324 * We have to replace the newowner TID in the user space variable. 2329 * Either way, we have to replace the TID in the user space variable.
2325 * This must be atomic as we have to preserve the owner died bit here. 2330 * This must be atomic as we have to preserve the owner died bit here.
2326 * 2331 *
2327 * Note: We write the user space value _before_ changing the pi_state 2332 * Note: We write the user space value _before_ changing the pi_state
@@ -2334,6 +2339,42 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
2334 * in the PID check in lookup_pi_state. 2339 * in the PID check in lookup_pi_state.
2335 */ 2340 */
2336retry: 2341retry:
2342 if (!argowner) {
2343 if (oldowner != current) {
2344 /*
2345 * We raced against a concurrent self; things are
2346 * already fixed up. Nothing to do.
2347 */
2348 ret = 0;
2349 goto out_unlock;
2350 }
2351
2352 if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
2353 /* We got the lock after all, nothing to fix. */
2354 ret = 0;
2355 goto out_unlock;
2356 }
2357
2358 /*
2359 * Since we just failed the trylock; there must be an owner.
2360 */
2361 newowner = rt_mutex_owner(&pi_state->pi_mutex);
2362 BUG_ON(!newowner);
2363 } else {
2364 WARN_ON_ONCE(argowner != current);
2365 if (oldowner == current) {
2366 /*
2367 * We raced against a concurrent self; things are
2368 * already fixed up. Nothing to do.
2369 */
2370 ret = 0;
2371 goto out_unlock;
2372 }
2373 newowner = argowner;
2374 }
2375
2376 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
2377
2337 if (get_futex_value_locked(&uval, uaddr)) 2378 if (get_futex_value_locked(&uval, uaddr))
2338 goto handle_fault; 2379 goto handle_fault;
2339 2380
@@ -2434,9 +2475,9 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
2434 * Got the lock. We might not be the anticipated owner if we 2475 * Got the lock. We might not be the anticipated owner if we
2435 * did a lock-steal - fix up the PI-state in that case: 2476 * did a lock-steal - fix up the PI-state in that case:
2436 * 2477 *
2437 * We can safely read pi_state->owner without holding wait_lock 2478 * Speculative pi_state->owner read (we don't hold wait_lock);
2438 * because we now own the rt_mutex, only the owner will attempt 2479 * since we own the lock pi_state->owner == current is the
2439 * to change it. 2480 * stable state, anything else needs more attention.
2440 */ 2481 */
2441 if (q->pi_state->owner != current) 2482 if (q->pi_state->owner != current)
2442 ret = fixup_pi_state_owner(uaddr, q, current); 2483 ret = fixup_pi_state_owner(uaddr, q, current);
@@ -2444,6 +2485,19 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
2444 } 2485 }
2445 2486
2446 /* 2487 /*
2488 * If we didn't get the lock; check if anybody stole it from us. In
2489 * that case, we need to fix up the uval to point to them instead of
2490 * us, otherwise bad things happen. [10]
2491 *
2492 * Another speculative read; pi_state->owner == current is unstable
2493 * but needs our attention.
2494 */
2495 if (q->pi_state->owner == current) {
2496 ret = fixup_pi_state_owner(uaddr, q, NULL);
2497 goto out;
2498 }
2499
2500 /*
2447 * Paranoia check. If we did not take the lock, then we should not be 2501 * Paranoia check. If we did not take the lock, then we should not be
2448 * the owner of the rt_mutex. 2502 * the owner of the rt_mutex.
2449 */ 2503 */
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 6f3dba6e4e9e..65cc0cb984e6 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1290,6 +1290,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
1290 return ret; 1290 return ret;
1291} 1291}
1292 1292
1293static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock)
1294{
1295 int ret = try_to_take_rt_mutex(lock, current, NULL);
1296
1297 /*
1298 * try_to_take_rt_mutex() sets the lock waiters bit
1299 * unconditionally. Clean this up.
1300 */
1301 fixup_rt_mutex_waiters(lock);
1302
1303 return ret;
1304}
1305
1293/* 1306/*
1294 * Slow path try-lock function: 1307 * Slow path try-lock function:
1295 */ 1308 */
@@ -1312,13 +1325,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
1312 */ 1325 */
1313 raw_spin_lock_irqsave(&lock->wait_lock, flags); 1326 raw_spin_lock_irqsave(&lock->wait_lock, flags);
1314 1327
1315 ret = try_to_take_rt_mutex(lock, current, NULL); 1328 ret = __rt_mutex_slowtrylock(lock);
1316
1317 /*
1318 * try_to_take_rt_mutex() sets the lock waiters bit
1319 * unconditionally. Clean this up.
1320 */
1321 fixup_rt_mutex_waiters(lock);
1322 1329
1323 raw_spin_unlock_irqrestore(&lock->wait_lock, flags); 1330 raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
1324 1331
@@ -1505,6 +1512,11 @@ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
1505 return rt_mutex_slowtrylock(lock); 1512 return rt_mutex_slowtrylock(lock);
1506} 1513}
1507 1514
1515int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock)
1516{
1517 return __rt_mutex_slowtrylock(lock);
1518}
1519
1508/** 1520/**
1509 * rt_mutex_timed_lock - lock a rt_mutex interruptible 1521 * rt_mutex_timed_lock - lock a rt_mutex interruptible
1510 * the timeout structure is provided 1522 * the timeout structure is provided
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 124e98ca0b17..68686b3ec3c1 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -148,6 +148,7 @@ extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
148 struct rt_mutex_waiter *waiter); 148 struct rt_mutex_waiter *waiter);
149 149
150extern int rt_mutex_futex_trylock(struct rt_mutex *l); 150extern int rt_mutex_futex_trylock(struct rt_mutex *l);
151extern int __rt_mutex_futex_trylock(struct rt_mutex *l);
151 152
152extern void rt_mutex_futex_unlock(struct rt_mutex *lock); 153extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
153extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, 154extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 644fa2e3d993..a7bf32aabfda 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2056,7 +2056,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2056 p->state = TASK_WAKING; 2056 p->state = TASK_WAKING;
2057 2057
2058 if (p->in_iowait) { 2058 if (p->in_iowait) {
2059 delayacct_blkio_end(); 2059 delayacct_blkio_end(p);
2060 atomic_dec(&task_rq(p)->nr_iowait); 2060 atomic_dec(&task_rq(p)->nr_iowait);
2061 } 2061 }
2062 2062
@@ -2069,7 +2069,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2069#else /* CONFIG_SMP */ 2069#else /* CONFIG_SMP */
2070 2070
2071 if (p->in_iowait) { 2071 if (p->in_iowait) {
2072 delayacct_blkio_end(); 2072 delayacct_blkio_end(p);
2073 atomic_dec(&task_rq(p)->nr_iowait); 2073 atomic_dec(&task_rq(p)->nr_iowait);
2074 } 2074 }
2075 2075
@@ -2122,7 +2122,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
2122 2122
2123 if (!task_on_rq_queued(p)) { 2123 if (!task_on_rq_queued(p)) {
2124 if (p->in_iowait) { 2124 if (p->in_iowait) {
2125 delayacct_blkio_end(); 2125 delayacct_blkio_end(p);
2126 atomic_dec(&rq->nr_iowait); 2126 atomic_dec(&rq->nr_iowait);
2127 } 2127 }
2128 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); 2128 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 89a9e1b4264a..0bcf00e3ce48 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1696,7 +1696,7 @@ void run_local_timers(void)
1696 hrtimer_run_queues(); 1696 hrtimer_run_queues();
1697 /* Raise the softirq only if required. */ 1697 /* Raise the softirq only if required. */
1698 if (time_before(jiffies, base->clk)) { 1698 if (time_before(jiffies, base->clk)) {
1699 if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) 1699 if (!IS_ENABLED(CONFIG_NO_HZ_COMMON))
1700 return; 1700 return;
1701 /* CPU is awake, so check the deferrable base. */ 1701 /* CPU is awake, so check the deferrable base. */
1702 base++; 1702 base++;
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index cb8997ed0149..47cddf32aeba 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -265,12 +265,18 @@ else
265objtool_args += $(call cc-ifversion, -lt, 0405, --no-unreachable) 265objtool_args += $(call cc-ifversion, -lt, 0405, --no-unreachable)
266endif 266endif
267 267
268ifdef CONFIG_MODVERSIONS
269objtool_o = $(@D)/.tmp_$(@F)
270else
271objtool_o = $(@)
272endif
273
268# 'OBJECT_FILES_NON_STANDARD := y': skip objtool checking for a directory 274# 'OBJECT_FILES_NON_STANDARD := y': skip objtool checking for a directory
269# 'OBJECT_FILES_NON_STANDARD_foo.o := 'y': skip objtool checking for a file 275# 'OBJECT_FILES_NON_STANDARD_foo.o := 'y': skip objtool checking for a file
270# 'OBJECT_FILES_NON_STANDARD_foo.o := 'n': override directory skip for a file 276# 'OBJECT_FILES_NON_STANDARD_foo.o := 'n': override directory skip for a file
271cmd_objtool = $(if $(patsubst y%,, \ 277cmd_objtool = $(if $(patsubst y%,, \
272 $(OBJECT_FILES_NON_STANDARD_$(basetarget).o)$(OBJECT_FILES_NON_STANDARD)n), \ 278 $(OBJECT_FILES_NON_STANDARD_$(basetarget).o)$(OBJECT_FILES_NON_STANDARD)n), \
273 $(__objtool_obj) $(objtool_args) "$(@)";) 279 $(__objtool_obj) $(objtool_args) "$(objtool_o)";)
274objtool_obj = $(if $(patsubst y%,, \ 280objtool_obj = $(if $(patsubst y%,, \
275 $(OBJECT_FILES_NON_STANDARD_$(basetarget).o)$(OBJECT_FILES_NON_STANDARD)n), \ 281 $(OBJECT_FILES_NON_STANDARD_$(basetarget).o)$(OBJECT_FILES_NON_STANDARD)n), \
276 $(__objtool_obj)) 282 $(__objtool_obj))
@@ -286,16 +292,16 @@ objtool_dep = $(objtool_obj) \
286define rule_cc_o_c 292define rule_cc_o_c
287 $(call echo-cmd,checksrc) $(cmd_checksrc) \ 293 $(call echo-cmd,checksrc) $(cmd_checksrc) \
288 $(call cmd_and_fixdep,cc_o_c) \ 294 $(call cmd_and_fixdep,cc_o_c) \
289 $(cmd_modversions_c) \
290 $(cmd_checkdoc) \ 295 $(cmd_checkdoc) \
291 $(call echo-cmd,objtool) $(cmd_objtool) \ 296 $(call echo-cmd,objtool) $(cmd_objtool) \
297 $(cmd_modversions_c) \
292 $(call echo-cmd,record_mcount) $(cmd_record_mcount) 298 $(call echo-cmd,record_mcount) $(cmd_record_mcount)
293endef 299endef
294 300
295define rule_as_o_S 301define rule_as_o_S
296 $(call cmd_and_fixdep,as_o_S) \ 302 $(call cmd_and_fixdep,as_o_S) \
297 $(cmd_modversions_S) \ 303 $(call echo-cmd,objtool) $(cmd_objtool) \
298 $(call echo-cmd,objtool) $(cmd_objtool) 304 $(cmd_modversions_S)
299endef 305endef
300 306
301# List module undefined symbols (or empty line if not enabled) 307# List module undefined symbols (or empty line if not enabled)
diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index 24460155c82c..c1c338661699 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -26,6 +26,7 @@
26#include <stdlib.h> 26#include <stdlib.h>
27#include <string.h> 27#include <string.h>
28#include <unistd.h> 28#include <unistd.h>
29#include <errno.h>
29 30
30#include "elf.h" 31#include "elf.h"
31#include "warn.h" 32#include "warn.h"
@@ -358,7 +359,8 @@ struct elf *elf_open(const char *name, int flags)
358 359
359 elf->fd = open(name, flags); 360 elf->fd = open(name, flags);
360 if (elf->fd == -1) { 361 if (elf->fd == -1) {
361 perror("open"); 362 fprintf(stderr, "objtool: Can't open '%s': %s\n",
363 name, strerror(errno));
362 goto err; 364 goto err;
363 } 365 }
364 366