aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-05-20 14:28:32 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-05-20 14:28:32 -0400
commit8a6bd2f40e96fb4d96749ab029c61f0df218b003 (patch)
tree33bbe31fad29d88c066f91fb577c92496f659122
parentb9aad92236391f681083fa4045083d5b846b59e0 (diff)
parentacf46020012ccbca1172e9c7aeab399c950d9212 (diff)
Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 fixes from Thomas Gleixner: "An unfortunately larger set of fixes, but a large portion is selftests: - Fix the missing clusterid initializaiton for x2apic cluster management which caused boot failures due to IPIs being sent to the wrong cluster - Drop TX_COMPAT when a 64bit executable is exec()'ed from a compat task - Wrap access to __supported_pte_mask in __startup_64() where clang compile fails due to a non PC relative access being generated. - Two fixes for 5 level paging fallout in the decompressor: - Handle GOT correctly for paging_prepare() and cleanup_trampoline() - Fix the page table handling in cleanup_trampoline() to avoid page table corruption. - Stop special casing protection key 0 as this is inconsistent with the manpage and also inconsistent with the allocation map handling. - Override the protection key wen moving away from PROT_EXEC to prevent inaccessible memory. - Fix and update the protection key selftests to address breakage and to cover the above issue - Add a MOV SS self test" [ Part of the x86 fixes were in the earlier core pull due to dependencies ] * 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (21 commits) x86/mm: Drop TS_COMPAT on 64-bit exec() syscall x86/apic/x2apic: Initialize cluster ID properly x86/boot/compressed/64: Fix moving page table out of trampoline memory x86/boot/compressed/64: Set up GOT for paging_prepare() and cleanup_trampoline() x86/pkeys: Do not special case protection key 0 x86/pkeys/selftests: Add a test for pkey 0 x86/pkeys/selftests: Save off 'prot' for allocations x86/pkeys/selftests: Fix pointer math x86/pkeys: Override pkey when moving away from PROT_EXEC x86/pkeys/selftests: Fix pkey exhaustion test off-by-one x86/pkeys/selftests: Add PROT_EXEC test x86/pkeys/selftests: Factor out "instruction page" x86/pkeys/selftests: Allow faults on unknown keys x86/pkeys/selftests: Avoid printf-in-signal deadlocks x86/pkeys/selftests: Remove dead debugging code, fix dprint_in_signal x86/pkeys/selftests: Stop using assert() x86/pkeys/selftests: Give better unexpected fault error messages x86/selftests: Add mov_to_ss test x86/mpx/selftests: Adjust the self-test to fresh distros that export the MPX ABI x86/pkeys/selftests: Adjust the self-test to fresh distros that export the pkeys ABI ...
-rw-r--r--arch/x86/boot/compressed/head_64.S79
-rw-r--r--arch/x86/boot/compressed/pgtable_64.c14
-rw-r--r--arch/x86/include/asm/mmu_context.h2
-rw-r--r--arch/x86/include/asm/pkeys.h18
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c1
-rw-r--r--arch/x86/kernel/head64.c10
-rw-r--r--arch/x86/kernel/process_64.c1
-rw-r--r--arch/x86/mm/pkeys.c21
-rw-r--r--tools/testing/selftests/x86/Makefile2
-rw-r--r--tools/testing/selftests/x86/mov_ss_trap.c285
-rw-r--r--tools/testing/selftests/x86/mpx-mini-test.c7
-rw-r--r--tools/testing/selftests/x86/pkey-helpers.h20
-rw-r--r--tools/testing/selftests/x86/protection_keys.c254
13 files changed, 585 insertions, 129 deletions
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index fca012baba19..8169e8b7a4dc 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -306,6 +306,25 @@ ENTRY(startup_64)
306 leaq boot_stack_end(%rbx), %rsp 306 leaq boot_stack_end(%rbx), %rsp
307 307
308 /* 308 /*
309 * paging_prepare() and cleanup_trampoline() below can have GOT
310 * references. Adjust the table with address we are running at.
311 *
312 * Zero RAX for adjust_got: the GOT was not adjusted before;
313 * there's no adjustment to undo.
314 */
315 xorq %rax, %rax
316
317 /*
318 * Calculate the address the binary is loaded at and use it as
319 * a GOT adjustment.
320 */
321 call 1f
3221: popq %rdi
323 subq $1b, %rdi
324
325 call adjust_got
326
327 /*
309 * At this point we are in long mode with 4-level paging enabled, 328 * At this point we are in long mode with 4-level paging enabled,
310 * but we might want to enable 5-level paging or vice versa. 329 * but we might want to enable 5-level paging or vice versa.
311 * 330 *
@@ -370,10 +389,14 @@ trampoline_return:
370 /* 389 /*
371 * cleanup_trampoline() would restore trampoline memory. 390 * cleanup_trampoline() would restore trampoline memory.
372 * 391 *
392 * RDI is address of the page table to use instead of page table
393 * in trampoline memory (if required).
394 *
373 * RSI holds real mode data and needs to be preserved across 395 * RSI holds real mode data and needs to be preserved across
374 * this function call. 396 * this function call.
375 */ 397 */
376 pushq %rsi 398 pushq %rsi
399 leaq top_pgtable(%rbx), %rdi
377 call cleanup_trampoline 400 call cleanup_trampoline
378 popq %rsi 401 popq %rsi
379 402
@@ -381,6 +404,21 @@ trampoline_return:
381 pushq $0 404 pushq $0
382 popfq 405 popfq
383 406
407 /*
408 * Previously we've adjusted the GOT with address the binary was
409 * loaded at. Now we need to re-adjust for relocation address.
410 *
411 * Calculate the address the binary is loaded at, so that we can
412 * undo the previous GOT adjustment.
413 */
414 call 1f
4151: popq %rax
416 subq $1b, %rax
417
418 /* The new adjustment is the relocation address */
419 movq %rbx, %rdi
420 call adjust_got
421
384/* 422/*
385 * Copy the compressed kernel to the end of our buffer 423 * Copy the compressed kernel to the end of our buffer
386 * where decompression in place becomes safe. 424 * where decompression in place becomes safe.
@@ -482,19 +520,6 @@ relocated:
482 rep stosq 520 rep stosq
483 521
484/* 522/*
485 * Adjust our own GOT
486 */
487 leaq _got(%rip), %rdx
488 leaq _egot(%rip), %rcx
4891:
490 cmpq %rcx, %rdx
491 jae 2f
492 addq %rbx, (%rdx)
493 addq $8, %rdx
494 jmp 1b
4952:
496
497/*
498 * Do the extraction, and jump to the new kernel.. 523 * Do the extraction, and jump to the new kernel..
499 */ 524 */
500 pushq %rsi /* Save the real mode argument */ 525 pushq %rsi /* Save the real mode argument */
@@ -512,6 +537,27 @@ relocated:
512 */ 537 */
513 jmp *%rax 538 jmp *%rax
514 539
540/*
541 * Adjust the global offset table
542 *
543 * RAX is the previous adjustment of the table to undo (use 0 if it's the
544 * first time we touch GOT).
545 * RDI is the new adjustment to apply.
546 */
547adjust_got:
548 /* Walk through the GOT adding the address to the entries */
549 leaq _got(%rip), %rdx
550 leaq _egot(%rip), %rcx
5511:
552 cmpq %rcx, %rdx
553 jae 2f
554 subq %rax, (%rdx) /* Undo previous adjustment */
555 addq %rdi, (%rdx) /* Apply the new adjustment */
556 addq $8, %rdx
557 jmp 1b
5582:
559 ret
560
515 .code32 561 .code32
516/* 562/*
517 * This is the 32-bit trampoline that will be copied over to low memory. 563 * This is the 32-bit trampoline that will be copied over to low memory.
@@ -649,3 +695,10 @@ boot_stack_end:
649 .balign 4096 695 .balign 4096
650pgtable: 696pgtable:
651 .fill BOOT_PGT_SIZE, 1, 0 697 .fill BOOT_PGT_SIZE, 1, 0
698
699/*
700 * The page table is going to be used instead of page table in the trampoline
701 * memory.
702 */
703top_pgtable:
704 .fill PAGE_SIZE, 1, 0
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
index 32af1cbcd903..a362fa0b849c 100644
--- a/arch/x86/boot/compressed/pgtable_64.c
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -23,14 +23,6 @@ struct paging_config {
23static char trampoline_save[TRAMPOLINE_32BIT_SIZE]; 23static char trampoline_save[TRAMPOLINE_32BIT_SIZE];
24 24
25/* 25/*
26 * The page table is going to be used instead of page table in the trampoline
27 * memory.
28 *
29 * It must not be in BSS as BSS is cleared after cleanup_trampoline().
30 */
31static char top_pgtable[PAGE_SIZE] __aligned(PAGE_SIZE) __section(.data);
32
33/*
34 * Trampoline address will be printed by extract_kernel() for debugging 26 * Trampoline address will be printed by extract_kernel() for debugging
35 * purposes. 27 * purposes.
36 * 28 *
@@ -134,7 +126,7 @@ out:
134 return paging_config; 126 return paging_config;
135} 127}
136 128
137void cleanup_trampoline(void) 129void cleanup_trampoline(void *pgtable)
138{ 130{
139 void *trampoline_pgtable; 131 void *trampoline_pgtable;
140 132
@@ -145,8 +137,8 @@ void cleanup_trampoline(void)
145 * if it's there. 137 * if it's there.
146 */ 138 */
147 if ((void *)__native_read_cr3() == trampoline_pgtable) { 139 if ((void *)__native_read_cr3() == trampoline_pgtable) {
148 memcpy(top_pgtable, trampoline_pgtable, PAGE_SIZE); 140 memcpy(pgtable, trampoline_pgtable, PAGE_SIZE);
149 native_write_cr3((unsigned long)top_pgtable); 141 native_write_cr3((unsigned long)pgtable);
150 } 142 }
151 143
152 /* Restore trampoline memory */ 144 /* Restore trampoline memory */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 57e3785d0d26..cf9911b5a53c 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -193,7 +193,7 @@ static inline int init_new_context(struct task_struct *tsk,
193 193
194#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS 194#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
195 if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { 195 if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
196 /* pkey 0 is the default and always allocated */ 196 /* pkey 0 is the default and allocated implicitly */
197 mm->context.pkey_allocation_map = 0x1; 197 mm->context.pkey_allocation_map = 0x1;
198 /* -1 means unallocated or invalid */ 198 /* -1 means unallocated or invalid */
199 mm->context.execute_only_pkey = -1; 199 mm->context.execute_only_pkey = -1;
diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index a0ba1ffda0df..851c04b7a092 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -2,6 +2,8 @@
2#ifndef _ASM_X86_PKEYS_H 2#ifndef _ASM_X86_PKEYS_H
3#define _ASM_X86_PKEYS_H 3#define _ASM_X86_PKEYS_H
4 4
5#define ARCH_DEFAULT_PKEY 0
6
5#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1) 7#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1)
6 8
7extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, 9extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
@@ -15,7 +17,7 @@ extern int __execute_only_pkey(struct mm_struct *mm);
15static inline int execute_only_pkey(struct mm_struct *mm) 17static inline int execute_only_pkey(struct mm_struct *mm)
16{ 18{
17 if (!boot_cpu_has(X86_FEATURE_OSPKE)) 19 if (!boot_cpu_has(X86_FEATURE_OSPKE))
18 return 0; 20 return ARCH_DEFAULT_PKEY;
19 21
20 return __execute_only_pkey(mm); 22 return __execute_only_pkey(mm);
21} 23}
@@ -49,13 +51,21 @@ bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
49{ 51{
50 /* 52 /*
51 * "Allocated" pkeys are those that have been returned 53 * "Allocated" pkeys are those that have been returned
52 * from pkey_alloc(). pkey 0 is special, and never 54 * from pkey_alloc() or pkey 0 which is allocated
53 * returned from pkey_alloc(). 55 * implicitly when the mm is created.
54 */ 56 */
55 if (pkey <= 0) 57 if (pkey < 0)
56 return false; 58 return false;
57 if (pkey >= arch_max_pkey()) 59 if (pkey >= arch_max_pkey())
58 return false; 60 return false;
61 /*
62 * The exec-only pkey is set in the allocation map, but
63 * is not available to any of the user interfaces like
64 * mprotect_pkey().
65 */
66 if (pkey == mm->context.execute_only_pkey)
67 return false;
68
59 return mm_pkey_allocation_map(mm) & (1U << pkey); 69 return mm_pkey_allocation_map(mm) & (1U << pkey);
60} 70}
61 71
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 8b04234e010b..7685444a106b 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -116,6 +116,7 @@ static void init_x2apic_ldr(void)
116 goto update; 116 goto update;
117 } 117 }
118 cmsk = cluster_hotplug_mask; 118 cmsk = cluster_hotplug_mask;
119 cmsk->clusterid = cluster;
119 cluster_hotplug_mask = NULL; 120 cluster_hotplug_mask = NULL;
120update: 121update:
121 this_cpu_write(cluster_masks, cmsk); 122 this_cpu_write(cluster_masks, cmsk);
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 0c408f8c4ed4..2d29e47c056e 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -104,6 +104,12 @@ static bool __head check_la57_support(unsigned long physaddr)
104} 104}
105#endif 105#endif
106 106
107/* Code in __startup_64() can be relocated during execution, but the compiler
108 * doesn't have to generate PC-relative relocations when accessing globals from
109 * that function. Clang actually does not generate them, which leads to
110 * boot-time crashes. To work around this problem, every global pointer must
111 * be adjusted using fixup_pointer().
112 */
107unsigned long __head __startup_64(unsigned long physaddr, 113unsigned long __head __startup_64(unsigned long physaddr,
108 struct boot_params *bp) 114 struct boot_params *bp)
109{ 115{
@@ -113,6 +119,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
113 p4dval_t *p4d; 119 p4dval_t *p4d;
114 pudval_t *pud; 120 pudval_t *pud;
115 pmdval_t *pmd, pmd_entry; 121 pmdval_t *pmd, pmd_entry;
122 pteval_t *mask_ptr;
116 bool la57; 123 bool la57;
117 int i; 124 int i;
118 unsigned int *next_pgt_ptr; 125 unsigned int *next_pgt_ptr;
@@ -196,7 +203,8 @@ unsigned long __head __startup_64(unsigned long physaddr,
196 203
197 pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; 204 pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
198 /* Filter out unsupported __PAGE_KERNEL_* bits: */ 205 /* Filter out unsupported __PAGE_KERNEL_* bits: */
199 pmd_entry &= __supported_pte_mask; 206 mask_ptr = fixup_pointer(&__supported_pte_mask, physaddr);
207 pmd_entry &= *mask_ptr;
200 pmd_entry += sme_get_me_mask(); 208 pmd_entry += sme_get_me_mask();
201 pmd_entry += physaddr; 209 pmd_entry += physaddr;
202 210
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 4b100fe0f508..12bb445fb98d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -542,6 +542,7 @@ void set_personality_64bit(void)
542 clear_thread_flag(TIF_X32); 542 clear_thread_flag(TIF_X32);
543 /* Pretend that this comes from a 64bit execve */ 543 /* Pretend that this comes from a 64bit execve */
544 task_pt_regs(current)->orig_ax = __NR_execve; 544 task_pt_regs(current)->orig_ax = __NR_execve;
545 current_thread_info()->status &= ~TS_COMPAT;
545 546
546 /* Ensure the corresponding mm is not marked. */ 547 /* Ensure the corresponding mm is not marked. */
547 if (current->mm) 548 if (current->mm)
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index d7bc0eea20a5..6e98e0a7c923 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -94,26 +94,27 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey
94 */ 94 */
95 if (pkey != -1) 95 if (pkey != -1)
96 return pkey; 96 return pkey;
97 /* 97
98 * Look for a protection-key-drive execute-only mapping
99 * which is now being given permissions that are not
100 * execute-only. Move it back to the default pkey.
101 */
102 if (vma_is_pkey_exec_only(vma) &&
103 (prot & (PROT_READ|PROT_WRITE))) {
104 return 0;
105 }
106 /* 98 /*
107 * The mapping is execute-only. Go try to get the 99 * The mapping is execute-only. Go try to get the
108 * execute-only protection key. If we fail to do that, 100 * execute-only protection key. If we fail to do that,
109 * fall through as if we do not have execute-only 101 * fall through as if we do not have execute-only
110 * support. 102 * support in this mm.
111 */ 103 */
112 if (prot == PROT_EXEC) { 104 if (prot == PROT_EXEC) {
113 pkey = execute_only_pkey(vma->vm_mm); 105 pkey = execute_only_pkey(vma->vm_mm);
114 if (pkey > 0) 106 if (pkey > 0)
115 return pkey; 107 return pkey;
108 } else if (vma_is_pkey_exec_only(vma)) {
109 /*
110 * Protections are *not* PROT_EXEC, but the mapping
111 * is using the exec-only pkey. This mapping was
112 * PROT_EXEC and will no longer be. Move back to
113 * the default pkey.
114 */
115 return ARCH_DEFAULT_PKEY;
116 } 116 }
117
117 /* 118 /*
118 * This is a vanilla, non-pkey mprotect (or we failed to 119 * This is a vanilla, non-pkey mprotect (or we failed to
119 * setup execute-only), inherit the pkey from the VMA we 120 * setup execute-only), inherit the pkey from the VMA we
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index d744991c0f4f..39f66bc29b82 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -11,7 +11,7 @@ CAN_BUILD_X86_64 := $(shell ./check_cc.sh $(CC) trivial_64bit_program.c)
11 11
12TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \ 12TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \
13 check_initial_reg_state sigreturn iopl mpx-mini-test ioperm \ 13 check_initial_reg_state sigreturn iopl mpx-mini-test ioperm \
14 protection_keys test_vdso test_vsyscall 14 protection_keys test_vdso test_vsyscall mov_ss_trap
15TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \ 15TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \
16 test_FCMOV test_FCOMI test_FISTTP \ 16 test_FCMOV test_FCOMI test_FISTTP \
17 vdso_restorer 17 vdso_restorer
diff --git a/tools/testing/selftests/x86/mov_ss_trap.c b/tools/testing/selftests/x86/mov_ss_trap.c
new file mode 100644
index 000000000000..3c3a022654f3
--- /dev/null
+++ b/tools/testing/selftests/x86/mov_ss_trap.c
@@ -0,0 +1,285 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * mov_ss_trap.c: Exercise the bizarre side effects of a watchpoint on MOV SS
4 *
5 * This does MOV SS from a watchpointed address followed by various
6 * types of kernel entries. A MOV SS that hits a watchpoint will queue
7 * up a #DB trap but will not actually deliver that trap. The trap
8 * will be delivered after the next instruction instead. The CPU's logic
9 * seems to be:
10 *
11 * - Any fault: drop the pending #DB trap.
12 * - INT $N, INT3, INTO, SYSCALL, SYSENTER: enter the kernel and then
13 * deliver #DB.
14 * - ICEBP: enter the kernel but do not deliver the watchpoint trap
15 * - breakpoint: only one #DB is delivered (phew!)
16 *
17 * There are plenty of ways for a kernel to handle this incorrectly. This
18 * test tries to exercise all the cases.
19 *
20 * This should mostly cover CVE-2018-1087 and CVE-2018-8897.
21 */
22#define _GNU_SOURCE
23
24#include <stdlib.h>
25#include <sys/ptrace.h>
26#include <sys/types.h>
27#include <sys/wait.h>
28#include <sys/user.h>
29#include <sys/syscall.h>
30#include <unistd.h>
31#include <errno.h>
32#include <stddef.h>
33#include <stdio.h>
34#include <err.h>
35#include <string.h>
36#include <setjmp.h>
37#include <sys/prctl.h>
38
39#define X86_EFLAGS_RF (1UL << 16)
40
41#if __x86_64__
42# define REG_IP REG_RIP
43#else
44# define REG_IP REG_EIP
45#endif
46
47unsigned short ss;
48extern unsigned char breakpoint_insn[];
49sigjmp_buf jmpbuf;
50static unsigned char altstack_data[SIGSTKSZ];
51
52static void enable_watchpoint(void)
53{
54 pid_t parent = getpid();
55 int status;
56
57 pid_t child = fork();
58 if (child < 0)
59 err(1, "fork");
60
61 if (child) {
62 if (waitpid(child, &status, 0) != child)
63 err(1, "waitpid for child");
64 } else {
65 unsigned long dr0, dr1, dr7;
66
67 dr0 = (unsigned long)&ss;
68 dr1 = (unsigned long)breakpoint_insn;
69 dr7 = ((1UL << 1) | /* G0 */
70 (3UL << 16) | /* RW0 = read or write */
71 (1UL << 18) | /* LEN0 = 2 bytes */
72 (1UL << 3)); /* G1, RW1 = insn */
73
74 if (ptrace(PTRACE_ATTACH, parent, NULL, NULL) != 0)
75 err(1, "PTRACE_ATTACH");
76
77 if (waitpid(parent, &status, 0) != parent)
78 err(1, "waitpid for child");
79
80 if (ptrace(PTRACE_POKEUSER, parent, (void *)offsetof(struct user, u_debugreg[0]), dr0) != 0)
81 err(1, "PTRACE_POKEUSER DR0");
82
83 if (ptrace(PTRACE_POKEUSER, parent, (void *)offsetof(struct user, u_debugreg[1]), dr1) != 0)
84 err(1, "PTRACE_POKEUSER DR1");
85
86 if (ptrace(PTRACE_POKEUSER, parent, (void *)offsetof(struct user, u_debugreg[7]), dr7) != 0)
87 err(1, "PTRACE_POKEUSER DR7");
88
89 printf("\tDR0 = %lx, DR1 = %lx, DR7 = %lx\n", dr0, dr1, dr7);
90
91 if (ptrace(PTRACE_DETACH, parent, NULL, NULL) != 0)
92 err(1, "PTRACE_DETACH");
93
94 exit(0);
95 }
96}
97
98static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
99 int flags)
100{
101 struct sigaction sa;
102 memset(&sa, 0, sizeof(sa));
103 sa.sa_sigaction = handler;
104 sa.sa_flags = SA_SIGINFO | flags;
105 sigemptyset(&sa.sa_mask);
106 if (sigaction(sig, &sa, 0))
107 err(1, "sigaction");
108}
109
110static char const * const signames[] = {
111 [SIGSEGV] = "SIGSEGV",
112 [SIGBUS] = "SIBGUS",
113 [SIGTRAP] = "SIGTRAP",
114 [SIGILL] = "SIGILL",
115};
116
117static void sigtrap(int sig, siginfo_t *si, void *ctx_void)
118{
119 ucontext_t *ctx = ctx_void;
120
121 printf("\tGot SIGTRAP with RIP=%lx, EFLAGS.RF=%d\n",
122 (unsigned long)ctx->uc_mcontext.gregs[REG_IP],
123 !!(ctx->uc_mcontext.gregs[REG_EFL] & X86_EFLAGS_RF));
124}
125
126static void handle_and_return(int sig, siginfo_t *si, void *ctx_void)
127{
128 ucontext_t *ctx = ctx_void;
129
130 printf("\tGot %s with RIP=%lx\n", signames[sig],
131 (unsigned long)ctx->uc_mcontext.gregs[REG_IP]);
132}
133
134static void handle_and_longjmp(int sig, siginfo_t *si, void *ctx_void)
135{
136 ucontext_t *ctx = ctx_void;
137
138 printf("\tGot %s with RIP=%lx\n", signames[sig],
139 (unsigned long)ctx->uc_mcontext.gregs[REG_IP]);
140
141 siglongjmp(jmpbuf, 1);
142}
143
144int main()
145{
146 unsigned long nr;
147
148 asm volatile ("mov %%ss, %[ss]" : [ss] "=m" (ss));
149 printf("\tSS = 0x%hx, &SS = 0x%p\n", ss, &ss);
150
151 if (prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0) == 0)
152 printf("\tPR_SET_PTRACER_ANY succeeded\n");
153
154 printf("\tSet up a watchpoint\n");
155 sethandler(SIGTRAP, sigtrap, 0);
156 enable_watchpoint();
157
158 printf("[RUN]\tRead from watched memory (should get SIGTRAP)\n");
159 asm volatile ("mov %[ss], %[tmp]" : [tmp] "=r" (nr) : [ss] "m" (ss));
160
161 printf("[RUN]\tMOV SS; INT3\n");
162 asm volatile ("mov %[ss], %%ss; int3" :: [ss] "m" (ss));
163
164 printf("[RUN]\tMOV SS; INT 3\n");
165 asm volatile ("mov %[ss], %%ss; .byte 0xcd, 0x3" :: [ss] "m" (ss));
166
167 printf("[RUN]\tMOV SS; CS CS INT3\n");
168 asm volatile ("mov %[ss], %%ss; .byte 0x2e, 0x2e; int3" :: [ss] "m" (ss));
169
170 printf("[RUN]\tMOV SS; CSx14 INT3\n");
171 asm volatile ("mov %[ss], %%ss; .fill 14,1,0x2e; int3" :: [ss] "m" (ss));
172
173 printf("[RUN]\tMOV SS; INT 4\n");
174 sethandler(SIGSEGV, handle_and_return, SA_RESETHAND);
175 asm volatile ("mov %[ss], %%ss; int $4" :: [ss] "m" (ss));
176
177#ifdef __i386__
178 printf("[RUN]\tMOV SS; INTO\n");
179 sethandler(SIGSEGV, handle_and_return, SA_RESETHAND);
180 nr = -1;
181 asm volatile ("add $1, %[tmp]; mov %[ss], %%ss; into"
182 : [tmp] "+r" (nr) : [ss] "m" (ss));
183#endif
184
185 if (sigsetjmp(jmpbuf, 1) == 0) {
186 printf("[RUN]\tMOV SS; ICEBP\n");
187
188 /* Some emulators (e.g. QEMU TCG) don't emulate ICEBP. */
189 sethandler(SIGILL, handle_and_longjmp, SA_RESETHAND);
190
191 asm volatile ("mov %[ss], %%ss; .byte 0xf1" :: [ss] "m" (ss));
192 }
193
194 if (sigsetjmp(jmpbuf, 1) == 0) {
195 printf("[RUN]\tMOV SS; CLI\n");
196 sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND);
197 asm volatile ("mov %[ss], %%ss; cli" :: [ss] "m" (ss));
198 }
199
200 if (sigsetjmp(jmpbuf, 1) == 0) {
201 printf("[RUN]\tMOV SS; #PF\n");
202 sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND);
203 asm volatile ("mov %[ss], %%ss; mov (-1), %[tmp]"
204 : [tmp] "=r" (nr) : [ss] "m" (ss));
205 }
206
207 /*
208 * INT $1: if #DB has DPL=3 and there isn't special handling,
209 * then the kernel will die.
210 */
211 if (sigsetjmp(jmpbuf, 1) == 0) {
212 printf("[RUN]\tMOV SS; INT 1\n");
213 sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND);
214 asm volatile ("mov %[ss], %%ss; int $1" :: [ss] "m" (ss));
215 }
216
217#ifdef __x86_64__
218 /*
219 * In principle, we should test 32-bit SYSCALL as well, but
220 * the calling convention is so unpredictable that it's
221 * not obviously worth the effort.
222 */
223 if (sigsetjmp(jmpbuf, 1) == 0) {
224 printf("[RUN]\tMOV SS; SYSCALL\n");
225 sethandler(SIGILL, handle_and_longjmp, SA_RESETHAND);
226 nr = SYS_getpid;
227 /*
228 * Toggle the high bit of RSP to make it noncanonical to
229 * strengthen this test on non-SMAP systems.
230 */
231 asm volatile ("btc $63, %%rsp\n\t"
232 "mov %[ss], %%ss; syscall\n\t"
233 "btc $63, %%rsp"
234 : "+a" (nr) : [ss] "m" (ss)
235 : "rcx"
236#ifdef __x86_64__
237 , "r11"
238#endif
239 );
240 }
241#endif
242
243 printf("[RUN]\tMOV SS; breakpointed NOP\n");
244 asm volatile ("mov %[ss], %%ss; breakpoint_insn: nop" :: [ss] "m" (ss));
245
246 /*
247 * Invoking SYSENTER directly breaks all the rules. Just handle
248 * the SIGSEGV.
249 */
250 if (sigsetjmp(jmpbuf, 1) == 0) {
251 printf("[RUN]\tMOV SS; SYSENTER\n");
252 stack_t stack = {
253 .ss_sp = altstack_data,
254 .ss_size = SIGSTKSZ,
255 };
256 if (sigaltstack(&stack, NULL) != 0)
257 err(1, "sigaltstack");
258 sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND | SA_ONSTACK);
259 nr = SYS_getpid;
260 asm volatile ("mov %[ss], %%ss; SYSENTER" : "+a" (nr)
261 : [ss] "m" (ss) : "flags", "rcx"
262#ifdef __x86_64__
263 , "r11"
264#endif
265 );
266
267 /* We're unreachable here. SYSENTER forgets RIP. */
268 }
269
270 if (sigsetjmp(jmpbuf, 1) == 0) {
271 printf("[RUN]\tMOV SS; INT $0x80\n");
272 sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND);
273 nr = 20; /* compat getpid */
274 asm volatile ("mov %[ss], %%ss; int $0x80"
275 : "+a" (nr) : [ss] "m" (ss)
276 : "flags"
277#ifdef __x86_64__
278 , "r8", "r9", "r10", "r11"
279#endif
280 );
281 }
282
283 printf("[OK]\tI aten't dead\n");
284 return 0;
285}
diff --git a/tools/testing/selftests/x86/mpx-mini-test.c b/tools/testing/selftests/x86/mpx-mini-test.c
index 9c0325e1ea68..50f7e9272481 100644
--- a/tools/testing/selftests/x86/mpx-mini-test.c
+++ b/tools/testing/selftests/x86/mpx-mini-test.c
@@ -368,6 +368,11 @@ static int expected_bnd_index = -1;
368uint64_t shadow_plb[NR_MPX_BOUNDS_REGISTERS][2]; /* shadow MPX bound registers */ 368uint64_t shadow_plb[NR_MPX_BOUNDS_REGISTERS][2]; /* shadow MPX bound registers */
369unsigned long shadow_map[NR_MPX_BOUNDS_REGISTERS]; 369unsigned long shadow_map[NR_MPX_BOUNDS_REGISTERS];
370 370
371/* Failed address bound checks: */
372#ifndef SEGV_BNDERR
373# define SEGV_BNDERR 3
374#endif
375
371/* 376/*
372 * The kernel is supposed to provide some information about the bounds 377 * The kernel is supposed to provide some information about the bounds
373 * exception in the siginfo. It should match what we have in the bounds 378 * exception in the siginfo. It should match what we have in the bounds
@@ -419,8 +424,6 @@ void handler(int signum, siginfo_t *si, void *vucontext)
419 br_count++; 424 br_count++;
420 dprintf1("#BR 0x%jx (total seen: %d)\n", status, br_count); 425 dprintf1("#BR 0x%jx (total seen: %d)\n", status, br_count);
421 426
422#define SEGV_BNDERR 3 /* failed address bound checks */
423
424 dprintf2("Saw a #BR! status 0x%jx at %016lx br_reason: %jx\n", 427 dprintf2("Saw a #BR! status 0x%jx at %016lx br_reason: %jx\n",
425 status, ip, br_reason); 428 status, ip, br_reason);
426 dprintf2("si_signo: %d\n", si->si_signo); 429 dprintf2("si_signo: %d\n", si->si_signo);
diff --git a/tools/testing/selftests/x86/pkey-helpers.h b/tools/testing/selftests/x86/pkey-helpers.h
index b3cb7670e026..254e5436bdd9 100644
--- a/tools/testing/selftests/x86/pkey-helpers.h
+++ b/tools/testing/selftests/x86/pkey-helpers.h
@@ -26,30 +26,26 @@ static inline void sigsafe_printf(const char *format, ...)
26{ 26{
27 va_list ap; 27 va_list ap;
28 28
29 va_start(ap, format);
30 if (!dprint_in_signal) { 29 if (!dprint_in_signal) {
30 va_start(ap, format);
31 vprintf(format, ap); 31 vprintf(format, ap);
32 va_end(ap);
32 } else { 33 } else {
33 int ret; 34 int ret;
34 int len = vsnprintf(dprint_in_signal_buffer,
35 DPRINT_IN_SIGNAL_BUF_SIZE,
36 format, ap);
37 /* 35 /*
38 * len is amount that would have been printed, 36 * No printf() functions are signal-safe.
39 * but actual write is truncated at BUF_SIZE. 37 * They deadlock easily. Write the format
38 * string to get some output, even if
39 * incomplete.
40 */ 40 */
41 if (len > DPRINT_IN_SIGNAL_BUF_SIZE) 41 ret = write(1, format, strlen(format));
42 len = DPRINT_IN_SIGNAL_BUF_SIZE;
43 ret = write(1, dprint_in_signal_buffer, len);
44 if (ret < 0) 42 if (ret < 0)
45 abort(); 43 exit(1);
46 } 44 }
47 va_end(ap);
48} 45}
49#define dprintf_level(level, args...) do { \ 46#define dprintf_level(level, args...) do { \
50 if (level <= DEBUG_LEVEL) \ 47 if (level <= DEBUG_LEVEL) \
51 sigsafe_printf(args); \ 48 sigsafe_printf(args); \
52 fflush(NULL); \
53} while (0) 49} while (0)
54#define dprintf0(args...) dprintf_level(0, args) 50#define dprintf0(args...) dprintf_level(0, args)
55#define dprintf1(args...) dprintf_level(1, args) 51#define dprintf1(args...) dprintf_level(1, args)
diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/x86/protection_keys.c
index f15aa5a76fe3..460b4bdf4c1e 100644
--- a/tools/testing/selftests/x86/protection_keys.c
+++ b/tools/testing/selftests/x86/protection_keys.c
@@ -72,10 +72,9 @@ extern void abort_hooks(void);
72 test_nr, iteration_nr); \ 72 test_nr, iteration_nr); \
73 dprintf0("errno at assert: %d", errno); \ 73 dprintf0("errno at assert: %d", errno); \
74 abort_hooks(); \ 74 abort_hooks(); \
75 assert(condition); \ 75 exit(__LINE__); \
76 } \ 76 } \
77} while (0) 77} while (0)
78#define raw_assert(cond) assert(cond)
79 78
80void cat_into_file(char *str, char *file) 79void cat_into_file(char *str, char *file)
81{ 80{
@@ -87,12 +86,17 @@ void cat_into_file(char *str, char *file)
87 * these need to be raw because they are called under 86 * these need to be raw because they are called under
88 * pkey_assert() 87 * pkey_assert()
89 */ 88 */
90 raw_assert(fd >= 0); 89 if (fd < 0) {
90 fprintf(stderr, "error opening '%s'\n", str);
91 perror("error: ");
92 exit(__LINE__);
93 }
94
91 ret = write(fd, str, strlen(str)); 95 ret = write(fd, str, strlen(str));
92 if (ret != strlen(str)) { 96 if (ret != strlen(str)) {
93 perror("write to file failed"); 97 perror("write to file failed");
94 fprintf(stderr, "filename: '%s' str: '%s'\n", file, str); 98 fprintf(stderr, "filename: '%s' str: '%s'\n", file, str);
95 raw_assert(0); 99 exit(__LINE__);
96 } 100 }
97 close(fd); 101 close(fd);
98} 102}
@@ -191,26 +195,30 @@ void lots_o_noops_around_write(int *write_to_me)
191#ifdef __i386__ 195#ifdef __i386__
192 196
193#ifndef SYS_mprotect_key 197#ifndef SYS_mprotect_key
194# define SYS_mprotect_key 380 198# define SYS_mprotect_key 380
195#endif 199#endif
200
196#ifndef SYS_pkey_alloc 201#ifndef SYS_pkey_alloc
197# define SYS_pkey_alloc 381 202# define SYS_pkey_alloc 381
198# define SYS_pkey_free 382 203# define SYS_pkey_free 382
199#endif 204#endif
200#define REG_IP_IDX REG_EIP 205
201#define si_pkey_offset 0x14 206#define REG_IP_IDX REG_EIP
207#define si_pkey_offset 0x14
202 208
203#else 209#else
204 210
205#ifndef SYS_mprotect_key 211#ifndef SYS_mprotect_key
206# define SYS_mprotect_key 329 212# define SYS_mprotect_key 329
207#endif 213#endif
214
208#ifndef SYS_pkey_alloc 215#ifndef SYS_pkey_alloc
209# define SYS_pkey_alloc 330 216# define SYS_pkey_alloc 330
210# define SYS_pkey_free 331 217# define SYS_pkey_free 331
211#endif 218#endif
212#define REG_IP_IDX REG_RIP 219
213#define si_pkey_offset 0x20 220#define REG_IP_IDX REG_RIP
221#define si_pkey_offset 0x20
214 222
215#endif 223#endif
216 224
@@ -225,8 +233,14 @@ void dump_mem(void *dumpme, int len_bytes)
225 } 233 }
226} 234}
227 235
228#define SEGV_BNDERR 3 /* failed address bound checks */ 236/* Failed address bound checks: */
229#define SEGV_PKUERR 4 237#ifndef SEGV_BNDERR
238# define SEGV_BNDERR 3
239#endif
240
241#ifndef SEGV_PKUERR
242# define SEGV_PKUERR 4
243#endif
230 244
231static char *si_code_str(int si_code) 245static char *si_code_str(int si_code)
232{ 246{
@@ -289,13 +303,6 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext)
289 dump_mem(pkru_ptr - 128, 256); 303 dump_mem(pkru_ptr - 128, 256);
290 pkey_assert(*pkru_ptr); 304 pkey_assert(*pkru_ptr);
291 305
292 si_pkey_ptr = (u32 *)(((u8 *)si) + si_pkey_offset);
293 dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr);
294 dump_mem(si_pkey_ptr - 8, 24);
295 siginfo_pkey = *si_pkey_ptr;
296 pkey_assert(siginfo_pkey < NR_PKEYS);
297 last_si_pkey = siginfo_pkey;
298
299 if ((si->si_code == SEGV_MAPERR) || 306 if ((si->si_code == SEGV_MAPERR) ||
300 (si->si_code == SEGV_ACCERR) || 307 (si->si_code == SEGV_ACCERR) ||
301 (si->si_code == SEGV_BNDERR)) { 308 (si->si_code == SEGV_BNDERR)) {
@@ -303,6 +310,13 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext)
303 exit(4); 310 exit(4);
304 } 311 }
305 312
313 si_pkey_ptr = (u32 *)(((u8 *)si) + si_pkey_offset);
314 dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr);
315 dump_mem((u8 *)si_pkey_ptr - 8, 24);
316 siginfo_pkey = *si_pkey_ptr;
317 pkey_assert(siginfo_pkey < NR_PKEYS);
318 last_si_pkey = siginfo_pkey;
319
306 dprintf1("signal pkru from xsave: %08x\n", *pkru_ptr); 320 dprintf1("signal pkru from xsave: %08x\n", *pkru_ptr);
307 /* need __rdpkru() version so we do not do shadow_pkru checking */ 321 /* need __rdpkru() version so we do not do shadow_pkru checking */
308 dprintf1("signal pkru from pkru: %08x\n", __rdpkru()); 322 dprintf1("signal pkru from pkru: %08x\n", __rdpkru());
@@ -311,22 +325,6 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext)
311 dprintf1("WARNING: set PRKU=0 to allow faulting instruction to continue\n"); 325 dprintf1("WARNING: set PRKU=0 to allow faulting instruction to continue\n");
312 pkru_faults++; 326 pkru_faults++;
313 dprintf1("<<<<==================================================\n"); 327 dprintf1("<<<<==================================================\n");
314 return;
315 if (trapno == 14) {
316 fprintf(stderr,
317 "ERROR: In signal handler, page fault, trapno = %d, ip = %016lx\n",
318 trapno, ip);
319 fprintf(stderr, "si_addr %p\n", si->si_addr);
320 fprintf(stderr, "REG_ERR: %lx\n",
321 (unsigned long)uctxt->uc_mcontext.gregs[REG_ERR]);
322 exit(1);
323 } else {
324 fprintf(stderr, "unexpected trap %d! at 0x%lx\n", trapno, ip);
325 fprintf(stderr, "si_addr %p\n", si->si_addr);
326 fprintf(stderr, "REG_ERR: %lx\n",
327 (unsigned long)uctxt->uc_mcontext.gregs[REG_ERR]);
328 exit(2);
329 }
330 dprint_in_signal = 0; 328 dprint_in_signal = 0;
331} 329}
332 330
@@ -393,10 +391,15 @@ pid_t fork_lazy_child(void)
393 return forkret; 391 return forkret;
394} 392}
395 393
396#define PKEY_DISABLE_ACCESS 0x1 394#ifndef PKEY_DISABLE_ACCESS
397#define PKEY_DISABLE_WRITE 0x2 395# define PKEY_DISABLE_ACCESS 0x1
396#endif
397
398#ifndef PKEY_DISABLE_WRITE
399# define PKEY_DISABLE_WRITE 0x2
400#endif
398 401
399u32 pkey_get(int pkey, unsigned long flags) 402static u32 hw_pkey_get(int pkey, unsigned long flags)
400{ 403{
401 u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); 404 u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
402 u32 pkru = __rdpkru(); 405 u32 pkru = __rdpkru();
@@ -418,7 +421,7 @@ u32 pkey_get(int pkey, unsigned long flags)
418 return masked_pkru; 421 return masked_pkru;
419} 422}
420 423
421int pkey_set(int pkey, unsigned long rights, unsigned long flags) 424static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags)
422{ 425{
423 u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); 426 u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
424 u32 old_pkru = __rdpkru(); 427 u32 old_pkru = __rdpkru();
@@ -452,15 +455,15 @@ void pkey_disable_set(int pkey, int flags)
452 pkey, flags); 455 pkey, flags);
453 pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); 456 pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
454 457
455 pkey_rights = pkey_get(pkey, syscall_flags); 458 pkey_rights = hw_pkey_get(pkey, syscall_flags);
456 459
457 dprintf1("%s(%d) pkey_get(%d): %x\n", __func__, 460 dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
458 pkey, pkey, pkey_rights); 461 pkey, pkey, pkey_rights);
459 pkey_assert(pkey_rights >= 0); 462 pkey_assert(pkey_rights >= 0);
460 463
461 pkey_rights |= flags; 464 pkey_rights |= flags;
462 465
463 ret = pkey_set(pkey, pkey_rights, syscall_flags); 466 ret = hw_pkey_set(pkey, pkey_rights, syscall_flags);
464 assert(!ret); 467 assert(!ret);
465 /*pkru and flags have the same format */ 468 /*pkru and flags have the same format */
466 shadow_pkru |= flags << (pkey * 2); 469 shadow_pkru |= flags << (pkey * 2);
@@ -468,8 +471,8 @@ void pkey_disable_set(int pkey, int flags)
468 471
469 pkey_assert(ret >= 0); 472 pkey_assert(ret >= 0);
470 473
471 pkey_rights = pkey_get(pkey, syscall_flags); 474 pkey_rights = hw_pkey_get(pkey, syscall_flags);
472 dprintf1("%s(%d) pkey_get(%d): %x\n", __func__, 475 dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
473 pkey, pkey, pkey_rights); 476 pkey, pkey, pkey_rights);
474 477
475 dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru()); 478 dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru());
@@ -483,24 +486,24 @@ void pkey_disable_clear(int pkey, int flags)
483{ 486{
484 unsigned long syscall_flags = 0; 487 unsigned long syscall_flags = 0;
485 int ret; 488 int ret;
486 int pkey_rights = pkey_get(pkey, syscall_flags); 489 int pkey_rights = hw_pkey_get(pkey, syscall_flags);
487 u32 orig_pkru = rdpkru(); 490 u32 orig_pkru = rdpkru();
488 491
489 pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); 492 pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
490 493
491 dprintf1("%s(%d) pkey_get(%d): %x\n", __func__, 494 dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
492 pkey, pkey, pkey_rights); 495 pkey, pkey, pkey_rights);
493 pkey_assert(pkey_rights >= 0); 496 pkey_assert(pkey_rights >= 0);
494 497
495 pkey_rights |= flags; 498 pkey_rights |= flags;
496 499
497 ret = pkey_set(pkey, pkey_rights, 0); 500 ret = hw_pkey_set(pkey, pkey_rights, 0);
498 /* pkru and flags have the same format */ 501 /* pkru and flags have the same format */
499 shadow_pkru &= ~(flags << (pkey * 2)); 502 shadow_pkru &= ~(flags << (pkey * 2));
500 pkey_assert(ret >= 0); 503 pkey_assert(ret >= 0);
501 504
502 pkey_rights = pkey_get(pkey, syscall_flags); 505 pkey_rights = hw_pkey_get(pkey, syscall_flags);
503 dprintf1("%s(%d) pkey_get(%d): %x\n", __func__, 506 dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
504 pkey, pkey, pkey_rights); 507 pkey, pkey, pkey_rights);
505 508
506 dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru()); 509 dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru());
@@ -674,10 +677,12 @@ int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
674struct pkey_malloc_record { 677struct pkey_malloc_record {
675 void *ptr; 678 void *ptr;
676 long size; 679 long size;
680 int prot;
677}; 681};
678struct pkey_malloc_record *pkey_malloc_records; 682struct pkey_malloc_record *pkey_malloc_records;
683struct pkey_malloc_record *pkey_last_malloc_record;
679long nr_pkey_malloc_records; 684long nr_pkey_malloc_records;
680void record_pkey_malloc(void *ptr, long size) 685void record_pkey_malloc(void *ptr, long size, int prot)
681{ 686{
682 long i; 687 long i;
683 struct pkey_malloc_record *rec = NULL; 688 struct pkey_malloc_record *rec = NULL;
@@ -709,6 +714,8 @@ void record_pkey_malloc(void *ptr, long size)
709 (int)(rec - pkey_malloc_records), rec, ptr, size); 714 (int)(rec - pkey_malloc_records), rec, ptr, size);
710 rec->ptr = ptr; 715 rec->ptr = ptr;
711 rec->size = size; 716 rec->size = size;
717 rec->prot = prot;
718 pkey_last_malloc_record = rec;
712 nr_pkey_malloc_records++; 719 nr_pkey_malloc_records++;
713} 720}
714 721
@@ -753,7 +760,7 @@ void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey)
753 pkey_assert(ptr != (void *)-1); 760 pkey_assert(ptr != (void *)-1);
754 ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); 761 ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
755 pkey_assert(!ret); 762 pkey_assert(!ret);
756 record_pkey_malloc(ptr, size); 763 record_pkey_malloc(ptr, size, prot);
757 rdpkru(); 764 rdpkru();
758 765
759 dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); 766 dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
@@ -774,7 +781,7 @@ void *malloc_pkey_anon_huge(long size, int prot, u16 pkey)
774 size = ALIGN_UP(size, HPAGE_SIZE * 2); 781 size = ALIGN_UP(size, HPAGE_SIZE * 2);
775 ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); 782 ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
776 pkey_assert(ptr != (void *)-1); 783 pkey_assert(ptr != (void *)-1);
777 record_pkey_malloc(ptr, size); 784 record_pkey_malloc(ptr, size, prot);
778 mprotect_pkey(ptr, size, prot, pkey); 785 mprotect_pkey(ptr, size, prot, pkey);
779 786
780 dprintf1("unaligned ptr: %p\n", ptr); 787 dprintf1("unaligned ptr: %p\n", ptr);
@@ -847,7 +854,7 @@ void *malloc_pkey_hugetlb(long size, int prot, u16 pkey)
847 pkey_assert(ptr != (void *)-1); 854 pkey_assert(ptr != (void *)-1);
848 mprotect_pkey(ptr, size, prot, pkey); 855 mprotect_pkey(ptr, size, prot, pkey);
849 856
850 record_pkey_malloc(ptr, size); 857 record_pkey_malloc(ptr, size, prot);
851 858
852 dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr); 859 dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr);
853 return ptr; 860 return ptr;
@@ -869,7 +876,7 @@ void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey)
869 876
870 mprotect_pkey(ptr, size, prot, pkey); 877 mprotect_pkey(ptr, size, prot, pkey);
871 878
872 record_pkey_malloc(ptr, size); 879 record_pkey_malloc(ptr, size, prot);
873 880
874 dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr); 881 dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr);
875 close(fd); 882 close(fd);
@@ -918,13 +925,21 @@ void *malloc_pkey(long size, int prot, u16 pkey)
918} 925}
919 926
920int last_pkru_faults; 927int last_pkru_faults;
928#define UNKNOWN_PKEY -2
921void expected_pk_fault(int pkey) 929void expected_pk_fault(int pkey)
922{ 930{
923 dprintf2("%s(): last_pkru_faults: %d pkru_faults: %d\n", 931 dprintf2("%s(): last_pkru_faults: %d pkru_faults: %d\n",
924 __func__, last_pkru_faults, pkru_faults); 932 __func__, last_pkru_faults, pkru_faults);
925 dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey); 933 dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey);
926 pkey_assert(last_pkru_faults + 1 == pkru_faults); 934 pkey_assert(last_pkru_faults + 1 == pkru_faults);
927 pkey_assert(last_si_pkey == pkey); 935
936 /*
937 * For exec-only memory, we do not know the pkey in
938 * advance, so skip this check.
939 */
940 if (pkey != UNKNOWN_PKEY)
941 pkey_assert(last_si_pkey == pkey);
942
928 /* 943 /*
929 * The signal handler shold have cleared out PKRU to let the 944 * The signal handler shold have cleared out PKRU to let the
930 * test program continue. We now have to restore it. 945 * test program continue. We now have to restore it.
@@ -939,10 +954,11 @@ void expected_pk_fault(int pkey)
939 last_si_pkey = -1; 954 last_si_pkey = -1;
940} 955}
941 956
942void do_not_expect_pk_fault(void) 957#define do_not_expect_pk_fault(msg) do { \
943{ 958 if (last_pkru_faults != pkru_faults) \
944 pkey_assert(last_pkru_faults == pkru_faults); 959 dprintf0("unexpected PK fault: %s\n", msg); \
945} 960 pkey_assert(last_pkru_faults == pkru_faults); \
961} while (0)
946 962
947int test_fds[10] = { -1 }; 963int test_fds[10] = { -1 };
948int nr_test_fds; 964int nr_test_fds;
@@ -1151,12 +1167,15 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
1151 pkey_assert(i < NR_PKEYS*2); 1167 pkey_assert(i < NR_PKEYS*2);
1152 1168
1153 /* 1169 /*
1154 * There are 16 pkeys supported in hardware. One is taken 1170 * There are 16 pkeys supported in hardware. Three are
1155 * up for the default (0) and another can be taken up by 1171 * allocated by the time we get here:
1156 * an execute-only mapping. Ensure that we can allocate 1172 * 1. The default key (0)
1157 * at least 14 (16-2). 1173 * 2. One possibly consumed by an execute-only mapping.
1174 * 3. One allocated by the test code and passed in via
1175 * 'pkey' to this function.
1176 * Ensure that we can allocate at least another 13 (16-3).
1158 */ 1177 */
1159 pkey_assert(i >= NR_PKEYS-2); 1178 pkey_assert(i >= NR_PKEYS-3);
1160 1179
1161 for (i = 0; i < nr_allocated_pkeys; i++) { 1180 for (i = 0; i < nr_allocated_pkeys; i++) {
1162 err = sys_pkey_free(allocated_pkeys[i]); 1181 err = sys_pkey_free(allocated_pkeys[i]);
@@ -1165,6 +1184,35 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
1165 } 1184 }
1166} 1185}
1167 1186
1187/*
1188 * pkey 0 is special. It is allocated by default, so you do not
1189 * have to call pkey_alloc() to use it first. Make sure that it
1190 * is usable.
1191 */
1192void test_mprotect_with_pkey_0(int *ptr, u16 pkey)
1193{
1194 long size;
1195 int prot;
1196
1197 assert(pkey_last_malloc_record);
1198 size = pkey_last_malloc_record->size;
1199 /*
1200 * This is a bit of a hack. But mprotect() requires
1201 * huge-page-aligned sizes when operating on hugetlbfs.
1202 * So, make sure that we use something that's a multiple
1203 * of a huge page when we can.
1204 */
1205 if (size >= HPAGE_SIZE)
1206 size = HPAGE_SIZE;
1207 prot = pkey_last_malloc_record->prot;
1208
1209 /* Use pkey 0 */
1210 mprotect_pkey(ptr, size, prot, 0);
1211
1212 /* Make sure that we can set it back to the original pkey. */
1213 mprotect_pkey(ptr, size, prot, pkey);
1214}
1215
1168void test_ptrace_of_child(int *ptr, u16 pkey) 1216void test_ptrace_of_child(int *ptr, u16 pkey)
1169{ 1217{
1170 __attribute__((__unused__)) int peek_result; 1218 __attribute__((__unused__)) int peek_result;
@@ -1228,7 +1276,7 @@ void test_ptrace_of_child(int *ptr, u16 pkey)
1228 pkey_assert(ret != -1); 1276 pkey_assert(ret != -1);
1229 /* Now access from the current task, and expect NO exception: */ 1277 /* Now access from the current task, and expect NO exception: */
1230 peek_result = read_ptr(plain_ptr); 1278 peek_result = read_ptr(plain_ptr);
1231 do_not_expect_pk_fault(); 1279 do_not_expect_pk_fault("read plain pointer after ptrace");
1232 1280
1233 ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0); 1281 ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0);
1234 pkey_assert(ret != -1); 1282 pkey_assert(ret != -1);
@@ -1241,12 +1289,9 @@ void test_ptrace_of_child(int *ptr, u16 pkey)
1241 free(plain_ptr_unaligned); 1289 free(plain_ptr_unaligned);
1242} 1290}
1243 1291
1244void test_executing_on_unreadable_memory(int *ptr, u16 pkey) 1292void *get_pointer_to_instructions(void)
1245{ 1293{
1246 void *p1; 1294 void *p1;
1247 int scratch;
1248 int ptr_contents;
1249 int ret;
1250 1295
1251 p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE); 1296 p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE);
1252 dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write); 1297 dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write);
@@ -1256,7 +1301,23 @@ void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
1256 /* Point 'p1' at the *second* page of the function: */ 1301 /* Point 'p1' at the *second* page of the function: */
1257 p1 += PAGE_SIZE; 1302 p1 += PAGE_SIZE;
1258 1303
1304 /*
1305 * Try to ensure we fault this in on next touch to ensure
1306 * we get an instruction fault as opposed to a data one
1307 */
1259 madvise(p1, PAGE_SIZE, MADV_DONTNEED); 1308 madvise(p1, PAGE_SIZE, MADV_DONTNEED);
1309
1310 return p1;
1311}
1312
1313void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
1314{
1315 void *p1;
1316 int scratch;
1317 int ptr_contents;
1318 int ret;
1319
1320 p1 = get_pointer_to_instructions();
1260 lots_o_noops_around_write(&scratch); 1321 lots_o_noops_around_write(&scratch);
1261 ptr_contents = read_ptr(p1); 1322 ptr_contents = read_ptr(p1);
1262 dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); 1323 dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
@@ -1272,12 +1333,55 @@ void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
1272 */ 1333 */
1273 madvise(p1, PAGE_SIZE, MADV_DONTNEED); 1334 madvise(p1, PAGE_SIZE, MADV_DONTNEED);
1274 lots_o_noops_around_write(&scratch); 1335 lots_o_noops_around_write(&scratch);
1275 do_not_expect_pk_fault(); 1336 do_not_expect_pk_fault("executing on PROT_EXEC memory");
1276 ptr_contents = read_ptr(p1); 1337 ptr_contents = read_ptr(p1);
1277 dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); 1338 dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
1278 expected_pk_fault(pkey); 1339 expected_pk_fault(pkey);
1279} 1340}
1280 1341
1342void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey)
1343{
1344 void *p1;
1345 int scratch;
1346 int ptr_contents;
1347 int ret;
1348
1349 dprintf1("%s() start\n", __func__);
1350
1351 p1 = get_pointer_to_instructions();
1352 lots_o_noops_around_write(&scratch);
1353 ptr_contents = read_ptr(p1);
1354 dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
1355
1356 /* Use a *normal* mprotect(), not mprotect_pkey(): */
1357 ret = mprotect(p1, PAGE_SIZE, PROT_EXEC);
1358 pkey_assert(!ret);
1359
1360 dprintf2("pkru: %x\n", rdpkru());
1361
1362 /* Make sure this is an *instruction* fault */
1363 madvise(p1, PAGE_SIZE, MADV_DONTNEED);
1364 lots_o_noops_around_write(&scratch);
1365 do_not_expect_pk_fault("executing on PROT_EXEC memory");
1366 ptr_contents = read_ptr(p1);
1367 dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
1368 expected_pk_fault(UNKNOWN_PKEY);
1369
1370 /*
1371 * Put the memory back to non-PROT_EXEC. Should clear the
1372 * exec-only pkey off the VMA and allow it to be readable
1373 * again. Go to PROT_NONE first to check for a kernel bug
1374 * that did not clear the pkey when doing PROT_NONE.
1375 */
1376 ret = mprotect(p1, PAGE_SIZE, PROT_NONE);
1377 pkey_assert(!ret);
1378
1379 ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC);
1380 pkey_assert(!ret);
1381 ptr_contents = read_ptr(p1);
1382 do_not_expect_pk_fault("plain read on recently PROT_EXEC area");
1383}
1384
1281void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) 1385void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey)
1282{ 1386{
1283 int size = PAGE_SIZE; 1387 int size = PAGE_SIZE;
@@ -1302,6 +1406,8 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = {
1302 test_kernel_gup_of_access_disabled_region, 1406 test_kernel_gup_of_access_disabled_region,
1303 test_kernel_gup_write_to_write_disabled_region, 1407 test_kernel_gup_write_to_write_disabled_region,
1304 test_executing_on_unreadable_memory, 1408 test_executing_on_unreadable_memory,
1409 test_implicit_mprotect_exec_only_memory,
1410 test_mprotect_with_pkey_0,
1305 test_ptrace_of_child, 1411 test_ptrace_of_child,
1306 test_pkey_syscalls_on_non_allocated_pkey, 1412 test_pkey_syscalls_on_non_allocated_pkey,
1307 test_pkey_syscalls_bad_args, 1413 test_pkey_syscalls_bad_args,