aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-10-10 14:01:51 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-10-10 14:01:51 -0400
commit93c26d7dc02380fe11e57ff0d152368743762169 (patch)
tree5234bc86561c6b8c7fd698a2d456add3c960db1f
parent5fa0eb0b4d4780fbd6d8a09850cc4fd539e9fe65 (diff)
parent6679dac513fd612f34d3a3d99d7b84ed6d5eb5cc (diff)
Merge branch 'mm-pkeys-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull protection keys syscall interface from Thomas Gleixner: "This is the final step of Protection Keys support which adds the syscalls so user space can actually allocate keys and protect memory areas with them. Details and usage examples can be found in the documentation. The mm side of this has been acked by Mel" * 'mm-pkeys-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/pkeys: Update documentation x86/mm/pkeys: Do not skip PKRU register if debug registers are not used x86/pkeys: Fix pkeys build breakage for some non-x86 arches x86/pkeys: Add self-tests x86/pkeys: Allow configuration of init_pkru x86/pkeys: Default to a restrictive init PKRU pkeys: Add details of system call use to Documentation/ generic syscalls: Wire up memory protection keys syscalls x86: Wire up protection keys system calls x86/pkeys: Allocation/free syscalls x86/pkeys: Make mprotect_key() mask off additional vm_flags mm: Implement new pkey_mprotect() system call x86/pkeys: Add fault handling for PF_PK page fault bit
-rw-r--r--Documentation/kernel-parameters.txt5
-rw-r--r--Documentation/x86/protection-keys.txt70
-rw-r--r--arch/alpha/include/uapi/asm/mman.h5
-rw-r--r--arch/mips/include/uapi/asm/mman.h5
-rw-r--r--arch/parisc/include/uapi/asm/mman.h5
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl5
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl5
-rw-r--r--arch/x86/include/asm/mmu.h8
-rw-r--r--arch/x86/include/asm/mmu_context.h25
-rw-r--r--arch/x86/include/asm/pkeys.h73
-rw-r--r--arch/x86/kernel/fpu/core.c4
-rw-r--r--arch/x86/kernel/fpu/xstate.c5
-rw-r--r--arch/x86/kernel/process_64.c13
-rw-r--r--arch/x86/mm/fault.c9
-rw-r--r--arch/x86/mm/pkeys.c142
-rw-r--r--arch/xtensa/include/uapi/asm/mman.h5
-rw-r--r--include/linux/pkeys.h41
-rw-r--r--include/linux/syscalls.h8
-rw-r--r--include/uapi/asm-generic/mman-common.h5
-rw-r--r--include/uapi/asm-generic/unistd.h12
-rw-r--r--kernel/sys_ni.c5
-rw-r--r--mm/mprotect.c90
-rw-r--r--tools/testing/selftests/x86/Makefile3
-rw-r--r--tools/testing/selftests/x86/pkey-helpers.h219
-rw-r--r--tools/testing/selftests/x86/protection_keys.c1410
25 files changed, 2127 insertions, 50 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index ec8d81417dc8..705fb915cbf7 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1666,6 +1666,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1666 1666
1667 initrd= [BOOT] Specify the location of the initial ramdisk 1667 initrd= [BOOT] Specify the location of the initial ramdisk
1668 1668
1669 init_pkru= [x86] Specify the default memory protection keys rights
1670 register contents for all processes. 0x55555554 by
1671 default (disallow access to all but pkey 0). Can
1672 override in debugfs after boot.
1673
1669 inport.irq= [HW] Inport (ATI XL and Microsoft) busmouse driver 1674 inport.irq= [HW] Inport (ATI XL and Microsoft) busmouse driver
1670 Format: <irq> 1675 Format: <irq>
1671 1676
diff --git a/Documentation/x86/protection-keys.txt b/Documentation/x86/protection-keys.txt
index c281ded1ba16..b64304540821 100644
--- a/Documentation/x86/protection-keys.txt
+++ b/Documentation/x86/protection-keys.txt
@@ -18,10 +18,68 @@ even though there is theoretically space in the PAE PTEs. These
18permissions are enforced on data access only and have no effect on 18permissions are enforced on data access only and have no effect on
19instruction fetches. 19instruction fetches.
20 20
21=========================== Config Option =========================== 21=========================== Syscalls ===========================
22 22
23This config option adds approximately 1.5kb of text. and 50 bytes of 23There are 3 system calls which directly interact with pkeys:
24data to the executable. A workload which does large O_DIRECT reads 24
25of holes in XFS files was run to exercise get_user_pages_fast(). No 25 int pkey_alloc(unsigned long flags, unsigned long init_access_rights)
26performance delta was observed with the config option 26 int pkey_free(int pkey);
27enabled or disabled. 27 int pkey_mprotect(unsigned long start, size_t len,
28 unsigned long prot, int pkey);
29
30Before a pkey can be used, it must first be allocated with
31pkey_alloc(). An application calls the WRPKRU instruction
32directly in order to change access permissions to memory covered
33with a key. In this example WRPKRU is wrapped by a C function
34called pkey_set().
35
36 int real_prot = PROT_READ|PROT_WRITE;
37 pkey = pkey_alloc(0, PKEY_DENY_WRITE);
38 ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
39 ret = pkey_mprotect(ptr, PAGE_SIZE, real_prot, pkey);
40 ... application runs here
41
42Now, if the application needs to update the data at 'ptr', it can
43gain access, do the update, then remove its write access:
44
45 pkey_set(pkey, 0); // clear PKEY_DENY_WRITE
46 *ptr = foo; // assign something
47 pkey_set(pkey, PKEY_DENY_WRITE); // set PKEY_DENY_WRITE again
48
49Now when it frees the memory, it will also free the pkey since it
50is no longer in use:
51
52 munmap(ptr, PAGE_SIZE);
53 pkey_free(pkey);
54
55(Note: pkey_set() is a wrapper for the RDPKRU and WRPKRU instructions.
56 An example implementation can be found in
57 tools/testing/selftests/x86/protection_keys.c)
58
59=========================== Behavior ===========================
60
61The kernel attempts to make protection keys consistent with the
62behavior of a plain mprotect(). For instance if you do this:
63
64 mprotect(ptr, size, PROT_NONE);
65 something(ptr);
66
67you can expect the same effects with protection keys when doing this:
68
69 pkey = pkey_alloc(0, PKEY_DISABLE_WRITE | PKEY_DISABLE_READ);
70 pkey_mprotect(ptr, size, PROT_READ|PROT_WRITE, pkey);
71 something(ptr);
72
73That should be true whether something() is a direct access to 'ptr'
74like:
75
76 *ptr = foo;
77
78or when the kernel does the access on the application's behalf like
79with a read():
80
81 read(fd, ptr, 1);
82
83The kernel will send a SIGSEGV in both cases, but si_code will be set
84to SEGV_PKERR when violating protection keys versus SEGV_ACCERR when
85the plain mprotect() permissions are violated.
diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index fec1947b8dbc..02760f6e6ca4 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -78,4 +78,9 @@
78#define MAP_HUGE_SHIFT 26 78#define MAP_HUGE_SHIFT 26
79#define MAP_HUGE_MASK 0x3f 79#define MAP_HUGE_MASK 0x3f
80 80
81#define PKEY_DISABLE_ACCESS 0x1
82#define PKEY_DISABLE_WRITE 0x2
83#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
84 PKEY_DISABLE_WRITE)
85
81#endif /* __ALPHA_MMAN_H__ */ 86#endif /* __ALPHA_MMAN_H__ */
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index ccdcfcbb24aa..655e2fb5395b 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -105,4 +105,9 @@
105#define MAP_HUGE_SHIFT 26 105#define MAP_HUGE_SHIFT 26
106#define MAP_HUGE_MASK 0x3f 106#define MAP_HUGE_MASK 0x3f
107 107
108#define PKEY_DISABLE_ACCESS 0x1
109#define PKEY_DISABLE_WRITE 0x2
110#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
111 PKEY_DISABLE_WRITE)
112
108#endif /* _ASM_MMAN_H */ 113#endif /* _ASM_MMAN_H */
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index f3db7d8eb0c2..5979745815a5 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -75,4 +75,9 @@
75#define MAP_HUGE_SHIFT 26 75#define MAP_HUGE_SHIFT 26
76#define MAP_HUGE_MASK 0x3f 76#define MAP_HUGE_MASK 0x3f
77 77
78#define PKEY_DISABLE_ACCESS 0x1
79#define PKEY_DISABLE_WRITE 0x2
80#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
81 PKEY_DISABLE_WRITE)
82
78#endif /* __PARISC_MMAN_H__ */ 83#endif /* __PARISC_MMAN_H__ */
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index f848572169ea..ff6ef7b30822 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -386,3 +386,8 @@
386377 i386 copy_file_range sys_copy_file_range 386377 i386 copy_file_range sys_copy_file_range
387378 i386 preadv2 sys_preadv2 compat_sys_preadv2 387378 i386 preadv2 sys_preadv2 compat_sys_preadv2
388379 i386 pwritev2 sys_pwritev2 compat_sys_pwritev2 388379 i386 pwritev2 sys_pwritev2 compat_sys_pwritev2
389380 i386 pkey_mprotect sys_pkey_mprotect
390381 i386 pkey_alloc sys_pkey_alloc
391382 i386 pkey_free sys_pkey_free
392#383 i386 pkey_get sys_pkey_get
393#384 i386 pkey_set sys_pkey_set
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index e9ce9c7c39b4..2f024d02511d 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -335,6 +335,11 @@
335326 common copy_file_range sys_copy_file_range 335326 common copy_file_range sys_copy_file_range
336327 64 preadv2 sys_preadv2 336327 64 preadv2 sys_preadv2
337328 64 pwritev2 sys_pwritev2 337328 64 pwritev2 sys_pwritev2
338329 common pkey_mprotect sys_pkey_mprotect
339330 common pkey_alloc sys_pkey_alloc
340331 common pkey_free sys_pkey_free
341#332 common pkey_get sys_pkey_get
342#333 common pkey_set sys_pkey_set
338 343
339# 344#
340# x32-specific system call numbers start at 512 to avoid cache impact 345# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 1ea0baef1175..72198c64e646 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -23,6 +23,14 @@ typedef struct {
23 const struct vdso_image *vdso_image; /* vdso image in use */ 23 const struct vdso_image *vdso_image; /* vdso image in use */
24 24
25 atomic_t perf_rdpmc_allowed; /* nonzero if rdpmc is allowed */ 25 atomic_t perf_rdpmc_allowed; /* nonzero if rdpmc is allowed */
26#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
27 /*
28 * One bit per protection key says whether userspace can
29 * use it or not. protected by mmap_sem.
30 */
31 u16 pkey_allocation_map;
32 s16 execute_only_pkey;
33#endif
26} mm_context_t; 34} mm_context_t;
27 35
28#ifdef CONFIG_SMP 36#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index d8abfcf524d1..8e0a9fe86de4 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -4,6 +4,7 @@
4#include <asm/desc.h> 4#include <asm/desc.h>
5#include <linux/atomic.h> 5#include <linux/atomic.h>
6#include <linux/mm_types.h> 6#include <linux/mm_types.h>
7#include <linux/pkeys.h>
7 8
8#include <trace/events/tlb.h> 9#include <trace/events/tlb.h>
9 10
@@ -107,7 +108,16 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
107static inline int init_new_context(struct task_struct *tsk, 108static inline int init_new_context(struct task_struct *tsk,
108 struct mm_struct *mm) 109 struct mm_struct *mm)
109{ 110{
111 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
112 if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
113 /* pkey 0 is the default and always allocated */
114 mm->context.pkey_allocation_map = 0x1;
115 /* -1 means unallocated or invalid */
116 mm->context.execute_only_pkey = -1;
117 }
118 #endif
110 init_new_context_ldt(tsk, mm); 119 init_new_context_ldt(tsk, mm);
120
111 return 0; 121 return 0;
112} 122}
113static inline void destroy_context(struct mm_struct *mm) 123static inline void destroy_context(struct mm_struct *mm)
@@ -195,16 +205,20 @@ static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
195 mpx_notify_unmap(mm, vma, start, end); 205 mpx_notify_unmap(mm, vma, start, end);
196} 206}
197 207
208#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
198static inline int vma_pkey(struct vm_area_struct *vma) 209static inline int vma_pkey(struct vm_area_struct *vma)
199{ 210{
200 u16 pkey = 0;
201#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
202 unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 | 211 unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
203 VM_PKEY_BIT2 | VM_PKEY_BIT3; 212 VM_PKEY_BIT2 | VM_PKEY_BIT3;
204 pkey = (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT; 213
205#endif 214 return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
206 return pkey; 215}
216#else
217static inline int vma_pkey(struct vm_area_struct *vma)
218{
219 return 0;
207} 220}
221#endif
208 222
209static inline bool __pkru_allows_pkey(u16 pkey, bool write) 223static inline bool __pkru_allows_pkey(u16 pkey, bool write)
210{ 224{
@@ -258,5 +272,4 @@ static inline bool arch_pte_access_permitted(pte_t pte, bool write)
258{ 272{
259 return __pkru_allows_pkey(pte_flags_pkey(pte_flags(pte)), write); 273 return __pkru_allows_pkey(pte_flags_pkey(pte_flags(pte)), write);
260} 274}
261
262#endif /* _ASM_X86_MMU_CONTEXT_H */ 275#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index 7b84565c916c..34684adb6899 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -10,7 +10,6 @@ extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
10 * Try to dedicate one of the protection keys to be used as an 10 * Try to dedicate one of the protection keys to be used as an
11 * execute-only protection key. 11 * execute-only protection key.
12 */ 12 */
13#define PKEY_DEDICATED_EXECUTE_ONLY 15
14extern int __execute_only_pkey(struct mm_struct *mm); 13extern int __execute_only_pkey(struct mm_struct *mm);
15static inline int execute_only_pkey(struct mm_struct *mm) 14static inline int execute_only_pkey(struct mm_struct *mm)
16{ 15{
@@ -31,4 +30,76 @@ static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
31 return __arch_override_mprotect_pkey(vma, prot, pkey); 30 return __arch_override_mprotect_pkey(vma, prot, pkey);
32} 31}
33 32
33extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
34 unsigned long init_val);
35
36#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3)
37
38#define mm_pkey_allocation_map(mm) (mm->context.pkey_allocation_map)
39#define mm_set_pkey_allocated(mm, pkey) do { \
40 mm_pkey_allocation_map(mm) |= (1U << pkey); \
41} while (0)
42#define mm_set_pkey_free(mm, pkey) do { \
43 mm_pkey_allocation_map(mm) &= ~(1U << pkey); \
44} while (0)
45
46static inline
47bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
48{
49 return mm_pkey_allocation_map(mm) & (1U << pkey);
50}
51
52/*
53 * Returns a positive, 4-bit key on success, or -1 on failure.
54 */
55static inline
56int mm_pkey_alloc(struct mm_struct *mm)
57{
58 /*
59 * Note: this is the one and only place we make sure
60 * that the pkey is valid as far as the hardware is
61 * concerned. The rest of the kernel trusts that
62 * only good, valid pkeys come out of here.
63 */
64 u16 all_pkeys_mask = ((1U << arch_max_pkey()) - 1);
65 int ret;
66
67 /*
68 * Are we out of pkeys? We must handle this specially
69 * because ffz() behavior is undefined if there are no
70 * zeros.
71 */
72 if (mm_pkey_allocation_map(mm) == all_pkeys_mask)
73 return -1;
74
75 ret = ffz(mm_pkey_allocation_map(mm));
76
77 mm_set_pkey_allocated(mm, ret);
78
79 return ret;
80}
81
82static inline
83int mm_pkey_free(struct mm_struct *mm, int pkey)
84{
85 /*
86 * pkey 0 is special, always allocated and can never
87 * be freed.
88 */
89 if (!pkey)
90 return -EINVAL;
91 if (!mm_pkey_is_allocated(mm, pkey))
92 return -EINVAL;
93
94 mm_set_pkey_free(mm, pkey);
95
96 return 0;
97}
98
99extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
100 unsigned long init_val);
101extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
102 unsigned long init_val);
103extern void copy_init_pkru_to_fpregs(void);
104
34#endif /*_ASM_X86_PKEYS_H */ 105#endif /*_ASM_X86_PKEYS_H */
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 3fc03a09a93b..47004010ad5d 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -12,6 +12,7 @@
12#include <asm/traps.h> 12#include <asm/traps.h>
13 13
14#include <linux/hardirq.h> 14#include <linux/hardirq.h>
15#include <linux/pkeys.h>
15 16
16#define CREATE_TRACE_POINTS 17#define CREATE_TRACE_POINTS
17#include <asm/trace/fpu.h> 18#include <asm/trace/fpu.h>
@@ -505,6 +506,9 @@ static inline void copy_init_fpstate_to_fpregs(void)
505 copy_kernel_to_fxregs(&init_fpstate.fxsave); 506 copy_kernel_to_fxregs(&init_fpstate.fxsave);
506 else 507 else
507 copy_kernel_to_fregs(&init_fpstate.fsave); 508 copy_kernel_to_fregs(&init_fpstate.fsave);
509
510 if (boot_cpu_has(X86_FEATURE_OSPKE))
511 copy_init_pkru_to_fpregs();
508} 512}
509 513
510/* 514/*
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 01567aa87503..124aa5c593f8 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -5,6 +5,7 @@
5 */ 5 */
6#include <linux/compat.h> 6#include <linux/compat.h>
7#include <linux/cpu.h> 7#include <linux/cpu.h>
8#include <linux/mman.h>
8#include <linux/pkeys.h> 9#include <linux/pkeys.h>
9 10
10#include <asm/fpu/api.h> 11#include <asm/fpu/api.h>
@@ -866,9 +867,10 @@ const void *get_xsave_field_ptr(int xsave_state)
866 return get_xsave_addr(&fpu->state.xsave, xsave_state); 867 return get_xsave_addr(&fpu->state.xsave, xsave_state);
867} 868}
868 869
870#ifdef CONFIG_ARCH_HAS_PKEYS
871
869#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2) 872#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2)
870#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1) 873#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1)
871
872/* 874/*
873 * This will go out and modify PKRU register to set the access 875 * This will go out and modify PKRU register to set the access
874 * rights for @pkey to @init_val. 876 * rights for @pkey to @init_val.
@@ -914,6 +916,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
914 916
915 return 0; 917 return 0;
916} 918}
919#endif /* ! CONFIG_ARCH_HAS_PKEYS */
917 920
918/* 921/*
919 * This is similar to user_regset_copyout(), but will not add offset to 922 * This is similar to user_regset_copyout(), but will not add offset to
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ee944bd2310d..b3760b3c1ca0 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -109,12 +109,13 @@ void __show_regs(struct pt_regs *regs, int all)
109 get_debugreg(d7, 7); 109 get_debugreg(d7, 7);
110 110
111 /* Only print out debug registers if they are in their non-default state. */ 111 /* Only print out debug registers if they are in their non-default state. */
112 if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && 112 if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
113 (d6 == DR6_RESERVED) && (d7 == 0x400)) 113 (d6 == DR6_RESERVED) && (d7 == 0x400))) {
114 return; 114 printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n",
115 115 d0, d1, d2);
116 printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); 116 printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n",
117 printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 117 d3, d6, d7);
118 }
118 119
119 if (boot_cpu_has(X86_FEATURE_OSPKE)) 120 if (boot_cpu_has(X86_FEATURE_OSPKE))
120 printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru()); 121 printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru());
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 1e525122cbe4..4dc13340653e 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1144,6 +1144,15 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
1144{ 1144{
1145 /* This is only called for the current mm, so: */ 1145 /* This is only called for the current mm, so: */
1146 bool foreign = false; 1146 bool foreign = false;
1147
1148 /*
1149 * Read or write was blocked by protection keys. This is
1150 * always an unconditional error and can never result in
1151 * a follow-up action to resolve the fault, like a COW.
1152 */
1153 if (error_code & PF_PK)
1154 return 1;
1155
1147 /* 1156 /*
1148 * Make sure to check the VMA so that we do not perform 1157 * Make sure to check the VMA so that we do not perform
1149 * faults just to hit a PF_PK as soon as we fill in a 1158 * faults just to hit a PF_PK as soon as we fill in a
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index e8c474451928..f88ce0e5efd9 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -11,6 +11,7 @@
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details. 12 * more details.
13 */ 13 */
14#include <linux/debugfs.h> /* debugfs_create_u32() */
14#include <linux/mm_types.h> /* mm_struct, vma, etc... */ 15#include <linux/mm_types.h> /* mm_struct, vma, etc... */
15#include <linux/pkeys.h> /* PKEY_* */ 16#include <linux/pkeys.h> /* PKEY_* */
16#include <uapi/asm-generic/mman-common.h> 17#include <uapi/asm-generic/mman-common.h>
@@ -21,8 +22,19 @@
21 22
22int __execute_only_pkey(struct mm_struct *mm) 23int __execute_only_pkey(struct mm_struct *mm)
23{ 24{
25 bool need_to_set_mm_pkey = false;
26 int execute_only_pkey = mm->context.execute_only_pkey;
24 int ret; 27 int ret;
25 28
29 /* Do we need to assign a pkey for mm's execute-only maps? */
30 if (execute_only_pkey == -1) {
31 /* Go allocate one to use, which might fail */
32 execute_only_pkey = mm_pkey_alloc(mm);
33 if (execute_only_pkey < 0)
34 return -1;
35 need_to_set_mm_pkey = true;
36 }
37
26 /* 38 /*
27 * We do not want to go through the relatively costly 39 * We do not want to go through the relatively costly
28 * dance to set PKRU if we do not need to. Check it 40 * dance to set PKRU if we do not need to. Check it
@@ -32,22 +44,33 @@ int __execute_only_pkey(struct mm_struct *mm)
32 * can make fpregs inactive. 44 * can make fpregs inactive.
33 */ 45 */
34 preempt_disable(); 46 preempt_disable();
35 if (fpregs_active() && 47 if (!need_to_set_mm_pkey &&
36 !__pkru_allows_read(read_pkru(), PKEY_DEDICATED_EXECUTE_ONLY)) { 48 fpregs_active() &&
49 !__pkru_allows_read(read_pkru(), execute_only_pkey)) {
37 preempt_enable(); 50 preempt_enable();
38 return PKEY_DEDICATED_EXECUTE_ONLY; 51 return execute_only_pkey;
39 } 52 }
40 preempt_enable(); 53 preempt_enable();
41 ret = arch_set_user_pkey_access(current, PKEY_DEDICATED_EXECUTE_ONLY, 54
55 /*
56 * Set up PKRU so that it denies access for everything
57 * other than execution.
58 */
59 ret = arch_set_user_pkey_access(current, execute_only_pkey,
42 PKEY_DISABLE_ACCESS); 60 PKEY_DISABLE_ACCESS);
43 /* 61 /*
44 * If the PKRU-set operation failed somehow, just return 62 * If the PKRU-set operation failed somehow, just return
45 * 0 and effectively disable execute-only support. 63 * 0 and effectively disable execute-only support.
46 */ 64 */
47 if (ret) 65 if (ret) {
48 return 0; 66 mm_set_pkey_free(mm, execute_only_pkey);
67 return -1;
68 }
49 69
50 return PKEY_DEDICATED_EXECUTE_ONLY; 70 /* We got one, store it and use it from here on out */
71 if (need_to_set_mm_pkey)
72 mm->context.execute_only_pkey = execute_only_pkey;
73 return execute_only_pkey;
51} 74}
52 75
53static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma) 76static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
@@ -55,7 +78,7 @@ static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
55 /* Do this check first since the vm_flags should be hot */ 78 /* Do this check first since the vm_flags should be hot */
56 if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC) 79 if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC)
57 return false; 80 return false;
58 if (vma_pkey(vma) != PKEY_DEDICATED_EXECUTE_ONLY) 81 if (vma_pkey(vma) != vma->vm_mm->context.execute_only_pkey)
59 return false; 82 return false;
60 83
61 return true; 84 return true;
@@ -99,3 +122,106 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey
99 */ 122 */
100 return vma_pkey(vma); 123 return vma_pkey(vma);
101} 124}
125
126#define PKRU_AD_KEY(pkey) (PKRU_AD_BIT << ((pkey) * PKRU_BITS_PER_PKEY))
127
128/*
129 * Make the default PKRU value (at execve() time) as restrictive
130 * as possible. This ensures that any threads clone()'d early
131 * in the process's lifetime will not accidentally get access
132 * to data which is pkey-protected later on.
133 */
134u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) |
135 PKRU_AD_KEY( 4) | PKRU_AD_KEY( 5) | PKRU_AD_KEY( 6) |
136 PKRU_AD_KEY( 7) | PKRU_AD_KEY( 8) | PKRU_AD_KEY( 9) |
137 PKRU_AD_KEY(10) | PKRU_AD_KEY(11) | PKRU_AD_KEY(12) |
138 PKRU_AD_KEY(13) | PKRU_AD_KEY(14) | PKRU_AD_KEY(15);
139
140/*
141 * Called from the FPU code when creating a fresh set of FPU
142 * registers. This is called from a very specific context where
143 * we know the FPU regstiers are safe for use and we can use PKRU
144 * directly. The fact that PKRU is only available when we are
145 * using eagerfpu mode makes this possible.
146 */
147void copy_init_pkru_to_fpregs(void)
148{
149 u32 init_pkru_value_snapshot = READ_ONCE(init_pkru_value);
150 /*
151 * Any write to PKRU takes it out of the XSAVE 'init
152 * state' which increases context switch cost. Avoid
153 * writing 0 when PKRU was already 0.
154 */
155 if (!init_pkru_value_snapshot && !read_pkru())
156 return;
157 /*
158 * Override the PKRU state that came from 'init_fpstate'
159 * with the baseline from the process.
160 */
161 write_pkru(init_pkru_value_snapshot);
162}
163
164static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf,
165 size_t count, loff_t *ppos)
166{
167 char buf[32];
168 unsigned int len;
169
170 len = sprintf(buf, "0x%x\n", init_pkru_value);
171 return simple_read_from_buffer(user_buf, count, ppos, buf, len);
172}
173
174static ssize_t init_pkru_write_file(struct file *file,
175 const char __user *user_buf, size_t count, loff_t *ppos)
176{
177 char buf[32];
178 ssize_t len;
179 u32 new_init_pkru;
180
181 len = min(count, sizeof(buf) - 1);
182 if (copy_from_user(buf, user_buf, len))
183 return -EFAULT;
184
185 /* Make the buffer a valid string that we can not overrun */
186 buf[len] = '\0';
187 if (kstrtouint(buf, 0, &new_init_pkru))
188 return -EINVAL;
189
190 /*
191 * Don't allow insane settings that will blow the system
192 * up immediately if someone attempts to disable access
193 * or writes to pkey 0.
194 */
195 if (new_init_pkru & (PKRU_AD_BIT|PKRU_WD_BIT))
196 return -EINVAL;
197
198 WRITE_ONCE(init_pkru_value, new_init_pkru);
199 return count;
200}
201
202static const struct file_operations fops_init_pkru = {
203 .read = init_pkru_read_file,
204 .write = init_pkru_write_file,
205 .llseek = default_llseek,
206};
207
208static int __init create_init_pkru_value(void)
209{
210 debugfs_create_file("init_pkru", S_IRUSR | S_IWUSR,
211 arch_debugfs_dir, NULL, &fops_init_pkru);
212 return 0;
213}
214late_initcall(create_init_pkru_value);
215
216static __init int setup_init_pkru(char *opt)
217{
218 u32 new_init_pkru;
219
220 if (kstrtouint(opt, 0, &new_init_pkru))
221 return 1;
222
223 WRITE_ONCE(init_pkru_value, new_init_pkru);
224
225 return 1;
226}
227__setup("init_pkru=", setup_init_pkru);
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index 9e079d49e7f2..24365b30aae9 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -117,4 +117,9 @@
117#define MAP_HUGE_SHIFT 26 117#define MAP_HUGE_SHIFT 26
118#define MAP_HUGE_MASK 0x3f 118#define MAP_HUGE_MASK 0x3f
119 119
120#define PKEY_DISABLE_ACCESS 0x1
121#define PKEY_DISABLE_WRITE 0x2
122#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
123 PKEY_DISABLE_WRITE)
124
120#endif /* _XTENSA_MMAN_H */ 125#endif /* _XTENSA_MMAN_H */
diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h
index 1d405a2b7272..e4c08c1ff0c5 100644
--- a/include/linux/pkeys.h
+++ b/include/linux/pkeys.h
@@ -4,11 +4,6 @@
4#include <linux/mm_types.h> 4#include <linux/mm_types.h>
5#include <asm/mmu_context.h> 5#include <asm/mmu_context.h>
6 6
7#define PKEY_DISABLE_ACCESS 0x1
8#define PKEY_DISABLE_WRITE 0x2
9#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
10 PKEY_DISABLE_WRITE)
11
12#ifdef CONFIG_ARCH_HAS_PKEYS 7#ifdef CONFIG_ARCH_HAS_PKEYS
13#include <asm/pkeys.h> 8#include <asm/pkeys.h>
14#else /* ! CONFIG_ARCH_HAS_PKEYS */ 9#else /* ! CONFIG_ARCH_HAS_PKEYS */
@@ -16,18 +11,34 @@
16#define execute_only_pkey(mm) (0) 11#define execute_only_pkey(mm) (0)
17#define arch_override_mprotect_pkey(vma, prot, pkey) (0) 12#define arch_override_mprotect_pkey(vma, prot, pkey) (0)
18#define PKEY_DEDICATED_EXECUTE_ONLY 0 13#define PKEY_DEDICATED_EXECUTE_ONLY 0
19#endif /* ! CONFIG_ARCH_HAS_PKEYS */ 14#define ARCH_VM_PKEY_FLAGS 0
15
16static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
17{
18 return (pkey == 0);
19}
20
21static inline int mm_pkey_alloc(struct mm_struct *mm)
22{
23 return -1;
24}
20 25
21/* 26static inline int mm_pkey_free(struct mm_struct *mm, int pkey)
22 * This is called from mprotect_pkey().
23 *
24 * Returns true if the protection keys is valid.
25 */
26static inline bool validate_pkey(int pkey)
27{ 27{
28 if (pkey < 0) 28 WARN_ONCE(1, "free of protection key when disabled");
29 return false; 29 return -EINVAL;
30 return (pkey < arch_max_pkey());
31} 30}
32 31
32static inline int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
33 unsigned long init_val)
34{
35 return 0;
36}
37
38static inline void copy_init_pkru_to_fpregs(void)
39{
40}
41
42#endif /* ! CONFIG_ARCH_HAS_PKEYS */
43
33#endif /* _LINUX_PKEYS_H */ 44#endif /* _LINUX_PKEYS_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index d02239022bd0..0d7abb8b7315 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -898,4 +898,12 @@ asmlinkage long sys_copy_file_range(int fd_in, loff_t __user *off_in,
898 898
899asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags); 899asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags);
900 900
901asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
902 unsigned long prot, int pkey);
903asmlinkage long sys_pkey_alloc(unsigned long flags, unsigned long init_val);
904asmlinkage long sys_pkey_free(int pkey);
905//asmlinkage long sys_pkey_get(int pkey, unsigned long flags);
906//asmlinkage long sys_pkey_set(int pkey, unsigned long access_rights,
907// unsigned long flags);
908
901#endif 909#endif
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 58274382a616..8c27db0c5c08 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -72,4 +72,9 @@
72#define MAP_HUGE_SHIFT 26 72#define MAP_HUGE_SHIFT 26
73#define MAP_HUGE_MASK 0x3f 73#define MAP_HUGE_MASK 0x3f
74 74
75#define PKEY_DISABLE_ACCESS 0x1
76#define PKEY_DISABLE_WRITE 0x2
77#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
78 PKEY_DISABLE_WRITE)
79
75#endif /* __ASM_GENERIC_MMAN_COMMON_H */ 80#endif /* __ASM_GENERIC_MMAN_COMMON_H */
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index a26415b5151c..dbfee7e86ba6 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -724,9 +724,19 @@ __SYSCALL(__NR_copy_file_range, sys_copy_file_range)
724__SC_COMP(__NR_preadv2, sys_preadv2, compat_sys_preadv2) 724__SC_COMP(__NR_preadv2, sys_preadv2, compat_sys_preadv2)
725#define __NR_pwritev2 287 725#define __NR_pwritev2 287
726__SC_COMP(__NR_pwritev2, sys_pwritev2, compat_sys_pwritev2) 726__SC_COMP(__NR_pwritev2, sys_pwritev2, compat_sys_pwritev2)
727#define __NR_pkey_mprotect 288
728__SYSCALL(__NR_pkey_mprotect, sys_pkey_mprotect)
729#define __NR_pkey_alloc 289
730__SYSCALL(__NR_pkey_alloc, sys_pkey_alloc)
731#define __NR_pkey_free 290
732__SYSCALL(__NR_pkey_free, sys_pkey_free)
733#define __NR_pkey_get 291
734//__SYSCALL(__NR_pkey_get, sys_pkey_get)
735#define __NR_pkey_set 292
736//__SYSCALL(__NR_pkey_set, sys_pkey_set)
727 737
728#undef __NR_syscalls 738#undef __NR_syscalls
729#define __NR_syscalls 288 739#define __NR_syscalls 291
730 740
731/* 741/*
732 * All syscalls below here should go away really, 742 * All syscalls below here should go away really,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 2c5e3a8e00d7..635482e60ca3 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -250,3 +250,8 @@ cond_syscall(sys_execveat);
250 250
251/* membarrier */ 251/* membarrier */
252cond_syscall(sys_membarrier); 252cond_syscall(sys_membarrier);
253
254/* memory protection keys */
255cond_syscall(sys_pkey_mprotect);
256cond_syscall(sys_pkey_alloc);
257cond_syscall(sys_pkey_free);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ec91dfd3f900..bcdbe62f3e6d 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -23,11 +23,13 @@
23#include <linux/mmu_notifier.h> 23#include <linux/mmu_notifier.h>
24#include <linux/migrate.h> 24#include <linux/migrate.h>
25#include <linux/perf_event.h> 25#include <linux/perf_event.h>
26#include <linux/pkeys.h>
26#include <linux/ksm.h> 27#include <linux/ksm.h>
27#include <linux/pkeys.h> 28#include <linux/pkeys.h>
28#include <asm/uaccess.h> 29#include <asm/uaccess.h>
29#include <asm/pgtable.h> 30#include <asm/pgtable.h>
30#include <asm/cacheflush.h> 31#include <asm/cacheflush.h>
32#include <asm/mmu_context.h>
31#include <asm/tlbflush.h> 33#include <asm/tlbflush.h>
32 34
33#include "internal.h" 35#include "internal.h"
@@ -353,8 +355,11 @@ fail:
353 return error; 355 return error;
354} 356}
355 357
356SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, 358/*
357 unsigned long, prot) 359 * pkey==-1 when doing a legacy mprotect()
360 */
361static int do_mprotect_pkey(unsigned long start, size_t len,
362 unsigned long prot, int pkey)
358{ 363{
359 unsigned long nstart, end, tmp, reqprot; 364 unsigned long nstart, end, tmp, reqprot;
360 struct vm_area_struct *vma, *prev; 365 struct vm_area_struct *vma, *prev;
@@ -383,6 +388,14 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
383 if (down_write_killable(&current->mm->mmap_sem)) 388 if (down_write_killable(&current->mm->mmap_sem))
384 return -EINTR; 389 return -EINTR;
385 390
391 /*
392 * If userspace did not allocate the pkey, do not let
393 * them use it here.
394 */
395 error = -EINVAL;
396 if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey))
397 goto out;
398
386 vma = find_vma(current->mm, start); 399 vma = find_vma(current->mm, start);
387 error = -ENOMEM; 400 error = -ENOMEM;
388 if (!vma) 401 if (!vma)
@@ -409,8 +422,9 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
409 prev = vma; 422 prev = vma;
410 423
411 for (nstart = start ; ; ) { 424 for (nstart = start ; ; ) {
425 unsigned long mask_off_old_flags;
412 unsigned long newflags; 426 unsigned long newflags;
413 int pkey = arch_override_mprotect_pkey(vma, prot, -1); 427 int new_vma_pkey;
414 428
415 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ 429 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
416 430
@@ -418,8 +432,17 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
418 if (rier && (vma->vm_flags & VM_MAYEXEC)) 432 if (rier && (vma->vm_flags & VM_MAYEXEC))
419 prot |= PROT_EXEC; 433 prot |= PROT_EXEC;
420 434
421 newflags = calc_vm_prot_bits(prot, pkey); 435 /*
422 newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); 436 * Each mprotect() call explicitly passes r/w/x permissions.
437 * If a permission is not passed to mprotect(), it must be
438 * cleared from the VMA.
439 */
440 mask_off_old_flags = VM_READ | VM_WRITE | VM_EXEC |
441 ARCH_VM_PKEY_FLAGS;
442
443 new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey);
444 newflags = calc_vm_prot_bits(prot, new_vma_pkey);
445 newflags |= (vma->vm_flags & ~mask_off_old_flags);
423 446
424 /* newflags >> 4 shift VM_MAY% in place of VM_% */ 447 /* newflags >> 4 shift VM_MAY% in place of VM_% */
425 if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) { 448 if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
@@ -455,3 +478,60 @@ out:
455 up_write(&current->mm->mmap_sem); 478 up_write(&current->mm->mmap_sem);
456 return error; 479 return error;
457} 480}
481
482SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
483 unsigned long, prot)
484{
485 return do_mprotect_pkey(start, len, prot, -1);
486}
487
488SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len,
489 unsigned long, prot, int, pkey)
490{
491 return do_mprotect_pkey(start, len, prot, pkey);
492}
493
494SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val)
495{
496 int pkey;
497 int ret;
498
499 /* No flags supported yet. */
500 if (flags)
501 return -EINVAL;
502 /* check for unsupported init values */
503 if (init_val & ~PKEY_ACCESS_MASK)
504 return -EINVAL;
505
506 down_write(&current->mm->mmap_sem);
507 pkey = mm_pkey_alloc(current->mm);
508
509 ret = -ENOSPC;
510 if (pkey == -1)
511 goto out;
512
513 ret = arch_set_user_pkey_access(current, pkey, init_val);
514 if (ret) {
515 mm_pkey_free(current->mm, pkey);
516 goto out;
517 }
518 ret = pkey;
519out:
520 up_write(&current->mm->mmap_sem);
521 return ret;
522}
523
524SYSCALL_DEFINE1(pkey_free, int, pkey)
525{
526 int ret;
527
528 down_write(&current->mm->mmap_sem);
529 ret = mm_pkey_free(current->mm, pkey);
530 up_write(&current->mm->mmap_sem);
531
532 /*
533 * We could provie warnings or errors if any VMA still
534 * has the pkey set here.
535 */
536 return ret;
537}
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index 4f747ee07f10..a89f80a5b711 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -5,7 +5,8 @@ include ../lib.mk
5.PHONY: all all_32 all_64 warn_32bit_failure clean 5.PHONY: all all_32 all_64 warn_32bit_failure clean
6 6
7TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall test_mremap_vdso \ 7TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall test_mremap_vdso \
8 check_initial_reg_state sigreturn ldt_gdt iopl mpx-mini-test 8 check_initial_reg_state sigreturn ldt_gdt iopl \
9 protection_keys
9TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \ 10TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \
10 test_FCMOV test_FCOMI test_FISTTP \ 11 test_FCMOV test_FCOMI test_FISTTP \
11 vdso_restorer 12 vdso_restorer
diff --git a/tools/testing/selftests/x86/pkey-helpers.h b/tools/testing/selftests/x86/pkey-helpers.h
new file mode 100644
index 000000000000..b20293956eec
--- /dev/null
+++ b/tools/testing/selftests/x86/pkey-helpers.h
@@ -0,0 +1,219 @@
1#ifndef _PKEYS_HELPER_H
2#define _PKEYS_HELPER_H
3#define _GNU_SOURCE
4#include <string.h>
5#include <stdarg.h>
6#include <stdio.h>
7#include <stdint.h>
8#include <stdbool.h>
9#include <signal.h>
10#include <assert.h>
11#include <stdlib.h>
12#include <ucontext.h>
13#include <sys/mman.h>
14
15#define NR_PKEYS 16
16#define PKRU_BITS_PER_PKEY 2
17
18#ifndef DEBUG_LEVEL
19#define DEBUG_LEVEL 0
20#endif
21#define DPRINT_IN_SIGNAL_BUF_SIZE 4096
22extern int dprint_in_signal;
23extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
24static inline void sigsafe_printf(const char *format, ...)
25{
26 va_list ap;
27
28 va_start(ap, format);
29 if (!dprint_in_signal) {
30 vprintf(format, ap);
31 } else {
32 int len = vsnprintf(dprint_in_signal_buffer,
33 DPRINT_IN_SIGNAL_BUF_SIZE,
34 format, ap);
35 /*
36 * len is amount that would have been printed,
37 * but actual write is truncated at BUF_SIZE.
38 */
39 if (len > DPRINT_IN_SIGNAL_BUF_SIZE)
40 len = DPRINT_IN_SIGNAL_BUF_SIZE;
41 write(1, dprint_in_signal_buffer, len);
42 }
43 va_end(ap);
44}
45#define dprintf_level(level, args...) do { \
46 if (level <= DEBUG_LEVEL) \
47 sigsafe_printf(args); \
48 fflush(NULL); \
49} while (0)
50#define dprintf0(args...) dprintf_level(0, args)
51#define dprintf1(args...) dprintf_level(1, args)
52#define dprintf2(args...) dprintf_level(2, args)
53#define dprintf3(args...) dprintf_level(3, args)
54#define dprintf4(args...) dprintf_level(4, args)
55
56extern unsigned int shadow_pkru;
57static inline unsigned int __rdpkru(void)
58{
59 unsigned int eax, edx;
60 unsigned int ecx = 0;
61 unsigned int pkru;
62
63 asm volatile(".byte 0x0f,0x01,0xee\n\t"
64 : "=a" (eax), "=d" (edx)
65 : "c" (ecx));
66 pkru = eax;
67 return pkru;
68}
69
70static inline unsigned int _rdpkru(int line)
71{
72 unsigned int pkru = __rdpkru();
73
74 dprintf4("rdpkru(line=%d) pkru: %x shadow: %x\n",
75 line, pkru, shadow_pkru);
76 assert(pkru == shadow_pkru);
77
78 return pkru;
79}
80
81#define rdpkru() _rdpkru(__LINE__)
82
83static inline void __wrpkru(unsigned int pkru)
84{
85 unsigned int eax = pkru;
86 unsigned int ecx = 0;
87 unsigned int edx = 0;
88
89 dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru);
90 asm volatile(".byte 0x0f,0x01,0xef\n\t"
91 : : "a" (eax), "c" (ecx), "d" (edx));
92 assert(pkru == __rdpkru());
93}
94
95static inline void wrpkru(unsigned int pkru)
96{
97 dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru);
98 /* will do the shadow check for us: */
99 rdpkru();
100 __wrpkru(pkru);
101 shadow_pkru = pkru;
102 dprintf4("%s(%08x) pkru: %08x\n", __func__, pkru, __rdpkru());
103}
104
105/*
106 * These are technically racy. since something could
107 * change PKRU between the read and the write.
108 */
109static inline void __pkey_access_allow(int pkey, int do_allow)
110{
111 unsigned int pkru = rdpkru();
112 int bit = pkey * 2;
113
114 if (do_allow)
115 pkru &= (1<<bit);
116 else
117 pkru |= (1<<bit);
118
119 dprintf4("pkru now: %08x\n", rdpkru());
120 wrpkru(pkru);
121}
122
123static inline void __pkey_write_allow(int pkey, int do_allow_write)
124{
125 long pkru = rdpkru();
126 int bit = pkey * 2 + 1;
127
128 if (do_allow_write)
129 pkru &= (1<<bit);
130 else
131 pkru |= (1<<bit);
132
133 wrpkru(pkru);
134 dprintf4("pkru now: %08x\n", rdpkru());
135}
136
137#define PROT_PKEY0 0x10 /* protection key value (bit 0) */
138#define PROT_PKEY1 0x20 /* protection key value (bit 1) */
139#define PROT_PKEY2 0x40 /* protection key value (bit 2) */
140#define PROT_PKEY3 0x80 /* protection key value (bit 3) */
141
142#define PAGE_SIZE 4096
143#define MB (1<<20)
144
145static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
146 unsigned int *ecx, unsigned int *edx)
147{
148 /* ecx is often an input as well as an output. */
149 asm volatile(
150 "cpuid;"
151 : "=a" (*eax),
152 "=b" (*ebx),
153 "=c" (*ecx),
154 "=d" (*edx)
155 : "0" (*eax), "2" (*ecx));
156}
157
158/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */
159#define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */
160#define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */
161
162static inline int cpu_has_pku(void)
163{
164 unsigned int eax;
165 unsigned int ebx;
166 unsigned int ecx;
167 unsigned int edx;
168
169 eax = 0x7;
170 ecx = 0x0;
171 __cpuid(&eax, &ebx, &ecx, &edx);
172
173 if (!(ecx & X86_FEATURE_PKU)) {
174 dprintf2("cpu does not have PKU\n");
175 return 0;
176 }
177 if (!(ecx & X86_FEATURE_OSPKE)) {
178 dprintf2("cpu does not have OSPKE\n");
179 return 0;
180 }
181 return 1;
182}
183
184#define XSTATE_PKRU_BIT (9)
185#define XSTATE_PKRU 0x200
186
187int pkru_xstate_offset(void)
188{
189 unsigned int eax;
190 unsigned int ebx;
191 unsigned int ecx;
192 unsigned int edx;
193 int xstate_offset;
194 int xstate_size;
195 unsigned long XSTATE_CPUID = 0xd;
196 int leaf;
197
198 /* assume that XSTATE_PKRU is set in XCR0 */
199 leaf = XSTATE_PKRU_BIT;
200 {
201 eax = XSTATE_CPUID;
202 ecx = leaf;
203 __cpuid(&eax, &ebx, &ecx, &edx);
204
205 if (leaf == XSTATE_PKRU_BIT) {
206 xstate_offset = ebx;
207 xstate_size = eax;
208 }
209 }
210
211 if (xstate_size == 0) {
212 printf("could not find size/offset of PKRU in xsave state\n");
213 return 0;
214 }
215
216 return xstate_offset;
217}
218
219#endif /* _PKEYS_HELPER_H */
diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/x86/protection_keys.c
new file mode 100644
index 000000000000..bdd58c78902e
--- /dev/null
+++ b/tools/testing/selftests/x86/protection_keys.c
@@ -0,0 +1,1410 @@
1/*
2 * Tests x86 Memory Protection Keys (see Documentation/x86/protection-keys.txt)
3 *
4 * There are examples in here of:
5 * * how to set protection keys on memory
6 * * how to set/clear bits in PKRU (the rights register)
7 * * how to handle SEGV_PKRU signals and extract pkey-relevant
8 * information from the siginfo
9 *
10 * Things to add:
11 * make sure KSM and KSM COW breaking works
12 * prefault pages in at malloc, or not
13 * protect MPX bounds tables with protection keys?
14 * make sure VMA splitting/merging is working correctly
15 * OOMs can destroy mm->mmap (see exit_mmap()), so make sure it is immune to pkeys
16 * look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel
17 * do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks
18 *
19 * Compile like this:
20 * gcc -o protection_keys -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
21 * gcc -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
22 */
23#define _GNU_SOURCE
24#include <errno.h>
25#include <linux/futex.h>
26#include <sys/time.h>
27#include <sys/syscall.h>
28#include <string.h>
29#include <stdio.h>
30#include <stdint.h>
31#include <stdbool.h>
32#include <signal.h>
33#include <assert.h>
34#include <stdlib.h>
35#include <ucontext.h>
36#include <sys/mman.h>
37#include <sys/types.h>
38#include <sys/wait.h>
39#include <sys/stat.h>
40#include <fcntl.h>
41#include <unistd.h>
42#include <sys/ptrace.h>
43#include <setjmp.h>
44
45#include "pkey-helpers.h"
46
47int iteration_nr = 1;
48int test_nr;
49
50unsigned int shadow_pkru;
51
52#define HPAGE_SIZE (1UL<<21)
53#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
54#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1))
55#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1))
56#define ALIGN_PTR_UP(p, ptr_align_to) ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to))
57#define ALIGN_PTR_DOWN(p, ptr_align_to) ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to))
58#define __stringify_1(x...) #x
59#define __stringify(x...) __stringify_1(x)
60
61#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP)
62
63int dprint_in_signal;
64char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
65
66extern void abort_hooks(void);
67#define pkey_assert(condition) do { \
68 if (!(condition)) { \
69 dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \
70 __FILE__, __LINE__, \
71 test_nr, iteration_nr); \
72 dprintf0("errno at assert: %d", errno); \
73 abort_hooks(); \
74 assert(condition); \
75 } \
76} while (0)
77#define raw_assert(cond) assert(cond)
78
79void cat_into_file(char *str, char *file)
80{
81 int fd = open(file, O_RDWR);
82 int ret;
83
84 dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file);
85 /*
86 * these need to be raw because they are called under
87 * pkey_assert()
88 */
89 raw_assert(fd >= 0);
90 ret = write(fd, str, strlen(str));
91 if (ret != strlen(str)) {
92 perror("write to file failed");
93 fprintf(stderr, "filename: '%s' str: '%s'\n", file, str);
94 raw_assert(0);
95 }
96 close(fd);
97}
98
99#if CONTROL_TRACING > 0
100static int warned_tracing;
101int tracing_root_ok(void)
102{
103 if (geteuid() != 0) {
104 if (!warned_tracing)
105 fprintf(stderr, "WARNING: not run as root, "
106 "can not do tracing control\n");
107 warned_tracing = 1;
108 return 0;
109 }
110 return 1;
111}
112#endif
113
114void tracing_on(void)
115{
116#if CONTROL_TRACING > 0
117#define TRACEDIR "/sys/kernel/debug/tracing"
118 char pidstr[32];
119
120 if (!tracing_root_ok())
121 return;
122
123 sprintf(pidstr, "%d", getpid());
124 cat_into_file("0", TRACEDIR "/tracing_on");
125 cat_into_file("\n", TRACEDIR "/trace");
126 if (1) {
127 cat_into_file("function_graph", TRACEDIR "/current_tracer");
128 cat_into_file("1", TRACEDIR "/options/funcgraph-proc");
129 } else {
130 cat_into_file("nop", TRACEDIR "/current_tracer");
131 }
132 cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid");
133 cat_into_file("1", TRACEDIR "/tracing_on");
134 dprintf1("enabled tracing\n");
135#endif
136}
137
138void tracing_off(void)
139{
140#if CONTROL_TRACING > 0
141 if (!tracing_root_ok())
142 return;
143 cat_into_file("0", "/sys/kernel/debug/tracing/tracing_on");
144#endif
145}
146
147void abort_hooks(void)
148{
149 fprintf(stderr, "running %s()...\n", __func__);
150 tracing_off();
151#ifdef SLEEP_ON_ABORT
152 sleep(SLEEP_ON_ABORT);
153#endif
154}
155
156static inline void __page_o_noops(void)
157{
158 /* 8-bytes of instruction * 512 bytes = 1 page */
159 asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr");
160}
161
162/*
163 * This attempts to have roughly a page of instructions followed by a few
164 * instructions that do a write, and another page of instructions. That
165 * way, we are pretty sure that the write is in the second page of
166 * instructions and has at least a page of padding behind it.
167 *
168 * *That* lets us be sure to madvise() away the write instruction, which
169 * will then fault, which makes sure that the fault code handles
170 * execute-only memory properly.
171 */
172__attribute__((__aligned__(PAGE_SIZE)))
173void lots_o_noops_around_write(int *write_to_me)
174{
175 dprintf3("running %s()\n", __func__);
176 __page_o_noops();
177 /* Assume this happens in the second page of instructions: */
178 *write_to_me = __LINE__;
179 /* pad out by another page: */
180 __page_o_noops();
181 dprintf3("%s() done\n", __func__);
182}
183
184/* Define some kernel-like types */
185#define u8 uint8_t
186#define u16 uint16_t
187#define u32 uint32_t
188#define u64 uint64_t
189
190#ifdef __i386__
191#define SYS_mprotect_key 380
192#define SYS_pkey_alloc 381
193#define SYS_pkey_free 382
194#define REG_IP_IDX REG_EIP
195#define si_pkey_offset 0x18
196#else
197#define SYS_mprotect_key 329
198#define SYS_pkey_alloc 330
199#define SYS_pkey_free 331
200#define REG_IP_IDX REG_RIP
201#define si_pkey_offset 0x20
202#endif
203
204void dump_mem(void *dumpme, int len_bytes)
205{
206 char *c = (void *)dumpme;
207 int i;
208
209 for (i = 0; i < len_bytes; i += sizeof(u64)) {
210 u64 *ptr = (u64 *)(c + i);
211 dprintf1("dump[%03d][@%p]: %016jx\n", i, ptr, *ptr);
212 }
213}
214
215#define __SI_FAULT (3 << 16)
216#define SEGV_BNDERR (__SI_FAULT|3) /* failed address bound checks */
217#define SEGV_PKUERR (__SI_FAULT|4)
218
219static char *si_code_str(int si_code)
220{
221 if (si_code & SEGV_MAPERR)
222 return "SEGV_MAPERR";
223 if (si_code & SEGV_ACCERR)
224 return "SEGV_ACCERR";
225 if (si_code & SEGV_BNDERR)
226 return "SEGV_BNDERR";
227 if (si_code & SEGV_PKUERR)
228 return "SEGV_PKUERR";
229 return "UNKNOWN";
230}
231
232int pkru_faults;
233int last_si_pkey = -1;
234void signal_handler(int signum, siginfo_t *si, void *vucontext)
235{
236 ucontext_t *uctxt = vucontext;
237 int trapno;
238 unsigned long ip;
239 char *fpregs;
240 u32 *pkru_ptr;
241 u64 si_pkey;
242 u32 *si_pkey_ptr;
243 int pkru_offset;
244 fpregset_t fpregset;
245
246 dprint_in_signal = 1;
247 dprintf1(">>>>===============SIGSEGV============================\n");
248 dprintf1("%s()::%d, pkru: 0x%x shadow: %x\n", __func__, __LINE__,
249 __rdpkru(), shadow_pkru);
250
251 trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO];
252 ip = uctxt->uc_mcontext.gregs[REG_IP_IDX];
253 fpregset = uctxt->uc_mcontext.fpregs;
254 fpregs = (void *)fpregset;
255
256 dprintf2("%s() trapno: %d ip: 0x%lx info->si_code: %s/%d\n", __func__,
257 trapno, ip, si_code_str(si->si_code), si->si_code);
258#ifdef __i386__
259 /*
260 * 32-bit has some extra padding so that userspace can tell whether
261 * the XSTATE header is present in addition to the "legacy" FPU
262 * state. We just assume that it is here.
263 */
264 fpregs += 0x70;
265#endif
266 pkru_offset = pkru_xstate_offset();
267 pkru_ptr = (void *)(&fpregs[pkru_offset]);
268
269 dprintf1("siginfo: %p\n", si);
270 dprintf1(" fpregs: %p\n", fpregs);
271 /*
272 * If we got a PKRU fault, we *HAVE* to have at least one bit set in
273 * here.
274 */
275 dprintf1("pkru_xstate_offset: %d\n", pkru_xstate_offset());
276 if (DEBUG_LEVEL > 4)
277 dump_mem(pkru_ptr - 128, 256);
278 pkey_assert(*pkru_ptr);
279
280 si_pkey_ptr = (u32 *)(((u8 *)si) + si_pkey_offset);
281 dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr);
282 dump_mem(si_pkey_ptr - 8, 24);
283 si_pkey = *si_pkey_ptr;
284 pkey_assert(si_pkey < NR_PKEYS);
285 last_si_pkey = si_pkey;
286
287 if ((si->si_code == SEGV_MAPERR) ||
288 (si->si_code == SEGV_ACCERR) ||
289 (si->si_code == SEGV_BNDERR)) {
290 printf("non-PK si_code, exiting...\n");
291 exit(4);
292 }
293
294 dprintf1("signal pkru from xsave: %08x\n", *pkru_ptr);
295 /* need __rdpkru() version so we do not do shadow_pkru checking */
296 dprintf1("signal pkru from pkru: %08x\n", __rdpkru());
297 dprintf1("si_pkey from siginfo: %jx\n", si_pkey);
298 *(u64 *)pkru_ptr = 0x00000000;
299 dprintf1("WARNING: set PRKU=0 to allow faulting instruction to continue\n");
300 pkru_faults++;
301 dprintf1("<<<<==================================================\n");
302 return;
303 if (trapno == 14) {
304 fprintf(stderr,
305 "ERROR: In signal handler, page fault, trapno = %d, ip = %016lx\n",
306 trapno, ip);
307 fprintf(stderr, "si_addr %p\n", si->si_addr);
308 fprintf(stderr, "REG_ERR: %lx\n",
309 (unsigned long)uctxt->uc_mcontext.gregs[REG_ERR]);
310 exit(1);
311 } else {
312 fprintf(stderr, "unexpected trap %d! at 0x%lx\n", trapno, ip);
313 fprintf(stderr, "si_addr %p\n", si->si_addr);
314 fprintf(stderr, "REG_ERR: %lx\n",
315 (unsigned long)uctxt->uc_mcontext.gregs[REG_ERR]);
316 exit(2);
317 }
318 dprint_in_signal = 0;
319}
320
321int wait_all_children(void)
322{
323 int status;
324 return waitpid(-1, &status, 0);
325}
326
327void sig_chld(int x)
328{
329 dprint_in_signal = 1;
330 dprintf2("[%d] SIGCHLD: %d\n", getpid(), x);
331 dprint_in_signal = 0;
332}
333
334void setup_sigsegv_handler(void)
335{
336 int r, rs;
337 struct sigaction newact;
338 struct sigaction oldact;
339
340 /* #PF is mapped to sigsegv */
341 int signum = SIGSEGV;
342
343 newact.sa_handler = 0;
344 newact.sa_sigaction = signal_handler;
345
346 /*sigset_t - signals to block while in the handler */
347 /* get the old signal mask. */
348 rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask);
349 pkey_assert(rs == 0);
350
351 /* call sa_sigaction, not sa_handler*/
352 newact.sa_flags = SA_SIGINFO;
353
354 newact.sa_restorer = 0; /* void(*)(), obsolete */
355 r = sigaction(signum, &newact, &oldact);
356 r = sigaction(SIGALRM, &newact, &oldact);
357 pkey_assert(r == 0);
358}
359
360void setup_handlers(void)
361{
362 signal(SIGCHLD, &sig_chld);
363 setup_sigsegv_handler();
364}
365
366pid_t fork_lazy_child(void)
367{
368 pid_t forkret;
369
370 forkret = fork();
371 pkey_assert(forkret >= 0);
372 dprintf3("[%d] fork() ret: %d\n", getpid(), forkret);
373
374 if (!forkret) {
375 /* in the child */
376 while (1) {
377 dprintf1("child sleeping...\n");
378 sleep(30);
379 }
380 }
381 return forkret;
382}
383
384void davecmp(void *_a, void *_b, int len)
385{
386 int i;
387 unsigned long *a = _a;
388 unsigned long *b = _b;
389
390 for (i = 0; i < len / sizeof(*a); i++) {
391 if (a[i] == b[i])
392 continue;
393
394 dprintf3("[%3d]: a: %016lx b: %016lx\n", i, a[i], b[i]);
395 }
396}
397
398void dumpit(char *f)
399{
400 int fd = open(f, O_RDONLY);
401 char buf[100];
402 int nr_read;
403
404 dprintf2("maps fd: %d\n", fd);
405 do {
406 nr_read = read(fd, &buf[0], sizeof(buf));
407 write(1, buf, nr_read);
408 } while (nr_read > 0);
409 close(fd);
410}
411
412#define PKEY_DISABLE_ACCESS 0x1
413#define PKEY_DISABLE_WRITE 0x2
414
415u32 pkey_get(int pkey, unsigned long flags)
416{
417 u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
418 u32 pkru = __rdpkru();
419 u32 shifted_pkru;
420 u32 masked_pkru;
421
422 dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n",
423 __func__, pkey, flags, 0, 0);
424 dprintf2("%s() raw pkru: %x\n", __func__, pkru);
425
426 shifted_pkru = (pkru >> (pkey * PKRU_BITS_PER_PKEY));
427 dprintf2("%s() shifted_pkru: %x\n", __func__, shifted_pkru);
428 masked_pkru = shifted_pkru & mask;
429 dprintf2("%s() masked pkru: %x\n", __func__, masked_pkru);
430 /*
431 * shift down the relevant bits to the lowest two, then
432 * mask off all the other high bits.
433 */
434 return masked_pkru;
435}
436
437int pkey_set(int pkey, unsigned long rights, unsigned long flags)
438{
439 u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
440 u32 old_pkru = __rdpkru();
441 u32 new_pkru;
442
443 /* make sure that 'rights' only contains the bits we expect: */
444 assert(!(rights & ~mask));
445
446 /* copy old pkru */
447 new_pkru = old_pkru;
448 /* mask out bits from pkey in old value: */
449 new_pkru &= ~(mask << (pkey * PKRU_BITS_PER_PKEY));
450 /* OR in new bits for pkey: */
451 new_pkru |= (rights << (pkey * PKRU_BITS_PER_PKEY));
452
453 __wrpkru(new_pkru);
454
455 dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x pkru now: %x old_pkru: %x\n",
456 __func__, pkey, rights, flags, 0, __rdpkru(), old_pkru);
457 return 0;
458}
459
460void pkey_disable_set(int pkey, int flags)
461{
462 unsigned long syscall_flags = 0;
463 int ret;
464 int pkey_rights;
465 u32 orig_pkru;
466
467 dprintf1("START->%s(%d, 0x%x)\n", __func__,
468 pkey, flags);
469 pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
470
471 pkey_rights = pkey_get(pkey, syscall_flags);
472
473 dprintf1("%s(%d) pkey_get(%d): %x\n", __func__,
474 pkey, pkey, pkey_rights);
475 pkey_assert(pkey_rights >= 0);
476
477 pkey_rights |= flags;
478
479 ret = pkey_set(pkey, pkey_rights, syscall_flags);
480 assert(!ret);
481 /*pkru and flags have the same format */
482 shadow_pkru |= flags << (pkey * 2);
483 dprintf1("%s(%d) shadow: 0x%x\n", __func__, pkey, shadow_pkru);
484
485 pkey_assert(ret >= 0);
486
487 pkey_rights = pkey_get(pkey, syscall_flags);
488 dprintf1("%s(%d) pkey_get(%d): %x\n", __func__,
489 pkey, pkey, pkey_rights);
490
491 dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru());
492 if (flags)
493 pkey_assert(rdpkru() > orig_pkru);
494 dprintf1("END<---%s(%d, 0x%x)\n", __func__,
495 pkey, flags);
496}
497
498void pkey_disable_clear(int pkey, int flags)
499{
500 unsigned long syscall_flags = 0;
501 int ret;
502 int pkey_rights = pkey_get(pkey, syscall_flags);
503 u32 orig_pkru = rdpkru();
504
505 pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
506
507 dprintf1("%s(%d) pkey_get(%d): %x\n", __func__,
508 pkey, pkey, pkey_rights);
509 pkey_assert(pkey_rights >= 0);
510
511 pkey_rights |= flags;
512
513 ret = pkey_set(pkey, pkey_rights, 0);
514 /* pkru and flags have the same format */
515 shadow_pkru &= ~(flags << (pkey * 2));
516 pkey_assert(ret >= 0);
517
518 pkey_rights = pkey_get(pkey, syscall_flags);
519 dprintf1("%s(%d) pkey_get(%d): %x\n", __func__,
520 pkey, pkey, pkey_rights);
521
522 dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru());
523 if (flags)
524 assert(rdpkru() > orig_pkru);
525}
526
527void pkey_write_allow(int pkey)
528{
529 pkey_disable_clear(pkey, PKEY_DISABLE_WRITE);
530}
531void pkey_write_deny(int pkey)
532{
533 pkey_disable_set(pkey, PKEY_DISABLE_WRITE);
534}
535void pkey_access_allow(int pkey)
536{
537 pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS);
538}
539void pkey_access_deny(int pkey)
540{
541 pkey_disable_set(pkey, PKEY_DISABLE_ACCESS);
542}
543
544int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
545 unsigned long pkey)
546{
547 int sret;
548
549 dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__,
550 ptr, size, orig_prot, pkey);
551
552 errno = 0;
553 sret = syscall(SYS_mprotect_key, ptr, size, orig_prot, pkey);
554 if (errno) {
555 dprintf2("SYS_mprotect_key sret: %d\n", sret);
556 dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot);
557 dprintf2("SYS_mprotect_key failed, errno: %d\n", errno);
558 if (DEBUG_LEVEL >= 2)
559 perror("SYS_mprotect_pkey");
560 }
561 return sret;
562}
563
564int sys_pkey_alloc(unsigned long flags, unsigned long init_val)
565{
566 int ret = syscall(SYS_pkey_alloc, flags, init_val);
567 dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n",
568 __func__, flags, init_val, ret, errno);
569 return ret;
570}
571
572int alloc_pkey(void)
573{
574 int ret;
575 unsigned long init_val = 0x0;
576
577 dprintf1("alloc_pkey()::%d, pkru: 0x%x shadow: %x\n",
578 __LINE__, __rdpkru(), shadow_pkru);
579 ret = sys_pkey_alloc(0, init_val);
580 /*
581 * pkey_alloc() sets PKRU, so we need to reflect it in
582 * shadow_pkru:
583 */
584 dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
585 __LINE__, ret, __rdpkru(), shadow_pkru);
586 if (ret) {
587 /* clear both the bits: */
588 shadow_pkru &= ~(0x3 << (ret * 2));
589 dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
590 __LINE__, ret, __rdpkru(), shadow_pkru);
591 /*
592 * move the new state in from init_val
593 * (remember, we cheated and init_val == pkru format)
594 */
595 shadow_pkru |= (init_val << (ret * 2));
596 }
597 dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
598 __LINE__, ret, __rdpkru(), shadow_pkru);
599 dprintf1("alloc_pkey()::%d errno: %d\n", __LINE__, errno);
600 /* for shadow checking: */
601 rdpkru();
602 dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
603 __LINE__, ret, __rdpkru(), shadow_pkru);
604 return ret;
605}
606
607int sys_pkey_free(unsigned long pkey)
608{
609 int ret = syscall(SYS_pkey_free, pkey);
610 dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret);
611 return ret;
612}
613
614/*
615 * I had a bug where pkey bits could be set by mprotect() but
616 * not cleared. This ensures we get lots of random bit sets
617 * and clears on the vma and pte pkey bits.
618 */
619int alloc_random_pkey(void)
620{
621 int max_nr_pkey_allocs;
622 int ret;
623 int i;
624 int alloced_pkeys[NR_PKEYS];
625 int nr_alloced = 0;
626 int random_index;
627 memset(alloced_pkeys, 0, sizeof(alloced_pkeys));
628
629 /* allocate every possible key and make a note of which ones we got */
630 max_nr_pkey_allocs = NR_PKEYS;
631 max_nr_pkey_allocs = 1;
632 for (i = 0; i < max_nr_pkey_allocs; i++) {
633 int new_pkey = alloc_pkey();
634 if (new_pkey < 0)
635 break;
636 alloced_pkeys[nr_alloced++] = new_pkey;
637 }
638
639 pkey_assert(nr_alloced > 0);
640 /* select a random one out of the allocated ones */
641 random_index = rand() % nr_alloced;
642 ret = alloced_pkeys[random_index];
643 /* now zero it out so we don't free it next */
644 alloced_pkeys[random_index] = 0;
645
646 /* go through the allocated ones that we did not want and free them */
647 for (i = 0; i < nr_alloced; i++) {
648 int free_ret;
649 if (!alloced_pkeys[i])
650 continue;
651 free_ret = sys_pkey_free(alloced_pkeys[i]);
652 pkey_assert(!free_ret);
653 }
654 dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
655 __LINE__, ret, __rdpkru(), shadow_pkru);
656 return ret;
657}
658
659int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
660 unsigned long pkey)
661{
662 int nr_iterations = random() % 100;
663 int ret;
664
665 while (0) {
666 int rpkey = alloc_random_pkey();
667 ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
668 dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
669 ptr, size, orig_prot, pkey, ret);
670 if (nr_iterations-- < 0)
671 break;
672
673 dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
674 __LINE__, ret, __rdpkru(), shadow_pkru);
675 sys_pkey_free(rpkey);
676 dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
677 __LINE__, ret, __rdpkru(), shadow_pkru);
678 }
679 pkey_assert(pkey < NR_PKEYS);
680
681 ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
682 dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
683 ptr, size, orig_prot, pkey, ret);
684 pkey_assert(!ret);
685 dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
686 __LINE__, ret, __rdpkru(), shadow_pkru);
687 return ret;
688}
689
690struct pkey_malloc_record {
691 void *ptr;
692 long size;
693};
694struct pkey_malloc_record *pkey_malloc_records;
695long nr_pkey_malloc_records;
696void record_pkey_malloc(void *ptr, long size)
697{
698 long i;
699 struct pkey_malloc_record *rec = NULL;
700
701 for (i = 0; i < nr_pkey_malloc_records; i++) {
702 rec = &pkey_malloc_records[i];
703 /* find a free record */
704 if (rec)
705 break;
706 }
707 if (!rec) {
708 /* every record is full */
709 size_t old_nr_records = nr_pkey_malloc_records;
710 size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1);
711 size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record);
712 dprintf2("new_nr_records: %zd\n", new_nr_records);
713 dprintf2("new_size: %zd\n", new_size);
714 pkey_malloc_records = realloc(pkey_malloc_records, new_size);
715 pkey_assert(pkey_malloc_records != NULL);
716 rec = &pkey_malloc_records[nr_pkey_malloc_records];
717 /*
718 * realloc() does not initialize memory, so zero it from
719 * the first new record all the way to the end.
720 */
721 for (i = 0; i < new_nr_records - old_nr_records; i++)
722 memset(rec + i, 0, sizeof(*rec));
723 }
724 dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n",
725 (int)(rec - pkey_malloc_records), rec, ptr, size);
726 rec->ptr = ptr;
727 rec->size = size;
728 nr_pkey_malloc_records++;
729}
730
731void free_pkey_malloc(void *ptr)
732{
733 long i;
734 int ret;
735 dprintf3("%s(%p)\n", __func__, ptr);
736 for (i = 0; i < nr_pkey_malloc_records; i++) {
737 struct pkey_malloc_record *rec = &pkey_malloc_records[i];
738 dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n",
739 ptr, i, rec, rec->ptr, rec->size);
740 if ((ptr < rec->ptr) ||
741 (ptr >= rec->ptr + rec->size))
742 continue;
743
744 dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n",
745 ptr, i, rec, rec->ptr, rec->size);
746 nr_pkey_malloc_records--;
747 ret = munmap(rec->ptr, rec->size);
748 dprintf3("munmap ret: %d\n", ret);
749 pkey_assert(!ret);
750 dprintf3("clearing rec->ptr, rec: %p\n", rec);
751 rec->ptr = NULL;
752 dprintf3("done clearing rec->ptr, rec: %p\n", rec);
753 return;
754 }
755 pkey_assert(false);
756}
757
758
759void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey)
760{
761 void *ptr;
762 int ret;
763
764 rdpkru();
765 dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
766 size, prot, pkey);
767 pkey_assert(pkey < NR_PKEYS);
768 ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
769 pkey_assert(ptr != (void *)-1);
770 ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
771 pkey_assert(!ret);
772 record_pkey_malloc(ptr, size);
773 rdpkru();
774
775 dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
776 return ptr;
777}
778
779void *malloc_pkey_anon_huge(long size, int prot, u16 pkey)
780{
781 int ret;
782 void *ptr;
783
784 dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
785 size, prot, pkey);
786 /*
787 * Guarantee we can fit at least one huge page in the resulting
788 * allocation by allocating space for 2:
789 */
790 size = ALIGN_UP(size, HPAGE_SIZE * 2);
791 ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
792 pkey_assert(ptr != (void *)-1);
793 record_pkey_malloc(ptr, size);
794 mprotect_pkey(ptr, size, prot, pkey);
795
796 dprintf1("unaligned ptr: %p\n", ptr);
797 ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE);
798 dprintf1(" aligned ptr: %p\n", ptr);
799 ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE);
800 dprintf1("MADV_HUGEPAGE ret: %d\n", ret);
801 ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED);
802 dprintf1("MADV_WILLNEED ret: %d\n", ret);
803 memset(ptr, 0, HPAGE_SIZE);
804
805 dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr);
806 return ptr;
807}
808
809int hugetlb_setup_ok;
810#define GET_NR_HUGE_PAGES 10
811void setup_hugetlbfs(void)
812{
813 int err;
814 int fd;
815 int validated_nr_pages;
816 int i;
817 char buf[] = "123";
818
819 if (geteuid() != 0) {
820 fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n");
821 return;
822 }
823
824 cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages");
825
826 /*
827 * Now go make sure that we got the pages and that they
828 * are 2M pages. Someone might have made 1G the default.
829 */
830 fd = open("/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages", O_RDONLY);
831 if (fd < 0) {
832 perror("opening sysfs 2M hugetlb config");
833 return;
834 }
835
836 /* -1 to guarantee leaving the trailing \0 */
837 err = read(fd, buf, sizeof(buf)-1);
838 close(fd);
839 if (err <= 0) {
840 perror("reading sysfs 2M hugetlb config");
841 return;
842 }
843
844 if (atoi(buf) != GET_NR_HUGE_PAGES) {
845 fprintf(stderr, "could not confirm 2M pages, got: '%s' expected %d\n",
846 buf, GET_NR_HUGE_PAGES);
847 return;
848 }
849
850 hugetlb_setup_ok = 1;
851}
852
853void *malloc_pkey_hugetlb(long size, int prot, u16 pkey)
854{
855 void *ptr;
856 int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB;
857
858 if (!hugetlb_setup_ok)
859 return PTR_ERR_ENOTSUP;
860
861 dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey);
862 size = ALIGN_UP(size, HPAGE_SIZE * 2);
863 pkey_assert(pkey < NR_PKEYS);
864 ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0);
865 pkey_assert(ptr != (void *)-1);
866 mprotect_pkey(ptr, size, prot, pkey);
867
868 record_pkey_malloc(ptr, size);
869
870 dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr);
871 return ptr;
872}
873
874void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey)
875{
876 void *ptr;
877 int fd;
878
879 dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
880 size, prot, pkey);
881 pkey_assert(pkey < NR_PKEYS);
882 fd = open("/dax/foo", O_RDWR);
883 pkey_assert(fd >= 0);
884
885 ptr = mmap(0, size, prot, MAP_SHARED, fd, 0);
886 pkey_assert(ptr != (void *)-1);
887
888 mprotect_pkey(ptr, size, prot, pkey);
889
890 record_pkey_malloc(ptr, size);
891
892 dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr);
893 close(fd);
894 return ptr;
895}
896
897void *(*pkey_malloc[])(long size, int prot, u16 pkey) = {
898
899 malloc_pkey_with_mprotect,
900 malloc_pkey_anon_huge,
901 malloc_pkey_hugetlb
902/* can not do direct with the pkey_mprotect() API:
903 malloc_pkey_mmap_direct,
904 malloc_pkey_mmap_dax,
905*/
906};
907
908void *malloc_pkey(long size, int prot, u16 pkey)
909{
910 void *ret;
911 static int malloc_type;
912 int nr_malloc_types = ARRAY_SIZE(pkey_malloc);
913
914 pkey_assert(pkey < NR_PKEYS);
915
916 while (1) {
917 pkey_assert(malloc_type < nr_malloc_types);
918
919 ret = pkey_malloc[malloc_type](size, prot, pkey);
920 pkey_assert(ret != (void *)-1);
921
922 malloc_type++;
923 if (malloc_type >= nr_malloc_types)
924 malloc_type = (random()%nr_malloc_types);
925
926 /* try again if the malloc_type we tried is unsupported */
927 if (ret == PTR_ERR_ENOTSUP)
928 continue;
929
930 break;
931 }
932
933 dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__,
934 size, prot, pkey, ret);
935 return ret;
936}
937
938int last_pkru_faults;
939void expected_pk_fault(int pkey)
940{
941 dprintf2("%s(): last_pkru_faults: %d pkru_faults: %d\n",
942 __func__, last_pkru_faults, pkru_faults);
943 dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey);
944 pkey_assert(last_pkru_faults + 1 == pkru_faults);
945 pkey_assert(last_si_pkey == pkey);
946 /*
947 * The signal handler shold have cleared out PKRU to let the
948 * test program continue. We now have to restore it.
949 */
950 if (__rdpkru() != 0)
951 pkey_assert(0);
952
953 __wrpkru(shadow_pkru);
954 dprintf1("%s() set PKRU=%x to restore state after signal nuked it\n",
955 __func__, shadow_pkru);
956 last_pkru_faults = pkru_faults;
957 last_si_pkey = -1;
958}
959
960void do_not_expect_pk_fault(void)
961{
962 pkey_assert(last_pkru_faults == pkru_faults);
963}
964
965int test_fds[10] = { -1 };
966int nr_test_fds;
967void __save_test_fd(int fd)
968{
969 pkey_assert(fd >= 0);
970 pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds));
971 test_fds[nr_test_fds] = fd;
972 nr_test_fds++;
973}
974
975int get_test_read_fd(void)
976{
977 int test_fd = open("/etc/passwd", O_RDONLY);
978 __save_test_fd(test_fd);
979 return test_fd;
980}
981
982void close_test_fds(void)
983{
984 int i;
985
986 for (i = 0; i < nr_test_fds; i++) {
987 if (test_fds[i] < 0)
988 continue;
989 close(test_fds[i]);
990 test_fds[i] = -1;
991 }
992 nr_test_fds = 0;
993}
994
995#define barrier() __asm__ __volatile__("": : :"memory")
996__attribute__((noinline)) int read_ptr(int *ptr)
997{
998 /*
999 * Keep GCC from optimizing this away somehow
1000 */
1001 barrier();
1002 return *ptr;
1003}
1004
1005void test_read_of_write_disabled_region(int *ptr, u16 pkey)
1006{
1007 int ptr_contents;
1008
1009 dprintf1("disabling write access to PKEY[1], doing read\n");
1010 pkey_write_deny(pkey);
1011 ptr_contents = read_ptr(ptr);
1012 dprintf1("*ptr: %d\n", ptr_contents);
1013 dprintf1("\n");
1014}
1015void test_read_of_access_disabled_region(int *ptr, u16 pkey)
1016{
1017 int ptr_contents;
1018
1019 dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr);
1020 rdpkru();
1021 pkey_access_deny(pkey);
1022 ptr_contents = read_ptr(ptr);
1023 dprintf1("*ptr: %d\n", ptr_contents);
1024 expected_pk_fault(pkey);
1025}
1026void test_write_of_write_disabled_region(int *ptr, u16 pkey)
1027{
1028 dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey);
1029 pkey_write_deny(pkey);
1030 *ptr = __LINE__;
1031 expected_pk_fault(pkey);
1032}
1033void test_write_of_access_disabled_region(int *ptr, u16 pkey)
1034{
1035 dprintf1("disabling access to PKEY[%02d], doing write\n", pkey);
1036 pkey_access_deny(pkey);
1037 *ptr = __LINE__;
1038 expected_pk_fault(pkey);
1039}
1040void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey)
1041{
1042 int ret;
1043 int test_fd = get_test_read_fd();
1044
1045 dprintf1("disabling access to PKEY[%02d], "
1046 "having kernel read() to buffer\n", pkey);
1047 pkey_access_deny(pkey);
1048 ret = read(test_fd, ptr, 1);
1049 dprintf1("read ret: %d\n", ret);
1050 pkey_assert(ret);
1051}
1052void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey)
1053{
1054 int ret;
1055 int test_fd = get_test_read_fd();
1056
1057 pkey_write_deny(pkey);
1058 ret = read(test_fd, ptr, 100);
1059 dprintf1("read ret: %d\n", ret);
1060 if (ret < 0 && (DEBUG_LEVEL > 0))
1061 perror("verbose read result (OK for this to be bad)");
1062 pkey_assert(ret);
1063}
1064
1065void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey)
1066{
1067 int pipe_ret, vmsplice_ret;
1068 struct iovec iov;
1069 int pipe_fds[2];
1070
1071 pipe_ret = pipe(pipe_fds);
1072
1073 pkey_assert(pipe_ret == 0);
1074 dprintf1("disabling access to PKEY[%02d], "
1075 "having kernel vmsplice from buffer\n", pkey);
1076 pkey_access_deny(pkey);
1077 iov.iov_base = ptr;
1078 iov.iov_len = PAGE_SIZE;
1079 vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT);
1080 dprintf1("vmsplice() ret: %d\n", vmsplice_ret);
1081 pkey_assert(vmsplice_ret == -1);
1082
1083 close(pipe_fds[0]);
1084 close(pipe_fds[1]);
1085}
1086
1087void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey)
1088{
1089 int ignored = 0xdada;
1090 int futex_ret;
1091 int some_int = __LINE__;
1092
1093 dprintf1("disabling write to PKEY[%02d], "
1094 "doing futex gunk in buffer\n", pkey);
1095 *ptr = some_int;
1096 pkey_write_deny(pkey);
1097 futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL,
1098 &ignored, ignored);
1099 if (DEBUG_LEVEL > 0)
1100 perror("futex");
1101 dprintf1("futex() ret: %d\n", futex_ret);
1102}
1103
1104/* Assumes that all pkeys other than 'pkey' are unallocated */
1105void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey)
1106{
1107 int err;
1108 int i;
1109
1110 /* Note: 0 is the default pkey, so don't mess with it */
1111 for (i = 1; i < NR_PKEYS; i++) {
1112 if (pkey == i)
1113 continue;
1114
1115 dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i);
1116 err = sys_pkey_free(i);
1117 pkey_assert(err);
1118
1119 /* not enforced when pkey_get() is not a syscall
1120 err = pkey_get(i, 0);
1121 pkey_assert(err < 0);
1122 */
1123
1124 err = sys_pkey_free(i);
1125 pkey_assert(err);
1126
1127 err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i);
1128 pkey_assert(err);
1129 }
1130}
1131
1132/* Assumes that all pkeys other than 'pkey' are unallocated */
1133void test_pkey_syscalls_bad_args(int *ptr, u16 pkey)
1134{
1135 int err;
1136 int bad_flag = (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE) + 1;
1137 int bad_pkey = NR_PKEYS+99;
1138
1139 /* not enforced when pkey_get() is not a syscall
1140 err = pkey_get(bad_pkey, bad_flag);
1141 pkey_assert(err < 0);
1142 */
1143
1144 /* pass a known-invalid pkey in: */
1145 err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey);
1146 pkey_assert(err);
1147}
1148
1149/* Assumes that all pkeys other than 'pkey' are unallocated */
1150void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
1151{
1152 unsigned long flags;
1153 unsigned long init_val;
1154 int err;
1155 int allocated_pkeys[NR_PKEYS] = {0};
1156 int nr_allocated_pkeys = 0;
1157 int i;
1158
1159 for (i = 0; i < NR_PKEYS*2; i++) {
1160 int new_pkey;
1161 dprintf1("%s() alloc loop: %d\n", __func__, i);
1162 new_pkey = alloc_pkey();
1163 dprintf4("%s()::%d, err: %d pkru: 0x%x shadow: 0x%x\n", __func__,
1164 __LINE__, err, __rdpkru(), shadow_pkru);
1165 rdpkru(); /* for shadow checking */
1166 dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC);
1167 if ((new_pkey == -1) && (errno == ENOSPC)) {
1168 dprintf2("%s() failed to allocate pkey after %d tries\n",
1169 __func__, nr_allocated_pkeys);
1170 break;
1171 }
1172 pkey_assert(nr_allocated_pkeys < NR_PKEYS);
1173 allocated_pkeys[nr_allocated_pkeys++] = new_pkey;
1174 }
1175
1176 dprintf3("%s()::%d\n", __func__, __LINE__);
1177
1178 /*
1179 * ensure it did not reach the end of the loop without
1180 * failure:
1181 */
1182 pkey_assert(i < NR_PKEYS*2);
1183
1184 /*
1185 * There are 16 pkeys supported in hardware. One is taken
1186 * up for the default (0) and another can be taken up by
1187 * an execute-only mapping. Ensure that we can allocate
1188 * at least 14 (16-2).
1189 */
1190 pkey_assert(i >= NR_PKEYS-2);
1191
1192 for (i = 0; i < nr_allocated_pkeys; i++) {
1193 err = sys_pkey_free(allocated_pkeys[i]);
1194 pkey_assert(!err);
1195 rdpkru(); /* for shadow checking */
1196 }
1197}
1198
1199void test_ptrace_of_child(int *ptr, u16 pkey)
1200{
1201 __attribute__((__unused__)) int peek_result;
1202 pid_t child_pid;
1203 void *ignored = 0;
1204 long ret;
1205 int status;
1206 /*
1207 * This is the "control" for our little expermient. Make sure
1208 * we can always access it when ptracing.
1209 */
1210 int *plain_ptr_unaligned = malloc(HPAGE_SIZE);
1211 int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE);
1212
1213 /*
1214 * Fork a child which is an exact copy of this process, of course.
1215 * That means we can do all of our tests via ptrace() and then plain
1216 * memory access and ensure they work differently.
1217 */
1218 child_pid = fork_lazy_child();
1219 dprintf1("[%d] child pid: %d\n", getpid(), child_pid);
1220
1221 ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored);
1222 if (ret)
1223 perror("attach");
1224 dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__);
1225 pkey_assert(ret != -1);
1226 ret = waitpid(child_pid, &status, WUNTRACED);
1227 if ((ret != child_pid) || !(WIFSTOPPED(status))) {
1228 fprintf(stderr, "weird waitpid result %ld stat %x\n",
1229 ret, status);
1230 pkey_assert(0);
1231 }
1232 dprintf2("waitpid ret: %ld\n", ret);
1233 dprintf2("waitpid status: %d\n", status);
1234
1235 pkey_access_deny(pkey);
1236 pkey_write_deny(pkey);
1237
1238 /* Write access, untested for now:
1239 ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data);
1240 pkey_assert(ret != -1);
1241 dprintf1("poke at %p: %ld\n", peek_at, ret);
1242 */
1243
1244 /*
1245 * Try to access the pkey-protected "ptr" via ptrace:
1246 */
1247 ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored);
1248 /* expect it to work, without an error: */
1249 pkey_assert(ret != -1);
1250 /* Now access from the current task, and expect an exception: */
1251 peek_result = read_ptr(ptr);
1252 expected_pk_fault(pkey);
1253
1254 /*
1255 * Try to access the NON-pkey-protected "plain_ptr" via ptrace:
1256 */
1257 ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored);
1258 /* expect it to work, without an error: */
1259 pkey_assert(ret != -1);
1260 /* Now access from the current task, and expect NO exception: */
1261 peek_result = read_ptr(plain_ptr);
1262 do_not_expect_pk_fault();
1263
1264 ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0);
1265 pkey_assert(ret != -1);
1266
1267 ret = kill(child_pid, SIGKILL);
1268 pkey_assert(ret != -1);
1269
1270 wait(&status);
1271
1272 free(plain_ptr_unaligned);
1273}
1274
1275void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
1276{
1277 void *p1;
1278 int scratch;
1279 int ptr_contents;
1280 int ret;
1281
1282 p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE);
1283 dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write);
1284 /* lots_o_noops_around_write should be page-aligned already */
1285 assert(p1 == &lots_o_noops_around_write);
1286
1287 /* Point 'p1' at the *second* page of the function: */
1288 p1 += PAGE_SIZE;
1289
1290 madvise(p1, PAGE_SIZE, MADV_DONTNEED);
1291 lots_o_noops_around_write(&scratch);
1292 ptr_contents = read_ptr(p1);
1293 dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
1294
1295 ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey);
1296 pkey_assert(!ret);
1297 pkey_access_deny(pkey);
1298
1299 dprintf2("pkru: %x\n", rdpkru());
1300
1301 /*
1302 * Make sure this is an *instruction* fault
1303 */
1304 madvise(p1, PAGE_SIZE, MADV_DONTNEED);
1305 lots_o_noops_around_write(&scratch);
1306 do_not_expect_pk_fault();
1307 ptr_contents = read_ptr(p1);
1308 dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
1309 expected_pk_fault(pkey);
1310}
1311
1312void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey)
1313{
1314 int size = PAGE_SIZE;
1315 int sret;
1316
1317 if (cpu_has_pku()) {
1318 dprintf1("SKIP: %s: no CPU support\n", __func__);
1319 return;
1320 }
1321
1322 sret = syscall(SYS_mprotect_key, ptr, size, PROT_READ, pkey);
1323 pkey_assert(sret < 0);
1324}
1325
1326void (*pkey_tests[])(int *ptr, u16 pkey) = {
1327 test_read_of_write_disabled_region,
1328 test_read_of_access_disabled_region,
1329 test_write_of_write_disabled_region,
1330 test_write_of_access_disabled_region,
1331 test_kernel_write_of_access_disabled_region,
1332 test_kernel_write_of_write_disabled_region,
1333 test_kernel_gup_of_access_disabled_region,
1334 test_kernel_gup_write_to_write_disabled_region,
1335 test_executing_on_unreadable_memory,
1336 test_ptrace_of_child,
1337 test_pkey_syscalls_on_non_allocated_pkey,
1338 test_pkey_syscalls_bad_args,
1339 test_pkey_alloc_exhaust,
1340};
1341
1342void run_tests_once(void)
1343{
1344 int *ptr;
1345 int prot = PROT_READ|PROT_WRITE;
1346
1347 for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) {
1348 int pkey;
1349 int orig_pkru_faults = pkru_faults;
1350
1351 dprintf1("======================\n");
1352 dprintf1("test %d preparing...\n", test_nr);
1353
1354 tracing_on();
1355 pkey = alloc_random_pkey();
1356 dprintf1("test %d starting with pkey: %d\n", test_nr, pkey);
1357 ptr = malloc_pkey(PAGE_SIZE, prot, pkey);
1358 dprintf1("test %d starting...\n", test_nr);
1359 pkey_tests[test_nr](ptr, pkey);
1360 dprintf1("freeing test memory: %p\n", ptr);
1361 free_pkey_malloc(ptr);
1362 sys_pkey_free(pkey);
1363
1364 dprintf1("pkru_faults: %d\n", pkru_faults);
1365 dprintf1("orig_pkru_faults: %d\n", orig_pkru_faults);
1366
1367 tracing_off();
1368 close_test_fds();
1369
1370 printf("test %2d PASSED (itertation %d)\n", test_nr, iteration_nr);
1371 dprintf1("======================\n\n");
1372 }
1373 iteration_nr++;
1374}
1375
1376void pkey_setup_shadow(void)
1377{
1378 shadow_pkru = __rdpkru();
1379}
1380
1381int main(void)
1382{
1383 int nr_iterations = 22;
1384
1385 setup_handlers();
1386
1387 printf("has pku: %d\n", cpu_has_pku());
1388
1389 if (!cpu_has_pku()) {
1390 int size = PAGE_SIZE;
1391 int *ptr;
1392
1393 printf("running PKEY tests for unsupported CPU/OS\n");
1394
1395 ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
1396 assert(ptr != (void *)-1);
1397 test_mprotect_pkey_on_unsupported_cpu(ptr, 1);
1398 exit(0);
1399 }
1400
1401 pkey_setup_shadow();
1402 printf("startup pkru: %x\n", rdpkru());
1403 setup_hugetlbfs();
1404
1405 while (nr_iterations-- > 0)
1406 run_tests_once();
1407
1408 printf("done (all tests OK)\n");
1409 return 0;
1410}