aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-03-20 22:08:56 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-03-20 22:08:56 -0400
commit643ad15d47410d37d43daf3ef1c8ac52c281efa5 (patch)
treea864860cfe04c994c03d7946e12b3351e38a168b
parent24b5e20f11a75866bbffc46c30a22fa50612a769 (diff)
parent0d47638f80a02b15869f1fe1fc09e5bf996750fd (diff)
Merge branch 'mm-pkeys-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 protection key support from Ingo Molnar: "This tree adds support for a new memory protection hardware feature that is available in upcoming Intel CPUs: 'protection keys' (pkeys). There's a background article at LWN.net: https://lwn.net/Articles/643797/ The gist is that protection keys allow the encoding of user-controllable permission masks in the pte. So instead of having a fixed protection mask in the pte (which needs a system call to change and works on a per page basis), the user can map a (handful of) protection mask variants and can change the masks runtime relatively cheaply, without having to change every single page in the affected virtual memory range. This allows the dynamic switching of the protection bits of large amounts of virtual memory, via user-space instructions. It also allows more precise control of MMU permission bits: for example the executable bit is separate from the read bit (see more about that below). This tree adds the MM infrastructure and low level x86 glue needed for that, plus it adds a high level API to make use of protection keys - if a user-space application calls: mmap(..., PROT_EXEC); or mprotect(ptr, sz, PROT_EXEC); (note PROT_EXEC-only, without PROT_READ/WRITE), the kernel will notice this special case, and will set a special protection key on this memory range. It also sets the appropriate bits in the Protection Keys User Rights (PKRU) register so that the memory becomes unreadable and unwritable. So using protection keys the kernel is able to implement 'true' PROT_EXEC on x86 CPUs: without protection keys PROT_EXEC implies PROT_READ as well. Unreadable executable mappings have security advantages: they cannot be read via information leaks to figure out ASLR details, nor can they be scanned for ROP gadgets - and they cannot be used by exploits for data purposes either. We know about no user-space code that relies on pure PROT_EXEC mappings today, but binary loaders could start making use of this new feature to map binaries and libraries in a more secure fashion. There is other pending pkeys work that offers more high level system call APIs to manage protection keys - but those are not part of this pull request. Right now there's a Kconfig that controls this feature (CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) that is default enabled (like most x86 CPU feature enablement code that has no runtime overhead), but it's not user-configurable at the moment. If there's any serious problem with this then we can make it configurable and/or flip the default" * 'mm-pkeys-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (38 commits) x86/mm/pkeys: Fix mismerge of protection keys CPUID bits mm/pkeys: Fix siginfo ABI breakage caused by new u64 field x86/mm/pkeys: Fix access_error() denial of writes to write-only VMA mm/core, x86/mm/pkeys: Add execute-only protection keys support x86/mm/pkeys: Create an x86 arch_calc_vm_prot_bits() for VMA flags x86/mm/pkeys: Allow kernel to modify user pkey rights register x86/fpu: Allow setting of XSAVE state x86/mm: Factor out LDT init from context init mm/core, x86/mm/pkeys: Add arch_validate_pkey() mm/core, arch, powerpc: Pass a protection key in to calc_vm_flag_bits() x86/mm/pkeys: Actually enable Memory Protection Keys in the CPU x86/mm/pkeys: Add Kconfig prompt to existing config option x86/mm/pkeys: Dump pkey from VMA in /proc/pid/smaps x86/mm/pkeys: Dump PKRU with other kernel registers mm/core, x86/mm/pkeys: Differentiate instruction fetches x86/mm/pkeys: Optimize fault handling in access_error() mm/core: Do not enforce PKEY permissions on remote mm access um, pkeys: Add UML arch_*_access_permitted() methods mm/gup, x86/mm/pkeys: Check VMAs and PTEs for protection keys x86/mm/gup: Simplify get_user_pages() PTE bit handling ...
-rw-r--r--Documentation/kernel-parameters.txt3
-rw-r--r--arch/cris/arch-v32/drivers/cryptocop.c8
-rw-r--r--arch/ia64/include/uapi/asm/siginfo.h13
-rw-r--r--arch/ia64/kernel/err_inject.c3
-rw-r--r--arch/mips/include/uapi/asm/siginfo.h13
-rw-r--r--arch/mips/mm/gup.c3
-rw-r--r--arch/powerpc/include/asm/mman.h5
-rw-r--r--arch/powerpc/include/asm/mmu_context.h12
-rw-r--r--arch/s390/include/asm/mmu_context.h12
-rw-r--r--arch/s390/mm/gup.c4
-rw-r--r--arch/sh/mm/gup.c2
-rw-r--r--arch/sparc/mm/gup.c2
-rw-r--r--arch/um/include/asm/mmu_context.h14
-rw-r--r--arch/unicore32/include/asm/mmu_context.h12
-rw-r--r--arch/x86/Kconfig16
-rw-r--r--arch/x86/include/asm/cpufeature.h55
-rw-r--r--arch/x86/include/asm/cpufeatures.h6
-rw-r--r--arch/x86/include/asm/disabled-features.h15
-rw-r--r--arch/x86/include/asm/fpu/internal.h2
-rw-r--r--arch/x86/include/asm/fpu/types.h12
-rw-r--r--arch/x86/include/asm/fpu/xstate.h3
-rw-r--r--arch/x86/include/asm/mmu_context.h85
-rw-r--r--arch/x86/include/asm/pgtable.h38
-rw-r--r--arch/x86/include/asm/pgtable_types.h39
-rw-r--r--arch/x86/include/asm/pkeys.h34
-rw-r--r--arch/x86/include/asm/required-features.h7
-rw-r--r--arch/x86/include/asm/special_insns.h22
-rw-r--r--arch/x86/include/uapi/asm/mman.h22
-rw-r--r--arch/x86/include/uapi/asm/processor-flags.h2
-rw-r--r--arch/x86/kernel/cpu/common.c44
-rw-r--r--arch/x86/kernel/fpu/core.c63
-rw-r--r--arch/x86/kernel/fpu/xstate.c185
-rw-r--r--arch/x86/kernel/ldt.c4
-rw-r--r--arch/x86/kernel/process_64.c2
-rw-r--r--arch/x86/kernel/setup.c9
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/fault.c150
-rw-r--r--arch/x86/mm/gup.c45
-rw-r--r--arch/x86/mm/mpx.c4
-rw-r--r--arch/x86/mm/pkeys.c101
-rw-r--r--drivers/char/agp/frontend.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c3
-rw-r--r--drivers/gpu/drm/etnaviv/etnaviv_gem.c6
-rw-r--r--drivers/gpu/drm/i915/i915_gem_userptr.c10
-rw-r--r--drivers/gpu/drm/radeon/radeon_ttm.c3
-rw-r--r--drivers/gpu/drm/via/via_dmablit.c3
-rw-r--r--drivers/infiniband/core/umem.c2
-rw-r--r--drivers/infiniband/core/umem_odp.c8
-rw-r--r--drivers/infiniband/hw/mthca/mthca_memfree.c3
-rw-r--r--drivers/infiniband/hw/qib/qib_user_pages.c3
-rw-r--r--drivers/infiniband/hw/usnic/usnic_uiom.c2
-rw-r--r--drivers/iommu/amd_iommu_v2.c1
-rw-r--r--drivers/media/pci/ivtv/ivtv-udma.c4
-rw-r--r--drivers/media/pci/ivtv/ivtv-yuv.c10
-rw-r--r--drivers/media/v4l2-core/videobuf-dma-sg.c3
-rw-r--r--drivers/misc/mic/scif/scif_rma.c2
-rw-r--r--drivers/misc/sgi-gru/grufault.c3
-rw-r--r--drivers/scsi/st.c2
-rw-r--r--drivers/staging/android/ashmem.c4
-rw-r--r--drivers/video/fbdev/pvr2fb.c4
-rw-r--r--drivers/virt/fsl_hypervisor.c5
-rw-r--r--fs/exec.c8
-rw-r--r--fs/proc/task_mmu.c14
-rw-r--r--include/asm-generic/mm_hooks.h12
-rw-r--r--include/linux/mm.h99
-rw-r--r--include/linux/mman.h6
-rw-r--r--include/linux/pkeys.h33
-rw-r--r--include/uapi/asm-generic/siginfo.h17
-rw-r--r--kernel/events/uprobes.c10
-rw-r--r--kernel/signal.c4
-rw-r--r--mm/Kconfig5
-rw-r--r--mm/frame_vector.c2
-rw-r--r--mm/gup.c127
-rw-r--r--mm/ksm.c12
-rw-r--r--mm/memory.c8
-rw-r--r--mm/mempolicy.c6
-rw-r--r--mm/mmap.c10
-rw-r--r--mm/mprotect.c8
-rw-r--r--mm/nommu.c66
-rw-r--r--mm/process_vm_access.c11
-rw-r--r--mm/util.c4
-rw-r--r--net/ceph/pagevec.c2
-rw-r--r--security/tomoyo/domain.c9
-rw-r--r--virt/kvm/async_pf.c8
-rw-r--r--virt/kvm/kvm_main.c10
85 files changed, 1406 insertions, 241 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 1f780d907718..ecc74fa4bfde 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -987,6 +987,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
987 See Documentation/x86/intel_mpx.txt for more 987 See Documentation/x86/intel_mpx.txt for more
988 information about the feature. 988 information about the feature.
989 989
990 nopku [X86] Disable Memory Protection Keys CPU feature found
991 in some Intel CPUs.
992
990 eagerfpu= [X86] 993 eagerfpu= [X86]
991 on enable eager fpu restore 994 on enable eager fpu restore
992 off disable eager fpu restore 995 off disable eager fpu restore
diff --git a/arch/cris/arch-v32/drivers/cryptocop.c b/arch/cris/arch-v32/drivers/cryptocop.c
index 877da1908234..617645d21b20 100644
--- a/arch/cris/arch-v32/drivers/cryptocop.c
+++ b/arch/cris/arch-v32/drivers/cryptocop.c
@@ -2719,9 +2719,7 @@ static int cryptocop_ioctl_process(struct inode *inode, struct file *filp, unsig
2719 /* Acquire the mm page semaphore. */ 2719 /* Acquire the mm page semaphore. */
2720 down_read(&current->mm->mmap_sem); 2720 down_read(&current->mm->mmap_sem);
2721 2721
2722 err = get_user_pages(current, 2722 err = get_user_pages((unsigned long int)(oper.indata + prev_ix),
2723 current->mm,
2724 (unsigned long int)(oper.indata + prev_ix),
2725 noinpages, 2723 noinpages,
2726 0, /* read access only for in data */ 2724 0, /* read access only for in data */
2727 0, /* no force */ 2725 0, /* no force */
@@ -2736,9 +2734,7 @@ static int cryptocop_ioctl_process(struct inode *inode, struct file *filp, unsig
2736 } 2734 }
2737 noinpages = err; 2735 noinpages = err;
2738 if (oper.do_cipher){ 2736 if (oper.do_cipher){
2739 err = get_user_pages(current, 2737 err = get_user_pages((unsigned long int)oper.cipher_outdata,
2740 current->mm,
2741 (unsigned long int)oper.cipher_outdata,
2742 nooutpages, 2738 nooutpages,
2743 1, /* write access for out data */ 2739 1, /* write access for out data */
2744 0, /* no force */ 2740 0, /* no force */
diff --git a/arch/ia64/include/uapi/asm/siginfo.h b/arch/ia64/include/uapi/asm/siginfo.h
index bce9bc1a66c4..f72bf0172bb2 100644
--- a/arch/ia64/include/uapi/asm/siginfo.h
+++ b/arch/ia64/include/uapi/asm/siginfo.h
@@ -63,10 +63,15 @@ typedef struct siginfo {
63 unsigned int _flags; /* see below */ 63 unsigned int _flags; /* see below */
64 unsigned long _isr; /* isr */ 64 unsigned long _isr; /* isr */
65 short _addr_lsb; /* lsb of faulting address */ 65 short _addr_lsb; /* lsb of faulting address */
66 struct { 66 union {
67 void __user *_lower; 67 /* used when si_code=SEGV_BNDERR */
68 void __user *_upper; 68 struct {
69 } _addr_bnd; 69 void __user *_lower;
70 void __user *_upper;
71 } _addr_bnd;
72 /* used when si_code=SEGV_PKUERR */
73 __u32 _pkey;
74 };
70 } _sigfault; 75 } _sigfault;
71 76
72 /* SIGPOLL */ 77 /* SIGPOLL */
diff --git a/arch/ia64/kernel/err_inject.c b/arch/ia64/kernel/err_inject.c
index 0c161ed6d18e..09f845793d12 100644
--- a/arch/ia64/kernel/err_inject.c
+++ b/arch/ia64/kernel/err_inject.c
@@ -142,8 +142,7 @@ store_virtual_to_phys(struct device *dev, struct device_attribute *attr,
142 u64 virt_addr=simple_strtoull(buf, NULL, 16); 142 u64 virt_addr=simple_strtoull(buf, NULL, 16);
143 int ret; 143 int ret;
144 144
145 ret = get_user_pages(current, current->mm, virt_addr, 145 ret = get_user_pages(virt_addr, 1, VM_READ, 0, NULL, NULL);
146 1, VM_READ, 0, NULL, NULL);
147 if (ret<=0) { 146 if (ret<=0) {
148#ifdef ERR_INJ_DEBUG 147#ifdef ERR_INJ_DEBUG
149 printk("Virtual address %lx is not existing.\n",virt_addr); 148 printk("Virtual address %lx is not existing.\n",virt_addr);
diff --git a/arch/mips/include/uapi/asm/siginfo.h b/arch/mips/include/uapi/asm/siginfo.h
index 2cb7fdead570..cc49dc240d67 100644
--- a/arch/mips/include/uapi/asm/siginfo.h
+++ b/arch/mips/include/uapi/asm/siginfo.h
@@ -86,10 +86,15 @@ typedef struct siginfo {
86 int _trapno; /* TRAP # which caused the signal */ 86 int _trapno; /* TRAP # which caused the signal */
87#endif 87#endif
88 short _addr_lsb; 88 short _addr_lsb;
89 struct { 89 union {
90 void __user *_lower; 90 /* used when si_code=SEGV_BNDERR */
91 void __user *_upper; 91 struct {
92 } _addr_bnd; 92 void __user *_lower;
93 void __user *_upper;
94 } _addr_bnd;
95 /* used when si_code=SEGV_PKUERR */
96 __u32 _pkey;
97 };
93 } _sigfault; 98 } _sigfault;
94 99
95 /* SIGPOLL, SIGXFSZ (To do ...) */ 100 /* SIGPOLL, SIGXFSZ (To do ...) */
diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c
index 6cdffc76735c..42d124fb6474 100644
--- a/arch/mips/mm/gup.c
+++ b/arch/mips/mm/gup.c
@@ -286,8 +286,7 @@ slow_irqon:
286 start += nr << PAGE_SHIFT; 286 start += nr << PAGE_SHIFT;
287 pages += nr; 287 pages += nr;
288 288
289 ret = get_user_pages_unlocked(current, mm, start, 289 ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT,
290 (end - start) >> PAGE_SHIFT,
291 write, 0, pages); 290 write, 0, pages);
292 291
293 /* Have to be a bit careful with return values */ 292 /* Have to be a bit careful with return values */
diff --git a/arch/powerpc/include/asm/mman.h b/arch/powerpc/include/asm/mman.h
index 8565c254151a..2563c435a4b1 100644
--- a/arch/powerpc/include/asm/mman.h
+++ b/arch/powerpc/include/asm/mman.h
@@ -18,11 +18,12 @@
18 * This file is included by linux/mman.h, so we can't use cacl_vm_prot_bits() 18 * This file is included by linux/mman.h, so we can't use cacl_vm_prot_bits()
19 * here. How important is the optimization? 19 * here. How important is the optimization?
20 */ 20 */
21static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot) 21static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
22 unsigned long pkey)
22{ 23{
23 return (prot & PROT_SAO) ? VM_SAO : 0; 24 return (prot & PROT_SAO) ? VM_SAO : 0;
24} 25}
25#define arch_calc_vm_prot_bits(prot) arch_calc_vm_prot_bits(prot) 26#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)
26 27
27static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags) 28static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags)
28{ 29{
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 878c27771717..4eaab40e3ade 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -148,5 +148,17 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm,
148{ 148{
149} 149}
150 150
151static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
152 bool write, bool execute, bool foreign)
153{
154 /* by default, allow everything */
155 return true;
156}
157
158static inline bool arch_pte_access_permitted(pte_t pte, bool write)
159{
160 /* by default, allow everything */
161 return true;
162}
151#endif /* __KERNEL__ */ 163#endif /* __KERNEL__ */
152#endif /* __ASM_POWERPC_MMU_CONTEXT_H */ 164#endif /* __ASM_POWERPC_MMU_CONTEXT_H */
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index e485817f7b1a..d321469eeda7 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -136,4 +136,16 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm,
136{ 136{
137} 137}
138 138
139static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
140 bool write, bool execute, bool foreign)
141{
142 /* by default, allow everything */
143 return true;
144}
145
146static inline bool arch_pte_access_permitted(pte_t pte, bool write)
147{
148 /* by default, allow everything */
149 return true;
150}
139#endif /* __S390_MMU_CONTEXT_H */ 151#endif /* __S390_MMU_CONTEXT_H */
diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c
index 13dab0c1645c..49a1c84ed266 100644
--- a/arch/s390/mm/gup.c
+++ b/arch/s390/mm/gup.c
@@ -210,7 +210,6 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
210int get_user_pages_fast(unsigned long start, int nr_pages, int write, 210int get_user_pages_fast(unsigned long start, int nr_pages, int write,
211 struct page **pages) 211 struct page **pages)
212{ 212{
213 struct mm_struct *mm = current->mm;
214 int nr, ret; 213 int nr, ret;
215 214
216 might_sleep(); 215 might_sleep();
@@ -222,8 +221,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
222 /* Try to get the remaining pages with get_user_pages */ 221 /* Try to get the remaining pages with get_user_pages */
223 start += nr << PAGE_SHIFT; 222 start += nr << PAGE_SHIFT;
224 pages += nr; 223 pages += nr;
225 ret = get_user_pages_unlocked(current, mm, start, 224 ret = get_user_pages_unlocked(start, nr_pages - nr, write, 0, pages);
226 nr_pages - nr, write, 0, pages);
227 /* Have to be a bit careful with return values */ 225 /* Have to be a bit careful with return values */
228 if (nr > 0) 226 if (nr > 0)
229 ret = (ret < 0) ? nr : ret + nr; 227 ret = (ret < 0) ? nr : ret + nr;
diff --git a/arch/sh/mm/gup.c b/arch/sh/mm/gup.c
index e7af6a65baab..40fa6c8adc43 100644
--- a/arch/sh/mm/gup.c
+++ b/arch/sh/mm/gup.c
@@ -257,7 +257,7 @@ slow_irqon:
257 start += nr << PAGE_SHIFT; 257 start += nr << PAGE_SHIFT;
258 pages += nr; 258 pages += nr;
259 259
260 ret = get_user_pages_unlocked(current, mm, start, 260 ret = get_user_pages_unlocked(start,
261 (end - start) >> PAGE_SHIFT, write, 0, pages); 261 (end - start) >> PAGE_SHIFT, write, 0, pages);
262 262
263 /* Have to be a bit careful with return values */ 263 /* Have to be a bit careful with return values */
diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c
index eb3d8e8ebc6b..4e06750a5d29 100644
--- a/arch/sparc/mm/gup.c
+++ b/arch/sparc/mm/gup.c
@@ -237,7 +237,7 @@ slow:
237 start += nr << PAGE_SHIFT; 237 start += nr << PAGE_SHIFT;
238 pages += nr; 238 pages += nr;
239 239
240 ret = get_user_pages_unlocked(current, mm, start, 240 ret = get_user_pages_unlocked(start,
241 (end - start) >> PAGE_SHIFT, write, 0, pages); 241 (end - start) >> PAGE_SHIFT, write, 0, pages);
242 242
243 /* Have to be a bit careful with return values */ 243 /* Have to be a bit careful with return values */
diff --git a/arch/um/include/asm/mmu_context.h b/arch/um/include/asm/mmu_context.h
index 941527e507f7..1a60e1328e2f 100644
--- a/arch/um/include/asm/mmu_context.h
+++ b/arch/um/include/asm/mmu_context.h
@@ -27,6 +27,20 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm,
27 struct vm_area_struct *vma) 27 struct vm_area_struct *vma)
28{ 28{
29} 29}
30
31static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
32 bool write, bool execute, bool foreign)
33{
34 /* by default, allow everything */
35 return true;
36}
37
38static inline bool arch_pte_access_permitted(pte_t pte, bool write)
39{
40 /* by default, allow everything */
41 return true;
42}
43
30/* 44/*
31 * end asm-generic/mm_hooks.h functions 45 * end asm-generic/mm_hooks.h functions
32 */ 46 */
diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
index 1cb5220afaf9..e35632ef23c7 100644
--- a/arch/unicore32/include/asm/mmu_context.h
+++ b/arch/unicore32/include/asm/mmu_context.h
@@ -97,4 +97,16 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm,
97{ 97{
98} 98}
99 99
100static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
101 bool write, bool foreign)
102{
103 /* by default, allow everything */
104 return true;
105}
106
107static inline bool arch_pte_access_permitted(pte_t pte, bool write)
108{
109 /* by default, allow everything */
110 return true;
111}
100#endif 112#endif
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d07cca6ad37b..8b680a5cb25b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -156,6 +156,8 @@ config X86
156 select X86_DEV_DMA_OPS if X86_64 156 select X86_DEV_DMA_OPS if X86_64
157 select X86_FEATURE_NAMES if PROC_FS 157 select X86_FEATURE_NAMES if PROC_FS
158 select HAVE_STACK_VALIDATION if X86_64 158 select HAVE_STACK_VALIDATION if X86_64
159 select ARCH_USES_HIGH_VMA_FLAGS if X86_INTEL_MEMORY_PROTECTION_KEYS
160 select ARCH_HAS_PKEYS if X86_INTEL_MEMORY_PROTECTION_KEYS
159 161
160config INSTRUCTION_DECODER 162config INSTRUCTION_DECODER
161 def_bool y 163 def_bool y
@@ -1719,6 +1721,20 @@ config X86_INTEL_MPX
1719 1721
1720 If unsure, say N. 1722 If unsure, say N.
1721 1723
1724config X86_INTEL_MEMORY_PROTECTION_KEYS
1725 prompt "Intel Memory Protection Keys"
1726 def_bool y
1727 # Note: only available in 64-bit mode
1728 depends on CPU_SUP_INTEL && X86_64
1729 ---help---
1730 Memory Protection Keys provides a mechanism for enforcing
1731 page-based protections, but without requiring modification of the
1732 page tables when an application changes protection domains.
1733
1734 For details, see Documentation/x86/protection-keys.txt
1735
1736 If unsure, say y.
1737
1722config EFI 1738config EFI
1723 bool "EFI runtime service support" 1739 bool "EFI runtime service support"
1724 depends on ACPI 1740 depends on ACPI
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 68e4e8258b84..3636ec06c887 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -26,6 +26,7 @@ enum cpuid_leafs
26 CPUID_8000_0008_EBX, 26 CPUID_8000_0008_EBX,
27 CPUID_6_EAX, 27 CPUID_6_EAX,
28 CPUID_8000_000A_EDX, 28 CPUID_8000_000A_EDX,
29 CPUID_7_ECX,
29}; 30};
30 31
31#ifdef CONFIG_X86_FEATURE_NAMES 32#ifdef CONFIG_X86_FEATURE_NAMES
@@ -48,28 +49,42 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
48 test_bit(bit, (unsigned long *)((c)->x86_capability)) 49 test_bit(bit, (unsigned long *)((c)->x86_capability))
49 50
50#define REQUIRED_MASK_BIT_SET(bit) \ 51#define REQUIRED_MASK_BIT_SET(bit) \
51 ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \ 52 ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0 )) || \
52 (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \ 53 (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1 )) || \
53 (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \ 54 (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2 )) || \
54 (((bit)>>5)==3 && (1UL<<((bit)&31) & REQUIRED_MASK3)) || \ 55 (((bit)>>5)==3 && (1UL<<((bit)&31) & REQUIRED_MASK3 )) || \
55 (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) || \ 56 (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4 )) || \
56 (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) || \ 57 (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5 )) || \
57 (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \ 58 (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6 )) || \
58 (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \ 59 (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7 )) || \
59 (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \ 60 (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8 )) || \
60 (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) 61 (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9 )) || \
62 (((bit)>>5)==10 && (1UL<<((bit)&31) & REQUIRED_MASK10)) || \
63 (((bit)>>5)==11 && (1UL<<((bit)&31) & REQUIRED_MASK11)) || \
64 (((bit)>>5)==12 && (1UL<<((bit)&31) & REQUIRED_MASK12)) || \
65 (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK13)) || \
66 (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK14)) || \
67 (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK15)) || \
68 (((bit)>>5)==14 && (1UL<<((bit)&31) & REQUIRED_MASK16)) )
61 69
62#define DISABLED_MASK_BIT_SET(bit) \ 70#define DISABLED_MASK_BIT_SET(bit) \
63 ( (((bit)>>5)==0 && (1UL<<((bit)&31) & DISABLED_MASK0)) || \ 71 ( (((bit)>>5)==0 && (1UL<<((bit)&31) & DISABLED_MASK0 )) || \
64 (((bit)>>5)==1 && (1UL<<((bit)&31) & DISABLED_MASK1)) || \ 72 (((bit)>>5)==1 && (1UL<<((bit)&31) & DISABLED_MASK1 )) || \
65 (((bit)>>5)==2 && (1UL<<((bit)&31) & DISABLED_MASK2)) || \ 73 (((bit)>>5)==2 && (1UL<<((bit)&31) & DISABLED_MASK2 )) || \
66 (((bit)>>5)==3 && (1UL<<((bit)&31) & DISABLED_MASK3)) || \ 74 (((bit)>>5)==3 && (1UL<<((bit)&31) & DISABLED_MASK3 )) || \
67 (((bit)>>5)==4 && (1UL<<((bit)&31) & DISABLED_MASK4)) || \ 75 (((bit)>>5)==4 && (1UL<<((bit)&31) & DISABLED_MASK4 )) || \
68 (((bit)>>5)==5 && (1UL<<((bit)&31) & DISABLED_MASK5)) || \ 76 (((bit)>>5)==5 && (1UL<<((bit)&31) & DISABLED_MASK5 )) || \
69 (((bit)>>5)==6 && (1UL<<((bit)&31) & DISABLED_MASK6)) || \ 77 (((bit)>>5)==6 && (1UL<<((bit)&31) & DISABLED_MASK6 )) || \
70 (((bit)>>5)==7 && (1UL<<((bit)&31) & DISABLED_MASK7)) || \ 78 (((bit)>>5)==7 && (1UL<<((bit)&31) & DISABLED_MASK7 )) || \
71 (((bit)>>5)==8 && (1UL<<((bit)&31) & DISABLED_MASK8)) || \ 79 (((bit)>>5)==8 && (1UL<<((bit)&31) & DISABLED_MASK8 )) || \
72 (((bit)>>5)==9 && (1UL<<((bit)&31) & DISABLED_MASK9)) ) 80 (((bit)>>5)==9 && (1UL<<((bit)&31) & DISABLED_MASK9 )) || \
81 (((bit)>>5)==10 && (1UL<<((bit)&31) & DISABLED_MASK10)) || \
82 (((bit)>>5)==11 && (1UL<<((bit)&31) & DISABLED_MASK11)) || \
83 (((bit)>>5)==12 && (1UL<<((bit)&31) & DISABLED_MASK12)) || \
84 (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK13)) || \
85 (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK14)) || \
86 (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK15)) || \
87 (((bit)>>5)==14 && (1UL<<((bit)&31) & DISABLED_MASK16)) )
73 88
74#define cpu_has(c, bit) \ 89#define cpu_has(c, bit) \
75 (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ 90 (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 074b7604bd51..3d1a84383162 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -12,7 +12,7 @@
12/* 12/*
13 * Defines x86 CPU feature bits 13 * Defines x86 CPU feature bits
14 */ 14 */
15#define NCAPINTS 16 /* N 32-bit words worth of info */ 15#define NCAPINTS 17 /* N 32-bit words worth of info */
16#define NBUGINTS 1 /* N 32-bit bug flags */ 16#define NBUGINTS 1 /* N 32-bit bug flags */
17 17
18/* 18/*
@@ -274,6 +274,10 @@
274#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ 274#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
275#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ 275#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
276 276
277/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */
278#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
279#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
280
277/* 281/*
278 * BUG word(s) 282 * BUG word(s)
279 */ 283 */
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index f226df064660..39343be7d4f4 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -28,6 +28,14 @@
28# define DISABLE_CENTAUR_MCR 0 28# define DISABLE_CENTAUR_MCR 0
29#endif /* CONFIG_X86_64 */ 29#endif /* CONFIG_X86_64 */
30 30
31#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
32# define DISABLE_PKU (1<<(X86_FEATURE_PKU))
33# define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE))
34#else
35# define DISABLE_PKU 0
36# define DISABLE_OSPKE 0
37#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
38
31/* 39/*
32 * Make sure to add features to the correct mask 40 * Make sure to add features to the correct mask
33 */ 41 */
@@ -41,5 +49,12 @@
41#define DISABLED_MASK7 0 49#define DISABLED_MASK7 0
42#define DISABLED_MASK8 0 50#define DISABLED_MASK8 0
43#define DISABLED_MASK9 (DISABLE_MPX) 51#define DISABLED_MASK9 (DISABLE_MPX)
52#define DISABLED_MASK10 0
53#define DISABLED_MASK11 0
54#define DISABLED_MASK12 0
55#define DISABLED_MASK13 0
56#define DISABLED_MASK14 0
57#define DISABLED_MASK15 0
58#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE)
44 59
45#endif /* _ASM_X86_DISABLED_FEATURES_H */ 60#endif /* _ASM_X86_DISABLED_FEATURES_H */
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index a2124343edf5..31ac8e6d9f36 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -25,6 +25,8 @@
25extern void fpu__activate_curr(struct fpu *fpu); 25extern void fpu__activate_curr(struct fpu *fpu);
26extern void fpu__activate_fpstate_read(struct fpu *fpu); 26extern void fpu__activate_fpstate_read(struct fpu *fpu);
27extern void fpu__activate_fpstate_write(struct fpu *fpu); 27extern void fpu__activate_fpstate_write(struct fpu *fpu);
28extern void fpu__current_fpstate_write_begin(void);
29extern void fpu__current_fpstate_write_end(void);
28extern void fpu__save(struct fpu *fpu); 30extern void fpu__save(struct fpu *fpu);
29extern void fpu__restore(struct fpu *fpu); 31extern void fpu__restore(struct fpu *fpu);
30extern int fpu__restore_sig(void __user *buf, int ia32_frame); 32extern int fpu__restore_sig(void __user *buf, int ia32_frame);
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 1c6f6ac52ad0..36b90bbfc69f 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -108,6 +108,8 @@ enum xfeature {
108 XFEATURE_OPMASK, 108 XFEATURE_OPMASK,
109 XFEATURE_ZMM_Hi256, 109 XFEATURE_ZMM_Hi256,
110 XFEATURE_Hi16_ZMM, 110 XFEATURE_Hi16_ZMM,
111 XFEATURE_PT_UNIMPLEMENTED_SO_FAR,
112 XFEATURE_PKRU,
111 113
112 XFEATURE_MAX, 114 XFEATURE_MAX,
113}; 115};
@@ -120,6 +122,7 @@ enum xfeature {
120#define XFEATURE_MASK_OPMASK (1 << XFEATURE_OPMASK) 122#define XFEATURE_MASK_OPMASK (1 << XFEATURE_OPMASK)
121#define XFEATURE_MASK_ZMM_Hi256 (1 << XFEATURE_ZMM_Hi256) 123#define XFEATURE_MASK_ZMM_Hi256 (1 << XFEATURE_ZMM_Hi256)
122#define XFEATURE_MASK_Hi16_ZMM (1 << XFEATURE_Hi16_ZMM) 124#define XFEATURE_MASK_Hi16_ZMM (1 << XFEATURE_Hi16_ZMM)
125#define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU)
123 126
124#define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE) 127#define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE)
125#define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK \ 128#define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK \
@@ -212,6 +215,15 @@ struct avx_512_hi16_state {
212 struct reg_512_bit hi16_zmm[16]; 215 struct reg_512_bit hi16_zmm[16];
213} __packed; 216} __packed;
214 217
218/*
219 * State component 9: 32-bit PKRU register. The state is
220 * 8 bytes long but only 4 bytes is used currently.
221 */
222struct pkru_state {
223 u32 pkru;
224 u32 pad;
225} __packed;
226
215struct xstate_header { 227struct xstate_header {
216 u64 xfeatures; 228 u64 xfeatures;
217 u64 xcomp_bv; 229 u64 xcomp_bv;
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index f23cd8c80b1c..38951b0fcc5a 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -24,7 +24,8 @@
24 XFEATURE_MASK_YMM | \ 24 XFEATURE_MASK_YMM | \
25 XFEATURE_MASK_OPMASK | \ 25 XFEATURE_MASK_OPMASK | \
26 XFEATURE_MASK_ZMM_Hi256 | \ 26 XFEATURE_MASK_ZMM_Hi256 | \
27 XFEATURE_MASK_Hi16_ZMM) 27 XFEATURE_MASK_Hi16_ZMM | \
28 XFEATURE_MASK_PKRU)
28 29
29/* Supported features which require eager state saving */ 30/* Supported features which require eager state saving */
30#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR) 31#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index bfd9b2a35a0b..84280029cafd 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -52,15 +52,15 @@ struct ldt_struct {
52/* 52/*
53 * Used for LDT copy/destruction. 53 * Used for LDT copy/destruction.
54 */ 54 */
55int init_new_context(struct task_struct *tsk, struct mm_struct *mm); 55int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm);
56void destroy_context(struct mm_struct *mm); 56void destroy_context_ldt(struct mm_struct *mm);
57#else /* CONFIG_MODIFY_LDT_SYSCALL */ 57#else /* CONFIG_MODIFY_LDT_SYSCALL */
58static inline int init_new_context(struct task_struct *tsk, 58static inline int init_new_context_ldt(struct task_struct *tsk,
59 struct mm_struct *mm) 59 struct mm_struct *mm)
60{ 60{
61 return 0; 61 return 0;
62} 62}
63static inline void destroy_context(struct mm_struct *mm) {} 63static inline void destroy_context_ldt(struct mm_struct *mm) {}
64#endif 64#endif
65 65
66static inline void load_mm_ldt(struct mm_struct *mm) 66static inline void load_mm_ldt(struct mm_struct *mm)
@@ -104,6 +104,17 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
104#endif 104#endif
105} 105}
106 106
107static inline int init_new_context(struct task_struct *tsk,
108 struct mm_struct *mm)
109{
110 init_new_context_ldt(tsk, mm);
111 return 0;
112}
113static inline void destroy_context(struct mm_struct *mm)
114{
115 destroy_context_ldt(mm);
116}
117
107static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, 118static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
108 struct task_struct *tsk) 119 struct task_struct *tsk)
109{ 120{
@@ -275,4 +286,68 @@ static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
275 mpx_notify_unmap(mm, vma, start, end); 286 mpx_notify_unmap(mm, vma, start, end);
276} 287}
277 288
289static inline int vma_pkey(struct vm_area_struct *vma)
290{
291 u16 pkey = 0;
292#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
293 unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
294 VM_PKEY_BIT2 | VM_PKEY_BIT3;
295 pkey = (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
296#endif
297 return pkey;
298}
299
300static inline bool __pkru_allows_pkey(u16 pkey, bool write)
301{
302 u32 pkru = read_pkru();
303
304 if (!__pkru_allows_read(pkru, pkey))
305 return false;
306 if (write && !__pkru_allows_write(pkru, pkey))
307 return false;
308
309 return true;
310}
311
312/*
313 * We only want to enforce protection keys on the current process
314 * because we effectively have no access to PKRU for other
315 * processes or any way to tell *which * PKRU in a threaded
316 * process we could use.
317 *
318 * So do not enforce things if the VMA is not from the current
319 * mm, or if we are in a kernel thread.
320 */
321static inline bool vma_is_foreign(struct vm_area_struct *vma)
322{
323 if (!current->mm)
324 return true;
325 /*
326 * Should PKRU be enforced on the access to this VMA? If
327 * the VMA is from another process, then PKRU has no
328 * relevance and should not be enforced.
329 */
330 if (current->mm != vma->vm_mm)
331 return true;
332
333 return false;
334}
335
336static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
337 bool write, bool execute, bool foreign)
338{
339 /* pkeys never affect instruction fetches */
340 if (execute)
341 return true;
342 /* allow access if the VMA is not one from this process */
343 if (foreign || vma_is_foreign(vma))
344 return true;
345 return __pkru_allows_pkey(vma_pkey(vma), write);
346}
347
348static inline bool arch_pte_access_permitted(pte_t pte, bool write)
349{
350 return __pkru_allows_pkey(pte_flags_pkey(pte_flags(pte)), write);
351}
352
278#endif /* _ASM_X86_MMU_CONTEXT_H */ 353#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 0687c4748b8f..1ff49ec29ece 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -99,6 +99,14 @@ static inline int pte_dirty(pte_t pte)
99 return pte_flags(pte) & _PAGE_DIRTY; 99 return pte_flags(pte) & _PAGE_DIRTY;
100} 100}
101 101
102
103static inline u32 read_pkru(void)
104{
105 if (boot_cpu_has(X86_FEATURE_OSPKE))
106 return __read_pkru();
107 return 0;
108}
109
102static inline int pte_young(pte_t pte) 110static inline int pte_young(pte_t pte)
103{ 111{
104 return pte_flags(pte) & _PAGE_ACCESSED; 112 return pte_flags(pte) & _PAGE_ACCESSED;
@@ -911,6 +919,36 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
911} 919}
912#endif 920#endif
913 921
922#define PKRU_AD_BIT 0x1
923#define PKRU_WD_BIT 0x2
924#define PKRU_BITS_PER_PKEY 2
925
926static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
927{
928 int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
929 return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits));
930}
931
932static inline bool __pkru_allows_write(u32 pkru, u16 pkey)
933{
934 int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
935 /*
936 * Access-disable disables writes too so we need to check
937 * both bits here.
938 */
939 return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits));
940}
941
942static inline u16 pte_flags_pkey(unsigned long pte_flags)
943{
944#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
945 /* ifdef to avoid doing 59-bit shift on 32-bit values */
946 return (pte_flags & _PAGE_PKEY_MASK) >> _PAGE_BIT_PKEY_BIT0;
947#else
948 return 0;
949#endif
950}
951
914#include <asm-generic/pgtable.h> 952#include <asm-generic/pgtable.h>
915#endif /* __ASSEMBLY__ */ 953#endif /* __ASSEMBLY__ */
916 954
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 4432ab7f407c..7b5efe264eff 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -20,13 +20,18 @@
20#define _PAGE_BIT_SOFTW2 10 /* " */ 20#define _PAGE_BIT_SOFTW2 10 /* " */
21#define _PAGE_BIT_SOFTW3 11 /* " */ 21#define _PAGE_BIT_SOFTW3 11 /* " */
22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ 22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
23#define _PAGE_BIT_SOFTW4 58 /* available for programmer */
24#define _PAGE_BIT_PKEY_BIT0 59 /* Protection Keys, bit 1/4 */
25#define _PAGE_BIT_PKEY_BIT1 60 /* Protection Keys, bit 2/4 */
26#define _PAGE_BIT_PKEY_BIT2 61 /* Protection Keys, bit 3/4 */
27#define _PAGE_BIT_PKEY_BIT3 62 /* Protection Keys, bit 4/4 */
28#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
29
23#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 30#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 31#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
25#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */ 32#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
26#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ 33#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
27#define _PAGE_BIT_SOFTW4 58 /* available for programmer */ 34#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
28#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
29#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
30 35
31/* If _PAGE_BIT_PRESENT is clear, we use these: */ 36/* If _PAGE_BIT_PRESENT is clear, we use these: */
32/* - if the user mapped it with PROT_NONE; pte_present gives true */ 37/* - if the user mapped it with PROT_NONE; pte_present gives true */
@@ -47,8 +52,24 @@
47#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) 52#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
48#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) 53#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
49#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) 54#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
55#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
56#define _PAGE_PKEY_BIT0 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0)
57#define _PAGE_PKEY_BIT1 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1)
58#define _PAGE_PKEY_BIT2 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT2)
59#define _PAGE_PKEY_BIT3 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT3)
60#else
61#define _PAGE_PKEY_BIT0 (_AT(pteval_t, 0))
62#define _PAGE_PKEY_BIT1 (_AT(pteval_t, 0))
63#define _PAGE_PKEY_BIT2 (_AT(pteval_t, 0))
64#define _PAGE_PKEY_BIT3 (_AT(pteval_t, 0))
65#endif
50#define __HAVE_ARCH_PTE_SPECIAL 66#define __HAVE_ARCH_PTE_SPECIAL
51 67
68#define _PAGE_PKEY_MASK (_PAGE_PKEY_BIT0 | \
69 _PAGE_PKEY_BIT1 | \
70 _PAGE_PKEY_BIT2 | \
71 _PAGE_PKEY_BIT3)
72
52#ifdef CONFIG_KMEMCHECK 73#ifdef CONFIG_KMEMCHECK
53#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) 74#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
54#else 75#else
@@ -99,7 +120,12 @@
99#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ 120#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
100 _PAGE_DIRTY) 121 _PAGE_DIRTY)
101 122
102/* Set of bits not changed in pte_modify */ 123/*
124 * Set of bits not changed in pte_modify. The pte's
125 * protection key is treated like _PAGE_RW, for
126 * instance, and is *not* included in this mask since
127 * pte_modify() does modify it.
128 */
103#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ 129#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
104 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ 130 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \
105 _PAGE_SOFT_DIRTY) 131 _PAGE_SOFT_DIRTY)
@@ -215,7 +241,10 @@ enum page_cache_mode {
215/* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */ 241/* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */
216#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK) 242#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
217 243
218/* Extracts the flags from a (pte|pmd|pud|pgd)val_t of a 4KB page */ 244/*
245 * Extracts the flags from a (pte|pmd|pud|pgd)val_t
246 * This includes the protection key value.
247 */
219#define PTE_FLAGS_MASK (~PTE_PFN_MASK) 248#define PTE_FLAGS_MASK (~PTE_PFN_MASK)
220 249
221typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; 250typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
new file mode 100644
index 000000000000..7b84565c916c
--- /dev/null
+++ b/arch/x86/include/asm/pkeys.h
@@ -0,0 +1,34 @@
1#ifndef _ASM_X86_PKEYS_H
2#define _ASM_X86_PKEYS_H
3
4#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1)
5
6extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
7 unsigned long init_val);
8
9/*
10 * Try to dedicate one of the protection keys to be used as an
11 * execute-only protection key.
12 */
13#define PKEY_DEDICATED_EXECUTE_ONLY 15
14extern int __execute_only_pkey(struct mm_struct *mm);
15static inline int execute_only_pkey(struct mm_struct *mm)
16{
17 if (!boot_cpu_has(X86_FEATURE_OSPKE))
18 return 0;
19
20 return __execute_only_pkey(mm);
21}
22
23extern int __arch_override_mprotect_pkey(struct vm_area_struct *vma,
24 int prot, int pkey);
25static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
26 int prot, int pkey)
27{
28 if (!boot_cpu_has(X86_FEATURE_OSPKE))
29 return 0;
30
31 return __arch_override_mprotect_pkey(vma, prot, pkey);
32}
33
34#endif /*_ASM_X86_PKEYS_H */
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index 5c6e4fb370f5..4916144e3c42 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -92,5 +92,12 @@
92#define REQUIRED_MASK7 0 92#define REQUIRED_MASK7 0
93#define REQUIRED_MASK8 0 93#define REQUIRED_MASK8 0
94#define REQUIRED_MASK9 0 94#define REQUIRED_MASK9 0
95#define REQUIRED_MASK10 0
96#define REQUIRED_MASK11 0
97#define REQUIRED_MASK12 0
98#define REQUIRED_MASK13 0
99#define REQUIRED_MASK14 0
100#define REQUIRED_MASK15 0
101#define REQUIRED_MASK16 0
95 102
96#endif /* _ASM_X86_REQUIRED_FEATURES_H */ 103#endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 2270e41b32fd..aee6e76e561e 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -98,6 +98,28 @@ static inline void native_write_cr8(unsigned long val)
98} 98}
99#endif 99#endif
100 100
101#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
102static inline u32 __read_pkru(void)
103{
104 u32 ecx = 0;
105 u32 edx, pkru;
106
107 /*
108 * "rdpkru" instruction. Places PKRU contents in to EAX,
109 * clears EDX and requires that ecx=0.
110 */
111 asm volatile(".byte 0x0f,0x01,0xee\n\t"
112 : "=a" (pkru), "=d" (edx)
113 : "c" (ecx));
114 return pkru;
115}
116#else
117static inline u32 __read_pkru(void)
118{
119 return 0;
120}
121#endif
122
101static inline void native_wbinvd(void) 123static inline void native_wbinvd(void)
102{ 124{
103 asm volatile("wbinvd": : :"memory"); 125 asm volatile("wbinvd": : :"memory");
diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
index 513b05f15bb4..39bca7fac087 100644
--- a/arch/x86/include/uapi/asm/mman.h
+++ b/arch/x86/include/uapi/asm/mman.h
@@ -6,6 +6,28 @@
6#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) 6#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
7#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) 7#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
8 8
9#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
10/*
11 * Take the 4 protection key bits out of the vma->vm_flags
12 * value and turn them in to the bits that we can put in
13 * to a pte.
14 *
15 * Only override these if Protection Keys are available
16 * (which is only on 64-bit).
17 */
18#define arch_vm_get_page_prot(vm_flags) __pgprot( \
19 ((vm_flags) & VM_PKEY_BIT0 ? _PAGE_PKEY_BIT0 : 0) | \
20 ((vm_flags) & VM_PKEY_BIT1 ? _PAGE_PKEY_BIT1 : 0) | \
21 ((vm_flags) & VM_PKEY_BIT2 ? _PAGE_PKEY_BIT2 : 0) | \
22 ((vm_flags) & VM_PKEY_BIT3 ? _PAGE_PKEY_BIT3 : 0))
23
24#define arch_calc_vm_prot_bits(prot, key) ( \
25 ((key) & 0x1 ? VM_PKEY_BIT0 : 0) | \
26 ((key) & 0x2 ? VM_PKEY_BIT1 : 0) | \
27 ((key) & 0x4 ? VM_PKEY_BIT2 : 0) | \
28 ((key) & 0x8 ? VM_PKEY_BIT3 : 0))
29#endif
30
9#include <asm-generic/mman.h> 31#include <asm-generic/mman.h>
10 32
11#endif /* _ASM_X86_MMAN_H */ 33#endif /* _ASM_X86_MMAN_H */
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 79887abcb5e1..567de50a4c2a 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -118,6 +118,8 @@
118#define X86_CR4_SMEP _BITUL(X86_CR4_SMEP_BIT) 118#define X86_CR4_SMEP _BITUL(X86_CR4_SMEP_BIT)
119#define X86_CR4_SMAP_BIT 21 /* enable SMAP support */ 119#define X86_CR4_SMAP_BIT 21 /* enable SMAP support */
120#define X86_CR4_SMAP _BITUL(X86_CR4_SMAP_BIT) 120#define X86_CR4_SMAP _BITUL(X86_CR4_SMAP_BIT)
121#define X86_CR4_PKE_BIT 22 /* enable Protection Keys support */
122#define X86_CR4_PKE _BITUL(X86_CR4_PKE_BIT)
121 123
122/* 124/*
123 * x86-64 Task Priority Register, CR8 125 * x86-64 Task Priority Register, CR8
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 249461f95851..06ad72383b4e 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -304,6 +304,48 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
304} 304}
305 305
306/* 306/*
307 * Protection Keys are not available in 32-bit mode.
308 */
309static bool pku_disabled;
310
311static __always_inline void setup_pku(struct cpuinfo_x86 *c)
312{
313 if (!cpu_has(c, X86_FEATURE_PKU))
314 return;
315 if (pku_disabled)
316 return;
317
318 cr4_set_bits(X86_CR4_PKE);
319 /*
320 * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE
321 * cpuid bit to be set. We need to ensure that we
322 * update that bit in this CPU's "cpu_info".
323 */
324 get_cpu_cap(c);
325}
326
327#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
328static __init int setup_disable_pku(char *arg)
329{
330 /*
331 * Do not clear the X86_FEATURE_PKU bit. All of the
332 * runtime checks are against OSPKE so clearing the
333 * bit does nothing.
334 *
335 * This way, we will see "pku" in cpuinfo, but not
336 * "ospke", which is exactly what we want. It shows
337 * that the CPU has PKU, but the OS has not enabled it.
338 * This happens to be exactly how a system would look
339 * if we disabled the config option.
340 */
341 pr_info("x86: 'nopku' specified, disabling Memory Protection Keys\n");
342 pku_disabled = true;
343 return 1;
344}
345__setup("nopku", setup_disable_pku);
346#endif /* CONFIG_X86_64 */
347
348/*
307 * Some CPU features depend on higher CPUID levels, which may not always 349 * Some CPU features depend on higher CPUID levels, which may not always
308 * be available due to CPUID level capping or broken virtualization 350 * be available due to CPUID level capping or broken virtualization
309 * software. Add those features to this table to auto-disable them. 351 * software. Add those features to this table to auto-disable them.
@@ -625,6 +667,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
625 c->x86_capability[CPUID_7_0_EBX] = ebx; 667 c->x86_capability[CPUID_7_0_EBX] = ebx;
626 668
627 c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006); 669 c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006);
670 c->x86_capability[CPUID_7_ECX] = ecx;
628 } 671 }
629 672
630 /* Extended state features: level 0x0000000d */ 673 /* Extended state features: level 0x0000000d */
@@ -982,6 +1025,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
982 init_hypervisor(c); 1025 init_hypervisor(c);
983 x86_init_rdrand(c); 1026 x86_init_rdrand(c);
984 x86_init_cache_qos(c); 1027 x86_init_cache_qos(c);
1028 setup_pku(c);
985 1029
986 /* 1030 /*
987 * Clear/Set all flags overriden by options, need do it 1031 * Clear/Set all flags overriden by options, need do it
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 0b1b9abd4d5f..8e37cc8a539a 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -354,6 +354,69 @@ void fpu__activate_fpstate_write(struct fpu *fpu)
354} 354}
355 355
356/* 356/*
357 * This function must be called before we write the current
358 * task's fpstate.
359 *
360 * This call gets the current FPU register state and moves
361 * it in to the 'fpstate'. Preemption is disabled so that
362 * no writes to the 'fpstate' can occur from context
363 * swiches.
364 *
365 * Must be followed by a fpu__current_fpstate_write_end().
366 */
367void fpu__current_fpstate_write_begin(void)
368{
369 struct fpu *fpu = &current->thread.fpu;
370
371 /*
372 * Ensure that the context-switching code does not write
373 * over the fpstate while we are doing our update.
374 */
375 preempt_disable();
376
377 /*
378 * Move the fpregs in to the fpu's 'fpstate'.
379 */
380 fpu__activate_fpstate_read(fpu);
381
382 /*
383 * The caller is about to write to 'fpu'. Ensure that no
384 * CPU thinks that its fpregs match the fpstate. This
385 * ensures we will not be lazy and skip a XRSTOR in the
386 * future.
387 */
388 fpu->last_cpu = -1;
389}
390
391/*
392 * This function must be paired with fpu__current_fpstate_write_begin()
393 *
394 * This will ensure that the modified fpstate gets placed back in
395 * the fpregs if necessary.
396 *
397 * Note: This function may be called whether or not an _actual_
398 * write to the fpstate occurred.
399 */
400void fpu__current_fpstate_write_end(void)
401{
402 struct fpu *fpu = &current->thread.fpu;
403
404 /*
405 * 'fpu' now has an updated copy of the state, but the
406 * registers may still be out of date. Update them with
407 * an XRSTOR if they are active.
408 */
409 if (fpregs_active())
410 copy_kernel_to_fpregs(&fpu->state);
411
412 /*
413 * Our update is done and the fpregs/fpstate are in sync
414 * if necessary. Context switches can happen again.
415 */
416 preempt_enable();
417}
418
419/*
357 * 'fpu__restore()' is called to copy FPU registers from 420 * 'fpu__restore()' is called to copy FPU registers from
358 * the FPU fpstate to the live hw registers and to activate 421 * the FPU fpstate to the live hw registers and to activate
359 * access to the hardware registers, so that FPU instructions 422 * access to the hardware registers, so that FPU instructions
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 6e8354f5a593..b48ef35b28d4 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -5,6 +5,7 @@
5 */ 5 */
6#include <linux/compat.h> 6#include <linux/compat.h>
7#include <linux/cpu.h> 7#include <linux/cpu.h>
8#include <linux/pkeys.h>
8 9
9#include <asm/fpu/api.h> 10#include <asm/fpu/api.h>
10#include <asm/fpu/internal.h> 11#include <asm/fpu/internal.h>
@@ -13,6 +14,11 @@
13 14
14#include <asm/tlbflush.h> 15#include <asm/tlbflush.h>
15 16
17/*
18 * Although we spell it out in here, the Processor Trace
19 * xfeature is completely unused. We use other mechanisms
20 * to save/restore PT state in Linux.
21 */
16static const char *xfeature_names[] = 22static const char *xfeature_names[] =
17{ 23{
18 "x87 floating point registers" , 24 "x87 floating point registers" ,
@@ -23,6 +29,8 @@ static const char *xfeature_names[] =
23 "AVX-512 opmask" , 29 "AVX-512 opmask" ,
24 "AVX-512 Hi256" , 30 "AVX-512 Hi256" ,
25 "AVX-512 ZMM_Hi256" , 31 "AVX-512 ZMM_Hi256" ,
32 "Processor Trace (unused)" ,
33 "Protection Keys User registers",
26 "unknown xstate feature" , 34 "unknown xstate feature" ,
27}; 35};
28 36
@@ -56,6 +64,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
56 setup_clear_cpu_cap(X86_FEATURE_AVX512VL); 64 setup_clear_cpu_cap(X86_FEATURE_AVX512VL);
57 setup_clear_cpu_cap(X86_FEATURE_MPX); 65 setup_clear_cpu_cap(X86_FEATURE_MPX);
58 setup_clear_cpu_cap(X86_FEATURE_XGETBV1); 66 setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
67 setup_clear_cpu_cap(X86_FEATURE_PKU);
59} 68}
60 69
61/* 70/*
@@ -234,7 +243,7 @@ static void __init print_xstate_feature(u64 xstate_mask)
234 const char *feature_name; 243 const char *feature_name;
235 244
236 if (cpu_has_xfeatures(xstate_mask, &feature_name)) 245 if (cpu_has_xfeatures(xstate_mask, &feature_name))
237 pr_info("x86/fpu: Supporting XSAVE feature 0x%02Lx: '%s'\n", xstate_mask, feature_name); 246 pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name);
238} 247}
239 248
240/* 249/*
@@ -250,6 +259,7 @@ static void __init print_xstate_features(void)
250 print_xstate_feature(XFEATURE_MASK_OPMASK); 259 print_xstate_feature(XFEATURE_MASK_OPMASK);
251 print_xstate_feature(XFEATURE_MASK_ZMM_Hi256); 260 print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
252 print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); 261 print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
262 print_xstate_feature(XFEATURE_MASK_PKRU);
253} 263}
254 264
255/* 265/*
@@ -466,6 +476,7 @@ static void check_xstate_against_struct(int nr)
466 XCHECK_SZ(sz, nr, XFEATURE_OPMASK, struct avx_512_opmask_state); 476 XCHECK_SZ(sz, nr, XFEATURE_OPMASK, struct avx_512_opmask_state);
467 XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state); 477 XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state);
468 XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state); 478 XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state);
479 XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state);
469 480
470 /* 481 /*
471 * Make *SURE* to add any feature numbers in below if 482 * Make *SURE* to add any feature numbers in below if
@@ -473,7 +484,8 @@ static void check_xstate_against_struct(int nr)
473 * numbers. 484 * numbers.
474 */ 485 */
475 if ((nr < XFEATURE_YMM) || 486 if ((nr < XFEATURE_YMM) ||
476 (nr >= XFEATURE_MAX)) { 487 (nr >= XFEATURE_MAX) ||
488 (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR)) {
477 WARN_ONCE(1, "no structure for xstate: %d\n", nr); 489 WARN_ONCE(1, "no structure for xstate: %d\n", nr);
478 XSTATE_WARN_ON(1); 490 XSTATE_WARN_ON(1);
479 } 491 }
@@ -671,6 +683,19 @@ void fpu__resume_cpu(void)
671} 683}
672 684
673/* 685/*
686 * Given an xstate feature mask, calculate where in the xsave
687 * buffer the state is. Callers should ensure that the buffer
688 * is valid.
689 *
690 * Note: does not work for compacted buffers.
691 */
692void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask)
693{
694 int feature_nr = fls64(xstate_feature_mask) - 1;
695
696 return (void *)xsave + xstate_comp_offsets[feature_nr];
697}
698/*
674 * Given the xsave area and a state inside, this function returns the 699 * Given the xsave area and a state inside, this function returns the
675 * address of the state. 700 * address of the state.
676 * 701 *
@@ -690,7 +715,6 @@ void fpu__resume_cpu(void)
690 */ 715 */
691void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature) 716void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
692{ 717{
693 int feature_nr = fls64(xstate_feature) - 1;
694 /* 718 /*
695 * Do we even *have* xsave state? 719 * Do we even *have* xsave state?
696 */ 720 */
@@ -718,7 +742,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
718 if (!(xsave->header.xfeatures & xstate_feature)) 742 if (!(xsave->header.xfeatures & xstate_feature))
719 return NULL; 743 return NULL;
720 744
721 return (void *)xsave + xstate_comp_offsets[feature_nr]; 745 return __raw_xsave_addr(xsave, xstate_feature);
722} 746}
723EXPORT_SYMBOL_GPL(get_xsave_addr); 747EXPORT_SYMBOL_GPL(get_xsave_addr);
724 748
@@ -753,3 +777,156 @@ const void *get_xsave_field_ptr(int xsave_state)
753 777
754 return get_xsave_addr(&fpu->state.xsave, xsave_state); 778 return get_xsave_addr(&fpu->state.xsave, xsave_state);
755} 779}
780
781
782/*
783 * Set xfeatures (aka XSTATE_BV) bit for a feature that we want
784 * to take out of its "init state". This will ensure that an
785 * XRSTOR actually restores the state.
786 */
787static void fpu__xfeature_set_non_init(struct xregs_state *xsave,
788 int xstate_feature_mask)
789{
790 xsave->header.xfeatures |= xstate_feature_mask;
791}
792
793/*
794 * This function is safe to call whether the FPU is in use or not.
795 *
796 * Note that this only works on the current task.
797 *
798 * Inputs:
799 * @xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP,
800 * XFEATURE_MASK_SSE, etc...)
801 * @xsave_state_ptr: a pointer to a copy of the state that you would
802 * like written in to the current task's FPU xsave state. This pointer
803 * must not be located in the current tasks's xsave area.
804 * Output:
805 * address of the state in the xsave area or NULL if the state
806 * is not present or is in its 'init state'.
807 */
808static void fpu__xfeature_set_state(int xstate_feature_mask,
809 void *xstate_feature_src, size_t len)
810{
811 struct xregs_state *xsave = &current->thread.fpu.state.xsave;
812 struct fpu *fpu = &current->thread.fpu;
813 void *dst;
814
815 if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
816 WARN_ONCE(1, "%s() attempted with no xsave support", __func__);
817 return;
818 }
819
820 /*
821 * Tell the FPU code that we need the FPU state to be in
822 * 'fpu' (not in the registers), and that we need it to
823 * be stable while we write to it.
824 */
825 fpu__current_fpstate_write_begin();
826
827 /*
828 * This method *WILL* *NOT* work for compact-format
829 * buffers. If the 'xstate_feature_mask' is unset in
830 * xcomp_bv then we may need to move other feature state
831 * "up" in the buffer.
832 */
833 if (xsave->header.xcomp_bv & xstate_feature_mask) {
834 WARN_ON_ONCE(1);
835 goto out;
836 }
837
838 /* find the location in the xsave buffer of the desired state */
839 dst = __raw_xsave_addr(&fpu->state.xsave, xstate_feature_mask);
840
841 /*
842 * Make sure that the pointer being passed in did not
843 * come from the xsave buffer itself.
844 */
845 WARN_ONCE(xstate_feature_src == dst, "set from xsave buffer itself");
846
847 /* put the caller-provided data in the location */
848 memcpy(dst, xstate_feature_src, len);
849
850 /*
851 * Mark the xfeature so that the CPU knows there is state
852 * in the buffer now.
853 */
854 fpu__xfeature_set_non_init(xsave, xstate_feature_mask);
855out:
856 /*
857 * We are done writing to the 'fpu'. Reenable preeption
858 * and (possibly) move the fpstate back in to the fpregs.
859 */
860 fpu__current_fpstate_write_end();
861}
862
863#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2)
864#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1)
865
866/*
867 * This will go out and modify the XSAVE buffer so that PKRU is
868 * set to a particular state for access to 'pkey'.
869 *
870 * PKRU state does affect kernel access to user memory. We do
871 * not modfiy PKRU *itself* here, only the XSAVE state that will
872 * be restored in to PKRU when we return back to userspace.
873 */
874int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
875 unsigned long init_val)
876{
877 struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
878 struct pkru_state *old_pkru_state;
879 struct pkru_state new_pkru_state;
880 int pkey_shift = (pkey * PKRU_BITS_PER_PKEY);
881 u32 new_pkru_bits = 0;
882
883 /*
884 * This check implies XSAVE support. OSPKE only gets
885 * set if we enable XSAVE and we enable PKU in XCR0.
886 */
887 if (!boot_cpu_has(X86_FEATURE_OSPKE))
888 return -EINVAL;
889
890 /* Set the bits we need in PKRU */
891 if (init_val & PKEY_DISABLE_ACCESS)
892 new_pkru_bits |= PKRU_AD_BIT;
893 if (init_val & PKEY_DISABLE_WRITE)
894 new_pkru_bits |= PKRU_WD_BIT;
895
896 /* Shift the bits in to the correct place in PKRU for pkey. */
897 new_pkru_bits <<= pkey_shift;
898
899 /* Locate old copy of the state in the xsave buffer */
900 old_pkru_state = get_xsave_addr(xsave, XFEATURE_MASK_PKRU);
901
902 /*
903 * When state is not in the buffer, it is in the init
904 * state, set it manually. Otherwise, copy out the old
905 * state.
906 */
907 if (!old_pkru_state)
908 new_pkru_state.pkru = 0;
909 else
910 new_pkru_state.pkru = old_pkru_state->pkru;
911
912 /* mask off any old bits in place */
913 new_pkru_state.pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
914 /* Set the newly-requested bits */
915 new_pkru_state.pkru |= new_pkru_bits;
916
917 /*
918 * We could theoretically live without zeroing pkru.pad.
919 * The current XSAVE feature state definition says that
920 * only bytes 0->3 are used. But we do not want to
921 * chance leaking kernel stack out to userspace in case a
922 * memcpy() of the whole xsave buffer was done.
923 *
924 * They're in the same cacheline anyway.
925 */
926 new_pkru_state.pad = 0;
927
928 fpu__xfeature_set_state(XFEATURE_MASK_PKRU, &new_pkru_state,
929 sizeof(new_pkru_state));
930
931 return 0;
932}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 6acc9dd91f36..6707039b9032 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -103,7 +103,7 @@ static void free_ldt_struct(struct ldt_struct *ldt)
103 * we do not have to muck with descriptors here, that is 103 * we do not have to muck with descriptors here, that is
104 * done in switch_mm() as needed. 104 * done in switch_mm() as needed.
105 */ 105 */
106int init_new_context(struct task_struct *tsk, struct mm_struct *mm) 106int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
107{ 107{
108 struct ldt_struct *new_ldt; 108 struct ldt_struct *new_ldt;
109 struct mm_struct *old_mm; 109 struct mm_struct *old_mm;
@@ -144,7 +144,7 @@ out_unlock:
144 * 144 *
145 * 64bit: Don't touch the LDT register - we're already in the next thread. 145 * 64bit: Don't touch the LDT register - we're already in the next thread.
146 */ 146 */
147void destroy_context(struct mm_struct *mm) 147void destroy_context_ldt(struct mm_struct *mm)
148{ 148{
149 free_ldt_struct(mm->context.ldt); 149 free_ldt_struct(mm->context.ldt);
150 mm->context.ldt = NULL; 150 mm->context.ldt = NULL;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b9d99e0f82c4..776229e98202 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -116,6 +116,8 @@ void __show_regs(struct pt_regs *regs, int all)
116 printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); 116 printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
117 printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 117 printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
118 118
119 if (boot_cpu_has(X86_FEATURE_OSPKE))
120 printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru());
119} 121}
120 122
121void release_thread(struct task_struct *dead_task) 123void release_thread(struct task_struct *dead_task)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index aa52c1009475..2367ae07eb76 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -112,6 +112,7 @@
112#include <asm/alternative.h> 112#include <asm/alternative.h>
113#include <asm/prom.h> 113#include <asm/prom.h>
114#include <asm/microcode.h> 114#include <asm/microcode.h>
115#include <asm/mmu_context.h>
115 116
116/* 117/*
117 * max_low_pfn_mapped: highest direct mapped pfn under 4GB 118 * max_low_pfn_mapped: highest direct mapped pfn under 4GB
@@ -1282,3 +1283,11 @@ static int __init register_kernel_offset_dumper(void)
1282 return 0; 1283 return 0;
1283} 1284}
1284__initcall(register_kernel_offset_dumper); 1285__initcall(register_kernel_offset_dumper);
1286
1287void arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
1288{
1289 if (!boot_cpu_has(X86_FEATURE_OSPKE))
1290 return;
1291
1292 seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
1293}
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index f9d38a48e3c8..67cf2e1e557b 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -34,3 +34,5 @@ obj-$(CONFIG_ACPI_NUMA) += srat.o
34obj-$(CONFIG_NUMA_EMU) += numa_emulation.o 34obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
35 35
36obj-$(CONFIG_X86_INTEL_MPX) += mpx.o 36obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
37obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
38
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 03898aea6e0f..5ce1ed02f7e8 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -15,12 +15,14 @@
15#include <linux/context_tracking.h> /* exception_enter(), ... */ 15#include <linux/context_tracking.h> /* exception_enter(), ... */
16#include <linux/uaccess.h> /* faulthandler_disabled() */ 16#include <linux/uaccess.h> /* faulthandler_disabled() */
17 17
18#include <asm/cpufeature.h> /* boot_cpu_has, ... */
18#include <asm/traps.h> /* dotraplinkage, ... */ 19#include <asm/traps.h> /* dotraplinkage, ... */
19#include <asm/pgalloc.h> /* pgd_*(), ... */ 20#include <asm/pgalloc.h> /* pgd_*(), ... */
20#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ 21#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
21#include <asm/fixmap.h> /* VSYSCALL_ADDR */ 22#include <asm/fixmap.h> /* VSYSCALL_ADDR */
22#include <asm/vsyscall.h> /* emulate_vsyscall */ 23#include <asm/vsyscall.h> /* emulate_vsyscall */
23#include <asm/vm86.h> /* struct vm86 */ 24#include <asm/vm86.h> /* struct vm86 */
25#include <asm/mmu_context.h> /* vma_pkey() */
24 26
25#define CREATE_TRACE_POINTS 27#define CREATE_TRACE_POINTS
26#include <asm/trace/exceptions.h> 28#include <asm/trace/exceptions.h>
@@ -33,6 +35,7 @@
33 * bit 2 == 0: kernel-mode access 1: user-mode access 35 * bit 2 == 0: kernel-mode access 1: user-mode access
34 * bit 3 == 1: use of reserved bit detected 36 * bit 3 == 1: use of reserved bit detected
35 * bit 4 == 1: fault was an instruction fetch 37 * bit 4 == 1: fault was an instruction fetch
38 * bit 5 == 1: protection keys block access
36 */ 39 */
37enum x86_pf_error_code { 40enum x86_pf_error_code {
38 41
@@ -41,6 +44,7 @@ enum x86_pf_error_code {
41 PF_USER = 1 << 2, 44 PF_USER = 1 << 2,
42 PF_RSVD = 1 << 3, 45 PF_RSVD = 1 << 3,
43 PF_INSTR = 1 << 4, 46 PF_INSTR = 1 << 4,
47 PF_PK = 1 << 5,
44}; 48};
45 49
46/* 50/*
@@ -167,9 +171,60 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
167 return prefetch; 171 return prefetch;
168} 172}
169 173
174/*
175 * A protection key fault means that the PKRU value did not allow
176 * access to some PTE. Userspace can figure out what PKRU was
177 * from the XSAVE state, and this function fills out a field in
178 * siginfo so userspace can discover which protection key was set
179 * on the PTE.
180 *
181 * If we get here, we know that the hardware signaled a PF_PK
182 * fault and that there was a VMA once we got in the fault
183 * handler. It does *not* guarantee that the VMA we find here
184 * was the one that we faulted on.
185 *
186 * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4);
187 * 2. T1 : set PKRU to deny access to pkey=4, touches page
188 * 3. T1 : faults...
189 * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
190 * 5. T1 : enters fault handler, takes mmap_sem, etc...
191 * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
192 * faulted on a pte with its pkey=4.
193 */
194static void fill_sig_info_pkey(int si_code, siginfo_t *info,
195 struct vm_area_struct *vma)
196{
197 /* This is effectively an #ifdef */
198 if (!boot_cpu_has(X86_FEATURE_OSPKE))
199 return;
200
201 /* Fault not from Protection Keys: nothing to do */
202 if (si_code != SEGV_PKUERR)
203 return;
204 /*
205 * force_sig_info_fault() is called from a number of
206 * contexts, some of which have a VMA and some of which
207 * do not. The PF_PK handing happens after we have a
208 * valid VMA, so we should never reach this without a
209 * valid VMA.
210 */
211 if (!vma) {
212 WARN_ONCE(1, "PKU fault with no VMA passed in");
213 info->si_pkey = 0;
214 return;
215 }
216 /*
217 * si_pkey should be thought of as a strong hint, but not
218 * absolutely guranteed to be 100% accurate because of
219 * the race explained above.
220 */
221 info->si_pkey = vma_pkey(vma);
222}
223
170static void 224static void
171force_sig_info_fault(int si_signo, int si_code, unsigned long address, 225force_sig_info_fault(int si_signo, int si_code, unsigned long address,
172 struct task_struct *tsk, int fault) 226 struct task_struct *tsk, struct vm_area_struct *vma,
227 int fault)
173{ 228{
174 unsigned lsb = 0; 229 unsigned lsb = 0;
175 siginfo_t info; 230 siginfo_t info;
@@ -184,6 +239,8 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
184 lsb = PAGE_SHIFT; 239 lsb = PAGE_SHIFT;
185 info.si_addr_lsb = lsb; 240 info.si_addr_lsb = lsb;
186 241
242 fill_sig_info_pkey(si_code, &info, vma);
243
187 force_sig_info(si_signo, &info, tsk); 244 force_sig_info(si_signo, &info, tsk);
188} 245}
189 246
@@ -661,6 +718,8 @@ no_context(struct pt_regs *regs, unsigned long error_code,
661 struct task_struct *tsk = current; 718 struct task_struct *tsk = current;
662 unsigned long flags; 719 unsigned long flags;
663 int sig; 720 int sig;
721 /* No context means no VMA to pass down */
722 struct vm_area_struct *vma = NULL;
664 723
665 /* Are we prepared to handle this kernel fault? */ 724 /* Are we prepared to handle this kernel fault? */
666 if (fixup_exception(regs, X86_TRAP_PF)) { 725 if (fixup_exception(regs, X86_TRAP_PF)) {
@@ -684,7 +743,8 @@ no_context(struct pt_regs *regs, unsigned long error_code,
684 tsk->thread.cr2 = address; 743 tsk->thread.cr2 = address;
685 744
686 /* XXX: hwpoison faults will set the wrong code. */ 745 /* XXX: hwpoison faults will set the wrong code. */
687 force_sig_info_fault(signal, si_code, address, tsk, 0); 746 force_sig_info_fault(signal, si_code, address,
747 tsk, vma, 0);
688 } 748 }
689 749
690 /* 750 /*
@@ -761,7 +821,8 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
761 821
762static void 822static void
763__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, 823__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
764 unsigned long address, int si_code) 824 unsigned long address, struct vm_area_struct *vma,
825 int si_code)
765{ 826{
766 struct task_struct *tsk = current; 827 struct task_struct *tsk = current;
767 828
@@ -804,7 +865,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
804 tsk->thread.error_code = error_code; 865 tsk->thread.error_code = error_code;
805 tsk->thread.trap_nr = X86_TRAP_PF; 866 tsk->thread.trap_nr = X86_TRAP_PF;
806 867
807 force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); 868 force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0);
808 869
809 return; 870 return;
810 } 871 }
@@ -817,14 +878,14 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
817 878
818static noinline void 879static noinline void
819bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, 880bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
820 unsigned long address) 881 unsigned long address, struct vm_area_struct *vma)
821{ 882{
822 __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); 883 __bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR);
823} 884}
824 885
825static void 886static void
826__bad_area(struct pt_regs *regs, unsigned long error_code, 887__bad_area(struct pt_regs *regs, unsigned long error_code,
827 unsigned long address, int si_code) 888 unsigned long address, struct vm_area_struct *vma, int si_code)
828{ 889{
829 struct mm_struct *mm = current->mm; 890 struct mm_struct *mm = current->mm;
830 891
@@ -834,25 +895,50 @@ __bad_area(struct pt_regs *regs, unsigned long error_code,
834 */ 895 */
835 up_read(&mm->mmap_sem); 896 up_read(&mm->mmap_sem);
836 897
837 __bad_area_nosemaphore(regs, error_code, address, si_code); 898 __bad_area_nosemaphore(regs, error_code, address, vma, si_code);
838} 899}
839 900
840static noinline void 901static noinline void
841bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) 902bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
842{ 903{
843 __bad_area(regs, error_code, address, SEGV_MAPERR); 904 __bad_area(regs, error_code, address, NULL, SEGV_MAPERR);
905}
906
907static inline bool bad_area_access_from_pkeys(unsigned long error_code,
908 struct vm_area_struct *vma)
909{
910 /* This code is always called on the current mm */
911 bool foreign = false;
912
913 if (!boot_cpu_has(X86_FEATURE_OSPKE))
914 return false;
915 if (error_code & PF_PK)
916 return true;
917 /* this checks permission keys on the VMA: */
918 if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
919 (error_code & PF_INSTR), foreign))
920 return true;
921 return false;
844} 922}
845 923
846static noinline void 924static noinline void
847bad_area_access_error(struct pt_regs *regs, unsigned long error_code, 925bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
848 unsigned long address) 926 unsigned long address, struct vm_area_struct *vma)
849{ 927{
850 __bad_area(regs, error_code, address, SEGV_ACCERR); 928 /*
929 * This OSPKE check is not strictly necessary at runtime.
930 * But, doing it this way allows compiler optimizations
931 * if pkeys are compiled out.
932 */
933 if (bad_area_access_from_pkeys(error_code, vma))
934 __bad_area(regs, error_code, address, vma, SEGV_PKUERR);
935 else
936 __bad_area(regs, error_code, address, vma, SEGV_ACCERR);
851} 937}
852 938
853static void 939static void
854do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, 940do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
855 unsigned int fault) 941 struct vm_area_struct *vma, unsigned int fault)
856{ 942{
857 struct task_struct *tsk = current; 943 struct task_struct *tsk = current;
858 int code = BUS_ADRERR; 944 int code = BUS_ADRERR;
@@ -879,12 +965,13 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
879 code = BUS_MCEERR_AR; 965 code = BUS_MCEERR_AR;
880 } 966 }
881#endif 967#endif
882 force_sig_info_fault(SIGBUS, code, address, tsk, fault); 968 force_sig_info_fault(SIGBUS, code, address, tsk, vma, fault);
883} 969}
884 970
885static noinline void 971static noinline void
886mm_fault_error(struct pt_regs *regs, unsigned long error_code, 972mm_fault_error(struct pt_regs *regs, unsigned long error_code,
887 unsigned long address, unsigned int fault) 973 unsigned long address, struct vm_area_struct *vma,
974 unsigned int fault)
888{ 975{
889 if (fatal_signal_pending(current) && !(error_code & PF_USER)) { 976 if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
890 no_context(regs, error_code, address, 0, 0); 977 no_context(regs, error_code, address, 0, 0);
@@ -908,9 +995,9 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
908 } else { 995 } else {
909 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| 996 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
910 VM_FAULT_HWPOISON_LARGE)) 997 VM_FAULT_HWPOISON_LARGE))
911 do_sigbus(regs, error_code, address, fault); 998 do_sigbus(regs, error_code, address, vma, fault);
912 else if (fault & VM_FAULT_SIGSEGV) 999 else if (fault & VM_FAULT_SIGSEGV)
913 bad_area_nosemaphore(regs, error_code, address); 1000 bad_area_nosemaphore(regs, error_code, address, vma);
914 else 1001 else
915 BUG(); 1002 BUG();
916 } 1003 }
@@ -923,6 +1010,12 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
923 1010
924 if ((error_code & PF_INSTR) && !pte_exec(*pte)) 1011 if ((error_code & PF_INSTR) && !pte_exec(*pte))
925 return 0; 1012 return 0;
1013 /*
1014 * Note: We do not do lazy flushing on protection key
1015 * changes, so no spurious fault will ever set PF_PK.
1016 */
1017 if ((error_code & PF_PK))
1018 return 1;
926 1019
927 return 1; 1020 return 1;
928} 1021}
@@ -1012,6 +1105,17 @@ int show_unhandled_signals = 1;
1012static inline int 1105static inline int
1013access_error(unsigned long error_code, struct vm_area_struct *vma) 1106access_error(unsigned long error_code, struct vm_area_struct *vma)
1014{ 1107{
1108 /* This is only called for the current mm, so: */
1109 bool foreign = false;
1110 /*
1111 * Make sure to check the VMA so that we do not perform
1112 * faults just to hit a PF_PK as soon as we fill in a
1113 * page.
1114 */
1115 if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
1116 (error_code & PF_INSTR), foreign))
1117 return 1;
1118
1015 if (error_code & PF_WRITE) { 1119 if (error_code & PF_WRITE) {
1016 /* write, present and write, not present: */ 1120 /* write, present and write, not present: */
1017 if (unlikely(!(vma->vm_flags & VM_WRITE))) 1121 if (unlikely(!(vma->vm_flags & VM_WRITE)))
@@ -1118,7 +1222,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
1118 * Don't take the mm semaphore here. If we fixup a prefetch 1222 * Don't take the mm semaphore here. If we fixup a prefetch
1119 * fault we could otherwise deadlock: 1223 * fault we could otherwise deadlock:
1120 */ 1224 */
1121 bad_area_nosemaphore(regs, error_code, address); 1225 bad_area_nosemaphore(regs, error_code, address, NULL);
1122 1226
1123 return; 1227 return;
1124 } 1228 }
@@ -1131,7 +1235,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
1131 pgtable_bad(regs, error_code, address); 1235 pgtable_bad(regs, error_code, address);
1132 1236
1133 if (unlikely(smap_violation(error_code, regs))) { 1237 if (unlikely(smap_violation(error_code, regs))) {
1134 bad_area_nosemaphore(regs, error_code, address); 1238 bad_area_nosemaphore(regs, error_code, address, NULL);
1135 return; 1239 return;
1136 } 1240 }
1137 1241
@@ -1140,7 +1244,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
1140 * in a region with pagefaults disabled then we must not take the fault 1244 * in a region with pagefaults disabled then we must not take the fault
1141 */ 1245 */
1142 if (unlikely(faulthandler_disabled() || !mm)) { 1246 if (unlikely(faulthandler_disabled() || !mm)) {
1143 bad_area_nosemaphore(regs, error_code, address); 1247 bad_area_nosemaphore(regs, error_code, address, NULL);
1144 return; 1248 return;
1145 } 1249 }
1146 1250
@@ -1164,6 +1268,8 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
1164 1268
1165 if (error_code & PF_WRITE) 1269 if (error_code & PF_WRITE)
1166 flags |= FAULT_FLAG_WRITE; 1270 flags |= FAULT_FLAG_WRITE;
1271 if (error_code & PF_INSTR)
1272 flags |= FAULT_FLAG_INSTRUCTION;
1167 1273
1168 /* 1274 /*
1169 * When running in the kernel we expect faults to occur only to 1275 * When running in the kernel we expect faults to occur only to
@@ -1184,7 +1290,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
1184 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 1290 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
1185 if ((error_code & PF_USER) == 0 && 1291 if ((error_code & PF_USER) == 0 &&
1186 !search_exception_tables(regs->ip)) { 1292 !search_exception_tables(regs->ip)) {
1187 bad_area_nosemaphore(regs, error_code, address); 1293 bad_area_nosemaphore(regs, error_code, address, NULL);
1188 return; 1294 return;
1189 } 1295 }
1190retry: 1296retry:
@@ -1232,7 +1338,7 @@ retry:
1232 */ 1338 */
1233good_area: 1339good_area:
1234 if (unlikely(access_error(error_code, vma))) { 1340 if (unlikely(access_error(error_code, vma))) {
1235 bad_area_access_error(regs, error_code, address); 1341 bad_area_access_error(regs, error_code, address, vma);
1236 return; 1342 return;
1237 } 1343 }
1238 1344
@@ -1270,7 +1376,7 @@ good_area:
1270 1376
1271 up_read(&mm->mmap_sem); 1377 up_read(&mm->mmap_sem);
1272 if (unlikely(fault & VM_FAULT_ERROR)) { 1378 if (unlikely(fault & VM_FAULT_ERROR)) {
1273 mm_fault_error(regs, error_code, address, fault); 1379 mm_fault_error(regs, error_code, address, vma, fault);
1274 return; 1380 return;
1275 } 1381 }
1276 1382
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index f8d0b5e8bdfd..b8b6a60b32cf 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -11,6 +11,7 @@
11#include <linux/swap.h> 11#include <linux/swap.h>
12#include <linux/memremap.h> 12#include <linux/memremap.h>
13 13
14#include <asm/mmu_context.h>
14#include <asm/pgtable.h> 15#include <asm/pgtable.h>
15 16
16static inline pte_t gup_get_pte(pte_t *ptep) 17static inline pte_t gup_get_pte(pte_t *ptep)
@@ -75,6 +76,28 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
75} 76}
76 77
77/* 78/*
79 * 'pteval' can come from a pte, pmd or pud. We only check
80 * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
81 * same value on all 3 types.
82 */
83static inline int pte_allows_gup(unsigned long pteval, int write)
84{
85 unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
86
87 if (write)
88 need_pte_bits |= _PAGE_RW;
89
90 if ((pteval & need_pte_bits) != need_pte_bits)
91 return 0;
92
93 /* Check memory protection keys permissions. */
94 if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write))
95 return 0;
96
97 return 1;
98}
99
100/*
78 * The performance critical leaf functions are made noinline otherwise gcc 101 * The performance critical leaf functions are made noinline otherwise gcc
79 * inlines everything into a single function which results in too much 102 * inlines everything into a single function which results in too much
80 * register pressure. 103 * register pressure.
@@ -83,14 +106,9 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
83 unsigned long end, int write, struct page **pages, int *nr) 106 unsigned long end, int write, struct page **pages, int *nr)
84{ 107{
85 struct dev_pagemap *pgmap = NULL; 108 struct dev_pagemap *pgmap = NULL;
86 unsigned long mask;
87 int nr_start = *nr; 109 int nr_start = *nr;
88 pte_t *ptep; 110 pte_t *ptep;
89 111
90 mask = _PAGE_PRESENT|_PAGE_USER;
91 if (write)
92 mask |= _PAGE_RW;
93
94 ptep = pte_offset_map(&pmd, addr); 112 ptep = pte_offset_map(&pmd, addr);
95 do { 113 do {
96 pte_t pte = gup_get_pte(ptep); 114 pte_t pte = gup_get_pte(ptep);
@@ -109,7 +127,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
109 pte_unmap(ptep); 127 pte_unmap(ptep);
110 return 0; 128 return 0;
111 } 129 }
112 } else if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { 130 } else if (!pte_allows_gup(pte_val(pte), write) ||
131 pte_special(pte)) {
113 pte_unmap(ptep); 132 pte_unmap(ptep);
114 return 0; 133 return 0;
115 } 134 }
@@ -164,14 +183,10 @@ static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
164static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, 183static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
165 unsigned long end, int write, struct page **pages, int *nr) 184 unsigned long end, int write, struct page **pages, int *nr)
166{ 185{
167 unsigned long mask;
168 struct page *head, *page; 186 struct page *head, *page;
169 int refs; 187 int refs;
170 188
171 mask = _PAGE_PRESENT|_PAGE_USER; 189 if (!pte_allows_gup(pmd_val(pmd), write))
172 if (write)
173 mask |= _PAGE_RW;
174 if ((pmd_flags(pmd) & mask) != mask)
175 return 0; 190 return 0;
176 191
177 VM_BUG_ON(!pfn_valid(pmd_pfn(pmd))); 192 VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
@@ -231,14 +246,10 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
231static noinline int gup_huge_pud(pud_t pud, unsigned long addr, 246static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
232 unsigned long end, int write, struct page **pages, int *nr) 247 unsigned long end, int write, struct page **pages, int *nr)
233{ 248{
234 unsigned long mask;
235 struct page *head, *page; 249 struct page *head, *page;
236 int refs; 250 int refs;
237 251
238 mask = _PAGE_PRESENT|_PAGE_USER; 252 if (!pte_allows_gup(pud_val(pud), write))
239 if (write)
240 mask |= _PAGE_RW;
241 if ((pud_flags(pud) & mask) != mask)
242 return 0; 253 return 0;
243 /* hugepages are never "special" */ 254 /* hugepages are never "special" */
244 VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL); 255 VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL);
@@ -422,7 +433,7 @@ slow_irqon:
422 start += nr << PAGE_SHIFT; 433 start += nr << PAGE_SHIFT;
423 pages += nr; 434 pages += nr;
424 435
425 ret = get_user_pages_unlocked(current, mm, start, 436 ret = get_user_pages_unlocked(start,
426 (end - start) >> PAGE_SHIFT, 437 (end - start) >> PAGE_SHIFT,
427 write, 0, pages); 438 write, 0, pages);
428 439
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index ef05755a1900..a0a0b9861902 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -546,8 +546,8 @@ static int mpx_resolve_fault(long __user *addr, int write)
546 int nr_pages = 1; 546 int nr_pages = 1;
547 int force = 0; 547 int force = 0;
548 548
549 gup_ret = get_user_pages(current, current->mm, (unsigned long)addr, 549 gup_ret = get_user_pages((unsigned long)addr, nr_pages, write,
550 nr_pages, write, force, NULL, NULL); 550 force, NULL, NULL);
551 /* 551 /*
552 * get_user_pages() returns number of pages gotten. 552 * get_user_pages() returns number of pages gotten.
553 * 0 means we failed to fault in and get anything, 553 * 0 means we failed to fault in and get anything,
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
new file mode 100644
index 000000000000..e8c474451928
--- /dev/null
+++ b/arch/x86/mm/pkeys.c
@@ -0,0 +1,101 @@
1/*
2 * Intel Memory Protection Keys management
3 * Copyright (c) 2015, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14#include <linux/mm_types.h> /* mm_struct, vma, etc... */
15#include <linux/pkeys.h> /* PKEY_* */
16#include <uapi/asm-generic/mman-common.h>
17
18#include <asm/cpufeature.h> /* boot_cpu_has, ... */
19#include <asm/mmu_context.h> /* vma_pkey() */
20#include <asm/fpu/internal.h> /* fpregs_active() */
21
22int __execute_only_pkey(struct mm_struct *mm)
23{
24 int ret;
25
26 /*
27 * We do not want to go through the relatively costly
28 * dance to set PKRU if we do not need to. Check it
29 * first and assume that if the execute-only pkey is
30 * write-disabled that we do not have to set it
31 * ourselves. We need preempt off so that nobody
32 * can make fpregs inactive.
33 */
34 preempt_disable();
35 if (fpregs_active() &&
36 !__pkru_allows_read(read_pkru(), PKEY_DEDICATED_EXECUTE_ONLY)) {
37 preempt_enable();
38 return PKEY_DEDICATED_EXECUTE_ONLY;
39 }
40 preempt_enable();
41 ret = arch_set_user_pkey_access(current, PKEY_DEDICATED_EXECUTE_ONLY,
42 PKEY_DISABLE_ACCESS);
43 /*
44 * If the PKRU-set operation failed somehow, just return
45 * 0 and effectively disable execute-only support.
46 */
47 if (ret)
48 return 0;
49
50 return PKEY_DEDICATED_EXECUTE_ONLY;
51}
52
53static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
54{
55 /* Do this check first since the vm_flags should be hot */
56 if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC)
57 return false;
58 if (vma_pkey(vma) != PKEY_DEDICATED_EXECUTE_ONLY)
59 return false;
60
61 return true;
62}
63
64/*
65 * This is only called for *plain* mprotect calls.
66 */
67int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey)
68{
69 /*
70 * Is this an mprotect_pkey() call? If so, never
71 * override the value that came from the user.
72 */
73 if (pkey != -1)
74 return pkey;
75 /*
76 * Look for a protection-key-drive execute-only mapping
77 * which is now being given permissions that are not
78 * execute-only. Move it back to the default pkey.
79 */
80 if (vma_is_pkey_exec_only(vma) &&
81 (prot & (PROT_READ|PROT_WRITE))) {
82 return 0;
83 }
84 /*
85 * The mapping is execute-only. Go try to get the
86 * execute-only protection key. If we fail to do that,
87 * fall through as if we do not have execute-only
88 * support.
89 */
90 if (prot == PROT_EXEC) {
91 pkey = execute_only_pkey(vma->vm_mm);
92 if (pkey > 0)
93 return pkey;
94 }
95 /*
96 * This is a vanilla, non-pkey mprotect (or we failed to
97 * setup execute-only), inherit the pkey from the VMA we
98 * are working on.
99 */
100 return vma_pkey(vma);
101}
diff --git a/drivers/char/agp/frontend.c b/drivers/char/agp/frontend.c
index 09f17eb73486..0f64d149c98d 100644
--- a/drivers/char/agp/frontend.c
+++ b/drivers/char/agp/frontend.c
@@ -156,7 +156,7 @@ static pgprot_t agp_convert_mmap_flags(int prot)
156{ 156{
157 unsigned long prot_bits; 157 unsigned long prot_bits;
158 158
159 prot_bits = calc_vm_prot_bits(prot) | VM_SHARED; 159 prot_bits = calc_vm_prot_bits(prot, 0) | VM_SHARED;
160 return vm_get_page_prot(prot_bits); 160 return vm_get_page_prot(prot_bits);
161} 161}
162 162
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 1cbb16e15307..7b82e57aa09c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -518,8 +518,7 @@ static int amdgpu_ttm_tt_pin_userptr(struct ttm_tt *ttm)
518 uint64_t userptr = gtt->userptr + pinned * PAGE_SIZE; 518 uint64_t userptr = gtt->userptr + pinned * PAGE_SIZE;
519 struct page **pages = ttm->pages + pinned; 519 struct page **pages = ttm->pages + pinned;
520 520
521 r = get_user_pages(current, current->mm, userptr, num_pages, 521 r = get_user_pages(userptr, num_pages, write, 0, pages, NULL);
522 write, 0, pages, NULL);
523 if (r < 0) 522 if (r < 0)
524 goto release_pages; 523 goto release_pages;
525 524
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
index 4b519e4309b2..97d4457be8d2 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
@@ -753,9 +753,9 @@ static struct page **etnaviv_gem_userptr_do_get_pages(
753 753
754 down_read(&mm->mmap_sem); 754 down_read(&mm->mmap_sem);
755 while (pinned < npages) { 755 while (pinned < npages) {
756 ret = get_user_pages(task, mm, ptr, npages - pinned, 756 ret = get_user_pages_remote(task, mm, ptr, npages - pinned,
757 !etnaviv_obj->userptr.ro, 0, 757 !etnaviv_obj->userptr.ro, 0,
758 pvec + pinned, NULL); 758 pvec + pinned, NULL);
759 if (ret < 0) 759 if (ret < 0)
760 break; 760 break;
761 761
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 59e45b3a6937..90dbf8121210 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -584,11 +584,11 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
584 584
585 down_read(&mm->mmap_sem); 585 down_read(&mm->mmap_sem);
586 while (pinned < npages) { 586 while (pinned < npages) {
587 ret = get_user_pages(work->task, mm, 587 ret = get_user_pages_remote(work->task, mm,
588 obj->userptr.ptr + pinned * PAGE_SIZE, 588 obj->userptr.ptr + pinned * PAGE_SIZE,
589 npages - pinned, 589 npages - pinned,
590 !obj->userptr.read_only, 0, 590 !obj->userptr.read_only, 0,
591 pvec + pinned, NULL); 591 pvec + pinned, NULL);
592 if (ret < 0) 592 if (ret < 0)
593 break; 593 break;
594 594
diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c
index e06ac546a90f..6d8c32377c6f 100644
--- a/drivers/gpu/drm/radeon/radeon_ttm.c
+++ b/drivers/gpu/drm/radeon/radeon_ttm.c
@@ -554,8 +554,7 @@ static int radeon_ttm_tt_pin_userptr(struct ttm_tt *ttm)
554 uint64_t userptr = gtt->userptr + pinned * PAGE_SIZE; 554 uint64_t userptr = gtt->userptr + pinned * PAGE_SIZE;
555 struct page **pages = ttm->pages + pinned; 555 struct page **pages = ttm->pages + pinned;
556 556
557 r = get_user_pages(current, current->mm, userptr, num_pages, 557 r = get_user_pages(userptr, num_pages, write, 0, pages, NULL);
558 write, 0, pages, NULL);
559 if (r < 0) 558 if (r < 0)
560 goto release_pages; 559 goto release_pages;
561 560
diff --git a/drivers/gpu/drm/via/via_dmablit.c b/drivers/gpu/drm/via/via_dmablit.c
index d0cbd5ecd7f0..e797dfc07ae3 100644
--- a/drivers/gpu/drm/via/via_dmablit.c
+++ b/drivers/gpu/drm/via/via_dmablit.c
@@ -239,8 +239,7 @@ via_lock_all_dma_pages(drm_via_sg_info_t *vsg, drm_via_dmablit_t *xfer)
239 if (NULL == vsg->pages) 239 if (NULL == vsg->pages)
240 return -ENOMEM; 240 return -ENOMEM;
241 down_read(&current->mm->mmap_sem); 241 down_read(&current->mm->mmap_sem);
242 ret = get_user_pages(current, current->mm, 242 ret = get_user_pages((unsigned long)xfer->mem_addr,
243 (unsigned long)xfer->mem_addr,
244 vsg->num_pages, 243 vsg->num_pages,
245 (vsg->direction == DMA_FROM_DEVICE), 244 (vsg->direction == DMA_FROM_DEVICE),
246 0, vsg->pages, NULL); 245 0, vsg->pages, NULL);
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 38acb3cfc545..fe4d2e1a8b58 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -188,7 +188,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
188 sg_list_start = umem->sg_head.sgl; 188 sg_list_start = umem->sg_head.sgl;
189 189
190 while (npages) { 190 while (npages) {
191 ret = get_user_pages(current, current->mm, cur_base, 191 ret = get_user_pages(cur_base,
192 min_t(unsigned long, npages, 192 min_t(unsigned long, npages,
193 PAGE_SIZE / sizeof (struct page *)), 193 PAGE_SIZE / sizeof (struct page *)),
194 1, !umem->writable, page_list, vma_list); 194 1, !umem->writable, page_list, vma_list);
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index e69bf266049d..75077a018675 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -572,10 +572,10 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
572 * complex (and doesn't gain us much performance in most use 572 * complex (and doesn't gain us much performance in most use
573 * cases). 573 * cases).
574 */ 574 */
575 npages = get_user_pages(owning_process, owning_mm, user_virt, 575 npages = get_user_pages_remote(owning_process, owning_mm,
576 gup_num_pages, 576 user_virt, gup_num_pages,
577 access_mask & ODP_WRITE_ALLOWED_BIT, 0, 577 access_mask & ODP_WRITE_ALLOWED_BIT,
578 local_page_list, NULL); 578 0, local_page_list, NULL);
579 up_read(&owning_mm->mmap_sem); 579 up_read(&owning_mm->mmap_sem);
580 580
581 if (npages < 0) 581 if (npages < 0)
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c
index 7d2e42dd6926..6c00d04b8b28 100644
--- a/drivers/infiniband/hw/mthca/mthca_memfree.c
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -472,8 +472,7 @@ int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
472 goto out; 472 goto out;
473 } 473 }
474 474
475 ret = get_user_pages(current, current->mm, uaddr & PAGE_MASK, 1, 1, 0, 475 ret = get_user_pages(uaddr & PAGE_MASK, 1, 1, 0, pages, NULL);
476 pages, NULL);
477 if (ret < 0) 476 if (ret < 0)
478 goto out; 477 goto out;
479 478
diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c
index 74f90b2619f6..2d2b94fd3633 100644
--- a/drivers/infiniband/hw/qib/qib_user_pages.c
+++ b/drivers/infiniband/hw/qib/qib_user_pages.c
@@ -66,8 +66,7 @@ static int __qib_get_user_pages(unsigned long start_page, size_t num_pages,
66 } 66 }
67 67
68 for (got = 0; got < num_pages; got += ret) { 68 for (got = 0; got < num_pages; got += ret) {
69 ret = get_user_pages(current, current->mm, 69 ret = get_user_pages(start_page + got * PAGE_SIZE,
70 start_page + got * PAGE_SIZE,
71 num_pages - got, 1, 1, 70 num_pages - got, 1, 1,
72 p + got, NULL); 71 p + got, NULL);
73 if (ret < 0) 72 if (ret < 0)
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
index 645a5f6e6c88..7209fbc03ccb 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -144,7 +144,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
144 ret = 0; 144 ret = 0;
145 145
146 while (npages) { 146 while (npages) {
147 ret = get_user_pages(current, current->mm, cur_base, 147 ret = get_user_pages(cur_base,
148 min_t(unsigned long, npages, 148 min_t(unsigned long, npages,
149 PAGE_SIZE / sizeof(struct page *)), 149 PAGE_SIZE / sizeof(struct page *)),
150 1, !writable, page_list, NULL); 150 1, !writable, page_list, NULL);
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index c865737326e1..56999d2fac07 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -526,6 +526,7 @@ static void do_fault(struct work_struct *work)
526 flags |= FAULT_FLAG_USER; 526 flags |= FAULT_FLAG_USER;
527 if (fault->flags & PPR_FAULT_WRITE) 527 if (fault->flags & PPR_FAULT_WRITE)
528 flags |= FAULT_FLAG_WRITE; 528 flags |= FAULT_FLAG_WRITE;
529 flags |= FAULT_FLAG_REMOTE;
529 530
530 down_read(&mm->mmap_sem); 531 down_read(&mm->mmap_sem);
531 vma = find_extend_vma(mm, address); 532 vma = find_extend_vma(mm, address);
diff --git a/drivers/media/pci/ivtv/ivtv-udma.c b/drivers/media/pci/ivtv/ivtv-udma.c
index 24152accc66c..4769469fe842 100644
--- a/drivers/media/pci/ivtv/ivtv-udma.c
+++ b/drivers/media/pci/ivtv/ivtv-udma.c
@@ -124,8 +124,8 @@ int ivtv_udma_setup(struct ivtv *itv, unsigned long ivtv_dest_addr,
124 } 124 }
125 125
126 /* Get user pages for DMA Xfer */ 126 /* Get user pages for DMA Xfer */
127 err = get_user_pages_unlocked(current, current->mm, 127 err = get_user_pages_unlocked(user_dma.uaddr, user_dma.page_count, 0,
128 user_dma.uaddr, user_dma.page_count, 0, 1, dma->map); 128 1, dma->map);
129 129
130 if (user_dma.page_count != err) { 130 if (user_dma.page_count != err) {
131 IVTV_DEBUG_WARN("failed to map user pages, returned %d instead of %d\n", 131 IVTV_DEBUG_WARN("failed to map user pages, returned %d instead of %d\n",
diff --git a/drivers/media/pci/ivtv/ivtv-yuv.c b/drivers/media/pci/ivtv/ivtv-yuv.c
index 2b8e7b2f2b86..b094054cda6e 100644
--- a/drivers/media/pci/ivtv/ivtv-yuv.c
+++ b/drivers/media/pci/ivtv/ivtv-yuv.c
@@ -75,14 +75,12 @@ static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma,
75 ivtv_udma_get_page_info (&uv_dma, (unsigned long)args->uv_source, 360 * uv_decode_height); 75 ivtv_udma_get_page_info (&uv_dma, (unsigned long)args->uv_source, 360 * uv_decode_height);
76 76
77 /* Get user pages for DMA Xfer */ 77 /* Get user pages for DMA Xfer */
78 y_pages = get_user_pages_unlocked(current, current->mm, 78 y_pages = get_user_pages_unlocked(y_dma.uaddr,
79 y_dma.uaddr, y_dma.page_count, 0, 1, 79 y_dma.page_count, 0, 1, &dma->map[0]);
80 &dma->map[0]);
81 uv_pages = 0; /* silence gcc. value is set and consumed only if: */ 80 uv_pages = 0; /* silence gcc. value is set and consumed only if: */
82 if (y_pages == y_dma.page_count) { 81 if (y_pages == y_dma.page_count) {
83 uv_pages = get_user_pages_unlocked(current, current->mm, 82 uv_pages = get_user_pages_unlocked(uv_dma.uaddr,
84 uv_dma.uaddr, uv_dma.page_count, 0, 1, 83 uv_dma.page_count, 0, 1, &dma->map[y_pages]);
85 &dma->map[y_pages]);
86 } 84 }
87 85
88 if (y_pages != y_dma.page_count || uv_pages != uv_dma.page_count) { 86 if (y_pages != y_dma.page_count || uv_pages != uv_dma.page_count) {
diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c
index f669cedca8bd..df4c052c6bd6 100644
--- a/drivers/media/v4l2-core/videobuf-dma-sg.c
+++ b/drivers/media/v4l2-core/videobuf-dma-sg.c
@@ -181,8 +181,7 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma,
181 dprintk(1, "init user [0x%lx+0x%lx => %d pages]\n", 181 dprintk(1, "init user [0x%lx+0x%lx => %d pages]\n",
182 data, size, dma->nr_pages); 182 data, size, dma->nr_pages);
183 183
184 err = get_user_pages(current, current->mm, 184 err = get_user_pages(data & PAGE_MASK, dma->nr_pages,
185 data & PAGE_MASK, dma->nr_pages,
186 rw == READ, 1, /* force */ 185 rw == READ, 1, /* force */
187 dma->pages, NULL); 186 dma->pages, NULL);
188 187
diff --git a/drivers/misc/mic/scif/scif_rma.c b/drivers/misc/mic/scif/scif_rma.c
index 6a451bd65bf3..e0203b1a20fd 100644
--- a/drivers/misc/mic/scif/scif_rma.c
+++ b/drivers/misc/mic/scif/scif_rma.c
@@ -1394,8 +1394,6 @@ retry:
1394 } 1394 }
1395 1395
1396 pinned_pages->nr_pages = get_user_pages( 1396 pinned_pages->nr_pages = get_user_pages(
1397 current,
1398 mm,
1399 (u64)addr, 1397 (u64)addr,
1400 nr_pages, 1398 nr_pages,
1401 !!(prot & SCIF_PROT_WRITE), 1399 !!(prot & SCIF_PROT_WRITE),
diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c
index f74fc0ca2ef9..a2d97b9b17e3 100644
--- a/drivers/misc/sgi-gru/grufault.c
+++ b/drivers/misc/sgi-gru/grufault.c
@@ -198,8 +198,7 @@ static int non_atomic_pte_lookup(struct vm_area_struct *vma,
198#else 198#else
199 *pageshift = PAGE_SHIFT; 199 *pageshift = PAGE_SHIFT;
200#endif 200#endif
201 if (get_user_pages 201 if (get_user_pages(vaddr, 1, write, 0, &page, NULL) <= 0)
202 (current, current->mm, vaddr, 1, write, 0, &page, NULL) <= 0)
203 return -EFAULT; 202 return -EFAULT;
204 *paddr = page_to_phys(page); 203 *paddr = page_to_phys(page);
205 put_page(page); 204 put_page(page);
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 607b0a505844..71c5138ddf94 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -4917,8 +4917,6 @@ static int sgl_map_user_pages(struct st_buffer *STbp,
4917 /* Try to fault in all of the necessary pages */ 4917 /* Try to fault in all of the necessary pages */
4918 /* rw==READ means read from drive, write into memory area */ 4918 /* rw==READ means read from drive, write into memory area */
4919 res = get_user_pages_unlocked( 4919 res = get_user_pages_unlocked(
4920 current,
4921 current->mm,
4922 uaddr, 4920 uaddr,
4923 nr_pages, 4921 nr_pages,
4924 rw == READ, 4922 rw == READ,
diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c
index 8a8078f954d5..ca9a53c03f0f 100644
--- a/drivers/staging/android/ashmem.c
+++ b/drivers/staging/android/ashmem.c
@@ -385,8 +385,8 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
385 } 385 }
386 386
387 /* requested protection bits must match our allowed protection mask */ 387 /* requested protection bits must match our allowed protection mask */
388 if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask)) & 388 if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask, 0)) &
389 calc_vm_prot_bits(PROT_MASK))) { 389 calc_vm_prot_bits(PROT_MASK, 0))) {
390 ret = -EPERM; 390 ret = -EPERM;
391 goto out; 391 goto out;
392 } 392 }
diff --git a/drivers/video/fbdev/pvr2fb.c b/drivers/video/fbdev/pvr2fb.c
index 0e24eb9c219c..71a923e53f93 100644
--- a/drivers/video/fbdev/pvr2fb.c
+++ b/drivers/video/fbdev/pvr2fb.c
@@ -686,8 +686,8 @@ static ssize_t pvr2fb_write(struct fb_info *info, const char *buf,
686 if (!pages) 686 if (!pages)
687 return -ENOMEM; 687 return -ENOMEM;
688 688
689 ret = get_user_pages_unlocked(current, current->mm, (unsigned long)buf, 689 ret = get_user_pages_unlocked((unsigned long)buf, nr_pages, WRITE,
690 nr_pages, WRITE, 0, pages); 690 0, pages);
691 691
692 if (ret < nr_pages) { 692 if (ret < nr_pages) {
693 nr_pages = ret; 693 nr_pages = ret;
diff --git a/drivers/virt/fsl_hypervisor.c b/drivers/virt/fsl_hypervisor.c
index 32c8fc5f7a5c..60bdad3a689b 100644
--- a/drivers/virt/fsl_hypervisor.c
+++ b/drivers/virt/fsl_hypervisor.c
@@ -244,9 +244,8 @@ static long ioctl_memcpy(struct fsl_hv_ioctl_memcpy __user *p)
244 244
245 /* Get the physical addresses of the source buffer */ 245 /* Get the physical addresses of the source buffer */
246 down_read(&current->mm->mmap_sem); 246 down_read(&current->mm->mmap_sem);
247 num_pinned = get_user_pages(current, current->mm, 247 num_pinned = get_user_pages(param.local_vaddr - lb_offset,
248 param.local_vaddr - lb_offset, num_pages, 248 num_pages, (param.source == -1) ? READ : WRITE,
249 (param.source == -1) ? READ : WRITE,
250 0, pages, NULL); 249 0, pages, NULL);
251 up_read(&current->mm->mmap_sem); 250 up_read(&current->mm->mmap_sem);
252 251
diff --git a/fs/exec.c b/fs/exec.c
index 9bdf0edf570d..c4010b8207a1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -199,8 +199,12 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
199 return NULL; 199 return NULL;
200 } 200 }
201#endif 201#endif
202 ret = get_user_pages(current, bprm->mm, pos, 202 /*
203 1, write, 1, &page, NULL); 203 * We are doing an exec(). 'current' is the process
204 * doing the exec and bprm->mm is the new process's mm.
205 */
206 ret = get_user_pages_remote(current, bprm->mm, pos, 1, write,
207 1, &page, NULL);
204 if (ret <= 0) 208 if (ret <= 0)
205 return NULL; 209 return NULL;
206 210
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fa95ab2d3674..9df431642042 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -660,11 +660,20 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
660 [ilog2(VM_MERGEABLE)] = "mg", 660 [ilog2(VM_MERGEABLE)] = "mg",
661 [ilog2(VM_UFFD_MISSING)]= "um", 661 [ilog2(VM_UFFD_MISSING)]= "um",
662 [ilog2(VM_UFFD_WP)] = "uw", 662 [ilog2(VM_UFFD_WP)] = "uw",
663#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
664 /* These come out via ProtectionKey: */
665 [ilog2(VM_PKEY_BIT0)] = "",
666 [ilog2(VM_PKEY_BIT1)] = "",
667 [ilog2(VM_PKEY_BIT2)] = "",
668 [ilog2(VM_PKEY_BIT3)] = "",
669#endif
663 }; 670 };
664 size_t i; 671 size_t i;
665 672
666 seq_puts(m, "VmFlags: "); 673 seq_puts(m, "VmFlags: ");
667 for (i = 0; i < BITS_PER_LONG; i++) { 674 for (i = 0; i < BITS_PER_LONG; i++) {
675 if (!mnemonics[i][0])
676 continue;
668 if (vma->vm_flags & (1UL << i)) { 677 if (vma->vm_flags & (1UL << i)) {
669 seq_printf(m, "%c%c ", 678 seq_printf(m, "%c%c ",
670 mnemonics[i][0], mnemonics[i][1]); 679 mnemonics[i][0], mnemonics[i][1]);
@@ -702,6 +711,10 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
702} 711}
703#endif /* HUGETLB_PAGE */ 712#endif /* HUGETLB_PAGE */
704 713
714void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
715{
716}
717
705static int show_smap(struct seq_file *m, void *v, int is_pid) 718static int show_smap(struct seq_file *m, void *v, int is_pid)
706{ 719{
707 struct vm_area_struct *vma = v; 720 struct vm_area_struct *vma = v;
@@ -783,6 +796,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
783 (vma->vm_flags & VM_LOCKED) ? 796 (vma->vm_flags & VM_LOCKED) ?
784 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); 797 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
785 798
799 arch_show_smap(m, vma);
786 show_smap_vma_flags(m, vma); 800 show_smap_vma_flags(m, vma);
787 m_cache_vma(m, vma); 801 m_cache_vma(m, vma);
788 return 0; 802 return 0;
diff --git a/include/asm-generic/mm_hooks.h b/include/asm-generic/mm_hooks.h
index 866aa461efa5..cc5d9a1405df 100644
--- a/include/asm-generic/mm_hooks.h
+++ b/include/asm-generic/mm_hooks.h
@@ -26,4 +26,16 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm,
26{ 26{
27} 27}
28 28
29static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
30 bool write, bool execute, bool foreign)
31{
32 /* by default, allow everything */
33 return true;
34}
35
36static inline bool arch_pte_access_permitted(pte_t pte, bool write)
37{
38 /* by default, allow everything */
39 return true;
40}
29#endif /* _ASM_GENERIC_MM_HOOKS_H */ 41#endif /* _ASM_GENERIC_MM_HOOKS_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7d42501c8bb4..450fc977ed02 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -193,8 +193,26 @@ extern unsigned int kobjsize(const void *objp);
193#define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ 193#define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */
194#define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ 194#define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */
195 195
196#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
197#define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */
198#define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */
199#define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */
200#define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */
201#define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
202#define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
203#define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
204#define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
205#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
206
196#if defined(CONFIG_X86) 207#if defined(CONFIG_X86)
197# define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */ 208# define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */
209#if defined (CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS)
210# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0
211# define VM_PKEY_BIT0 VM_HIGH_ARCH_0 /* A protection key is a 4-bit value */
212# define VM_PKEY_BIT1 VM_HIGH_ARCH_1
213# define VM_PKEY_BIT2 VM_HIGH_ARCH_2
214# define VM_PKEY_BIT3 VM_HIGH_ARCH_3
215#endif
198#elif defined(CONFIG_PPC) 216#elif defined(CONFIG_PPC)
199# define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */ 217# define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */
200#elif defined(CONFIG_PARISC) 218#elif defined(CONFIG_PARISC)
@@ -256,6 +274,8 @@ extern pgprot_t protection_map[16];
256#define FAULT_FLAG_KILLABLE 0x10 /* The fault task is in SIGKILL killable region */ 274#define FAULT_FLAG_KILLABLE 0x10 /* The fault task is in SIGKILL killable region */
257#define FAULT_FLAG_TRIED 0x20 /* Second try */ 275#define FAULT_FLAG_TRIED 0x20 /* Second try */
258#define FAULT_FLAG_USER 0x40 /* The fault originated in userspace */ 276#define FAULT_FLAG_USER 0x40 /* The fault originated in userspace */
277#define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */
278#define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */
259 279
260/* 280/*
261 * vm_fault is filled by the the pagefault handler and passed to the vma's 281 * vm_fault is filled by the the pagefault handler and passed to the vma's
@@ -1224,24 +1244,82 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1224 unsigned long start, unsigned long nr_pages, 1244 unsigned long start, unsigned long nr_pages,
1225 unsigned int foll_flags, struct page **pages, 1245 unsigned int foll_flags, struct page **pages,
1226 struct vm_area_struct **vmas, int *nonblocking); 1246 struct vm_area_struct **vmas, int *nonblocking);
1227long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1247long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
1228 unsigned long start, unsigned long nr_pages, 1248 unsigned long start, unsigned long nr_pages,
1229 int write, int force, struct page **pages, 1249 int write, int force, struct page **pages,
1230 struct vm_area_struct **vmas); 1250 struct vm_area_struct **vmas);
1231long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, 1251long get_user_pages6(unsigned long start, unsigned long nr_pages,
1232 unsigned long start, unsigned long nr_pages, 1252 int write, int force, struct page **pages,
1233 int write, int force, struct page **pages, 1253 struct vm_area_struct **vmas);
1234 int *locked); 1254long get_user_pages_locked6(unsigned long start, unsigned long nr_pages,
1255 int write, int force, struct page **pages, int *locked);
1235long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, 1256long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
1236 unsigned long start, unsigned long nr_pages, 1257 unsigned long start, unsigned long nr_pages,
1237 int write, int force, struct page **pages, 1258 int write, int force, struct page **pages,
1238 unsigned int gup_flags); 1259 unsigned int gup_flags);
1239long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, 1260long get_user_pages_unlocked5(unsigned long start, unsigned long nr_pages,
1240 unsigned long start, unsigned long nr_pages,
1241 int write, int force, struct page **pages); 1261 int write, int force, struct page **pages);
1242int get_user_pages_fast(unsigned long start, int nr_pages, int write, 1262int get_user_pages_fast(unsigned long start, int nr_pages, int write,
1243 struct page **pages); 1263 struct page **pages);
1244 1264
1265/* suppress warnings from use in EXPORT_SYMBOL() */
1266#ifndef __DISABLE_GUP_DEPRECATED
1267#define __gup_deprecated __deprecated
1268#else
1269#define __gup_deprecated
1270#endif
1271/*
1272 * These macros provide backward-compatibility with the old
1273 * get_user_pages() variants which took tsk/mm. These
1274 * functions/macros provide both compile-time __deprecated so we
1275 * can catch old-style use and not break the build. The actual
1276 * functions also have WARN_ON()s to let us know at runtime if
1277 * the get_user_pages() should have been the "remote" variant.
1278 *
1279 * These are hideous, but temporary.
1280 *
1281 * If you run into one of these __deprecated warnings, look
1282 * at how you are calling get_user_pages(). If you are calling
1283 * it with current/current->mm as the first two arguments,
1284 * simply remove those arguments. The behavior will be the same
1285 * as it is now. If you are calling it on another task, use
1286 * get_user_pages_remote() instead.
1287 *
1288 * Any questions? Ask Dave Hansen <dave@sr71.net>
1289 */
1290long
1291__gup_deprecated
1292get_user_pages8(struct task_struct *tsk, struct mm_struct *mm,
1293 unsigned long start, unsigned long nr_pages,
1294 int write, int force, struct page **pages,
1295 struct vm_area_struct **vmas);
1296#define GUP_MACRO(_1, _2, _3, _4, _5, _6, _7, _8, get_user_pages, ...) \
1297 get_user_pages
1298#define get_user_pages(...) GUP_MACRO(__VA_ARGS__, \
1299 get_user_pages8, x, \
1300 get_user_pages6, x, x, x, x, x)(__VA_ARGS__)
1301
1302__gup_deprecated
1303long get_user_pages_locked8(struct task_struct *tsk, struct mm_struct *mm,
1304 unsigned long start, unsigned long nr_pages,
1305 int write, int force, struct page **pages,
1306 int *locked);
1307#define GUPL_MACRO(_1, _2, _3, _4, _5, _6, _7, _8, get_user_pages_locked, ...) \
1308 get_user_pages_locked
1309#define get_user_pages_locked(...) GUPL_MACRO(__VA_ARGS__, \
1310 get_user_pages_locked8, x, \
1311 get_user_pages_locked6, x, x, x, x)(__VA_ARGS__)
1312
1313__gup_deprecated
1314long get_user_pages_unlocked7(struct task_struct *tsk, struct mm_struct *mm,
1315 unsigned long start, unsigned long nr_pages,
1316 int write, int force, struct page **pages);
1317#define GUPU_MACRO(_1, _2, _3, _4, _5, _6, _7, get_user_pages_unlocked, ...) \
1318 get_user_pages_unlocked
1319#define get_user_pages_unlocked(...) GUPU_MACRO(__VA_ARGS__, \
1320 get_user_pages_unlocked7, x, \
1321 get_user_pages_unlocked5, x, x, x, x)(__VA_ARGS__)
1322
1245/* Container for pinned pfns / pages */ 1323/* Container for pinned pfns / pages */
1246struct frame_vector { 1324struct frame_vector {
1247 unsigned int nr_allocated; /* Number of frames we have space for */ 1325 unsigned int nr_allocated; /* Number of frames we have space for */
@@ -2169,6 +2247,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma,
2169#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ 2247#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
2170#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ 2248#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
2171#define FOLL_MLOCK 0x1000 /* lock present pages */ 2249#define FOLL_MLOCK 0x1000 /* lock present pages */
2250#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */
2172 2251
2173typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, 2252typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
2174 void *data); 2253 void *data);
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 16373c8f5f57..33e17f6a327a 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -35,7 +35,7 @@ static inline void vm_unacct_memory(long pages)
35 */ 35 */
36 36
37#ifndef arch_calc_vm_prot_bits 37#ifndef arch_calc_vm_prot_bits
38#define arch_calc_vm_prot_bits(prot) 0 38#define arch_calc_vm_prot_bits(prot, pkey) 0
39#endif 39#endif
40 40
41#ifndef arch_vm_get_page_prot 41#ifndef arch_vm_get_page_prot
@@ -70,12 +70,12 @@ static inline int arch_validate_prot(unsigned long prot)
70 * Combine the mmap "prot" argument into "vm_flags" used internally. 70 * Combine the mmap "prot" argument into "vm_flags" used internally.
71 */ 71 */
72static inline unsigned long 72static inline unsigned long
73calc_vm_prot_bits(unsigned long prot) 73calc_vm_prot_bits(unsigned long prot, unsigned long pkey)
74{ 74{
75 return _calc_vm_trans(prot, PROT_READ, VM_READ ) | 75 return _calc_vm_trans(prot, PROT_READ, VM_READ ) |
76 _calc_vm_trans(prot, PROT_WRITE, VM_WRITE) | 76 _calc_vm_trans(prot, PROT_WRITE, VM_WRITE) |
77 _calc_vm_trans(prot, PROT_EXEC, VM_EXEC) | 77 _calc_vm_trans(prot, PROT_EXEC, VM_EXEC) |
78 arch_calc_vm_prot_bits(prot); 78 arch_calc_vm_prot_bits(prot, pkey);
79} 79}
80 80
81/* 81/*
diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h
new file mode 100644
index 000000000000..1d405a2b7272
--- /dev/null
+++ b/include/linux/pkeys.h
@@ -0,0 +1,33 @@
1#ifndef _LINUX_PKEYS_H
2#define _LINUX_PKEYS_H
3
4#include <linux/mm_types.h>
5#include <asm/mmu_context.h>
6
7#define PKEY_DISABLE_ACCESS 0x1
8#define PKEY_DISABLE_WRITE 0x2
9#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
10 PKEY_DISABLE_WRITE)
11
12#ifdef CONFIG_ARCH_HAS_PKEYS
13#include <asm/pkeys.h>
14#else /* ! CONFIG_ARCH_HAS_PKEYS */
15#define arch_max_pkey() (1)
16#define execute_only_pkey(mm) (0)
17#define arch_override_mprotect_pkey(vma, prot, pkey) (0)
18#define PKEY_DEDICATED_EXECUTE_ONLY 0
19#endif /* ! CONFIG_ARCH_HAS_PKEYS */
20
21/*
22 * This is called from mprotect_pkey().
23 *
24 * Returns true if the protection keys is valid.
25 */
26static inline bool validate_pkey(int pkey)
27{
28 if (pkey < 0)
29 return false;
30 return (pkey < arch_max_pkey());
31}
32
33#endif /* _LINUX_PKEYS_H */
diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
index 1e3552037a5a..1abaf62c86fc 100644
--- a/include/uapi/asm-generic/siginfo.h
+++ b/include/uapi/asm-generic/siginfo.h
@@ -91,10 +91,15 @@ typedef struct siginfo {
91 int _trapno; /* TRAP # which caused the signal */ 91 int _trapno; /* TRAP # which caused the signal */
92#endif 92#endif
93 short _addr_lsb; /* LSB of the reported address */ 93 short _addr_lsb; /* LSB of the reported address */
94 struct { 94 union {
95 void __user *_lower; 95 /* used when si_code=SEGV_BNDERR */
96 void __user *_upper; 96 struct {
97 } _addr_bnd; 97 void __user *_lower;
98 void __user *_upper;
99 } _addr_bnd;
100 /* used when si_code=SEGV_PKUERR */
101 __u32 _pkey;
102 };
98 } _sigfault; 103 } _sigfault;
99 104
100 /* SIGPOLL */ 105 /* SIGPOLL */
@@ -137,6 +142,7 @@ typedef struct siginfo {
137#define si_addr_lsb _sifields._sigfault._addr_lsb 142#define si_addr_lsb _sifields._sigfault._addr_lsb
138#define si_lower _sifields._sigfault._addr_bnd._lower 143#define si_lower _sifields._sigfault._addr_bnd._lower
139#define si_upper _sifields._sigfault._addr_bnd._upper 144#define si_upper _sifields._sigfault._addr_bnd._upper
145#define si_pkey _sifields._sigfault._pkey
140#define si_band _sifields._sigpoll._band 146#define si_band _sifields._sigpoll._band
141#define si_fd _sifields._sigpoll._fd 147#define si_fd _sifields._sigpoll._fd
142#ifdef __ARCH_SIGSYS 148#ifdef __ARCH_SIGSYS
@@ -206,7 +212,8 @@ typedef struct siginfo {
206#define SEGV_MAPERR (__SI_FAULT|1) /* address not mapped to object */ 212#define SEGV_MAPERR (__SI_FAULT|1) /* address not mapped to object */
207#define SEGV_ACCERR (__SI_FAULT|2) /* invalid permissions for mapped object */ 213#define SEGV_ACCERR (__SI_FAULT|2) /* invalid permissions for mapped object */
208#define SEGV_BNDERR (__SI_FAULT|3) /* failed address bound checks */ 214#define SEGV_BNDERR (__SI_FAULT|3) /* failed address bound checks */
209#define NSIGSEGV 3 215#define SEGV_PKUERR (__SI_FAULT|4) /* failed protection key checks */
216#define NSIGSEGV 4
210 217
211/* 218/*
212 * SIGBUS si_codes 219 * SIGBUS si_codes
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 5f6ce931f1ea..220fc17b9718 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -299,7 +299,7 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
299 299
300retry: 300retry:
301 /* Read the page with vaddr into memory */ 301 /* Read the page with vaddr into memory */
302 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); 302 ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma);
303 if (ret <= 0) 303 if (ret <= 0)
304 return ret; 304 return ret;
305 305
@@ -1701,7 +1701,13 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
1701 if (likely(result == 0)) 1701 if (likely(result == 0))
1702 goto out; 1702 goto out;
1703 1703
1704 result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL); 1704 /*
1705 * The NULL 'tsk' here ensures that any faults that occur here
1706 * will not be accounted to the task. 'mm' *is* current->mm,
1707 * but we treat this as a 'remote' access since it is
1708 * essentially a kernel access to the memory.
1709 */
1710 result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
1705 if (result < 0) 1711 if (result < 0)
1706 return result; 1712 return result;
1707 1713
diff --git a/kernel/signal.c b/kernel/signal.c
index 0508544c8ced..fe8ed298373c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2709,6 +2709,10 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
2709 err |= __put_user(from->si_upper, &to->si_upper); 2709 err |= __put_user(from->si_upper, &to->si_upper);
2710 } 2710 }
2711#endif 2711#endif
2712#ifdef SEGV_PKUERR
2713 if (from->si_signo == SIGSEGV && from->si_code == SEGV_PKUERR)
2714 err |= __put_user(from->si_pkey, &to->si_pkey);
2715#endif
2712 break; 2716 break;
2713 case __SI_CHLD: 2717 case __SI_CHLD:
2714 err |= __put_user(from->si_pid, &to->si_pid); 2718 err |= __put_user(from->si_pid, &to->si_pid);
diff --git a/mm/Kconfig b/mm/Kconfig
index 05efa6a5199e..989f8f3d77e0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -667,3 +667,8 @@ config ZONE_DEVICE
667 667
668config FRAME_VECTOR 668config FRAME_VECTOR
669 bool 669 bool
670
671config ARCH_USES_HIGH_VMA_FLAGS
672 bool
673config ARCH_HAS_PKEYS
674 bool
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
index 7cf2b7163222..381bb07ed14f 100644
--- a/mm/frame_vector.c
+++ b/mm/frame_vector.c
@@ -58,7 +58,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
58 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) { 58 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
59 vec->got_ref = true; 59 vec->got_ref = true;
60 vec->is_pfns = false; 60 vec->is_pfns = false;
61 ret = get_user_pages_locked(current, mm, start, nr_frames, 61 ret = get_user_pages_locked(start, nr_frames,
62 write, force, (struct page **)(vec->ptrs), &locked); 62 write, force, (struct page **)(vec->ptrs), &locked);
63 goto out; 63 goto out;
64 } 64 }
diff --git a/mm/gup.c b/mm/gup.c
index 7bf19ffa2199..7f1c4fb77cfa 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1,3 +1,4 @@
1#define __DISABLE_GUP_DEPRECATED 1
1#include <linux/kernel.h> 2#include <linux/kernel.h>
2#include <linux/errno.h> 3#include <linux/errno.h>
3#include <linux/err.h> 4#include <linux/err.h>
@@ -14,6 +15,7 @@
14#include <linux/rwsem.h> 15#include <linux/rwsem.h>
15#include <linux/hugetlb.h> 16#include <linux/hugetlb.h>
16 17
18#include <asm/mmu_context.h>
17#include <asm/pgtable.h> 19#include <asm/pgtable.h>
18#include <asm/tlbflush.h> 20#include <asm/tlbflush.h>
19 21
@@ -363,6 +365,8 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
363 return -ENOENT; 365 return -ENOENT;
364 if (*flags & FOLL_WRITE) 366 if (*flags & FOLL_WRITE)
365 fault_flags |= FAULT_FLAG_WRITE; 367 fault_flags |= FAULT_FLAG_WRITE;
368 if (*flags & FOLL_REMOTE)
369 fault_flags |= FAULT_FLAG_REMOTE;
366 if (nonblocking) 370 if (nonblocking)
367 fault_flags |= FAULT_FLAG_ALLOW_RETRY; 371 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
368 if (*flags & FOLL_NOWAIT) 372 if (*flags & FOLL_NOWAIT)
@@ -413,11 +417,13 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
413static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) 417static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
414{ 418{
415 vm_flags_t vm_flags = vma->vm_flags; 419 vm_flags_t vm_flags = vma->vm_flags;
420 int write = (gup_flags & FOLL_WRITE);
421 int foreign = (gup_flags & FOLL_REMOTE);
416 422
417 if (vm_flags & (VM_IO | VM_PFNMAP)) 423 if (vm_flags & (VM_IO | VM_PFNMAP))
418 return -EFAULT; 424 return -EFAULT;
419 425
420 if (gup_flags & FOLL_WRITE) { 426 if (write) {
421 if (!(vm_flags & VM_WRITE)) { 427 if (!(vm_flags & VM_WRITE)) {
422 if (!(gup_flags & FOLL_FORCE)) 428 if (!(gup_flags & FOLL_FORCE))
423 return -EFAULT; 429 return -EFAULT;
@@ -443,6 +449,12 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
443 if (!(vm_flags & VM_MAYREAD)) 449 if (!(vm_flags & VM_MAYREAD))
444 return -EFAULT; 450 return -EFAULT;
445 } 451 }
452 /*
453 * gups are always data accesses, not instruction
454 * fetches, so execute=false here
455 */
456 if (!arch_vma_access_permitted(vma, write, false, foreign))
457 return -EFAULT;
446 return 0; 458 return 0;
447} 459}
448 460
@@ -609,6 +621,28 @@ next_page:
609} 621}
610EXPORT_SYMBOL(__get_user_pages); 622EXPORT_SYMBOL(__get_user_pages);
611 623
624bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags)
625{
626 bool write = !!(fault_flags & FAULT_FLAG_WRITE);
627 bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
628 vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
629
630 if (!(vm_flags & vma->vm_flags))
631 return false;
632
633 /*
634 * The architecture might have a hardware protection
635 * mechanism other than read/write that can deny access.
636 *
637 * gup always represents data access, not instruction
638 * fetches, so execute=false here:
639 */
640 if (!arch_vma_access_permitted(vma, write, false, foreign))
641 return false;
642
643 return true;
644}
645
612/* 646/*
613 * fixup_user_fault() - manually resolve a user page fault 647 * fixup_user_fault() - manually resolve a user page fault
614 * @tsk: the task_struct to use for page fault accounting, or 648 * @tsk: the task_struct to use for page fault accounting, or
@@ -644,7 +678,6 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
644 bool *unlocked) 678 bool *unlocked)
645{ 679{
646 struct vm_area_struct *vma; 680 struct vm_area_struct *vma;
647 vm_flags_t vm_flags;
648 int ret, major = 0; 681 int ret, major = 0;
649 682
650 if (unlocked) 683 if (unlocked)
@@ -655,8 +688,7 @@ retry:
655 if (!vma || address < vma->vm_start) 688 if (!vma || address < vma->vm_start)
656 return -EFAULT; 689 return -EFAULT;
657 690
658 vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; 691 if (!vma_permits_fault(vma, fault_flags))
659 if (!(vm_flags & vma->vm_flags))
660 return -EFAULT; 692 return -EFAULT;
661 693
662 ret = handle_mm_fault(mm, vma, address, fault_flags); 694 ret = handle_mm_fault(mm, vma, address, fault_flags);
@@ -807,15 +839,15 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
807 * if (locked) 839 * if (locked)
808 * up_read(&mm->mmap_sem); 840 * up_read(&mm->mmap_sem);
809 */ 841 */
810long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, 842long get_user_pages_locked6(unsigned long start, unsigned long nr_pages,
811 unsigned long start, unsigned long nr_pages,
812 int write, int force, struct page **pages, 843 int write, int force, struct page **pages,
813 int *locked) 844 int *locked)
814{ 845{
815 return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, 846 return __get_user_pages_locked(current, current->mm, start, nr_pages,
816 pages, NULL, locked, true, FOLL_TOUCH); 847 write, force, pages, NULL, locked, true,
848 FOLL_TOUCH);
817} 849}
818EXPORT_SYMBOL(get_user_pages_locked); 850EXPORT_SYMBOL(get_user_pages_locked6);
819 851
820/* 852/*
821 * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to 853 * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to
@@ -860,17 +892,16 @@ EXPORT_SYMBOL(__get_user_pages_unlocked);
860 * or if "force" shall be set to 1 (get_user_pages_fast misses the 892 * or if "force" shall be set to 1 (get_user_pages_fast misses the
861 * "force" parameter). 893 * "force" parameter).
862 */ 894 */
863long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, 895long get_user_pages_unlocked5(unsigned long start, unsigned long nr_pages,
864 unsigned long start, unsigned long nr_pages,
865 int write, int force, struct page **pages) 896 int write, int force, struct page **pages)
866{ 897{
867 return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write, 898 return __get_user_pages_unlocked(current, current->mm, start, nr_pages,
868 force, pages, FOLL_TOUCH); 899 write, force, pages, FOLL_TOUCH);
869} 900}
870EXPORT_SYMBOL(get_user_pages_unlocked); 901EXPORT_SYMBOL(get_user_pages_unlocked5);
871 902
872/* 903/*
873 * get_user_pages() - pin user pages in memory 904 * get_user_pages_remote() - pin user pages in memory
874 * @tsk: the task_struct to use for page fault accounting, or 905 * @tsk: the task_struct to use for page fault accounting, or
875 * NULL if faults are not to be recorded. 906 * NULL if faults are not to be recorded.
876 * @mm: mm_struct of target mm 907 * @mm: mm_struct of target mm
@@ -924,14 +955,32 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
924 * should use get_user_pages because it cannot pass 955 * should use get_user_pages because it cannot pass
925 * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. 956 * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
926 */ 957 */
927long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 958long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
928 unsigned long start, unsigned long nr_pages, int write, 959 unsigned long start, unsigned long nr_pages,
929 int force, struct page **pages, struct vm_area_struct **vmas) 960 int write, int force, struct page **pages,
961 struct vm_area_struct **vmas)
930{ 962{
931 return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, 963 return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
932 pages, vmas, NULL, false, FOLL_TOUCH); 964 pages, vmas, NULL, false,
965 FOLL_TOUCH | FOLL_REMOTE);
966}
967EXPORT_SYMBOL(get_user_pages_remote);
968
969/*
970 * This is the same as get_user_pages_remote(), just with a
971 * less-flexible calling convention where we assume that the task
972 * and mm being operated on are the current task's. We also
973 * obviously don't pass FOLL_REMOTE in here.
974 */
975long get_user_pages6(unsigned long start, unsigned long nr_pages,
976 int write, int force, struct page **pages,
977 struct vm_area_struct **vmas)
978{
979 return __get_user_pages_locked(current, current->mm, start, nr_pages,
980 write, force, pages, vmas, NULL, false,
981 FOLL_TOUCH);
933} 982}
934EXPORT_SYMBOL(get_user_pages); 983EXPORT_SYMBOL(get_user_pages6);
935 984
936/** 985/**
937 * populate_vma_page_range() - populate a range of pages in the vma. 986 * populate_vma_page_range() - populate a range of pages in the vma.
@@ -1144,6 +1193,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
1144 pte_protnone(pte) || (write && !pte_write(pte))) 1193 pte_protnone(pte) || (write && !pte_write(pte)))
1145 goto pte_unmap; 1194 goto pte_unmap;
1146 1195
1196 if (!arch_pte_access_permitted(pte, write))
1197 goto pte_unmap;
1198
1147 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 1199 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
1148 page = pte_page(pte); 1200 page = pte_page(pte);
1149 head = compound_head(page); 1201 head = compound_head(page);
@@ -1467,3 +1519,38 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
1467} 1519}
1468 1520
1469#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */ 1521#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */
1522
1523long get_user_pages8(struct task_struct *tsk, struct mm_struct *mm,
1524 unsigned long start, unsigned long nr_pages,
1525 int write, int force, struct page **pages,
1526 struct vm_area_struct **vmas)
1527{
1528 WARN_ONCE(tsk != current, "get_user_pages() called on remote task");
1529 WARN_ONCE(mm != current->mm, "get_user_pages() called on remote mm");
1530
1531 return get_user_pages6(start, nr_pages, write, force, pages, vmas);
1532}
1533EXPORT_SYMBOL(get_user_pages8);
1534
1535long get_user_pages_locked8(struct task_struct *tsk, struct mm_struct *mm,
1536 unsigned long start, unsigned long nr_pages,
1537 int write, int force, struct page **pages, int *locked)
1538{
1539 WARN_ONCE(tsk != current, "get_user_pages_locked() called on remote task");
1540 WARN_ONCE(mm != current->mm, "get_user_pages_locked() called on remote mm");
1541
1542 return get_user_pages_locked6(start, nr_pages, write, force, pages, locked);
1543}
1544EXPORT_SYMBOL(get_user_pages_locked8);
1545
1546long get_user_pages_unlocked7(struct task_struct *tsk, struct mm_struct *mm,
1547 unsigned long start, unsigned long nr_pages,
1548 int write, int force, struct page **pages)
1549{
1550 WARN_ONCE(tsk != current, "get_user_pages_unlocked() called on remote task");
1551 WARN_ONCE(mm != current->mm, "get_user_pages_unlocked() called on remote mm");
1552
1553 return get_user_pages_unlocked5(start, nr_pages, write, force, pages);
1554}
1555EXPORT_SYMBOL(get_user_pages_unlocked7);
1556
diff --git a/mm/ksm.c b/mm/ksm.c
index ca6d2a06a615..b99e828172f6 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -352,13 +352,17 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
352/* 352/*
353 * We use break_ksm to break COW on a ksm page: it's a stripped down 353 * We use break_ksm to break COW on a ksm page: it's a stripped down
354 * 354 *
355 * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1) 355 * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1)
356 * put_page(page); 356 * put_page(page);
357 * 357 *
358 * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, 358 * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
359 * in case the application has unmapped and remapped mm,addr meanwhile. 359 * in case the application has unmapped and remapped mm,addr meanwhile.
360 * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP 360 * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
361 * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. 361 * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
362 *
363 * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context
364 * of the process that owns 'vma'. We also do not want to enforce
365 * protection keys here anyway.
362 */ 366 */
363static int break_ksm(struct vm_area_struct *vma, unsigned long addr) 367static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
364{ 368{
@@ -367,12 +371,14 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
367 371
368 do { 372 do {
369 cond_resched(); 373 cond_resched();
370 page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION); 374 page = follow_page(vma, addr,
375 FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
371 if (IS_ERR_OR_NULL(page)) 376 if (IS_ERR_OR_NULL(page))
372 break; 377 break;
373 if (PageKsm(page)) 378 if (PageKsm(page))
374 ret = handle_mm_fault(vma->vm_mm, vma, addr, 379 ret = handle_mm_fault(vma->vm_mm, vma, addr,
375 FAULT_FLAG_WRITE); 380 FAULT_FLAG_WRITE |
381 FAULT_FLAG_REMOTE);
376 else 382 else
377 ret = VM_FAULT_WRITE; 383 ret = VM_FAULT_WRITE;
378 put_page(page); 384 put_page(page);
diff --git a/mm/memory.c b/mm/memory.c
index ac6bc15c19be..81dca0083fcd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -65,6 +65,7 @@
65#include <linux/userfaultfd_k.h> 65#include <linux/userfaultfd_k.h>
66 66
67#include <asm/io.h> 67#include <asm/io.h>
68#include <asm/mmu_context.h>
68#include <asm/pgalloc.h> 69#include <asm/pgalloc.h>
69#include <asm/uaccess.h> 70#include <asm/uaccess.h>
70#include <asm/tlb.h> 71#include <asm/tlb.h>
@@ -3375,6 +3376,11 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3375 pmd_t *pmd; 3376 pmd_t *pmd;
3376 pte_t *pte; 3377 pte_t *pte;
3377 3378
3379 if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
3380 flags & FAULT_FLAG_INSTRUCTION,
3381 flags & FAULT_FLAG_REMOTE))
3382 return VM_FAULT_SIGSEGV;
3383
3378 if (unlikely(is_vm_hugetlb_page(vma))) 3384 if (unlikely(is_vm_hugetlb_page(vma)))
3379 return hugetlb_fault(mm, vma, address, flags); 3385 return hugetlb_fault(mm, vma, address, flags);
3380 3386
@@ -3691,7 +3697,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
3691 void *maddr; 3697 void *maddr;
3692 struct page *page = NULL; 3698 struct page *page = NULL;
3693 3699
3694 ret = get_user_pages(tsk, mm, addr, 1, 3700 ret = get_user_pages_remote(tsk, mm, addr, 1,
3695 write, 1, &page, &vma); 3701 write, 1, &page, &vma);
3696 if (ret <= 0) { 3702 if (ret <= 0) {
3697#ifndef CONFIG_HAVE_IOREMAP_PROT 3703#ifndef CONFIG_HAVE_IOREMAP_PROT
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b25de27b83d0..36cc01bc950a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -846,12 +846,12 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
846 } 846 }
847} 847}
848 848
849static int lookup_node(struct mm_struct *mm, unsigned long addr) 849static int lookup_node(unsigned long addr)
850{ 850{
851 struct page *p; 851 struct page *p;
852 int err; 852 int err;
853 853
854 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL); 854 err = get_user_pages(addr & PAGE_MASK, 1, 0, 0, &p, NULL);
855 if (err >= 0) { 855 if (err >= 0) {
856 err = page_to_nid(p); 856 err = page_to_nid(p);
857 put_page(p); 857 put_page(p);
@@ -906,7 +906,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
906 906
907 if (flags & MPOL_F_NODE) { 907 if (flags & MPOL_F_NODE) {
908 if (flags & MPOL_F_ADDR) { 908 if (flags & MPOL_F_ADDR) {
909 err = lookup_node(mm, addr); 909 err = lookup_node(addr);
910 if (err < 0) 910 if (err < 0)
911 goto out; 911 goto out;
912 *policy = err; 912 *policy = err;
diff --git a/mm/mmap.c b/mm/mmap.c
index e06345aafa03..bd2e1a533bc1 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -42,6 +42,7 @@
42#include <linux/printk.h> 42#include <linux/printk.h>
43#include <linux/userfaultfd_k.h> 43#include <linux/userfaultfd_k.h>
44#include <linux/moduleparam.h> 44#include <linux/moduleparam.h>
45#include <linux/pkeys.h>
45 46
46#include <asm/uaccess.h> 47#include <asm/uaccess.h>
47#include <asm/cacheflush.h> 48#include <asm/cacheflush.h>
@@ -1145,6 +1146,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
1145 unsigned long pgoff, unsigned long *populate) 1146 unsigned long pgoff, unsigned long *populate)
1146{ 1147{
1147 struct mm_struct *mm = current->mm; 1148 struct mm_struct *mm = current->mm;
1149 int pkey = 0;
1148 1150
1149 *populate = 0; 1151 *populate = 0;
1150 1152
@@ -1184,11 +1186,17 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
1184 if (offset_in_page(addr)) 1186 if (offset_in_page(addr))
1185 return addr; 1187 return addr;
1186 1188
1189 if (prot == PROT_EXEC) {
1190 pkey = execute_only_pkey(mm);
1191 if (pkey < 0)
1192 pkey = 0;
1193 }
1194
1187 /* Do simple checking here so the lower-level routines won't have 1195 /* Do simple checking here so the lower-level routines won't have
1188 * to. we assume access permissions have been handled by the open 1196 * to. we assume access permissions have been handled by the open
1189 * of the memory object, so we don't do any here. 1197 * of the memory object, so we don't do any here.
1190 */ 1198 */
1191 vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | 1199 vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
1192 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; 1200 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
1193 1201
1194 if (flags & MAP_LOCKED) 1202 if (flags & MAP_LOCKED)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index f7cb3d4d9c2e..fa37c4cd973a 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -24,6 +24,7 @@
24#include <linux/migrate.h> 24#include <linux/migrate.h>
25#include <linux/perf_event.h> 25#include <linux/perf_event.h>
26#include <linux/ksm.h> 26#include <linux/ksm.h>
27#include <linux/pkeys.h>
27#include <asm/uaccess.h> 28#include <asm/uaccess.h>
28#include <asm/pgtable.h> 29#include <asm/pgtable.h>
29#include <asm/cacheflush.h> 30#include <asm/cacheflush.h>
@@ -354,7 +355,7 @@ fail:
354SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, 355SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
355 unsigned long, prot) 356 unsigned long, prot)
356{ 357{
357 unsigned long vm_flags, nstart, end, tmp, reqprot; 358 unsigned long nstart, end, tmp, reqprot;
358 struct vm_area_struct *vma, *prev; 359 struct vm_area_struct *vma, *prev;
359 int error = -EINVAL; 360 int error = -EINVAL;
360 const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); 361 const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
@@ -380,8 +381,6 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
380 if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) 381 if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
381 prot |= PROT_EXEC; 382 prot |= PROT_EXEC;
382 383
383 vm_flags = calc_vm_prot_bits(prot);
384
385 down_write(&current->mm->mmap_sem); 384 down_write(&current->mm->mmap_sem);
386 385
387 vma = find_vma(current->mm, start); 386 vma = find_vma(current->mm, start);
@@ -411,10 +410,11 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
411 410
412 for (nstart = start ; ; ) { 411 for (nstart = start ; ; ) {
413 unsigned long newflags; 412 unsigned long newflags;
413 int pkey = arch_override_mprotect_pkey(vma, prot, -1);
414 414
415 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ 415 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
416 416
417 newflags = vm_flags; 417 newflags = calc_vm_prot_bits(prot, pkey);
418 newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); 418 newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
419 419
420 /* newflags >> 4 shift VM_MAY% in place of VM_% */ 420 /* newflags >> 4 shift VM_MAY% in place of VM_% */
diff --git a/mm/nommu.c b/mm/nommu.c
index 6402f2715d48..de8b6b6580c1 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -15,6 +15,8 @@
15 15
16#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 16#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
17 17
18#define __DISABLE_GUP_DEPRECATED
19
18#include <linux/export.h> 20#include <linux/export.h>
19#include <linux/mm.h> 21#include <linux/mm.h>
20#include <linux/vmacache.h> 22#include <linux/vmacache.h>
@@ -159,8 +161,7 @@ finish_or_fault:
159 * slab page or a secondary page from a compound page 161 * slab page or a secondary page from a compound page
160 * - don't permit access to VMAs that don't support it, such as I/O mappings 162 * - don't permit access to VMAs that don't support it, such as I/O mappings
161 */ 163 */
162long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 164long get_user_pages6(unsigned long start, unsigned long nr_pages,
163 unsigned long start, unsigned long nr_pages,
164 int write, int force, struct page **pages, 165 int write, int force, struct page **pages,
165 struct vm_area_struct **vmas) 166 struct vm_area_struct **vmas)
166{ 167{
@@ -171,20 +172,18 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
171 if (force) 172 if (force)
172 flags |= FOLL_FORCE; 173 flags |= FOLL_FORCE;
173 174
174 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, 175 return __get_user_pages(current, current->mm, start, nr_pages, flags,
175 NULL); 176 pages, vmas, NULL);
176} 177}
177EXPORT_SYMBOL(get_user_pages); 178EXPORT_SYMBOL(get_user_pages6);
178 179
179long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, 180long get_user_pages_locked6(unsigned long start, unsigned long nr_pages,
180 unsigned long start, unsigned long nr_pages, 181 int write, int force, struct page **pages,
181 int write, int force, struct page **pages, 182 int *locked)
182 int *locked)
183{ 183{
184 return get_user_pages(tsk, mm, start, nr_pages, write, force, 184 return get_user_pages6(start, nr_pages, write, force, pages, NULL);
185 pages, NULL);
186} 185}
187EXPORT_SYMBOL(get_user_pages_locked); 186EXPORT_SYMBOL(get_user_pages_locked6);
188 187
189long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, 188long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
190 unsigned long start, unsigned long nr_pages, 189 unsigned long start, unsigned long nr_pages,
@@ -193,21 +192,20 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
193{ 192{
194 long ret; 193 long ret;
195 down_read(&mm->mmap_sem); 194 down_read(&mm->mmap_sem);
196 ret = get_user_pages(tsk, mm, start, nr_pages, write, force, 195 ret = __get_user_pages(tsk, mm, start, nr_pages, gup_flags, pages,
197 pages, NULL); 196 NULL, NULL);
198 up_read(&mm->mmap_sem); 197 up_read(&mm->mmap_sem);
199 return ret; 198 return ret;
200} 199}
201EXPORT_SYMBOL(__get_user_pages_unlocked); 200EXPORT_SYMBOL(__get_user_pages_unlocked);
202 201
203long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, 202long get_user_pages_unlocked5(unsigned long start, unsigned long nr_pages,
204 unsigned long start, unsigned long nr_pages,
205 int write, int force, struct page **pages) 203 int write, int force, struct page **pages)
206{ 204{
207 return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write, 205 return __get_user_pages_unlocked(current, current->mm, start, nr_pages,
208 force, pages, 0); 206 write, force, pages, 0);
209} 207}
210EXPORT_SYMBOL(get_user_pages_unlocked); 208EXPORT_SYMBOL(get_user_pages_unlocked5);
211 209
212/** 210/**
213 * follow_pfn - look up PFN at a user virtual address 211 * follow_pfn - look up PFN at a user virtual address
@@ -1061,7 +1059,7 @@ static unsigned long determine_vm_flags(struct file *file,
1061{ 1059{
1062 unsigned long vm_flags; 1060 unsigned long vm_flags;
1063 1061
1064 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags); 1062 vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags);
1065 /* vm_flags |= mm->def_flags; */ 1063 /* vm_flags |= mm->def_flags; */
1066 1064
1067 if (!(capabilities & NOMMU_MAP_DIRECT)) { 1065 if (!(capabilities & NOMMU_MAP_DIRECT)) {
@@ -1991,3 +1989,31 @@ static int __meminit init_admin_reserve(void)
1991 return 0; 1989 return 0;
1992} 1990}
1993subsys_initcall(init_admin_reserve); 1991subsys_initcall(init_admin_reserve);
1992
1993long get_user_pages8(struct task_struct *tsk, struct mm_struct *mm,
1994 unsigned long start, unsigned long nr_pages,
1995 int write, int force, struct page **pages,
1996 struct vm_area_struct **vmas)
1997{
1998 return get_user_pages6(start, nr_pages, write, force, pages, vmas);
1999}
2000EXPORT_SYMBOL(get_user_pages8);
2001
2002long get_user_pages_locked8(struct task_struct *tsk, struct mm_struct *mm,
2003 unsigned long start, unsigned long nr_pages,
2004 int write, int force, struct page **pages,
2005 int *locked)
2006{
2007 return get_user_pages_locked6(start, nr_pages, write,
2008 force, pages, locked);
2009}
2010EXPORT_SYMBOL(get_user_pages_locked8);
2011
2012long get_user_pages_unlocked7(struct task_struct *tsk, struct mm_struct *mm,
2013 unsigned long start, unsigned long nr_pages,
2014 int write, int force, struct page **pages)
2015{
2016 return get_user_pages_unlocked5(start, nr_pages, write, force, pages);
2017}
2018EXPORT_SYMBOL(get_user_pages_unlocked7);
2019
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 5d453e58ddbf..07514d41ebcc 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -98,9 +98,14 @@ static int process_vm_rw_single_vec(unsigned long addr,
98 int pages = min(nr_pages, max_pages_per_loop); 98 int pages = min(nr_pages, max_pages_per_loop);
99 size_t bytes; 99 size_t bytes;
100 100
101 /* Get the pages we're interested in */ 101 /*
102 pages = get_user_pages_unlocked(task, mm, pa, pages, 102 * Get the pages we're interested in. We must
103 vm_write, 0, process_pages); 103 * add FOLL_REMOTE because task/mm might not
104 * current/current->mm
105 */
106 pages = __get_user_pages_unlocked(task, mm, pa, pages,
107 vm_write, 0, process_pages,
108 FOLL_REMOTE);
104 if (pages <= 0) 109 if (pages <= 0)
105 return -EFAULT; 110 return -EFAULT;
106 111
diff --git a/mm/util.c b/mm/util.c
index 47a57e557614..6cc81e7b8705 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -283,9 +283,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
283int __weak get_user_pages_fast(unsigned long start, 283int __weak get_user_pages_fast(unsigned long start,
284 int nr_pages, int write, struct page **pages) 284 int nr_pages, int write, struct page **pages)
285{ 285{
286 struct mm_struct *mm = current->mm; 286 return get_user_pages_unlocked(start, nr_pages, write, 0, pages);
287 return get_user_pages_unlocked(current, mm, start, nr_pages,
288 write, 0, pages);
289} 287}
290EXPORT_SYMBOL_GPL(get_user_pages_fast); 288EXPORT_SYMBOL_GPL(get_user_pages_fast);
291 289
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index d4f5f220a8e5..10297f7a89ba 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -24,7 +24,7 @@ struct page **ceph_get_direct_page_vector(const void __user *data,
24 return ERR_PTR(-ENOMEM); 24 return ERR_PTR(-ENOMEM);
25 25
26 while (got < num_pages) { 26 while (got < num_pages) {
27 rc = get_user_pages_unlocked(current, current->mm, 27 rc = get_user_pages_unlocked(
28 (unsigned long)data + ((unsigned long)got * PAGE_SIZE), 28 (unsigned long)data + ((unsigned long)got * PAGE_SIZE),
29 num_pages - got, write_page, 0, pages + got); 29 num_pages - got, write_page, 0, pages + got);
30 if (rc < 0) 30 if (rc < 0)
diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c
index 38651454ed08..ade7c6cad172 100644
--- a/security/tomoyo/domain.c
+++ b/security/tomoyo/domain.c
@@ -874,7 +874,14 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos,
874 } 874 }
875 /* Same with get_arg_page(bprm, pos, 0) in fs/exec.c */ 875 /* Same with get_arg_page(bprm, pos, 0) in fs/exec.c */
876#ifdef CONFIG_MMU 876#ifdef CONFIG_MMU
877 if (get_user_pages(current, bprm->mm, pos, 1, 0, 1, &page, NULL) <= 0) 877 /*
878 * This is called at execve() time in order to dig around
879 * in the argv/environment of the new proceess
880 * (represented by bprm). 'current' is the process doing
881 * the execve().
882 */
883 if (get_user_pages_remote(current, bprm->mm, pos, 1,
884 0, 1, &page, NULL) <= 0)
878 return false; 885 return false;
879#else 886#else
880 page = bprm->page[pos / PAGE_SIZE]; 887 page = bprm->page[pos / PAGE_SIZE];
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index f0d061f92674..db9668869f6f 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -79,7 +79,13 @@ static void async_pf_execute(struct work_struct *work)
79 79
80 might_sleep(); 80 might_sleep();
81 81
82 get_user_pages_unlocked(NULL, mm, addr, 1, 1, 0, NULL); 82 /*
83 * This work is run asynchromously to the task which owns
84 * mm and might be done in another context, so we must
85 * use FOLL_REMOTE.
86 */
87 __get_user_pages_unlocked(NULL, mm, addr, 1, 1, 0, NULL, FOLL_REMOTE);
88
83 kvm_async_page_present_sync(vcpu, apf); 89 kvm_async_page_present_sync(vcpu, apf);
84 90
85 spin_lock(&vcpu->async_pf.lock); 91 spin_lock(&vcpu->async_pf.lock);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7ba1d10ffed2..99ee4b1ce2db 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1260,15 +1260,16 @@ unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *w
1260 return gfn_to_hva_memslot_prot(slot, gfn, writable); 1260 return gfn_to_hva_memslot_prot(slot, gfn, writable);
1261} 1261}
1262 1262
1263static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, 1263static int get_user_page_nowait(unsigned long start, int write,
1264 unsigned long start, int write, struct page **page) 1264 struct page **page)
1265{ 1265{
1266 int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET; 1266 int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET;
1267 1267
1268 if (write) 1268 if (write)
1269 flags |= FOLL_WRITE; 1269 flags |= FOLL_WRITE;
1270 1270
1271 return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL); 1271 return __get_user_pages(current, current->mm, start, 1, flags, page,
1272 NULL, NULL);
1272} 1273}
1273 1274
1274static inline int check_user_page_hwpoison(unsigned long addr) 1275static inline int check_user_page_hwpoison(unsigned long addr)
@@ -1330,8 +1331,7 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
1330 1331
1331 if (async) { 1332 if (async) {
1332 down_read(&current->mm->mmap_sem); 1333 down_read(&current->mm->mmap_sem);
1333 npages = get_user_page_nowait(current, current->mm, 1334 npages = get_user_page_nowait(addr, write_fault, page);
1334 addr, write_fault, page);
1335 up_read(&current->mm->mmap_sem); 1335 up_read(&current->mm->mmap_sem);
1336 } else 1336 } else
1337 npages = __get_user_pages_unlocked(current, current->mm, addr, 1, 1337 npages = __get_user_pages_unlocked(current, current->mm, addr, 1,