diff options
| author | Ingo Molnar <mingo@kernel.org> | 2017-04-23 05:37:17 -0400 |
|---|---|---|
| committer | Ingo Molnar <mingo@kernel.org> | 2017-04-23 05:45:20 -0400 |
| commit | 6dd29b3df975582ef429b5b93c899e6575785940 (patch) | |
| tree | f7f214935c45eb7ea9096fccd4cdace3baa99e68 | |
| parent | ace2fb5a8b65d6aba530068ea9331f18e10ef565 (diff) | |
Revert "x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation"
This reverts commit 2947ba054a4dabbd82848728d765346886050029.
Dan Williams reported dax-pmem kernel warnings with the following signature:
WARNING: CPU: 8 PID: 245 at lib/percpu-refcount.c:155 percpu_ref_switch_to_atomic_rcu+0x1f5/0x200
percpu ref (dax_pmem_percpu_release [dax_pmem]) <= 0 (0) after switching to atomic
... and bisected it to this commit, which suggests possible memory corruption
caused by the x86 fast-GUP conversion.
He also pointed out:
"
This is similar to the backtrace when we were not properly handling
pud faults and was fixed with this commit: 220ced1676c4 "mm: fix
get_user_pages() vs device-dax pud mappings"
I've found some missing _devmap checks in the generic
get_user_pages_fast() path, but this does not fix the regression
[...]
"
So given that there are known bugs, and a pretty robust looking bisection
points to this commit suggesting that are unknown bugs in the conversion
as well, revert it for the time being - we'll re-try in v4.13.
Reported-by: Dan Williams <dan.j.williams@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: aneesh.kumar@linux.vnet.ibm.com
Cc: dann.frazier@canonical.com
Cc: dave.hansen@intel.com
Cc: steve.capper@linaro.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
| -rw-r--r-- | arch/arm/Kconfig | 2 | ||||
| -rw-r--r-- | arch/arm64/Kconfig | 2 | ||||
| -rw-r--r-- | arch/powerpc/Kconfig | 2 | ||||
| -rw-r--r-- | arch/x86/Kconfig | 3 | ||||
| -rw-r--r-- | arch/x86/include/asm/mmu_context.h | 12 | ||||
| -rw-r--r-- | arch/x86/include/asm/pgtable-3level.h | 47 | ||||
| -rw-r--r-- | arch/x86/include/asm/pgtable.h | 53 | ||||
| -rw-r--r-- | arch/x86/include/asm/pgtable_64.h | 16 | ||||
| -rw-r--r-- | arch/x86/mm/Makefile | 2 | ||||
| -rw-r--r-- | arch/x86/mm/gup.c | 496 | ||||
| -rw-r--r-- | mm/Kconfig | 2 | ||||
| -rw-r--r-- | mm/gup.c | 10 |
12 files changed, 519 insertions, 128 deletions
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 454fadd077ad..0d4e71b42c77 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig | |||
| @@ -1666,7 +1666,7 @@ config ARCH_SELECT_MEMORY_MODEL | |||
| 1666 | config HAVE_ARCH_PFN_VALID | 1666 | config HAVE_ARCH_PFN_VALID |
| 1667 | def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM | 1667 | def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM |
| 1668 | 1668 | ||
| 1669 | config HAVE_GENERIC_GUP | 1669 | config HAVE_GENERIC_RCU_GUP |
| 1670 | def_bool y | 1670 | def_bool y |
| 1671 | depends on ARM_LPAE | 1671 | depends on ARM_LPAE |
| 1672 | 1672 | ||
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index af62bf79721a..3741859765cf 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig | |||
| @@ -205,7 +205,7 @@ config GENERIC_CALIBRATE_DELAY | |||
| 205 | config ZONE_DMA | 205 | config ZONE_DMA |
| 206 | def_bool y | 206 | def_bool y |
| 207 | 207 | ||
| 208 | config HAVE_GENERIC_GUP | 208 | config HAVE_GENERIC_RCU_GUP |
| 209 | def_bool y | 209 | def_bool y |
| 210 | 210 | ||
| 211 | config ARCH_DMA_ADDR_T_64BIT | 211 | config ARCH_DMA_ADDR_T_64BIT |
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 3a716b2dcde9..97a8bc8a095c 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig | |||
| @@ -135,7 +135,7 @@ config PPC | |||
| 135 | select HAVE_FUNCTION_GRAPH_TRACER | 135 | select HAVE_FUNCTION_GRAPH_TRACER |
| 136 | select HAVE_FUNCTION_TRACER | 136 | select HAVE_FUNCTION_TRACER |
| 137 | select HAVE_GCC_PLUGINS | 137 | select HAVE_GCC_PLUGINS |
| 138 | select HAVE_GENERIC_GUP | 138 | select HAVE_GENERIC_RCU_GUP |
| 139 | select HAVE_HW_BREAKPOINT if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx) | 139 | select HAVE_HW_BREAKPOINT if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx) |
| 140 | select HAVE_IDE | 140 | select HAVE_IDE |
| 141 | select HAVE_IOREMAP_PROT | 141 | select HAVE_IOREMAP_PROT |
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a641b900fc1f..2bde14451e54 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
| @@ -2789,9 +2789,6 @@ config X86_DMA_REMAP | |||
| 2789 | bool | 2789 | bool |
| 2790 | depends on STA2X11 | 2790 | depends on STA2X11 |
| 2791 | 2791 | ||
| 2792 | config HAVE_GENERIC_GUP | ||
| 2793 | def_bool y | ||
| 2794 | |||
| 2795 | source "net/Kconfig" | 2792 | source "net/Kconfig" |
| 2796 | 2793 | ||
| 2797 | source "drivers/Kconfig" | 2794 | source "drivers/Kconfig" |
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 6e933d2d88d9..68b329d77b3a 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h | |||
| @@ -220,6 +220,18 @@ static inline int vma_pkey(struct vm_area_struct *vma) | |||
| 220 | } | 220 | } |
| 221 | #endif | 221 | #endif |
| 222 | 222 | ||
| 223 | static inline bool __pkru_allows_pkey(u16 pkey, bool write) | ||
| 224 | { | ||
| 225 | u32 pkru = read_pkru(); | ||
| 226 | |||
| 227 | if (!__pkru_allows_read(pkru, pkey)) | ||
| 228 | return false; | ||
| 229 | if (write && !__pkru_allows_write(pkru, pkey)) | ||
| 230 | return false; | ||
| 231 | |||
| 232 | return true; | ||
| 233 | } | ||
| 234 | |||
| 223 | /* | 235 | /* |
| 224 | * We only want to enforce protection keys on the current process | 236 | * We only want to enforce protection keys on the current process |
| 225 | * because we effectively have no access to PKRU for other | 237 | * because we effectively have no access to PKRU for other |
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index c8821bab938f..50d35e3185f5 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h | |||
| @@ -212,51 +212,4 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp) | |||
| 212 | #define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) | 212 | #define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) |
| 213 | #define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } }) | 213 | #define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } }) |
| 214 | 214 | ||
| 215 | #define gup_get_pte gup_get_pte | ||
| 216 | /* | ||
| 217 | * WARNING: only to be used in the get_user_pages_fast() implementation. | ||
| 218 | * | ||
| 219 | * With get_user_pages_fast(), we walk down the pagetables without taking | ||
| 220 | * any locks. For this we would like to load the pointers atomically, | ||
| 221 | * but that is not possible (without expensive cmpxchg8b) on PAE. What | ||
| 222 | * we do have is the guarantee that a PTE will only either go from not | ||
| 223 | * present to present, or present to not present or both -- it will not | ||
| 224 | * switch to a completely different present page without a TLB flush in | ||
| 225 | * between; something that we are blocking by holding interrupts off. | ||
| 226 | * | ||
| 227 | * Setting ptes from not present to present goes: | ||
| 228 | * | ||
| 229 | * ptep->pte_high = h; | ||
| 230 | * smp_wmb(); | ||
| 231 | * ptep->pte_low = l; | ||
| 232 | * | ||
| 233 | * And present to not present goes: | ||
| 234 | * | ||
| 235 | * ptep->pte_low = 0; | ||
| 236 | * smp_wmb(); | ||
| 237 | * ptep->pte_high = 0; | ||
| 238 | * | ||
| 239 | * We must ensure here that the load of pte_low sees 'l' iff pte_high | ||
| 240 | * sees 'h'. We load pte_high *after* loading pte_low, which ensures we | ||
| 241 | * don't see an older value of pte_high. *Then* we recheck pte_low, | ||
| 242 | * which ensures that we haven't picked up a changed pte high. We might | ||
| 243 | * have gotten rubbish values from pte_low and pte_high, but we are | ||
| 244 | * guaranteed that pte_low will not have the present bit set *unless* | ||
| 245 | * it is 'l'. Because get_user_pages_fast() only operates on present ptes | ||
| 246 | * we're safe. | ||
| 247 | */ | ||
| 248 | static inline pte_t gup_get_pte(pte_t *ptep) | ||
| 249 | { | ||
| 250 | pte_t pte; | ||
| 251 | |||
| 252 | do { | ||
| 253 | pte.pte_low = ptep->pte_low; | ||
| 254 | smp_rmb(); | ||
| 255 | pte.pte_high = ptep->pte_high; | ||
| 256 | smp_rmb(); | ||
| 257 | } while (unlikely(pte.pte_low != ptep->pte_low)); | ||
| 258 | |||
| 259 | return pte; | ||
| 260 | } | ||
| 261 | |||
| 262 | #endif /* _ASM_X86_PGTABLE_3LEVEL_H */ | 215 | #endif /* _ASM_X86_PGTABLE_3LEVEL_H */ |
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 942482ac36a8..f5af95a0c6b8 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h | |||
| @@ -244,11 +244,6 @@ static inline int pud_devmap(pud_t pud) | |||
| 244 | return 0; | 244 | return 0; |
| 245 | } | 245 | } |
| 246 | #endif | 246 | #endif |
| 247 | |||
| 248 | static inline int pgd_devmap(pgd_t pgd) | ||
| 249 | { | ||
| 250 | return 0; | ||
| 251 | } | ||
| 252 | #endif | 247 | #endif |
| 253 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 248 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
| 254 | 249 | ||
| @@ -1190,54 +1185,6 @@ static inline u16 pte_flags_pkey(unsigned long pte_flags) | |||
| 1190 | #endif | 1185 | #endif |
| 1191 | } | 1186 | } |
| 1192 | 1187 | ||
| 1193 | static inline bool __pkru_allows_pkey(u16 pkey, bool write) | ||
| 1194 | { | ||
| 1195 | u32 pkru = read_pkru(); | ||
| 1196 | |||
| 1197 | if (!__pkru_allows_read(pkru, pkey)) | ||
| 1198 | return false; | ||
| 1199 | if (write && !__pkru_allows_write(pkru, pkey)) | ||
| 1200 | return false; | ||
| 1201 | |||
| 1202 | return true; | ||
| 1203 | } | ||
| 1204 | |||
| 1205 | /* | ||
| 1206 | * 'pteval' can come from a PTE, PMD or PUD. We only check | ||
| 1207 | * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the | ||
| 1208 | * same value on all 3 types. | ||
| 1209 | */ | ||
| 1210 | static inline bool __pte_access_permitted(unsigned long pteval, bool write) | ||
| 1211 | { | ||
| 1212 | unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER; | ||
| 1213 | |||
| 1214 | if (write) | ||
| 1215 | need_pte_bits |= _PAGE_RW; | ||
| 1216 | |||
| 1217 | if ((pteval & need_pte_bits) != need_pte_bits) | ||
| 1218 | return 0; | ||
| 1219 | |||
| 1220 | return __pkru_allows_pkey(pte_flags_pkey(pteval), write); | ||
| 1221 | } | ||
| 1222 | |||
| 1223 | #define pte_access_permitted pte_access_permitted | ||
| 1224 | static inline bool pte_access_permitted(pte_t pte, bool write) | ||
| 1225 | { | ||
| 1226 | return __pte_access_permitted(pte_val(pte), write); | ||
| 1227 | } | ||
| 1228 | |||
| 1229 | #define pmd_access_permitted pmd_access_permitted | ||
| 1230 | static inline bool pmd_access_permitted(pmd_t pmd, bool write) | ||
| 1231 | { | ||
| 1232 | return __pte_access_permitted(pmd_val(pmd), write); | ||
| 1233 | } | ||
| 1234 | |||
| 1235 | #define pud_access_permitted pud_access_permitted | ||
| 1236 | static inline bool pud_access_permitted(pud_t pud, bool write) | ||
| 1237 | { | ||
| 1238 | return __pte_access_permitted(pud_val(pud), write); | ||
| 1239 | } | ||
| 1240 | |||
| 1241 | #include <asm-generic/pgtable.h> | 1188 | #include <asm-generic/pgtable.h> |
| 1242 | #endif /* __ASSEMBLY__ */ | 1189 | #endif /* __ASSEMBLY__ */ |
| 1243 | 1190 | ||
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 12ea31274eb6..9991224f6238 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h | |||
| @@ -227,20 +227,6 @@ extern void cleanup_highmap(void); | |||
| 227 | extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); | 227 | extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); |
| 228 | extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); | 228 | extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); |
| 229 | 229 | ||
| 230 | #define gup_fast_permitted gup_fast_permitted | ||
| 231 | static inline bool gup_fast_permitted(unsigned long start, int nr_pages, | ||
| 232 | int write) | ||
| 233 | { | ||
| 234 | unsigned long len, end; | ||
| 235 | |||
| 236 | len = (unsigned long)nr_pages << PAGE_SHIFT; | ||
| 237 | end = start + len; | ||
| 238 | if (end < start) | ||
| 239 | return false; | ||
| 240 | if (end >> __VIRTUAL_MASK_SHIFT) | ||
| 241 | return false; | ||
| 242 | return true; | ||
| 243 | } | ||
| 244 | |||
| 245 | #endif /* !__ASSEMBLY__ */ | 230 | #endif /* !__ASSEMBLY__ */ |
| 231 | |||
| 246 | #endif /* _ASM_X86_PGTABLE_64_H */ | 232 | #endif /* _ASM_X86_PGTABLE_64_H */ |
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 0fbdcb64f9f8..96d2b847e09e 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile | |||
| @@ -2,7 +2,7 @@ | |||
| 2 | KCOV_INSTRUMENT_tlb.o := n | 2 | KCOV_INSTRUMENT_tlb.o := n |
| 3 | 3 | ||
| 4 | obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ | 4 | obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ |
| 5 | pat.o pgtable.o physaddr.o setup_nx.o tlb.o | 5 | pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o |
| 6 | 6 | ||
| 7 | # Make sure __phys_addr has no stackprotector | 7 | # Make sure __phys_addr has no stackprotector |
| 8 | nostackp := $(call cc-option, -fno-stack-protector) | 8 | nostackp := $(call cc-option, -fno-stack-protector) |
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c new file mode 100644 index 000000000000..456dfdfd2249 --- /dev/null +++ b/arch/x86/mm/gup.c | |||
| @@ -0,0 +1,496 @@ | |||
| 1 | /* | ||
| 2 | * Lockless get_user_pages_fast for x86 | ||
| 3 | * | ||
| 4 | * Copyright (C) 2008 Nick Piggin | ||
| 5 | * Copyright (C) 2008 Novell Inc. | ||
| 6 | */ | ||
| 7 | #include <linux/sched.h> | ||
| 8 | #include <linux/mm.h> | ||
| 9 | #include <linux/vmstat.h> | ||
| 10 | #include <linux/highmem.h> | ||
| 11 | #include <linux/swap.h> | ||
| 12 | #include <linux/memremap.h> | ||
| 13 | |||
| 14 | #include <asm/mmu_context.h> | ||
| 15 | #include <asm/pgtable.h> | ||
| 16 | |||
| 17 | static inline pte_t gup_get_pte(pte_t *ptep) | ||
| 18 | { | ||
| 19 | #ifndef CONFIG_X86_PAE | ||
| 20 | return READ_ONCE(*ptep); | ||
| 21 | #else | ||
| 22 | /* | ||
| 23 | * With get_user_pages_fast, we walk down the pagetables without taking | ||
| 24 | * any locks. For this we would like to load the pointers atomically, | ||
| 25 | * but that is not possible (without expensive cmpxchg8b) on PAE. What | ||
| 26 | * we do have is the guarantee that a pte will only either go from not | ||
| 27 | * present to present, or present to not present or both -- it will not | ||
| 28 | * switch to a completely different present page without a TLB flush in | ||
| 29 | * between; something that we are blocking by holding interrupts off. | ||
| 30 | * | ||
| 31 | * Setting ptes from not present to present goes: | ||
| 32 | * ptep->pte_high = h; | ||
| 33 | * smp_wmb(); | ||
| 34 | * ptep->pte_low = l; | ||
| 35 | * | ||
| 36 | * And present to not present goes: | ||
| 37 | * ptep->pte_low = 0; | ||
| 38 | * smp_wmb(); | ||
| 39 | * ptep->pte_high = 0; | ||
| 40 | * | ||
| 41 | * We must ensure here that the load of pte_low sees l iff pte_high | ||
| 42 | * sees h. We load pte_high *after* loading pte_low, which ensures we | ||
| 43 | * don't see an older value of pte_high. *Then* we recheck pte_low, | ||
| 44 | * which ensures that we haven't picked up a changed pte high. We might | ||
| 45 | * have got rubbish values from pte_low and pte_high, but we are | ||
| 46 | * guaranteed that pte_low will not have the present bit set *unless* | ||
| 47 | * it is 'l'. And get_user_pages_fast only operates on present ptes, so | ||
| 48 | * we're safe. | ||
| 49 | * | ||
| 50 | * gup_get_pte should not be used or copied outside gup.c without being | ||
| 51 | * very careful -- it does not atomically load the pte or anything that | ||
| 52 | * is likely to be useful for you. | ||
| 53 | */ | ||
| 54 | pte_t pte; | ||
| 55 | |||
| 56 | retry: | ||
| 57 | pte.pte_low = ptep->pte_low; | ||
| 58 | smp_rmb(); | ||
| 59 | pte.pte_high = ptep->pte_high; | ||
| 60 | smp_rmb(); | ||
| 61 | if (unlikely(pte.pte_low != ptep->pte_low)) | ||
| 62 | goto retry; | ||
| 63 | |||
| 64 | return pte; | ||
| 65 | #endif | ||
| 66 | } | ||
| 67 | |||
| 68 | static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) | ||
| 69 | { | ||
| 70 | while ((*nr) - nr_start) { | ||
| 71 | struct page *page = pages[--(*nr)]; | ||
| 72 | |||
| 73 | ClearPageReferenced(page); | ||
| 74 | put_page(page); | ||
| 75 | } | ||
| 76 | } | ||
| 77 | |||
| 78 | /* | ||
| 79 | * 'pteval' can come from a pte, pmd, pud or p4d. We only check | ||
| 80 | * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the | ||
| 81 | * same value on all 4 types. | ||
| 82 | */ | ||
| 83 | static inline int pte_allows_gup(unsigned long pteval, int write) | ||
| 84 | { | ||
| 85 | unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER; | ||
| 86 | |||
| 87 | if (write) | ||
| 88 | need_pte_bits |= _PAGE_RW; | ||
| 89 | |||
| 90 | if ((pteval & need_pte_bits) != need_pte_bits) | ||
| 91 | return 0; | ||
| 92 | |||
| 93 | /* Check memory protection keys permissions. */ | ||
| 94 | if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write)) | ||
| 95 | return 0; | ||
| 96 | |||
| 97 | return 1; | ||
| 98 | } | ||
| 99 | |||
| 100 | /* | ||
| 101 | * The performance critical leaf functions are made noinline otherwise gcc | ||
| 102 | * inlines everything into a single function which results in too much | ||
| 103 | * register pressure. | ||
| 104 | */ | ||
| 105 | static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, | ||
| 106 | unsigned long end, int write, struct page **pages, int *nr) | ||
| 107 | { | ||
| 108 | struct dev_pagemap *pgmap = NULL; | ||
| 109 | int nr_start = *nr, ret = 0; | ||
| 110 | pte_t *ptep, *ptem; | ||
| 111 | |||
| 112 | /* | ||
| 113 | * Keep the original mapped PTE value (ptem) around since we | ||
| 114 | * might increment ptep off the end of the page when finishing | ||
| 115 | * our loop iteration. | ||
| 116 | */ | ||
| 117 | ptem = ptep = pte_offset_map(&pmd, addr); | ||
| 118 | do { | ||
| 119 | pte_t pte = gup_get_pte(ptep); | ||
| 120 | struct page *page; | ||
| 121 | |||
| 122 | /* Similar to the PMD case, NUMA hinting must take slow path */ | ||
| 123 | if (pte_protnone(pte)) | ||
| 124 | break; | ||
| 125 | |||
| 126 | if (!pte_allows_gup(pte_val(pte), write)) | ||
| 127 | break; | ||
| 128 | |||
| 129 | if (pte_devmap(pte)) { | ||
| 130 | pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); | ||
| 131 | if (unlikely(!pgmap)) { | ||
| 132 | undo_dev_pagemap(nr, nr_start, pages); | ||
| 133 | break; | ||
| 134 | } | ||
| 135 | } else if (pte_special(pte)) | ||
| 136 | break; | ||
| 137 | |||
| 138 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | ||
| 139 | page = pte_page(pte); | ||
| 140 | get_page(page); | ||
| 141 | put_dev_pagemap(pgmap); | ||
| 142 | SetPageReferenced(page); | ||
| 143 | pages[*nr] = page; | ||
| 144 | (*nr)++; | ||
| 145 | |||
| 146 | } while (ptep++, addr += PAGE_SIZE, addr != end); | ||
| 147 | if (addr == end) | ||
| 148 | ret = 1; | ||
| 149 | pte_unmap(ptem); | ||
| 150 | |||
| 151 | return ret; | ||
| 152 | } | ||
| 153 | |||
| 154 | static inline void get_head_page_multiple(struct page *page, int nr) | ||
| 155 | { | ||
| 156 | VM_BUG_ON_PAGE(page != compound_head(page), page); | ||
| 157 | VM_BUG_ON_PAGE(page_count(page) == 0, page); | ||
| 158 | page_ref_add(page, nr); | ||
| 159 | SetPageReferenced(page); | ||
| 160 | } | ||
| 161 | |||
| 162 | static int __gup_device_huge(unsigned long pfn, unsigned long addr, | ||
| 163 | unsigned long end, struct page **pages, int *nr) | ||
| 164 | { | ||
| 165 | int nr_start = *nr; | ||
| 166 | struct dev_pagemap *pgmap = NULL; | ||
| 167 | |||
| 168 | do { | ||
| 169 | struct page *page = pfn_to_page(pfn); | ||
| 170 | |||
| 171 | pgmap = get_dev_pagemap(pfn, pgmap); | ||
| 172 | if (unlikely(!pgmap)) { | ||
| 173 | undo_dev_pagemap(nr, nr_start, pages); | ||
| 174 | return 0; | ||
| 175 | } | ||
| 176 | SetPageReferenced(page); | ||
| 177 | pages[*nr] = page; | ||
| 178 | get_page(page); | ||
| 179 | put_dev_pagemap(pgmap); | ||
| 180 | (*nr)++; | ||
| 181 | pfn++; | ||
| 182 | } while (addr += PAGE_SIZE, addr != end); | ||
| 183 | return 1; | ||
| 184 | } | ||
| 185 | |||
| 186 | static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, | ||
| 187 | unsigned long end, struct page **pages, int *nr) | ||
| 188 | { | ||
| 189 | unsigned long fault_pfn; | ||
| 190 | |||
| 191 | fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); | ||
| 192 | return __gup_device_huge(fault_pfn, addr, end, pages, nr); | ||
| 193 | } | ||
| 194 | |||
| 195 | static int __gup_device_huge_pud(pud_t pud, unsigned long addr, | ||
| 196 | unsigned long end, struct page **pages, int *nr) | ||
| 197 | { | ||
| 198 | unsigned long fault_pfn; | ||
| 199 | |||
| 200 | fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); | ||
| 201 | return __gup_device_huge(fault_pfn, addr, end, pages, nr); | ||
| 202 | } | ||
| 203 | |||
| 204 | static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, | ||
| 205 | unsigned long end, int write, struct page **pages, int *nr) | ||
| 206 | { | ||
| 207 | struct page *head, *page; | ||
| 208 | int refs; | ||
| 209 | |||
| 210 | if (!pte_allows_gup(pmd_val(pmd), write)) | ||
| 211 | return 0; | ||
| 212 | |||
| 213 | VM_BUG_ON(!pfn_valid(pmd_pfn(pmd))); | ||
| 214 | if (pmd_devmap(pmd)) | ||
| 215 | return __gup_device_huge_pmd(pmd, addr, end, pages, nr); | ||
| 216 | |||
| 217 | /* hugepages are never "special" */ | ||
| 218 | VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL); | ||
| 219 | |||
| 220 | refs = 0; | ||
| 221 | head = pmd_page(pmd); | ||
| 222 | page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); | ||
| 223 | do { | ||
| 224 | VM_BUG_ON_PAGE(compound_head(page) != head, page); | ||
| 225 | pages[*nr] = page; | ||
| 226 | (*nr)++; | ||
| 227 | page++; | ||
| 228 | refs++; | ||
| 229 | } while (addr += PAGE_SIZE, addr != end); | ||
| 230 | get_head_page_multiple(head, refs); | ||
| 231 | |||
| 232 | return 1; | ||
| 233 | } | ||
| 234 | |||
| 235 | static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, | ||
| 236 | int write, struct page **pages, int *nr) | ||
| 237 | { | ||
| 238 | unsigned long next; | ||
| 239 | pmd_t *pmdp; | ||
| 240 | |||
| 241 | pmdp = pmd_offset(&pud, addr); | ||
| 242 | do { | ||
| 243 | pmd_t pmd = *pmdp; | ||
| 244 | |||
| 245 | next = pmd_addr_end(addr, end); | ||
| 246 | if (pmd_none(pmd)) | ||
| 247 | return 0; | ||
| 248 | if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) { | ||
| 249 | /* | ||
| 250 | * NUMA hinting faults need to be handled in the GUP | ||
| 251 | * slowpath for accounting purposes and so that they | ||
| 252 | * can be serialised against THP migration. | ||
| 253 | */ | ||
| 254 | if (pmd_protnone(pmd)) | ||
| 255 | return 0; | ||
| 256 | if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) | ||
| 257 | return 0; | ||
| 258 | } else { | ||
| 259 | if (!gup_pte_range(pmd, addr, next, write, pages, nr)) | ||
| 260 | return 0; | ||
| 261 | } | ||
| 262 | } while (pmdp++, addr = next, addr != end); | ||
| 263 | |||
| 264 | return 1; | ||
| 265 | } | ||
| 266 | |||
| 267 | static noinline int gup_huge_pud(pud_t pud, unsigned long addr, | ||
| 268 | unsigned long end, int write, struct page **pages, int *nr) | ||
| 269 | { | ||
| 270 | struct page *head, *page; | ||
| 271 | int refs; | ||
| 272 | |||
| 273 | if (!pte_allows_gup(pud_val(pud), write)) | ||
| 274 | return 0; | ||
| 275 | |||
| 276 | VM_BUG_ON(!pfn_valid(pud_pfn(pud))); | ||
| 277 | if (pud_devmap(pud)) | ||
| 278 | return __gup_device_huge_pud(pud, addr, end, pages, nr); | ||
| 279 | |||
| 280 | /* hugepages are never "special" */ | ||
| 281 | VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL); | ||
| 282 | |||
| 283 | refs = 0; | ||
| 284 | head = pud_page(pud); | ||
| 285 | page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); | ||
| 286 | do { | ||
| 287 | VM_BUG_ON_PAGE(compound_head(page) != head, page); | ||
| 288 | pages[*nr] = page; | ||
| 289 | (*nr)++; | ||
| 290 | page++; | ||
| 291 | refs++; | ||
| 292 | } while (addr += PAGE_SIZE, addr != end); | ||
| 293 | get_head_page_multiple(head, refs); | ||
| 294 | |||
| 295 | return 1; | ||
| 296 | } | ||
| 297 | |||
| 298 | static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, | ||
| 299 | int write, struct page **pages, int *nr) | ||
| 300 | { | ||
| 301 | unsigned long next; | ||
| 302 | pud_t *pudp; | ||
| 303 | |||
| 304 | pudp = pud_offset(&p4d, addr); | ||
| 305 | do { | ||
| 306 | pud_t pud = *pudp; | ||
| 307 | |||
| 308 | next = pud_addr_end(addr, end); | ||
| 309 | if (pud_none(pud)) | ||
| 310 | return 0; | ||
| 311 | if (unlikely(pud_large(pud))) { | ||
| 312 | if (!gup_huge_pud(pud, addr, next, write, pages, nr)) | ||
| 313 | return 0; | ||
| 314 | } else { | ||
| 315 | if (!gup_pmd_range(pud, addr, next, write, pages, nr)) | ||
| 316 | return 0; | ||
| 317 | } | ||
| 318 | } while (pudp++, addr = next, addr != end); | ||
| 319 | |||
| 320 | return 1; | ||
| 321 | } | ||
| 322 | |||
| 323 | static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, | ||
| 324 | int write, struct page **pages, int *nr) | ||
| 325 | { | ||
| 326 | unsigned long next; | ||
| 327 | p4d_t *p4dp; | ||
| 328 | |||
| 329 | p4dp = p4d_offset(&pgd, addr); | ||
| 330 | do { | ||
| 331 | p4d_t p4d = *p4dp; | ||
| 332 | |||
| 333 | next = p4d_addr_end(addr, end); | ||
| 334 | if (p4d_none(p4d)) | ||
| 335 | return 0; | ||
| 336 | BUILD_BUG_ON(p4d_large(p4d)); | ||
| 337 | if (!gup_pud_range(p4d, addr, next, write, pages, nr)) | ||
| 338 | return 0; | ||
| 339 | } while (p4dp++, addr = next, addr != end); | ||
| 340 | |||
| 341 | return 1; | ||
| 342 | } | ||
| 343 | |||
| 344 | /* | ||
| 345 | * Like get_user_pages_fast() except its IRQ-safe in that it won't fall | ||
| 346 | * back to the regular GUP. | ||
| 347 | */ | ||
| 348 | int __get_user_pages_fast(unsigned long start, int nr_pages, int write, | ||
| 349 | struct page **pages) | ||
| 350 | { | ||
| 351 | struct mm_struct *mm = current->mm; | ||
| 352 | unsigned long addr, len, end; | ||
| 353 | unsigned long next; | ||
| 354 | unsigned long flags; | ||
| 355 | pgd_t *pgdp; | ||
| 356 | int nr = 0; | ||
| 357 | |||
| 358 | start &= PAGE_MASK; | ||
| 359 | addr = start; | ||
| 360 | len = (unsigned long) nr_pages << PAGE_SHIFT; | ||
| 361 | end = start + len; | ||
| 362 | if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, | ||
| 363 | (void __user *)start, len))) | ||
| 364 | return 0; | ||
| 365 | |||
| 366 | /* | ||
| 367 | * XXX: batch / limit 'nr', to avoid large irq off latency | ||
| 368 | * needs some instrumenting to determine the common sizes used by | ||
| 369 | * important workloads (eg. DB2), and whether limiting the batch size | ||
| 370 | * will decrease performance. | ||
| 371 | * | ||
| 372 | * It seems like we're in the clear for the moment. Direct-IO is | ||
| 373 | * the main guy that batches up lots of get_user_pages, and even | ||
| 374 | * they are limited to 64-at-a-time which is not so many. | ||
| 375 | */ | ||
| 376 | /* | ||
| 377 | * This doesn't prevent pagetable teardown, but does prevent | ||
| 378 | * the pagetables and pages from being freed on x86. | ||
| 379 | * | ||
| 380 | * So long as we atomically load page table pointers versus teardown | ||
| 381 | * (which we do on x86, with the above PAE exception), we can follow the | ||
| 382 | * address down to the the page and take a ref on it. | ||
| 383 | */ | ||
| 384 | local_irq_save(flags); | ||
| 385 | pgdp = pgd_offset(mm, addr); | ||
| 386 | do { | ||
| 387 | pgd_t pgd = *pgdp; | ||
| 388 | |||
| 389 | next = pgd_addr_end(addr, end); | ||
| 390 | if (pgd_none(pgd)) | ||
| 391 | break; | ||
| 392 | if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) | ||
| 393 | break; | ||
| 394 | } while (pgdp++, addr = next, addr != end); | ||
| 395 | local_irq_restore(flags); | ||
| 396 | |||
| 397 | return nr; | ||
| 398 | } | ||
| 399 | |||
| 400 | /** | ||
| 401 | * get_user_pages_fast() - pin user pages in memory | ||
| 402 | * @start: starting user address | ||
| 403 | * @nr_pages: number of pages from start to pin | ||
| 404 | * @write: whether pages will be written to | ||
| 405 | * @pages: array that receives pointers to the pages pinned. | ||
| 406 | * Should be at least nr_pages long. | ||
| 407 | * | ||
| 408 | * Attempt to pin user pages in memory without taking mm->mmap_sem. | ||
| 409 | * If not successful, it will fall back to taking the lock and | ||
| 410 | * calling get_user_pages(). | ||
| 411 | * | ||
| 412 | * Returns number of pages pinned. This may be fewer than the number | ||
| 413 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | ||
| 414 | * were pinned, returns -errno. | ||
| 415 | */ | ||
| 416 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, | ||
| 417 | struct page **pages) | ||
| 418 | { | ||
| 419 | struct mm_struct *mm = current->mm; | ||
| 420 | unsigned long addr, len, end; | ||
| 421 | unsigned long next; | ||
| 422 | pgd_t *pgdp; | ||
| 423 | int nr = 0; | ||
| 424 | |||
| 425 | start &= PAGE_MASK; | ||
| 426 | addr = start; | ||
| 427 | len = (unsigned long) nr_pages << PAGE_SHIFT; | ||
| 428 | |||
| 429 | end = start + len; | ||
| 430 | if (end < start) | ||
| 431 | goto slow_irqon; | ||
| 432 | |||
| 433 | #ifdef CONFIG_X86_64 | ||
| 434 | if (end >> __VIRTUAL_MASK_SHIFT) | ||
| 435 | goto slow_irqon; | ||
| 436 | #endif | ||
| 437 | |||
| 438 | /* | ||
| 439 | * XXX: batch / limit 'nr', to avoid large irq off latency | ||
| 440 | * needs some instrumenting to determine the common sizes used by | ||
| 441 | * important workloads (eg. DB2), and whether limiting the batch size | ||
| 442 | * will decrease performance. | ||
| 443 | * | ||
| 444 | * It seems like we're in the clear for the moment. Direct-IO is | ||
| 445 | * the main guy that batches up lots of get_user_pages, and even | ||
| 446 | * they are limited to 64-at-a-time which is not so many. | ||
| 447 | */ | ||
| 448 | /* | ||
| 449 | * This doesn't prevent pagetable teardown, but does prevent | ||
| 450 | * the pagetables and pages from being freed on x86. | ||
| 451 | * | ||
| 452 | * So long as we atomically load page table pointers versus teardown | ||
| 453 | * (which we do on x86, with the above PAE exception), we can follow the | ||
| 454 | * address down to the the page and take a ref on it. | ||
| 455 | */ | ||
| 456 | local_irq_disable(); | ||
| 457 | pgdp = pgd_offset(mm, addr); | ||
| 458 | do { | ||
| 459 | pgd_t pgd = *pgdp; | ||
| 460 | |||
| 461 | next = pgd_addr_end(addr, end); | ||
| 462 | if (pgd_none(pgd)) | ||
| 463 | goto slow; | ||
| 464 | if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) | ||
| 465 | goto slow; | ||
| 466 | } while (pgdp++, addr = next, addr != end); | ||
| 467 | local_irq_enable(); | ||
| 468 | |||
| 469 | VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); | ||
| 470 | return nr; | ||
| 471 | |||
| 472 | { | ||
| 473 | int ret; | ||
| 474 | |||
| 475 | slow: | ||
| 476 | local_irq_enable(); | ||
| 477 | slow_irqon: | ||
| 478 | /* Try to get the remaining pages with get_user_pages */ | ||
| 479 | start += nr << PAGE_SHIFT; | ||
| 480 | pages += nr; | ||
| 481 | |||
| 482 | ret = get_user_pages_unlocked(start, | ||
| 483 | (end - start) >> PAGE_SHIFT, | ||
| 484 | pages, write ? FOLL_WRITE : 0); | ||
| 485 | |||
| 486 | /* Have to be a bit careful with return values */ | ||
| 487 | if (nr > 0) { | ||
| 488 | if (ret < 0) | ||
| 489 | ret = nr; | ||
| 490 | else | ||
| 491 | ret += nr; | ||
| 492 | } | ||
| 493 | |||
| 494 | return ret; | ||
| 495 | } | ||
| 496 | } | ||
diff --git a/mm/Kconfig b/mm/Kconfig index c89f472b658c..9b8fccb969dc 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
| @@ -137,7 +137,7 @@ config HAVE_MEMBLOCK_NODE_MAP | |||
| 137 | config HAVE_MEMBLOCK_PHYS_MAP | 137 | config HAVE_MEMBLOCK_PHYS_MAP |
| 138 | bool | 138 | bool |
| 139 | 139 | ||
| 140 | config HAVE_GENERIC_GUP | 140 | config HAVE_GENERIC_RCU_GUP |
| 141 | bool | 141 | bool |
| 142 | 142 | ||
| 143 | config ARCH_DISCARD_MEMBLOCK | 143 | config ARCH_DISCARD_MEMBLOCK |
| @@ -1155,7 +1155,7 @@ struct page *get_dump_page(unsigned long addr) | |||
| 1155 | #endif /* CONFIG_ELF_CORE */ | 1155 | #endif /* CONFIG_ELF_CORE */ |
| 1156 | 1156 | ||
| 1157 | /* | 1157 | /* |
| 1158 | * Generic Fast GUP | 1158 | * Generic RCU Fast GUP |
| 1159 | * | 1159 | * |
| 1160 | * get_user_pages_fast attempts to pin user pages by walking the page | 1160 | * get_user_pages_fast attempts to pin user pages by walking the page |
| 1161 | * tables directly and avoids taking locks. Thus the walker needs to be | 1161 | * tables directly and avoids taking locks. Thus the walker needs to be |
| @@ -1176,8 +1176,8 @@ struct page *get_dump_page(unsigned long addr) | |||
| 1176 | * Before activating this code, please be aware that the following assumptions | 1176 | * Before activating this code, please be aware that the following assumptions |
| 1177 | * are currently made: | 1177 | * are currently made: |
| 1178 | * | 1178 | * |
| 1179 | * *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to | 1179 | * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free |
| 1180 | * free pages containing page tables or TLB flushing requires IPI broadcast. | 1180 | * pages containing page tables. |
| 1181 | * | 1181 | * |
| 1182 | * *) ptes can be read atomically by the architecture. | 1182 | * *) ptes can be read atomically by the architecture. |
| 1183 | * | 1183 | * |
| @@ -1187,7 +1187,7 @@ struct page *get_dump_page(unsigned long addr) | |||
| 1187 | * | 1187 | * |
| 1188 | * This code is based heavily on the PowerPC implementation by Nick Piggin. | 1188 | * This code is based heavily on the PowerPC implementation by Nick Piggin. |
| 1189 | */ | 1189 | */ |
| 1190 | #ifdef CONFIG_HAVE_GENERIC_GUP | 1190 | #ifdef CONFIG_HAVE_GENERIC_RCU_GUP |
| 1191 | 1191 | ||
| 1192 | #ifndef gup_get_pte | 1192 | #ifndef gup_get_pte |
| 1193 | /* | 1193 | /* |
| @@ -1677,4 +1677,4 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, | |||
| 1677 | return ret; | 1677 | return ret; |
| 1678 | } | 1678 | } |
| 1679 | 1679 | ||
| 1680 | #endif /* CONFIG_HAVE_GENERIC_GUP */ | 1680 | #endif /* CONFIG_HAVE_GENERIC_RCU_GUP */ |
