diff options
author | Kirill A. Shutemov <kirill.shutemov@linux.intel.com> | 2017-06-06 07:31:20 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2017-06-13 02:56:50 -0400 |
commit | e585513b76f7b05d08ca3fb250fed11f6ba46ee5 (patch) | |
tree | 3acce851002d4bcff876009b37be0554ba22be3d | |
parent | 6c690ee1039b251e583fc65b28da30e97d6a7385 (diff) |
x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation
This patch provides all required callbacks required by the generic
get_user_pages_fast() code and switches x86 over - and removes
the platform specific implementation.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20170606113133.22974-2-kirill.shutemov@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r-- | arch/arm/Kconfig | 2 | ||||
-rw-r--r-- | arch/arm64/Kconfig | 2 | ||||
-rw-r--r-- | arch/powerpc/Kconfig | 2 | ||||
-rw-r--r-- | arch/x86/Kconfig | 3 | ||||
-rw-r--r-- | arch/x86/include/asm/mmu_context.h | 12 | ||||
-rw-r--r-- | arch/x86/include/asm/pgtable-3level.h | 47 | ||||
-rw-r--r-- | arch/x86/include/asm/pgtable.h | 53 | ||||
-rw-r--r-- | arch/x86/include/asm/pgtable_64.h | 16 | ||||
-rw-r--r-- | arch/x86/mm/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/mm/gup.c | 496 | ||||
-rw-r--r-- | mm/Kconfig | 2 | ||||
-rw-r--r-- | mm/gup.c | 10 |
12 files changed, 128 insertions, 519 deletions
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 4c1a35f15838..c3c49c9491d5 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig | |||
@@ -1637,7 +1637,7 @@ config ARCH_SELECT_MEMORY_MODEL | |||
1637 | config HAVE_ARCH_PFN_VALID | 1637 | config HAVE_ARCH_PFN_VALID |
1638 | def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM | 1638 | def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM |
1639 | 1639 | ||
1640 | config HAVE_GENERIC_RCU_GUP | 1640 | config HAVE_GENERIC_GUP |
1641 | def_bool y | 1641 | def_bool y |
1642 | depends on ARM_LPAE | 1642 | depends on ARM_LPAE |
1643 | 1643 | ||
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 3dcd7ec69bca..a7c5f8c3f13d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig | |||
@@ -205,7 +205,7 @@ config GENERIC_CALIBRATE_DELAY | |||
205 | config ZONE_DMA | 205 | config ZONE_DMA |
206 | def_bool y | 206 | def_bool y |
207 | 207 | ||
208 | config HAVE_GENERIC_RCU_GUP | 208 | config HAVE_GENERIC_GUP |
209 | def_bool y | 209 | def_bool y |
210 | 210 | ||
211 | config ARCH_DMA_ADDR_T_64BIT | 211 | config ARCH_DMA_ADDR_T_64BIT |
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 964da1891ea9..e07735a690c6 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig | |||
@@ -184,7 +184,7 @@ config PPC | |||
184 | select HAVE_FUNCTION_GRAPH_TRACER | 184 | select HAVE_FUNCTION_GRAPH_TRACER |
185 | select HAVE_FUNCTION_TRACER | 185 | select HAVE_FUNCTION_TRACER |
186 | select HAVE_GCC_PLUGINS | 186 | select HAVE_GCC_PLUGINS |
187 | select HAVE_GENERIC_RCU_GUP | 187 | select HAVE_GENERIC_GUP |
188 | select HAVE_HW_BREAKPOINT if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx) | 188 | select HAVE_HW_BREAKPOINT if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx) |
189 | select HAVE_IDE | 189 | select HAVE_IDE |
190 | select HAVE_IOREMAP_PROT | 190 | select HAVE_IOREMAP_PROT |
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7a065d81dc43..de71b6aca0be 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -2797,6 +2797,9 @@ config X86_DMA_REMAP | |||
2797 | bool | 2797 | bool |
2798 | depends on STA2X11 | 2798 | depends on STA2X11 |
2799 | 2799 | ||
2800 | config HAVE_GENERIC_GUP | ||
2801 | def_bool y | ||
2802 | |||
2800 | source "net/Kconfig" | 2803 | source "net/Kconfig" |
2801 | 2804 | ||
2802 | source "drivers/Kconfig" | 2805 | source "drivers/Kconfig" |
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index cfe6034ebfc6..1458f530948b 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h | |||
@@ -218,18 +218,6 @@ static inline int vma_pkey(struct vm_area_struct *vma) | |||
218 | } | 218 | } |
219 | #endif | 219 | #endif |
220 | 220 | ||
221 | static inline bool __pkru_allows_pkey(u16 pkey, bool write) | ||
222 | { | ||
223 | u32 pkru = read_pkru(); | ||
224 | |||
225 | if (!__pkru_allows_read(pkru, pkey)) | ||
226 | return false; | ||
227 | if (write && !__pkru_allows_write(pkru, pkey)) | ||
228 | return false; | ||
229 | |||
230 | return true; | ||
231 | } | ||
232 | |||
233 | /* | 221 | /* |
234 | * We only want to enforce protection keys on the current process | 222 | * We only want to enforce protection keys on the current process |
235 | * because we effectively have no access to PKRU for other | 223 | * because we effectively have no access to PKRU for other |
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 50d35e3185f5..c8821bab938f 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h | |||
@@ -212,4 +212,51 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp) | |||
212 | #define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) | 212 | #define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) |
213 | #define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } }) | 213 | #define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } }) |
214 | 214 | ||
215 | #define gup_get_pte gup_get_pte | ||
216 | /* | ||
217 | * WARNING: only to be used in the get_user_pages_fast() implementation. | ||
218 | * | ||
219 | * With get_user_pages_fast(), we walk down the pagetables without taking | ||
220 | * any locks. For this we would like to load the pointers atomically, | ||
221 | * but that is not possible (without expensive cmpxchg8b) on PAE. What | ||
222 | * we do have is the guarantee that a PTE will only either go from not | ||
223 | * present to present, or present to not present or both -- it will not | ||
224 | * switch to a completely different present page without a TLB flush in | ||
225 | * between; something that we are blocking by holding interrupts off. | ||
226 | * | ||
227 | * Setting ptes from not present to present goes: | ||
228 | * | ||
229 | * ptep->pte_high = h; | ||
230 | * smp_wmb(); | ||
231 | * ptep->pte_low = l; | ||
232 | * | ||
233 | * And present to not present goes: | ||
234 | * | ||
235 | * ptep->pte_low = 0; | ||
236 | * smp_wmb(); | ||
237 | * ptep->pte_high = 0; | ||
238 | * | ||
239 | * We must ensure here that the load of pte_low sees 'l' iff pte_high | ||
240 | * sees 'h'. We load pte_high *after* loading pte_low, which ensures we | ||
241 | * don't see an older value of pte_high. *Then* we recheck pte_low, | ||
242 | * which ensures that we haven't picked up a changed pte high. We might | ||
243 | * have gotten rubbish values from pte_low and pte_high, but we are | ||
244 | * guaranteed that pte_low will not have the present bit set *unless* | ||
245 | * it is 'l'. Because get_user_pages_fast() only operates on present ptes | ||
246 | * we're safe. | ||
247 | */ | ||
248 | static inline pte_t gup_get_pte(pte_t *ptep) | ||
249 | { | ||
250 | pte_t pte; | ||
251 | |||
252 | do { | ||
253 | pte.pte_low = ptep->pte_low; | ||
254 | smp_rmb(); | ||
255 | pte.pte_high = ptep->pte_high; | ||
256 | smp_rmb(); | ||
257 | } while (unlikely(pte.pte_low != ptep->pte_low)); | ||
258 | |||
259 | return pte; | ||
260 | } | ||
261 | |||
215 | #endif /* _ASM_X86_PGTABLE_3LEVEL_H */ | 262 | #endif /* _ASM_X86_PGTABLE_3LEVEL_H */ |
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index f5af95a0c6b8..942482ac36a8 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h | |||
@@ -244,6 +244,11 @@ static inline int pud_devmap(pud_t pud) | |||
244 | return 0; | 244 | return 0; |
245 | } | 245 | } |
246 | #endif | 246 | #endif |
247 | |||
248 | static inline int pgd_devmap(pgd_t pgd) | ||
249 | { | ||
250 | return 0; | ||
251 | } | ||
247 | #endif | 252 | #endif |
248 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 253 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
249 | 254 | ||
@@ -1185,6 +1190,54 @@ static inline u16 pte_flags_pkey(unsigned long pte_flags) | |||
1185 | #endif | 1190 | #endif |
1186 | } | 1191 | } |
1187 | 1192 | ||
1193 | static inline bool __pkru_allows_pkey(u16 pkey, bool write) | ||
1194 | { | ||
1195 | u32 pkru = read_pkru(); | ||
1196 | |||
1197 | if (!__pkru_allows_read(pkru, pkey)) | ||
1198 | return false; | ||
1199 | if (write && !__pkru_allows_write(pkru, pkey)) | ||
1200 | return false; | ||
1201 | |||
1202 | return true; | ||
1203 | } | ||
1204 | |||
1205 | /* | ||
1206 | * 'pteval' can come from a PTE, PMD or PUD. We only check | ||
1207 | * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the | ||
1208 | * same value on all 3 types. | ||
1209 | */ | ||
1210 | static inline bool __pte_access_permitted(unsigned long pteval, bool write) | ||
1211 | { | ||
1212 | unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER; | ||
1213 | |||
1214 | if (write) | ||
1215 | need_pte_bits |= _PAGE_RW; | ||
1216 | |||
1217 | if ((pteval & need_pte_bits) != need_pte_bits) | ||
1218 | return 0; | ||
1219 | |||
1220 | return __pkru_allows_pkey(pte_flags_pkey(pteval), write); | ||
1221 | } | ||
1222 | |||
1223 | #define pte_access_permitted pte_access_permitted | ||
1224 | static inline bool pte_access_permitted(pte_t pte, bool write) | ||
1225 | { | ||
1226 | return __pte_access_permitted(pte_val(pte), write); | ||
1227 | } | ||
1228 | |||
1229 | #define pmd_access_permitted pmd_access_permitted | ||
1230 | static inline bool pmd_access_permitted(pmd_t pmd, bool write) | ||
1231 | { | ||
1232 | return __pte_access_permitted(pmd_val(pmd), write); | ||
1233 | } | ||
1234 | |||
1235 | #define pud_access_permitted pud_access_permitted | ||
1236 | static inline bool pud_access_permitted(pud_t pud, bool write) | ||
1237 | { | ||
1238 | return __pte_access_permitted(pud_val(pud), write); | ||
1239 | } | ||
1240 | |||
1188 | #include <asm-generic/pgtable.h> | 1241 | #include <asm-generic/pgtable.h> |
1189 | #endif /* __ASSEMBLY__ */ | 1242 | #endif /* __ASSEMBLY__ */ |
1190 | 1243 | ||
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 9991224f6238..12ea31274eb6 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h | |||
@@ -227,6 +227,20 @@ extern void cleanup_highmap(void); | |||
227 | extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); | 227 | extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); |
228 | extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); | 228 | extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); |
229 | 229 | ||
230 | #endif /* !__ASSEMBLY__ */ | 230 | #define gup_fast_permitted gup_fast_permitted |
231 | static inline bool gup_fast_permitted(unsigned long start, int nr_pages, | ||
232 | int write) | ||
233 | { | ||
234 | unsigned long len, end; | ||
235 | |||
236 | len = (unsigned long)nr_pages << PAGE_SHIFT; | ||
237 | end = start + len; | ||
238 | if (end < start) | ||
239 | return false; | ||
240 | if (end >> __VIRTUAL_MASK_SHIFT) | ||
241 | return false; | ||
242 | return true; | ||
243 | } | ||
231 | 244 | ||
245 | #endif /* !__ASSEMBLY__ */ | ||
232 | #endif /* _ASM_X86_PGTABLE_64_H */ | 246 | #endif /* _ASM_X86_PGTABLE_64_H */ |
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 96d2b847e09e..0fbdcb64f9f8 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile | |||
@@ -2,7 +2,7 @@ | |||
2 | KCOV_INSTRUMENT_tlb.o := n | 2 | KCOV_INSTRUMENT_tlb.o := n |
3 | 3 | ||
4 | obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ | 4 | obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ |
5 | pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o | 5 | pat.o pgtable.o physaddr.o setup_nx.o tlb.o |
6 | 6 | ||
7 | # Make sure __phys_addr has no stackprotector | 7 | # Make sure __phys_addr has no stackprotector |
8 | nostackp := $(call cc-option, -fno-stack-protector) | 8 | nostackp := $(call cc-option, -fno-stack-protector) |
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c deleted file mode 100644 index 456dfdfd2249..000000000000 --- a/arch/x86/mm/gup.c +++ /dev/null | |||
@@ -1,496 +0,0 @@ | |||
1 | /* | ||
2 | * Lockless get_user_pages_fast for x86 | ||
3 | * | ||
4 | * Copyright (C) 2008 Nick Piggin | ||
5 | * Copyright (C) 2008 Novell Inc. | ||
6 | */ | ||
7 | #include <linux/sched.h> | ||
8 | #include <linux/mm.h> | ||
9 | #include <linux/vmstat.h> | ||
10 | #include <linux/highmem.h> | ||
11 | #include <linux/swap.h> | ||
12 | #include <linux/memremap.h> | ||
13 | |||
14 | #include <asm/mmu_context.h> | ||
15 | #include <asm/pgtable.h> | ||
16 | |||
17 | static inline pte_t gup_get_pte(pte_t *ptep) | ||
18 | { | ||
19 | #ifndef CONFIG_X86_PAE | ||
20 | return READ_ONCE(*ptep); | ||
21 | #else | ||
22 | /* | ||
23 | * With get_user_pages_fast, we walk down the pagetables without taking | ||
24 | * any locks. For this we would like to load the pointers atomically, | ||
25 | * but that is not possible (without expensive cmpxchg8b) on PAE. What | ||
26 | * we do have is the guarantee that a pte will only either go from not | ||
27 | * present to present, or present to not present or both -- it will not | ||
28 | * switch to a completely different present page without a TLB flush in | ||
29 | * between; something that we are blocking by holding interrupts off. | ||
30 | * | ||
31 | * Setting ptes from not present to present goes: | ||
32 | * ptep->pte_high = h; | ||
33 | * smp_wmb(); | ||
34 | * ptep->pte_low = l; | ||
35 | * | ||
36 | * And present to not present goes: | ||
37 | * ptep->pte_low = 0; | ||
38 | * smp_wmb(); | ||
39 | * ptep->pte_high = 0; | ||
40 | * | ||
41 | * We must ensure here that the load of pte_low sees l iff pte_high | ||
42 | * sees h. We load pte_high *after* loading pte_low, which ensures we | ||
43 | * don't see an older value of pte_high. *Then* we recheck pte_low, | ||
44 | * which ensures that we haven't picked up a changed pte high. We might | ||
45 | * have got rubbish values from pte_low and pte_high, but we are | ||
46 | * guaranteed that pte_low will not have the present bit set *unless* | ||
47 | * it is 'l'. And get_user_pages_fast only operates on present ptes, so | ||
48 | * we're safe. | ||
49 | * | ||
50 | * gup_get_pte should not be used or copied outside gup.c without being | ||
51 | * very careful -- it does not atomically load the pte or anything that | ||
52 | * is likely to be useful for you. | ||
53 | */ | ||
54 | pte_t pte; | ||
55 | |||
56 | retry: | ||
57 | pte.pte_low = ptep->pte_low; | ||
58 | smp_rmb(); | ||
59 | pte.pte_high = ptep->pte_high; | ||
60 | smp_rmb(); | ||
61 | if (unlikely(pte.pte_low != ptep->pte_low)) | ||
62 | goto retry; | ||
63 | |||
64 | return pte; | ||
65 | #endif | ||
66 | } | ||
67 | |||
68 | static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) | ||
69 | { | ||
70 | while ((*nr) - nr_start) { | ||
71 | struct page *page = pages[--(*nr)]; | ||
72 | |||
73 | ClearPageReferenced(page); | ||
74 | put_page(page); | ||
75 | } | ||
76 | } | ||
77 | |||
78 | /* | ||
79 | * 'pteval' can come from a pte, pmd, pud or p4d. We only check | ||
80 | * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the | ||
81 | * same value on all 4 types. | ||
82 | */ | ||
83 | static inline int pte_allows_gup(unsigned long pteval, int write) | ||
84 | { | ||
85 | unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER; | ||
86 | |||
87 | if (write) | ||
88 | need_pte_bits |= _PAGE_RW; | ||
89 | |||
90 | if ((pteval & need_pte_bits) != need_pte_bits) | ||
91 | return 0; | ||
92 | |||
93 | /* Check memory protection keys permissions. */ | ||
94 | if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write)) | ||
95 | return 0; | ||
96 | |||
97 | return 1; | ||
98 | } | ||
99 | |||
100 | /* | ||
101 | * The performance critical leaf functions are made noinline otherwise gcc | ||
102 | * inlines everything into a single function which results in too much | ||
103 | * register pressure. | ||
104 | */ | ||
105 | static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, | ||
106 | unsigned long end, int write, struct page **pages, int *nr) | ||
107 | { | ||
108 | struct dev_pagemap *pgmap = NULL; | ||
109 | int nr_start = *nr, ret = 0; | ||
110 | pte_t *ptep, *ptem; | ||
111 | |||
112 | /* | ||
113 | * Keep the original mapped PTE value (ptem) around since we | ||
114 | * might increment ptep off the end of the page when finishing | ||
115 | * our loop iteration. | ||
116 | */ | ||
117 | ptem = ptep = pte_offset_map(&pmd, addr); | ||
118 | do { | ||
119 | pte_t pte = gup_get_pte(ptep); | ||
120 | struct page *page; | ||
121 | |||
122 | /* Similar to the PMD case, NUMA hinting must take slow path */ | ||
123 | if (pte_protnone(pte)) | ||
124 | break; | ||
125 | |||
126 | if (!pte_allows_gup(pte_val(pte), write)) | ||
127 | break; | ||
128 | |||
129 | if (pte_devmap(pte)) { | ||
130 | pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); | ||
131 | if (unlikely(!pgmap)) { | ||
132 | undo_dev_pagemap(nr, nr_start, pages); | ||
133 | break; | ||
134 | } | ||
135 | } else if (pte_special(pte)) | ||
136 | break; | ||
137 | |||
138 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | ||
139 | page = pte_page(pte); | ||
140 | get_page(page); | ||
141 | put_dev_pagemap(pgmap); | ||
142 | SetPageReferenced(page); | ||
143 | pages[*nr] = page; | ||
144 | (*nr)++; | ||
145 | |||
146 | } while (ptep++, addr += PAGE_SIZE, addr != end); | ||
147 | if (addr == end) | ||
148 | ret = 1; | ||
149 | pte_unmap(ptem); | ||
150 | |||
151 | return ret; | ||
152 | } | ||
153 | |||
154 | static inline void get_head_page_multiple(struct page *page, int nr) | ||
155 | { | ||
156 | VM_BUG_ON_PAGE(page != compound_head(page), page); | ||
157 | VM_BUG_ON_PAGE(page_count(page) == 0, page); | ||
158 | page_ref_add(page, nr); | ||
159 | SetPageReferenced(page); | ||
160 | } | ||
161 | |||
162 | static int __gup_device_huge(unsigned long pfn, unsigned long addr, | ||
163 | unsigned long end, struct page **pages, int *nr) | ||
164 | { | ||
165 | int nr_start = *nr; | ||
166 | struct dev_pagemap *pgmap = NULL; | ||
167 | |||
168 | do { | ||
169 | struct page *page = pfn_to_page(pfn); | ||
170 | |||
171 | pgmap = get_dev_pagemap(pfn, pgmap); | ||
172 | if (unlikely(!pgmap)) { | ||
173 | undo_dev_pagemap(nr, nr_start, pages); | ||
174 | return 0; | ||
175 | } | ||
176 | SetPageReferenced(page); | ||
177 | pages[*nr] = page; | ||
178 | get_page(page); | ||
179 | put_dev_pagemap(pgmap); | ||
180 | (*nr)++; | ||
181 | pfn++; | ||
182 | } while (addr += PAGE_SIZE, addr != end); | ||
183 | return 1; | ||
184 | } | ||
185 | |||
186 | static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, | ||
187 | unsigned long end, struct page **pages, int *nr) | ||
188 | { | ||
189 | unsigned long fault_pfn; | ||
190 | |||
191 | fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); | ||
192 | return __gup_device_huge(fault_pfn, addr, end, pages, nr); | ||
193 | } | ||
194 | |||
195 | static int __gup_device_huge_pud(pud_t pud, unsigned long addr, | ||
196 | unsigned long end, struct page **pages, int *nr) | ||
197 | { | ||
198 | unsigned long fault_pfn; | ||
199 | |||
200 | fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); | ||
201 | return __gup_device_huge(fault_pfn, addr, end, pages, nr); | ||
202 | } | ||
203 | |||
204 | static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, | ||
205 | unsigned long end, int write, struct page **pages, int *nr) | ||
206 | { | ||
207 | struct page *head, *page; | ||
208 | int refs; | ||
209 | |||
210 | if (!pte_allows_gup(pmd_val(pmd), write)) | ||
211 | return 0; | ||
212 | |||
213 | VM_BUG_ON(!pfn_valid(pmd_pfn(pmd))); | ||
214 | if (pmd_devmap(pmd)) | ||
215 | return __gup_device_huge_pmd(pmd, addr, end, pages, nr); | ||
216 | |||
217 | /* hugepages are never "special" */ | ||
218 | VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL); | ||
219 | |||
220 | refs = 0; | ||
221 | head = pmd_page(pmd); | ||
222 | page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); | ||
223 | do { | ||
224 | VM_BUG_ON_PAGE(compound_head(page) != head, page); | ||
225 | pages[*nr] = page; | ||
226 | (*nr)++; | ||
227 | page++; | ||
228 | refs++; | ||
229 | } while (addr += PAGE_SIZE, addr != end); | ||
230 | get_head_page_multiple(head, refs); | ||
231 | |||
232 | return 1; | ||
233 | } | ||
234 | |||
235 | static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, | ||
236 | int write, struct page **pages, int *nr) | ||
237 | { | ||
238 | unsigned long next; | ||
239 | pmd_t *pmdp; | ||
240 | |||
241 | pmdp = pmd_offset(&pud, addr); | ||
242 | do { | ||
243 | pmd_t pmd = *pmdp; | ||
244 | |||
245 | next = pmd_addr_end(addr, end); | ||
246 | if (pmd_none(pmd)) | ||
247 | return 0; | ||
248 | if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) { | ||
249 | /* | ||
250 | * NUMA hinting faults need to be handled in the GUP | ||
251 | * slowpath for accounting purposes and so that they | ||
252 | * can be serialised against THP migration. | ||
253 | */ | ||
254 | if (pmd_protnone(pmd)) | ||
255 | return 0; | ||
256 | if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) | ||
257 | return 0; | ||
258 | } else { | ||
259 | if (!gup_pte_range(pmd, addr, next, write, pages, nr)) | ||
260 | return 0; | ||
261 | } | ||
262 | } while (pmdp++, addr = next, addr != end); | ||
263 | |||
264 | return 1; | ||
265 | } | ||
266 | |||
267 | static noinline int gup_huge_pud(pud_t pud, unsigned long addr, | ||
268 | unsigned long end, int write, struct page **pages, int *nr) | ||
269 | { | ||
270 | struct page *head, *page; | ||
271 | int refs; | ||
272 | |||
273 | if (!pte_allows_gup(pud_val(pud), write)) | ||
274 | return 0; | ||
275 | |||
276 | VM_BUG_ON(!pfn_valid(pud_pfn(pud))); | ||
277 | if (pud_devmap(pud)) | ||
278 | return __gup_device_huge_pud(pud, addr, end, pages, nr); | ||
279 | |||
280 | /* hugepages are never "special" */ | ||
281 | VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL); | ||
282 | |||
283 | refs = 0; | ||
284 | head = pud_page(pud); | ||
285 | page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); | ||
286 | do { | ||
287 | VM_BUG_ON_PAGE(compound_head(page) != head, page); | ||
288 | pages[*nr] = page; | ||
289 | (*nr)++; | ||
290 | page++; | ||
291 | refs++; | ||
292 | } while (addr += PAGE_SIZE, addr != end); | ||
293 | get_head_page_multiple(head, refs); | ||
294 | |||
295 | return 1; | ||
296 | } | ||
297 | |||
298 | static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, | ||
299 | int write, struct page **pages, int *nr) | ||
300 | { | ||
301 | unsigned long next; | ||
302 | pud_t *pudp; | ||
303 | |||
304 | pudp = pud_offset(&p4d, addr); | ||
305 | do { | ||
306 | pud_t pud = *pudp; | ||
307 | |||
308 | next = pud_addr_end(addr, end); | ||
309 | if (pud_none(pud)) | ||
310 | return 0; | ||
311 | if (unlikely(pud_large(pud))) { | ||
312 | if (!gup_huge_pud(pud, addr, next, write, pages, nr)) | ||
313 | return 0; | ||
314 | } else { | ||
315 | if (!gup_pmd_range(pud, addr, next, write, pages, nr)) | ||
316 | return 0; | ||
317 | } | ||
318 | } while (pudp++, addr = next, addr != end); | ||
319 | |||
320 | return 1; | ||
321 | } | ||
322 | |||
323 | static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, | ||
324 | int write, struct page **pages, int *nr) | ||
325 | { | ||
326 | unsigned long next; | ||
327 | p4d_t *p4dp; | ||
328 | |||
329 | p4dp = p4d_offset(&pgd, addr); | ||
330 | do { | ||
331 | p4d_t p4d = *p4dp; | ||
332 | |||
333 | next = p4d_addr_end(addr, end); | ||
334 | if (p4d_none(p4d)) | ||
335 | return 0; | ||
336 | BUILD_BUG_ON(p4d_large(p4d)); | ||
337 | if (!gup_pud_range(p4d, addr, next, write, pages, nr)) | ||
338 | return 0; | ||
339 | } while (p4dp++, addr = next, addr != end); | ||
340 | |||
341 | return 1; | ||
342 | } | ||
343 | |||
344 | /* | ||
345 | * Like get_user_pages_fast() except its IRQ-safe in that it won't fall | ||
346 | * back to the regular GUP. | ||
347 | */ | ||
348 | int __get_user_pages_fast(unsigned long start, int nr_pages, int write, | ||
349 | struct page **pages) | ||
350 | { | ||
351 | struct mm_struct *mm = current->mm; | ||
352 | unsigned long addr, len, end; | ||
353 | unsigned long next; | ||
354 | unsigned long flags; | ||
355 | pgd_t *pgdp; | ||
356 | int nr = 0; | ||
357 | |||
358 | start &= PAGE_MASK; | ||
359 | addr = start; | ||
360 | len = (unsigned long) nr_pages << PAGE_SHIFT; | ||
361 | end = start + len; | ||
362 | if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, | ||
363 | (void __user *)start, len))) | ||
364 | return 0; | ||
365 | |||
366 | /* | ||
367 | * XXX: batch / limit 'nr', to avoid large irq off latency | ||
368 | * needs some instrumenting to determine the common sizes used by | ||
369 | * important workloads (eg. DB2), and whether limiting the batch size | ||
370 | * will decrease performance. | ||
371 | * | ||
372 | * It seems like we're in the clear for the moment. Direct-IO is | ||
373 | * the main guy that batches up lots of get_user_pages, and even | ||
374 | * they are limited to 64-at-a-time which is not so many. | ||
375 | */ | ||
376 | /* | ||
377 | * This doesn't prevent pagetable teardown, but does prevent | ||
378 | * the pagetables and pages from being freed on x86. | ||
379 | * | ||
380 | * So long as we atomically load page table pointers versus teardown | ||
381 | * (which we do on x86, with the above PAE exception), we can follow the | ||
382 | * address down to the the page and take a ref on it. | ||
383 | */ | ||
384 | local_irq_save(flags); | ||
385 | pgdp = pgd_offset(mm, addr); | ||
386 | do { | ||
387 | pgd_t pgd = *pgdp; | ||
388 | |||
389 | next = pgd_addr_end(addr, end); | ||
390 | if (pgd_none(pgd)) | ||
391 | break; | ||
392 | if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) | ||
393 | break; | ||
394 | } while (pgdp++, addr = next, addr != end); | ||
395 | local_irq_restore(flags); | ||
396 | |||
397 | return nr; | ||
398 | } | ||
399 | |||
400 | /** | ||
401 | * get_user_pages_fast() - pin user pages in memory | ||
402 | * @start: starting user address | ||
403 | * @nr_pages: number of pages from start to pin | ||
404 | * @write: whether pages will be written to | ||
405 | * @pages: array that receives pointers to the pages pinned. | ||
406 | * Should be at least nr_pages long. | ||
407 | * | ||
408 | * Attempt to pin user pages in memory without taking mm->mmap_sem. | ||
409 | * If not successful, it will fall back to taking the lock and | ||
410 | * calling get_user_pages(). | ||
411 | * | ||
412 | * Returns number of pages pinned. This may be fewer than the number | ||
413 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | ||
414 | * were pinned, returns -errno. | ||
415 | */ | ||
416 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, | ||
417 | struct page **pages) | ||
418 | { | ||
419 | struct mm_struct *mm = current->mm; | ||
420 | unsigned long addr, len, end; | ||
421 | unsigned long next; | ||
422 | pgd_t *pgdp; | ||
423 | int nr = 0; | ||
424 | |||
425 | start &= PAGE_MASK; | ||
426 | addr = start; | ||
427 | len = (unsigned long) nr_pages << PAGE_SHIFT; | ||
428 | |||
429 | end = start + len; | ||
430 | if (end < start) | ||
431 | goto slow_irqon; | ||
432 | |||
433 | #ifdef CONFIG_X86_64 | ||
434 | if (end >> __VIRTUAL_MASK_SHIFT) | ||
435 | goto slow_irqon; | ||
436 | #endif | ||
437 | |||
438 | /* | ||
439 | * XXX: batch / limit 'nr', to avoid large irq off latency | ||
440 | * needs some instrumenting to determine the common sizes used by | ||
441 | * important workloads (eg. DB2), and whether limiting the batch size | ||
442 | * will decrease performance. | ||
443 | * | ||
444 | * It seems like we're in the clear for the moment. Direct-IO is | ||
445 | * the main guy that batches up lots of get_user_pages, and even | ||
446 | * they are limited to 64-at-a-time which is not so many. | ||
447 | */ | ||
448 | /* | ||
449 | * This doesn't prevent pagetable teardown, but does prevent | ||
450 | * the pagetables and pages from being freed on x86. | ||
451 | * | ||
452 | * So long as we atomically load page table pointers versus teardown | ||
453 | * (which we do on x86, with the above PAE exception), we can follow the | ||
454 | * address down to the the page and take a ref on it. | ||
455 | */ | ||
456 | local_irq_disable(); | ||
457 | pgdp = pgd_offset(mm, addr); | ||
458 | do { | ||
459 | pgd_t pgd = *pgdp; | ||
460 | |||
461 | next = pgd_addr_end(addr, end); | ||
462 | if (pgd_none(pgd)) | ||
463 | goto slow; | ||
464 | if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) | ||
465 | goto slow; | ||
466 | } while (pgdp++, addr = next, addr != end); | ||
467 | local_irq_enable(); | ||
468 | |||
469 | VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); | ||
470 | return nr; | ||
471 | |||
472 | { | ||
473 | int ret; | ||
474 | |||
475 | slow: | ||
476 | local_irq_enable(); | ||
477 | slow_irqon: | ||
478 | /* Try to get the remaining pages with get_user_pages */ | ||
479 | start += nr << PAGE_SHIFT; | ||
480 | pages += nr; | ||
481 | |||
482 | ret = get_user_pages_unlocked(start, | ||
483 | (end - start) >> PAGE_SHIFT, | ||
484 | pages, write ? FOLL_WRITE : 0); | ||
485 | |||
486 | /* Have to be a bit careful with return values */ | ||
487 | if (nr > 0) { | ||
488 | if (ret < 0) | ||
489 | ret = nr; | ||
490 | else | ||
491 | ret += nr; | ||
492 | } | ||
493 | |||
494 | return ret; | ||
495 | } | ||
496 | } | ||
diff --git a/mm/Kconfig b/mm/Kconfig index beb7a455915d..398b46064544 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -137,7 +137,7 @@ config HAVE_MEMBLOCK_NODE_MAP | |||
137 | config HAVE_MEMBLOCK_PHYS_MAP | 137 | config HAVE_MEMBLOCK_PHYS_MAP |
138 | bool | 138 | bool |
139 | 139 | ||
140 | config HAVE_GENERIC_RCU_GUP | 140 | config HAVE_GENERIC_GUP |
141 | bool | 141 | bool |
142 | 142 | ||
143 | config ARCH_DISCARD_MEMBLOCK | 143 | config ARCH_DISCARD_MEMBLOCK |
@@ -1151,7 +1151,7 @@ struct page *get_dump_page(unsigned long addr) | |||
1151 | #endif /* CONFIG_ELF_CORE */ | 1151 | #endif /* CONFIG_ELF_CORE */ |
1152 | 1152 | ||
1153 | /* | 1153 | /* |
1154 | * Generic RCU Fast GUP | 1154 | * Generic Fast GUP |
1155 | * | 1155 | * |
1156 | * get_user_pages_fast attempts to pin user pages by walking the page | 1156 | * get_user_pages_fast attempts to pin user pages by walking the page |
1157 | * tables directly and avoids taking locks. Thus the walker needs to be | 1157 | * tables directly and avoids taking locks. Thus the walker needs to be |
@@ -1172,8 +1172,8 @@ struct page *get_dump_page(unsigned long addr) | |||
1172 | * Before activating this code, please be aware that the following assumptions | 1172 | * Before activating this code, please be aware that the following assumptions |
1173 | * are currently made: | 1173 | * are currently made: |
1174 | * | 1174 | * |
1175 | * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free | 1175 | * *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to |
1176 | * pages containing page tables. | 1176 | * free pages containing page tables or TLB flushing requires IPI broadcast. |
1177 | * | 1177 | * |
1178 | * *) ptes can be read atomically by the architecture. | 1178 | * *) ptes can be read atomically by the architecture. |
1179 | * | 1179 | * |
@@ -1183,7 +1183,7 @@ struct page *get_dump_page(unsigned long addr) | |||
1183 | * | 1183 | * |
1184 | * This code is based heavily on the PowerPC implementation by Nick Piggin. | 1184 | * This code is based heavily on the PowerPC implementation by Nick Piggin. |
1185 | */ | 1185 | */ |
1186 | #ifdef CONFIG_HAVE_GENERIC_RCU_GUP | 1186 | #ifdef CONFIG_HAVE_GENERIC_GUP |
1187 | 1187 | ||
1188 | #ifndef gup_get_pte | 1188 | #ifndef gup_get_pte |
1189 | /* | 1189 | /* |
@@ -1673,4 +1673,4 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, | |||
1673 | return ret; | 1673 | return ret; |
1674 | } | 1674 | } |
1675 | 1675 | ||
1676 | #endif /* CONFIG_HAVE_GENERIC_RCU_GUP */ | 1676 | #endif /* CONFIG_HAVE_GENERIC_GUP */ |