diff options
author | Dan Williams <dan.j.williams@intel.com> | 2016-01-15 19:56:55 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-01-15 20:56:32 -0500 |
commit | 3565fce3a6597e91b8dee3e8e36ebf70f8b7ef9b (patch) | |
tree | 54f05861c87cb2c2710552b61a46cb5831b06296 /mm | |
parent | 5c7fb56e5e3f7035dd798a8e1adee639f87043e5 (diff) |
mm, x86: get_user_pages() for dax mappings
A dax mapping establishes a pte with _PAGE_DEVMAP set when the driver
has established a devm_memremap_pages() mapping, i.e. when the pfn_t
return from ->direct_access() has PFN_DEV and PFN_MAP set. Later, when
encountering _PAGE_DEVMAP during a page table walk we lookup and pin a
struct dev_pagemap instance to keep the result of pfn_to_page() valid
until put_page().
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/gup.c | 30 | ||||
-rw-r--r-- | mm/huge_memory.c | 75 | ||||
-rw-r--r-- | mm/swap.c | 1 |
3 files changed, 89 insertions, 17 deletions
@@ -4,6 +4,7 @@ | |||
4 | #include <linux/spinlock.h> | 4 | #include <linux/spinlock.h> |
5 | 5 | ||
6 | #include <linux/mm.h> | 6 | #include <linux/mm.h> |
7 | #include <linux/memremap.h> | ||
7 | #include <linux/pagemap.h> | 8 | #include <linux/pagemap.h> |
8 | #include <linux/rmap.h> | 9 | #include <linux/rmap.h> |
9 | #include <linux/swap.h> | 10 | #include <linux/swap.h> |
@@ -62,6 +63,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, | |||
62 | unsigned long address, pmd_t *pmd, unsigned int flags) | 63 | unsigned long address, pmd_t *pmd, unsigned int flags) |
63 | { | 64 | { |
64 | struct mm_struct *mm = vma->vm_mm; | 65 | struct mm_struct *mm = vma->vm_mm; |
66 | struct dev_pagemap *pgmap = NULL; | ||
65 | struct page *page; | 67 | struct page *page; |
66 | spinlock_t *ptl; | 68 | spinlock_t *ptl; |
67 | pte_t *ptep, pte; | 69 | pte_t *ptep, pte; |
@@ -98,7 +100,17 @@ retry: | |||
98 | } | 100 | } |
99 | 101 | ||
100 | page = vm_normal_page(vma, address, pte); | 102 | page = vm_normal_page(vma, address, pte); |
101 | if (unlikely(!page)) { | 103 | if (!page && pte_devmap(pte) && (flags & FOLL_GET)) { |
104 | /* | ||
105 | * Only return device mapping pages in the FOLL_GET case since | ||
106 | * they are only valid while holding the pgmap reference. | ||
107 | */ | ||
108 | pgmap = get_dev_pagemap(pte_pfn(pte), NULL); | ||
109 | if (pgmap) | ||
110 | page = pte_page(pte); | ||
111 | else | ||
112 | goto no_page; | ||
113 | } else if (unlikely(!page)) { | ||
102 | if (flags & FOLL_DUMP) { | 114 | if (flags & FOLL_DUMP) { |
103 | /* Avoid special (like zero) pages in core dumps */ | 115 | /* Avoid special (like zero) pages in core dumps */ |
104 | page = ERR_PTR(-EFAULT); | 116 | page = ERR_PTR(-EFAULT); |
@@ -129,8 +141,15 @@ retry: | |||
129 | goto retry; | 141 | goto retry; |
130 | } | 142 | } |
131 | 143 | ||
132 | if (flags & FOLL_GET) | 144 | if (flags & FOLL_GET) { |
133 | get_page(page); | 145 | get_page(page); |
146 | |||
147 | /* drop the pgmap reference now that we hold the page */ | ||
148 | if (pgmap) { | ||
149 | put_dev_pagemap(pgmap); | ||
150 | pgmap = NULL; | ||
151 | } | ||
152 | } | ||
134 | if (flags & FOLL_TOUCH) { | 153 | if (flags & FOLL_TOUCH) { |
135 | if ((flags & FOLL_WRITE) && | 154 | if ((flags & FOLL_WRITE) && |
136 | !pte_dirty(pte) && !PageDirty(page)) | 155 | !pte_dirty(pte) && !PageDirty(page)) |
@@ -237,6 +256,13 @@ struct page *follow_page_mask(struct vm_area_struct *vma, | |||
237 | } | 256 | } |
238 | if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) | 257 | if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) |
239 | return no_page_table(vma, flags); | 258 | return no_page_table(vma, flags); |
259 | if (pmd_devmap(*pmd)) { | ||
260 | ptl = pmd_lock(mm, pmd); | ||
261 | page = follow_devmap_pmd(vma, address, pmd, flags); | ||
262 | spin_unlock(ptl); | ||
263 | if (page) | ||
264 | return page; | ||
265 | } | ||
240 | if (likely(!pmd_trans_huge(*pmd))) | 266 | if (likely(!pmd_trans_huge(*pmd))) |
241 | return follow_page_pte(vma, address, pmd, flags); | 267 | return follow_page_pte(vma, address, pmd, flags); |
242 | 268 | ||
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 82bed2bec3ed..b2db98136af9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/freezer.h> | 23 | #include <linux/freezer.h> |
24 | #include <linux/pfn_t.h> | 24 | #include <linux/pfn_t.h> |
25 | #include <linux/mman.h> | 25 | #include <linux/mman.h> |
26 | #include <linux/memremap.h> | ||
26 | #include <linux/pagemap.h> | 27 | #include <linux/pagemap.h> |
27 | #include <linux/debugfs.h> | 28 | #include <linux/debugfs.h> |
28 | #include <linux/migrate.h> | 29 | #include <linux/migrate.h> |
@@ -974,6 +975,63 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, | |||
974 | return VM_FAULT_NOPAGE; | 975 | return VM_FAULT_NOPAGE; |
975 | } | 976 | } |
976 | 977 | ||
978 | static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, | ||
979 | pmd_t *pmd) | ||
980 | { | ||
981 | pmd_t _pmd; | ||
982 | |||
983 | /* | ||
984 | * We should set the dirty bit only for FOLL_WRITE but for now | ||
985 | * the dirty bit in the pmd is meaningless. And if the dirty | ||
986 | * bit will become meaningful and we'll only set it with | ||
987 | * FOLL_WRITE, an atomic set_bit will be required on the pmd to | ||
988 | * set the young bit, instead of the current set_pmd_at. | ||
989 | */ | ||
990 | _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); | ||
991 | if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, | ||
992 | pmd, _pmd, 1)) | ||
993 | update_mmu_cache_pmd(vma, addr, pmd); | ||
994 | } | ||
995 | |||
996 | struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, | ||
997 | pmd_t *pmd, int flags) | ||
998 | { | ||
999 | unsigned long pfn = pmd_pfn(*pmd); | ||
1000 | struct mm_struct *mm = vma->vm_mm; | ||
1001 | struct dev_pagemap *pgmap; | ||
1002 | struct page *page; | ||
1003 | |||
1004 | assert_spin_locked(pmd_lockptr(mm, pmd)); | ||
1005 | |||
1006 | if (flags & FOLL_WRITE && !pmd_write(*pmd)) | ||
1007 | return NULL; | ||
1008 | |||
1009 | if (pmd_present(*pmd) && pmd_devmap(*pmd)) | ||
1010 | /* pass */; | ||
1011 | else | ||
1012 | return NULL; | ||
1013 | |||
1014 | if (flags & FOLL_TOUCH) | ||
1015 | touch_pmd(vma, addr, pmd); | ||
1016 | |||
1017 | /* | ||
1018 | * device mapped pages can only be returned if the | ||
1019 | * caller will manage the page reference count. | ||
1020 | */ | ||
1021 | if (!(flags & FOLL_GET)) | ||
1022 | return ERR_PTR(-EEXIST); | ||
1023 | |||
1024 | pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; | ||
1025 | pgmap = get_dev_pagemap(pfn, NULL); | ||
1026 | if (!pgmap) | ||
1027 | return ERR_PTR(-EFAULT); | ||
1028 | page = pfn_to_page(pfn); | ||
1029 | get_page(page); | ||
1030 | put_dev_pagemap(pgmap); | ||
1031 | |||
1032 | return page; | ||
1033 | } | ||
1034 | |||
977 | int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 1035 | int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
978 | pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, | 1036 | pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, |
979 | struct vm_area_struct *vma) | 1037 | struct vm_area_struct *vma) |
@@ -1331,21 +1389,8 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, | |||
1331 | 1389 | ||
1332 | page = pmd_page(*pmd); | 1390 | page = pmd_page(*pmd); |
1333 | VM_BUG_ON_PAGE(!PageHead(page), page); | 1391 | VM_BUG_ON_PAGE(!PageHead(page), page); |
1334 | if (flags & FOLL_TOUCH) { | 1392 | if (flags & FOLL_TOUCH) |
1335 | pmd_t _pmd; | 1393 | touch_pmd(vma, addr, pmd); |
1336 | /* | ||
1337 | * We should set the dirty bit only for FOLL_WRITE but | ||
1338 | * for now the dirty bit in the pmd is meaningless. | ||
1339 | * And if the dirty bit will become meaningful and | ||
1340 | * we'll only set it with FOLL_WRITE, an atomic | ||
1341 | * set_bit will be required on the pmd to set the | ||
1342 | * young bit, instead of the current set_pmd_at. | ||
1343 | */ | ||
1344 | _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); | ||
1345 | if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, | ||
1346 | pmd, _pmd, 1)) | ||
1347 | update_mmu_cache_pmd(vma, addr, pmd); | ||
1348 | } | ||
1349 | if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { | 1394 | if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { |
1350 | /* | 1395 | /* |
1351 | * We don't mlock() pte-mapped THPs. This way we can avoid | 1396 | * We don't mlock() pte-mapped THPs. This way we can avoid |
@@ -24,6 +24,7 @@ | |||
24 | #include <linux/export.h> | 24 | #include <linux/export.h> |
25 | #include <linux/mm_inline.h> | 25 | #include <linux/mm_inline.h> |
26 | #include <linux/percpu_counter.h> | 26 | #include <linux/percpu_counter.h> |
27 | #include <linux/memremap.h> | ||
27 | #include <linux/percpu.h> | 28 | #include <linux/percpu.h> |
28 | #include <linux/cpu.h> | 29 | #include <linux/cpu.h> |
29 | #include <linux/notifier.h> | 30 | #include <linux/notifier.h> |