aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDan Williams <dan.j.williams@intel.com>2017-04-28 13:23:37 -0400
committerIngo Molnar <mingo@kernel.org>2017-05-01 03:15:53 -0400
commit71389703839ebe9cb426c72d5f0bd549592e583c (patch)
treeebae9604d3f43ce673a103c4b897ebac57d9aa57
parentdbd68d8e84c606673ebbcf15862f8c155fa92326 (diff)
mm, zone_device: Replace {get, put}_zone_device_page() with a single reference to fix pmem crash
The x86 conversion to the generic GUP code included a small change which causes crashes and data corruption in the pmem code - not good. The root cause is that the /dev/pmem driver code implicitly relies on the x86 get_user_pages() implementation doing a get_page() on the page refcount, because get_page() does a get_zone_device_page() which properly refcounts pmem's separate page struct arrays that are not present in the regular page struct structures. (The pmem driver does this because it can cover huge memory areas.) But the x86 conversion to the generic GUP code changed the get_page() to page_cache_get_speculative() which is faster but doesn't do the get_zone_device_page() call the pmem code relies on. One way to solve the regression would be to change the generic GUP code to use get_page(), but that would slow things down a bit and punish other generic-GUP using architectures for an x86-ism they did not care about. (Arguably the pmem driver was probably not working reliably for them: but nvdimm is an Intel feature, so non-x86 exposure is probably still limited.) So restructure the pmem code's interface with the MM instead: get rid of the get/put_zone_device_page() distinction, integrate put_zone_device_page() into __put_page() and and restructure the pmem completion-wait and teardown machinery: Kirill points out that the calls to {get,put}_dev_pagemap() can be removed from the mm fast path if we take a single get_dev_pagemap() reference to signify that the page is alive and use the final put of the page to drop that reference. This does require some care to make sure that any waits for the percpu_ref to drop to zero occur *after* devm_memremap_page_release(), since it now maintains its own elevated reference. This speeds up things while also making the pmem refcounting more robust going forward. Suggested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com> Tested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> Reviewed-by: Logan Gunthorpe <logang@deltatee.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Jérôme Glisse <jglisse@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/149339998297.24933.1129582806028305912.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--drivers/dax/pmem.c2
-rw-r--r--drivers/nvdimm/pmem.c13
-rw-r--r--include/linux/mm.h14
-rw-r--r--kernel/memremap.c22
-rw-r--r--mm/swap.c10
5 files changed, 31 insertions, 30 deletions
diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c
index 033f49b31fdc..cb0d742fa23f 100644
--- a/drivers/dax/pmem.c
+++ b/drivers/dax/pmem.c
@@ -43,6 +43,7 @@ static void dax_pmem_percpu_exit(void *data)
43 struct dax_pmem *dax_pmem = to_dax_pmem(ref); 43 struct dax_pmem *dax_pmem = to_dax_pmem(ref);
44 44
45 dev_dbg(dax_pmem->dev, "%s\n", __func__); 45 dev_dbg(dax_pmem->dev, "%s\n", __func__);
46 wait_for_completion(&dax_pmem->cmp);
46 percpu_ref_exit(ref); 47 percpu_ref_exit(ref);
47} 48}
48 49
@@ -53,7 +54,6 @@ static void dax_pmem_percpu_kill(void *data)
53 54
54 dev_dbg(dax_pmem->dev, "%s\n", __func__); 55 dev_dbg(dax_pmem->dev, "%s\n", __func__);
55 percpu_ref_kill(ref); 56 percpu_ref_kill(ref);
56 wait_for_completion(&dax_pmem->cmp);
57} 57}
58 58
59static int dax_pmem_probe(struct device *dev) 59static int dax_pmem_probe(struct device *dev)
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 5b536be5a12e..fb7bbc79ac26 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -25,6 +25,7 @@
25#include <linux/badblocks.h> 25#include <linux/badblocks.h>
26#include <linux/memremap.h> 26#include <linux/memremap.h>
27#include <linux/vmalloc.h> 27#include <linux/vmalloc.h>
28#include <linux/blk-mq.h>
28#include <linux/pfn_t.h> 29#include <linux/pfn_t.h>
29#include <linux/slab.h> 30#include <linux/slab.h>
30#include <linux/pmem.h> 31#include <linux/pmem.h>
@@ -231,6 +232,11 @@ static void pmem_release_queue(void *q)
231 blk_cleanup_queue(q); 232 blk_cleanup_queue(q);
232} 233}
233 234
235static void pmem_freeze_queue(void *q)
236{
237 blk_mq_freeze_queue_start(q);
238}
239
234static void pmem_release_disk(void *disk) 240static void pmem_release_disk(void *disk)
235{ 241{
236 del_gendisk(disk); 242 del_gendisk(disk);
@@ -284,6 +290,9 @@ static int pmem_attach_disk(struct device *dev,
284 if (!q) 290 if (!q)
285 return -ENOMEM; 291 return -ENOMEM;
286 292
293 if (devm_add_action_or_reset(dev, pmem_release_queue, q))
294 return -ENOMEM;
295
287 pmem->pfn_flags = PFN_DEV; 296 pmem->pfn_flags = PFN_DEV;
288 if (is_nd_pfn(dev)) { 297 if (is_nd_pfn(dev)) {
289 addr = devm_memremap_pages(dev, &pfn_res, &q->q_usage_counter, 298 addr = devm_memremap_pages(dev, &pfn_res, &q->q_usage_counter,
@@ -303,10 +312,10 @@ static int pmem_attach_disk(struct device *dev,
303 pmem->size, ARCH_MEMREMAP_PMEM); 312 pmem->size, ARCH_MEMREMAP_PMEM);
304 313
305 /* 314 /*
306 * At release time the queue must be dead before 315 * At release time the queue must be frozen before
307 * devm_memremap_pages is unwound 316 * devm_memremap_pages is unwound
308 */ 317 */
309 if (devm_add_action_or_reset(dev, pmem_release_queue, q)) 318 if (devm_add_action_or_reset(dev, pmem_freeze_queue, q))
310 return -ENOMEM; 319 return -ENOMEM;
311 320
312 if (IS_ERR(addr)) 321 if (IS_ERR(addr))
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a835edd2db34..695da2a19b4c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -762,19 +762,11 @@ static inline enum zone_type page_zonenum(const struct page *page)
762} 762}
763 763
764#ifdef CONFIG_ZONE_DEVICE 764#ifdef CONFIG_ZONE_DEVICE
765void get_zone_device_page(struct page *page);
766void put_zone_device_page(struct page *page);
767static inline bool is_zone_device_page(const struct page *page) 765static inline bool is_zone_device_page(const struct page *page)
768{ 766{
769 return page_zonenum(page) == ZONE_DEVICE; 767 return page_zonenum(page) == ZONE_DEVICE;
770} 768}
771#else 769#else
772static inline void get_zone_device_page(struct page *page)
773{
774}
775static inline void put_zone_device_page(struct page *page)
776{
777}
778static inline bool is_zone_device_page(const struct page *page) 770static inline bool is_zone_device_page(const struct page *page)
779{ 771{
780 return false; 772 return false;
@@ -790,9 +782,6 @@ static inline void get_page(struct page *page)
790 */ 782 */
791 VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page); 783 VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
792 page_ref_inc(page); 784 page_ref_inc(page);
793
794 if (unlikely(is_zone_device_page(page)))
795 get_zone_device_page(page);
796} 785}
797 786
798static inline void put_page(struct page *page) 787static inline void put_page(struct page *page)
@@ -801,9 +790,6 @@ static inline void put_page(struct page *page)
801 790
802 if (put_page_testzero(page)) 791 if (put_page_testzero(page))
803 __put_page(page); 792 __put_page(page);
804
805 if (unlikely(is_zone_device_page(page)))
806 put_zone_device_page(page);
807} 793}
808 794
809#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) 795#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 07e85e5229da..23a6483c3666 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -182,18 +182,6 @@ struct page_map {
182 struct vmem_altmap altmap; 182 struct vmem_altmap altmap;
183}; 183};
184 184
185void get_zone_device_page(struct page *page)
186{
187 percpu_ref_get(page->pgmap->ref);
188}
189EXPORT_SYMBOL(get_zone_device_page);
190
191void put_zone_device_page(struct page *page)
192{
193 put_dev_pagemap(page->pgmap);
194}
195EXPORT_SYMBOL(put_zone_device_page);
196
197static void pgmap_radix_release(struct resource *res) 185static void pgmap_radix_release(struct resource *res)
198{ 186{
199 resource_size_t key, align_start, align_size, align_end; 187 resource_size_t key, align_start, align_size, align_end;
@@ -237,6 +225,10 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
237 struct resource *res = &page_map->res; 225 struct resource *res = &page_map->res;
238 resource_size_t align_start, align_size; 226 resource_size_t align_start, align_size;
239 struct dev_pagemap *pgmap = &page_map->pgmap; 227 struct dev_pagemap *pgmap = &page_map->pgmap;
228 unsigned long pfn;
229
230 for_each_device_pfn(pfn, page_map)
231 put_page(pfn_to_page(pfn));
240 232
241 if (percpu_ref_tryget_live(pgmap->ref)) { 233 if (percpu_ref_tryget_live(pgmap->ref)) {
242 dev_WARN(dev, "%s: page mapping is still live!\n", __func__); 234 dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
@@ -277,7 +269,10 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
277 * 269 *
278 * Notes: 270 * Notes:
279 * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time 271 * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time
280 * (or devm release event). 272 * (or devm release event). The expected order of events is that @ref has
273 * been through percpu_ref_kill() before devm_memremap_pages_release(). The
274 * wait for the completion of all references being dropped and
275 * percpu_ref_exit() must occur after devm_memremap_pages_release().
281 * 276 *
282 * 2/ @res is expected to be a host memory range that could feasibly be 277 * 2/ @res is expected to be a host memory range that could feasibly be
283 * treated as a "System RAM" range, i.e. not a device mmio range, but 278 * treated as a "System RAM" range, i.e. not a device mmio range, but
@@ -379,6 +374,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
379 */ 374 */
380 list_del(&page->lru); 375 list_del(&page->lru);
381 page->pgmap = pgmap; 376 page->pgmap = pgmap;
377 percpu_ref_get(ref);
382 } 378 }
383 devres_add(dev, page_map); 379 devres_add(dev, page_map);
384 return __va(res->start); 380 return __va(res->start);
diff --git a/mm/swap.c b/mm/swap.c
index c4910f14f957..a4e6113276b5 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -97,6 +97,16 @@ static void __put_compound_page(struct page *page)
97 97
98void __put_page(struct page *page) 98void __put_page(struct page *page)
99{ 99{
100 if (is_zone_device_page(page)) {
101 put_dev_pagemap(page->pgmap);
102
103 /*
104 * The page belongs to the device that created pgmap. Do
105 * not return it to page allocator.
106 */
107 return;
108 }
109
100 if (unlikely(PageCompound(page))) 110 if (unlikely(PageCompound(page)))
101 __put_compound_page(page); 111 __put_compound_page(page);
102 else 112 else