summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/powerpc/platforms/Kconfig1
-rw-r--r--arch/powerpc/sysdev/axonram.c45
-rw-r--r--arch/x86/include/asm/pmem.h5
-rw-r--r--arch/x86/include/asm/string_64.h1
-rw-r--r--block/Kconfig1
-rw-r--r--block/partition-generic.c17
-rw-r--r--drivers/Makefile2
-rw-r--r--drivers/acpi/nfit/Kconfig12
-rw-r--r--drivers/acpi/nfit/core.c233
-rw-r--r--drivers/acpi/nfit/nfit.h4
-rw-r--r--drivers/block/Kconfig1
-rw-r--r--drivers/block/brd.c48
-rw-r--r--drivers/dax/Kconfig10
-rw-r--r--drivers/dax/Makefile5
-rw-r--r--drivers/dax/dax-private.h57
-rw-r--r--drivers/dax/dax.h15
-rw-r--r--drivers/dax/device-dax.h25
-rw-r--r--drivers/dax/device.c (renamed from drivers/dax/dax.c)501
-rw-r--r--drivers/dax/pmem.c10
-rw-r--r--drivers/dax/super.c425
-rw-r--r--drivers/md/Kconfig1
-rw-r--r--drivers/md/dm-core.h1
-rw-r--r--drivers/md/dm-linear.c27
-rw-r--r--drivers/md/dm-snap.c6
-rw-r--r--drivers/md/dm-stripe.c29
-rw-r--r--drivers/md/dm-target.c6
-rw-r--r--drivers/md/dm.c67
-rw-r--r--drivers/nvdimm/Kconfig1
-rw-r--r--drivers/nvdimm/btt_devs.c2
-rw-r--r--drivers/nvdimm/bus.c122
-rw-r--r--drivers/nvdimm/claim.c37
-rw-r--r--drivers/nvdimm/core.c51
-rw-r--r--drivers/nvdimm/dax_devs.c2
-rw-r--r--drivers/nvdimm/dimm.c2
-rw-r--r--drivers/nvdimm/dimm_devs.c19
-rw-r--r--drivers/nvdimm/namespace_devs.c17
-rw-r--r--drivers/nvdimm/nd-core.h1
-rw-r--r--drivers/nvdimm/nd.h2
-rw-r--r--drivers/nvdimm/pfn_devs.c8
-rw-r--r--drivers/nvdimm/pmem.c90
-rw-r--r--drivers/nvdimm/pmem.h7
-rw-r--r--drivers/nvdimm/region.c24
-rw-r--r--drivers/nvdimm/region_devs.c83
-rw-r--r--drivers/s390/block/Kconfig1
-rw-r--r--drivers/s390/block/dcssblk.c45
-rw-r--r--fs/block_dev.c117
-rw-r--r--fs/dax.c297
-rw-r--r--fs/ext2/inode.c9
-rw-r--r--fs/ext4/inode.c9
-rw-r--r--fs/iomap.c3
-rw-r--r--fs/xfs/xfs_iomap.c10
-rw-r--r--include/linux/blkdev.h19
-rw-r--r--include/linux/dax.h34
-rw-r--r--include/linux/device-mapper.h8
-rw-r--r--include/linux/iomap.h1
-rw-r--r--include/linux/libnvdimm.h8
-rw-r--r--include/linux/pmem.h23
-rw-r--r--include/linux/string.h8
-rw-r--r--include/uapi/linux/ndctl.h1
-rw-r--r--tools/testing/nvdimm/Kbuild11
-rw-r--r--tools/testing/nvdimm/dax-dev.c49
-rw-r--r--tools/testing/nvdimm/pmem-dax.c21
-rw-r--r--tools/testing/nvdimm/test/nfit.c54
63 files changed, 1807 insertions, 944 deletions
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index 7e3a2ebba29b..33244e3d9375 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -284,6 +284,7 @@ config CPM2
284config AXON_RAM 284config AXON_RAM
285 tristate "Axon DDR2 memory device driver" 285 tristate "Axon DDR2 memory device driver"
286 depends on PPC_IBM_CELL_BLADE && BLOCK 286 depends on PPC_IBM_CELL_BLADE && BLOCK
287 select DAX
287 default m 288 default m
288 help 289 help
289 It registers one block device per Axon's DDR2 memory bank found 290 It registers one block device per Axon's DDR2 memory bank found
diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index f523ac883150..a7fe5fee744f 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -25,6 +25,7 @@
25 25
26#include <linux/bio.h> 26#include <linux/bio.h>
27#include <linux/blkdev.h> 27#include <linux/blkdev.h>
28#include <linux/dax.h>
28#include <linux/device.h> 29#include <linux/device.h>
29#include <linux/errno.h> 30#include <linux/errno.h>
30#include <linux/fs.h> 31#include <linux/fs.h>
@@ -62,6 +63,7 @@ static int azfs_major, azfs_minor;
62struct axon_ram_bank { 63struct axon_ram_bank {
63 struct platform_device *device; 64 struct platform_device *device;
64 struct gendisk *disk; 65 struct gendisk *disk;
66 struct dax_device *dax_dev;
65 unsigned int irq_id; 67 unsigned int irq_id;
66 unsigned long ph_addr; 68 unsigned long ph_addr;
67 unsigned long io_addr; 69 unsigned long io_addr;
@@ -137,25 +139,32 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio)
137 return BLK_QC_T_NONE; 139 return BLK_QC_T_NONE;
138} 140}
139 141
140/** 142static const struct block_device_operations axon_ram_devops = {
141 * axon_ram_direct_access - direct_access() method for block device 143 .owner = THIS_MODULE,
142 * @device, @sector, @data: see block_device_operations method 144};
143 */ 145
144static long 146static long
145axon_ram_direct_access(struct block_device *device, sector_t sector, 147__axon_ram_direct_access(struct axon_ram_bank *bank, pgoff_t pgoff, long nr_pages,
146 void **kaddr, pfn_t *pfn, long size) 148 void **kaddr, pfn_t *pfn)
147{ 149{
148 struct axon_ram_bank *bank = device->bd_disk->private_data; 150 resource_size_t offset = pgoff * PAGE_SIZE;
149 loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT;
150 151
151 *kaddr = (void *) bank->io_addr + offset; 152 *kaddr = (void *) bank->io_addr + offset;
152 *pfn = phys_to_pfn_t(bank->ph_addr + offset, PFN_DEV); 153 *pfn = phys_to_pfn_t(bank->ph_addr + offset, PFN_DEV);
153 return bank->size - offset; 154 return (bank->size - offset) / PAGE_SIZE;
154} 155}
155 156
156static const struct block_device_operations axon_ram_devops = { 157static long
157 .owner = THIS_MODULE, 158axon_ram_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
158 .direct_access = axon_ram_direct_access 159 void **kaddr, pfn_t *pfn)
160{
161 struct axon_ram_bank *bank = dax_get_private(dax_dev);
162
163 return __axon_ram_direct_access(bank, pgoff, nr_pages, kaddr, pfn);
164}
165
166static const struct dax_operations axon_ram_dax_ops = {
167 .direct_access = axon_ram_dax_direct_access,
159}; 168};
160 169
161/** 170/**
@@ -219,6 +228,7 @@ static int axon_ram_probe(struct platform_device *device)
219 goto failed; 228 goto failed;
220 } 229 }
221 230
231
222 bank->disk->major = azfs_major; 232 bank->disk->major = azfs_major;
223 bank->disk->first_minor = azfs_minor; 233 bank->disk->first_minor = azfs_minor;
224 bank->disk->fops = &axon_ram_devops; 234 bank->disk->fops = &axon_ram_devops;
@@ -227,6 +237,13 @@ static int axon_ram_probe(struct platform_device *device)
227 sprintf(bank->disk->disk_name, "%s%d", 237 sprintf(bank->disk->disk_name, "%s%d",
228 AXON_RAM_DEVICE_NAME, axon_ram_bank_id); 238 AXON_RAM_DEVICE_NAME, axon_ram_bank_id);
229 239
240 bank->dax_dev = alloc_dax(bank, bank->disk->disk_name,
241 &axon_ram_dax_ops);
242 if (!bank->dax_dev) {
243 rc = -ENOMEM;
244 goto failed;
245 }
246
230 bank->disk->queue = blk_alloc_queue(GFP_KERNEL); 247 bank->disk->queue = blk_alloc_queue(GFP_KERNEL);
231 if (bank->disk->queue == NULL) { 248 if (bank->disk->queue == NULL) {
232 dev_err(&device->dev, "Cannot register disk queue\n"); 249 dev_err(&device->dev, "Cannot register disk queue\n");
@@ -278,6 +295,8 @@ failed:
278 del_gendisk(bank->disk); 295 del_gendisk(bank->disk);
279 put_disk(bank->disk); 296 put_disk(bank->disk);
280 } 297 }
298 kill_dax(bank->dax_dev);
299 put_dax(bank->dax_dev);
281 device->dev.platform_data = NULL; 300 device->dev.platform_data = NULL;
282 if (bank->io_addr != 0) 301 if (bank->io_addr != 0)
283 iounmap((void __iomem *) bank->io_addr); 302 iounmap((void __iomem *) bank->io_addr);
@@ -300,6 +319,8 @@ axon_ram_remove(struct platform_device *device)
300 319
301 device_remove_file(&device->dev, &dev_attr_ecc); 320 device_remove_file(&device->dev, &dev_attr_ecc);
302 free_irq(bank->irq_id, device); 321 free_irq(bank->irq_id, device);
322 kill_dax(bank->dax_dev);
323 put_dax(bank->dax_dev);
303 del_gendisk(bank->disk); 324 del_gendisk(bank->disk);
304 put_disk(bank->disk); 325 put_disk(bank->disk);
305 iounmap((void __iomem *) bank->io_addr); 326 iounmap((void __iomem *) bank->io_addr);
diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
index 529bb4a6487a..d5a22bac9988 100644
--- a/arch/x86/include/asm/pmem.h
+++ b/arch/x86/include/asm/pmem.h
@@ -44,11 +44,6 @@ static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n)
44 BUG(); 44 BUG();
45} 45}
46 46
47static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n)
48{
49 return memcpy_mcsafe(dst, src, n);
50}
51
52/** 47/**
53 * arch_wb_cache_pmem - write back a cache range with CLWB 48 * arch_wb_cache_pmem - write back a cache range with CLWB
54 * @vaddr: virtual start address 49 * @vaddr: virtual start address
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index a164862d77e3..733bae07fb29 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -79,6 +79,7 @@ int strcmp(const char *cs, const char *ct);
79#define memset(s, c, n) __memset(s, c, n) 79#define memset(s, c, n) __memset(s, c, n)
80#endif 80#endif
81 81
82#define __HAVE_ARCH_MEMCPY_MCSAFE 1
82__must_check int memcpy_mcsafe_unrolled(void *dst, const void *src, size_t cnt); 83__must_check int memcpy_mcsafe_unrolled(void *dst, const void *src, size_t cnt);
83DECLARE_STATIC_KEY_FALSE(mcsafe_key); 84DECLARE_STATIC_KEY_FALSE(mcsafe_key);
84 85
diff --git a/block/Kconfig b/block/Kconfig
index 89cd28f8d051..a8ad7e77db28 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -6,6 +6,7 @@ menuconfig BLOCK
6 default y 6 default y
7 select SBITMAP 7 select SBITMAP
8 select SRCU 8 select SRCU
9 select DAX
9 help 10 help
10 Provide block layer support for the kernel. 11 Provide block layer support for the kernel.
11 12
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 0171a2faad68..ff07b9143ca4 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -16,7 +16,6 @@
16#include <linux/kmod.h> 16#include <linux/kmod.h>
17#include <linux/ctype.h> 17#include <linux/ctype.h>
18#include <linux/genhd.h> 18#include <linux/genhd.h>
19#include <linux/dax.h>
20#include <linux/blktrace_api.h> 19#include <linux/blktrace_api.h>
21 20
22#include "partitions/check.h" 21#include "partitions/check.h"
@@ -630,24 +629,12 @@ int invalidate_partitions(struct gendisk *disk, struct block_device *bdev)
630 return 0; 629 return 0;
631} 630}
632 631
633static struct page *read_pagecache_sector(struct block_device *bdev, sector_t n)
634{
635 struct address_space *mapping = bdev->bd_inode->i_mapping;
636
637 return read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_SHIFT-9)),
638 NULL);
639}
640
641unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) 632unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
642{ 633{
634 struct address_space *mapping = bdev->bd_inode->i_mapping;
643 struct page *page; 635 struct page *page;
644 636
645 /* don't populate page cache for dax capable devices */ 637 page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_SHIFT-9)), NULL);
646 if (IS_DAX(bdev->bd_inode))
647 page = read_dax_sector(bdev, n);
648 else
649 page = read_pagecache_sector(bdev, n);
650
651 if (!IS_ERR(page)) { 638 if (!IS_ERR(page)) {
652 if (PageError(page)) 639 if (PageError(page))
653 goto fail; 640 goto fail;
diff --git a/drivers/Makefile b/drivers/Makefile
index 8f8bdc9e3d29..903b19199b69 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -71,7 +71,7 @@ obj-$(CONFIG_PARPORT) += parport/
71obj-$(CONFIG_NVM) += lightnvm/ 71obj-$(CONFIG_NVM) += lightnvm/
72obj-y += base/ block/ misc/ mfd/ nfc/ 72obj-y += base/ block/ misc/ mfd/ nfc/
73obj-$(CONFIG_LIBNVDIMM) += nvdimm/ 73obj-$(CONFIG_LIBNVDIMM) += nvdimm/
74obj-$(CONFIG_DEV_DAX) += dax/ 74obj-$(CONFIG_DAX) += dax/
75obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/ 75obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/
76obj-$(CONFIG_NUBUS) += nubus/ 76obj-$(CONFIG_NUBUS) += nubus/
77obj-y += macintosh/ 77obj-y += macintosh/
diff --git a/drivers/acpi/nfit/Kconfig b/drivers/acpi/nfit/Kconfig
index dd0d53c52552..6d3351452ea2 100644
--- a/drivers/acpi/nfit/Kconfig
+++ b/drivers/acpi/nfit/Kconfig
@@ -12,15 +12,3 @@ config ACPI_NFIT
12 12
13 To compile this driver as a module, choose M here: 13 To compile this driver as a module, choose M here:
14 the module will be called nfit. 14 the module will be called nfit.
15
16config ACPI_NFIT_DEBUG
17 bool "NFIT DSM debug"
18 depends on ACPI_NFIT
19 depends on DYNAMIC_DEBUG
20 default n
21 help
22 Enabling this option causes the nfit driver to dump the
23 input and output buffers of _DSM operations on the ACPI0012
24 device and its children. This can be very verbose, so leave
25 it disabled unless you are debugging a hardware / firmware
26 issue.
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index c8ea9d698cd0..656acb5d7166 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -49,7 +49,16 @@ MODULE_PARM_DESC(scrub_overflow_abort,
49static bool disable_vendor_specific; 49static bool disable_vendor_specific;
50module_param(disable_vendor_specific, bool, S_IRUGO); 50module_param(disable_vendor_specific, bool, S_IRUGO);
51MODULE_PARM_DESC(disable_vendor_specific, 51MODULE_PARM_DESC(disable_vendor_specific,
52 "Limit commands to the publicly specified set\n"); 52 "Limit commands to the publicly specified set");
53
54static unsigned long override_dsm_mask;
55module_param(override_dsm_mask, ulong, S_IRUGO);
56MODULE_PARM_DESC(override_dsm_mask, "Bitmask of allowed NVDIMM DSM functions");
57
58static int default_dsm_family = -1;
59module_param(default_dsm_family, int, S_IRUGO);
60MODULE_PARM_DESC(default_dsm_family,
61 "Try this DSM type first when identifying NVDIMM family");
53 62
54LIST_HEAD(acpi_descs); 63LIST_HEAD(acpi_descs);
55DEFINE_MUTEX(acpi_desc_lock); 64DEFINE_MUTEX(acpi_desc_lock);
@@ -175,14 +184,29 @@ static int xlat_bus_status(void *buf, unsigned int cmd, u32 status)
175 return 0; 184 return 0;
176} 185}
177 186
187static int xlat_nvdimm_status(void *buf, unsigned int cmd, u32 status)
188{
189 switch (cmd) {
190 case ND_CMD_GET_CONFIG_SIZE:
191 if (status >> 16 & ND_CONFIG_LOCKED)
192 return -EACCES;
193 break;
194 default:
195 break;
196 }
197
198 /* all other non-zero status results in an error */
199 if (status)
200 return -EIO;
201 return 0;
202}
203
178static int xlat_status(struct nvdimm *nvdimm, void *buf, unsigned int cmd, 204static int xlat_status(struct nvdimm *nvdimm, void *buf, unsigned int cmd,
179 u32 status) 205 u32 status)
180{ 206{
181 if (!nvdimm) 207 if (!nvdimm)
182 return xlat_bus_status(buf, cmd, status); 208 return xlat_bus_status(buf, cmd, status);
183 if (status) 209 return xlat_nvdimm_status(buf, cmd, status);
184 return -EIO;
185 return 0;
186} 210}
187 211
188int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, 212int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
@@ -259,14 +283,11 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
259 in_buf.buffer.length = call_pkg->nd_size_in; 283 in_buf.buffer.length = call_pkg->nd_size_in;
260 } 284 }
261 285
262 if (IS_ENABLED(CONFIG_ACPI_NFIT_DEBUG)) { 286 dev_dbg(dev, "%s:%s cmd: %d: func: %d input length: %d\n",
263 dev_dbg(dev, "%s:%s cmd: %d: func: %d input length: %d\n", 287 __func__, dimm_name, cmd, func, in_buf.buffer.length);
264 __func__, dimm_name, cmd, func, 288 print_hex_dump_debug("nvdimm in ", DUMP_PREFIX_OFFSET, 4, 4,
265 in_buf.buffer.length);
266 print_hex_dump_debug("nvdimm in ", DUMP_PREFIX_OFFSET, 4, 4,
267 in_buf.buffer.pointer, 289 in_buf.buffer.pointer,
268 min_t(u32, 256, in_buf.buffer.length), true); 290 min_t(u32, 256, in_buf.buffer.length), true);
269 }
270 291
271 out_obj = acpi_evaluate_dsm(handle, uuid, 1, func, &in_obj); 292 out_obj = acpi_evaluate_dsm(handle, uuid, 1, func, &in_obj);
272 if (!out_obj) { 293 if (!out_obj) {
@@ -298,13 +319,11 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
298 goto out; 319 goto out;
299 } 320 }
300 321
301 if (IS_ENABLED(CONFIG_ACPI_NFIT_DEBUG)) { 322 dev_dbg(dev, "%s:%s cmd: %s output length: %d\n", __func__, dimm_name,
302 dev_dbg(dev, "%s:%s cmd: %s output length: %d\n", __func__, 323 cmd_name, out_obj->buffer.length);
303 dimm_name, cmd_name, out_obj->buffer.length); 324 print_hex_dump_debug(cmd_name, DUMP_PREFIX_OFFSET, 4, 4,
304 print_hex_dump_debug(cmd_name, DUMP_PREFIX_OFFSET, 4, 325 out_obj->buffer.pointer,
305 4, out_obj->buffer.pointer, min_t(u32, 128, 326 min_t(u32, 128, out_obj->buffer.length), true);
306 out_obj->buffer.length), true);
307 }
308 327
309 for (i = 0, offset = 0; i < desc->out_num; i++) { 328 for (i = 0, offset = 0; i < desc->out_num; i++) {
310 u32 out_size = nd_cmd_out_size(nvdimm, cmd, desc, i, buf, 329 u32 out_size = nd_cmd_out_size(nvdimm, cmd, desc, i, buf,
@@ -448,9 +467,9 @@ static bool add_memdev(struct acpi_nfit_desc *acpi_desc,
448 INIT_LIST_HEAD(&nfit_memdev->list); 467 INIT_LIST_HEAD(&nfit_memdev->list);
449 memcpy(nfit_memdev->memdev, memdev, sizeof(*memdev)); 468 memcpy(nfit_memdev->memdev, memdev, sizeof(*memdev));
450 list_add_tail(&nfit_memdev->list, &acpi_desc->memdevs); 469 list_add_tail(&nfit_memdev->list, &acpi_desc->memdevs);
451 dev_dbg(dev, "%s: memdev handle: %#x spa: %d dcr: %d\n", 470 dev_dbg(dev, "%s: memdev handle: %#x spa: %d dcr: %d flags: %#x\n",
452 __func__, memdev->device_handle, memdev->range_index, 471 __func__, memdev->device_handle, memdev->range_index,
453 memdev->region_index); 472 memdev->region_index, memdev->flags);
454 return true; 473 return true;
455} 474}
456 475
@@ -729,28 +748,38 @@ static void nfit_mem_init_bdw(struct acpi_nfit_desc *acpi_desc,
729 } 748 }
730} 749}
731 750
732static int nfit_mem_dcr_init(struct acpi_nfit_desc *acpi_desc, 751static int __nfit_mem_init(struct acpi_nfit_desc *acpi_desc,
733 struct acpi_nfit_system_address *spa) 752 struct acpi_nfit_system_address *spa)
734{ 753{
735 struct nfit_mem *nfit_mem, *found; 754 struct nfit_mem *nfit_mem, *found;
736 struct nfit_memdev *nfit_memdev; 755 struct nfit_memdev *nfit_memdev;
737 int type = nfit_spa_type(spa); 756 int type = spa ? nfit_spa_type(spa) : 0;
738 757
739 switch (type) { 758 switch (type) {
740 case NFIT_SPA_DCR: 759 case NFIT_SPA_DCR:
741 case NFIT_SPA_PM: 760 case NFIT_SPA_PM:
742 break; 761 break;
743 default: 762 default:
744 return 0; 763 if (spa)
764 return 0;
745 } 765 }
746 766
767 /*
768 * This loop runs in two modes, when a dimm is mapped the loop
769 * adds memdev associations to an existing dimm, or creates a
770 * dimm. In the unmapped dimm case this loop sweeps for memdev
771 * instances with an invalid / zero range_index and adds those
772 * dimms without spa associations.
773 */
747 list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { 774 list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
748 struct nfit_flush *nfit_flush; 775 struct nfit_flush *nfit_flush;
749 struct nfit_dcr *nfit_dcr; 776 struct nfit_dcr *nfit_dcr;
750 u32 device_handle; 777 u32 device_handle;
751 u16 dcr; 778 u16 dcr;
752 779
753 if (nfit_memdev->memdev->range_index != spa->range_index) 780 if (spa && nfit_memdev->memdev->range_index != spa->range_index)
781 continue;
782 if (!spa && nfit_memdev->memdev->range_index)
754 continue; 783 continue;
755 found = NULL; 784 found = NULL;
756 dcr = nfit_memdev->memdev->region_index; 785 dcr = nfit_memdev->memdev->region_index;
@@ -835,14 +864,15 @@ static int nfit_mem_dcr_init(struct acpi_nfit_desc *acpi_desc,
835 break; 864 break;
836 } 865 }
837 nfit_mem_init_bdw(acpi_desc, nfit_mem, spa); 866 nfit_mem_init_bdw(acpi_desc, nfit_mem, spa);
838 } else { 867 } else if (type == NFIT_SPA_PM) {
839 /* 868 /*
840 * A single dimm may belong to multiple SPA-PM 869 * A single dimm may belong to multiple SPA-PM
841 * ranges, record at least one in addition to 870 * ranges, record at least one in addition to
842 * any SPA-DCR range. 871 * any SPA-DCR range.
843 */ 872 */
844 nfit_mem->memdev_pmem = nfit_memdev->memdev; 873 nfit_mem->memdev_pmem = nfit_memdev->memdev;
845 } 874 } else
875 nfit_mem->memdev_dcr = nfit_memdev->memdev;
846 } 876 }
847 877
848 return 0; 878 return 0;
@@ -866,6 +896,8 @@ static int nfit_mem_cmp(void *priv, struct list_head *_a, struct list_head *_b)
866static int nfit_mem_init(struct acpi_nfit_desc *acpi_desc) 896static int nfit_mem_init(struct acpi_nfit_desc *acpi_desc)
867{ 897{
868 struct nfit_spa *nfit_spa; 898 struct nfit_spa *nfit_spa;
899 int rc;
900
869 901
870 /* 902 /*
871 * For each SPA-DCR or SPA-PMEM address range find its 903 * For each SPA-DCR or SPA-PMEM address range find its
@@ -876,13 +908,20 @@ static int nfit_mem_init(struct acpi_nfit_desc *acpi_desc)
876 * BDWs are optional. 908 * BDWs are optional.
877 */ 909 */
878 list_for_each_entry(nfit_spa, &acpi_desc->spas, list) { 910 list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
879 int rc; 911 rc = __nfit_mem_init(acpi_desc, nfit_spa->spa);
880
881 rc = nfit_mem_dcr_init(acpi_desc, nfit_spa->spa);
882 if (rc) 912 if (rc)
883 return rc; 913 return rc;
884 } 914 }
885 915
916 /*
917 * If a DIMM has failed to be mapped into SPA there will be no
918 * SPA entries above. Find and register all the unmapped DIMMs
919 * for reporting and recovery purposes.
920 */
921 rc = __nfit_mem_init(acpi_desc, NULL);
922 if (rc)
923 return rc;
924
886 list_sort(NULL, &acpi_desc->dimms, nfit_mem_cmp); 925 list_sort(NULL, &acpi_desc->dimms, nfit_mem_cmp);
887 926
888 return 0; 927 return 0;
@@ -1237,12 +1276,14 @@ static ssize_t flags_show(struct device *dev,
1237{ 1276{
1238 u16 flags = to_nfit_memdev(dev)->flags; 1277 u16 flags = to_nfit_memdev(dev)->flags;
1239 1278
1240 return sprintf(buf, "%s%s%s%s%s\n", 1279 return sprintf(buf, "%s%s%s%s%s%s%s\n",
1241 flags & ACPI_NFIT_MEM_SAVE_FAILED ? "save_fail " : "", 1280 flags & ACPI_NFIT_MEM_SAVE_FAILED ? "save_fail " : "",
1242 flags & ACPI_NFIT_MEM_RESTORE_FAILED ? "restore_fail " : "", 1281 flags & ACPI_NFIT_MEM_RESTORE_FAILED ? "restore_fail " : "",
1243 flags & ACPI_NFIT_MEM_FLUSH_FAILED ? "flush_fail " : "", 1282 flags & ACPI_NFIT_MEM_FLUSH_FAILED ? "flush_fail " : "",
1244 flags & ACPI_NFIT_MEM_NOT_ARMED ? "not_armed " : "", 1283 flags & ACPI_NFIT_MEM_NOT_ARMED ? "not_armed " : "",
1245 flags & ACPI_NFIT_MEM_HEALTH_OBSERVED ? "smart_event " : ""); 1284 flags & ACPI_NFIT_MEM_HEALTH_OBSERVED ? "smart_event " : "",
1285 flags & ACPI_NFIT_MEM_MAP_FAILED ? "map_fail " : "",
1286 flags & ACPI_NFIT_MEM_HEALTH_ENABLED ? "smart_notify " : "");
1246} 1287}
1247static DEVICE_ATTR_RO(flags); 1288static DEVICE_ATTR_RO(flags);
1248 1289
@@ -1290,8 +1331,16 @@ static umode_t acpi_nfit_dimm_attr_visible(struct kobject *kobj,
1290 struct device *dev = container_of(kobj, struct device, kobj); 1331 struct device *dev = container_of(kobj, struct device, kobj);
1291 struct nvdimm *nvdimm = to_nvdimm(dev); 1332 struct nvdimm *nvdimm = to_nvdimm(dev);
1292 1333
1293 if (!to_nfit_dcr(dev)) 1334 if (!to_nfit_dcr(dev)) {
1335 /* Without a dcr only the memdev attributes can be surfaced */
1336 if (a == &dev_attr_handle.attr || a == &dev_attr_phys_id.attr
1337 || a == &dev_attr_flags.attr
1338 || a == &dev_attr_family.attr
1339 || a == &dev_attr_dsm_mask.attr)
1340 return a->mode;
1294 return 0; 1341 return 0;
1342 }
1343
1295 if (a == &dev_attr_format1.attr && num_nvdimm_formats(nvdimm) <= 1) 1344 if (a == &dev_attr_format1.attr && num_nvdimm_formats(nvdimm) <= 1)
1296 return 0; 1345 return 0;
1297 return a->mode; 1346 return a->mode;
@@ -1368,6 +1417,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
1368 unsigned long dsm_mask; 1417 unsigned long dsm_mask;
1369 const u8 *uuid; 1418 const u8 *uuid;
1370 int i; 1419 int i;
1420 int family = -1;
1371 1421
1372 /* nfit test assumes 1:1 relationship between commands and dsms */ 1422 /* nfit test assumes 1:1 relationship between commands and dsms */
1373 nfit_mem->dsm_mask = acpi_desc->dimm_cmd_force_en; 1423 nfit_mem->dsm_mask = acpi_desc->dimm_cmd_force_en;
@@ -1398,11 +1448,14 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
1398 */ 1448 */
1399 for (i = NVDIMM_FAMILY_INTEL; i <= NVDIMM_FAMILY_MSFT; i++) 1449 for (i = NVDIMM_FAMILY_INTEL; i <= NVDIMM_FAMILY_MSFT; i++)
1400 if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1)) 1450 if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1))
1401 break; 1451 if (family < 0 || i == default_dsm_family)
1452 family = i;
1402 1453
1403 /* limit the supported commands to those that are publicly documented */ 1454 /* limit the supported commands to those that are publicly documented */
1404 nfit_mem->family = i; 1455 nfit_mem->family = family;
1405 if (nfit_mem->family == NVDIMM_FAMILY_INTEL) { 1456 if (override_dsm_mask && !disable_vendor_specific)
1457 dsm_mask = override_dsm_mask;
1458 else if (nfit_mem->family == NVDIMM_FAMILY_INTEL) {
1406 dsm_mask = 0x3fe; 1459 dsm_mask = 0x3fe;
1407 if (disable_vendor_specific) 1460 if (disable_vendor_specific)
1408 dsm_mask &= ~(1 << ND_CMD_VENDOR); 1461 dsm_mask &= ~(1 << ND_CMD_VENDOR);
@@ -1462,6 +1515,7 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
1462 list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) { 1515 list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) {
1463 struct acpi_nfit_flush_address *flush; 1516 struct acpi_nfit_flush_address *flush;
1464 unsigned long flags = 0, cmd_mask; 1517 unsigned long flags = 0, cmd_mask;
1518 struct nfit_memdev *nfit_memdev;
1465 u32 device_handle; 1519 u32 device_handle;
1466 u16 mem_flags; 1520 u16 mem_flags;
1467 1521
@@ -1473,11 +1527,22 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
1473 } 1527 }
1474 1528
1475 if (nfit_mem->bdw && nfit_mem->memdev_pmem) 1529 if (nfit_mem->bdw && nfit_mem->memdev_pmem)
1476 flags |= NDD_ALIASING; 1530 set_bit(NDD_ALIASING, &flags);
1531
1532 /* collate flags across all memdevs for this dimm */
1533 list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
1534 struct acpi_nfit_memory_map *dimm_memdev;
1535
1536 dimm_memdev = __to_nfit_memdev(nfit_mem);
1537 if (dimm_memdev->device_handle
1538 != nfit_memdev->memdev->device_handle)
1539 continue;
1540 dimm_memdev->flags |= nfit_memdev->memdev->flags;
1541 }
1477 1542
1478 mem_flags = __to_nfit_memdev(nfit_mem)->flags; 1543 mem_flags = __to_nfit_memdev(nfit_mem)->flags;
1479 if (mem_flags & ACPI_NFIT_MEM_NOT_ARMED) 1544 if (mem_flags & ACPI_NFIT_MEM_NOT_ARMED)
1480 flags |= NDD_UNARMED; 1545 set_bit(NDD_UNARMED, &flags);
1481 1546
1482 rc = acpi_nfit_add_dimm(acpi_desc, nfit_mem, device_handle); 1547 rc = acpi_nfit_add_dimm(acpi_desc, nfit_mem, device_handle);
1483 if (rc) 1548 if (rc)
@@ -1507,12 +1572,13 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
1507 if ((mem_flags & ACPI_NFIT_MEM_FAILED_MASK) == 0) 1572 if ((mem_flags & ACPI_NFIT_MEM_FAILED_MASK) == 0)
1508 continue; 1573 continue;
1509 1574
1510 dev_info(acpi_desc->dev, "%s flags:%s%s%s%s\n", 1575 dev_info(acpi_desc->dev, "%s flags:%s%s%s%s%s\n",
1511 nvdimm_name(nvdimm), 1576 nvdimm_name(nvdimm),
1512 mem_flags & ACPI_NFIT_MEM_SAVE_FAILED ? " save_fail" : "", 1577 mem_flags & ACPI_NFIT_MEM_SAVE_FAILED ? " save_fail" : "",
1513 mem_flags & ACPI_NFIT_MEM_RESTORE_FAILED ? " restore_fail":"", 1578 mem_flags & ACPI_NFIT_MEM_RESTORE_FAILED ? " restore_fail":"",
1514 mem_flags & ACPI_NFIT_MEM_FLUSH_FAILED ? " flush_fail" : "", 1579 mem_flags & ACPI_NFIT_MEM_FLUSH_FAILED ? " flush_fail" : "",
1515 mem_flags & ACPI_NFIT_MEM_NOT_ARMED ? " not_armed" : ""); 1580 mem_flags & ACPI_NFIT_MEM_NOT_ARMED ? " not_armed" : "",
1581 mem_flags & ACPI_NFIT_MEM_MAP_FAILED ? " map_fail" : "");
1516 1582
1517 } 1583 }
1518 1584
@@ -1783,8 +1849,7 @@ static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk,
1783 mmio_flush_range((void __force *) 1849 mmio_flush_range((void __force *)
1784 mmio->addr.aperture + offset, c); 1850 mmio->addr.aperture + offset, c);
1785 1851
1786 memcpy_from_pmem(iobuf + copied, 1852 memcpy(iobuf + copied, mmio->addr.aperture + offset, c);
1787 mmio->addr.aperture + offset, c);
1788 } 1853 }
1789 1854
1790 copied += c; 1855 copied += c;
@@ -2525,6 +2590,7 @@ static void acpi_nfit_scrub(struct work_struct *work)
2525 acpi_nfit_register_region(acpi_desc, nfit_spa); 2590 acpi_nfit_register_region(acpi_desc, nfit_spa);
2526 } 2591 }
2527 } 2592 }
2593 acpi_desc->init_complete = 1;
2528 2594
2529 list_for_each_entry(nfit_spa, &acpi_desc->spas, list) 2595 list_for_each_entry(nfit_spa, &acpi_desc->spas, list)
2530 acpi_nfit_async_scrub(acpi_desc, nfit_spa); 2596 acpi_nfit_async_scrub(acpi_desc, nfit_spa);
@@ -2547,7 +2613,8 @@ static int acpi_nfit_register_regions(struct acpi_nfit_desc *acpi_desc)
2547 return rc; 2613 return rc;
2548 } 2614 }
2549 2615
2550 queue_work(nfit_wq, &acpi_desc->work); 2616 if (!acpi_desc->cancel)
2617 queue_work(nfit_wq, &acpi_desc->work);
2551 return 0; 2618 return 0;
2552} 2619}
2553 2620
@@ -2593,32 +2660,11 @@ static int acpi_nfit_desc_init_scrub_attr(struct acpi_nfit_desc *acpi_desc)
2593 return 0; 2660 return 0;
2594} 2661}
2595 2662
2596static void acpi_nfit_destruct(void *data) 2663static void acpi_nfit_unregister(void *data)
2597{ 2664{
2598 struct acpi_nfit_desc *acpi_desc = data; 2665 struct acpi_nfit_desc *acpi_desc = data;
2599 struct device *bus_dev = to_nvdimm_bus_dev(acpi_desc->nvdimm_bus);
2600
2601 /*
2602 * Destruct under acpi_desc_lock so that nfit_handle_mce does not
2603 * race teardown
2604 */
2605 mutex_lock(&acpi_desc_lock);
2606 acpi_desc->cancel = 1;
2607 /*
2608 * Bounce the nvdimm bus lock to make sure any in-flight
2609 * acpi_nfit_ars_rescan() submissions have had a chance to
2610 * either submit or see ->cancel set.
2611 */
2612 device_lock(bus_dev);
2613 device_unlock(bus_dev);
2614 2666
2615 flush_workqueue(nfit_wq);
2616 if (acpi_desc->scrub_count_state)
2617 sysfs_put(acpi_desc->scrub_count_state);
2618 nvdimm_bus_unregister(acpi_desc->nvdimm_bus); 2667 nvdimm_bus_unregister(acpi_desc->nvdimm_bus);
2619 acpi_desc->nvdimm_bus = NULL;
2620 list_del(&acpi_desc->list);
2621 mutex_unlock(&acpi_desc_lock);
2622} 2668}
2623 2669
2624int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *data, acpi_size sz) 2670int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *data, acpi_size sz)
@@ -2636,7 +2682,7 @@ int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *data, acpi_size sz)
2636 if (!acpi_desc->nvdimm_bus) 2682 if (!acpi_desc->nvdimm_bus)
2637 return -ENOMEM; 2683 return -ENOMEM;
2638 2684
2639 rc = devm_add_action_or_reset(dev, acpi_nfit_destruct, 2685 rc = devm_add_action_or_reset(dev, acpi_nfit_unregister,
2640 acpi_desc); 2686 acpi_desc);
2641 if (rc) 2687 if (rc)
2642 return rc; 2688 return rc;
@@ -2728,6 +2774,13 @@ static int acpi_nfit_flush_probe(struct nvdimm_bus_descriptor *nd_desc)
2728 device_lock(dev); 2774 device_lock(dev);
2729 device_unlock(dev); 2775 device_unlock(dev);
2730 2776
2777 /* bounce the init_mutex to make init_complete valid */
2778 mutex_lock(&acpi_desc->init_mutex);
2779 if (acpi_desc->cancel || acpi_desc->init_complete) {
2780 mutex_unlock(&acpi_desc->init_mutex);
2781 return 0;
2782 }
2783
2731 /* 2784 /*
2732 * Scrub work could take 10s of seconds, userspace may give up so we 2785 * Scrub work could take 10s of seconds, userspace may give up so we
2733 * need to be interruptible while waiting. 2786 * need to be interruptible while waiting.
@@ -2735,6 +2788,7 @@ static int acpi_nfit_flush_probe(struct nvdimm_bus_descriptor *nd_desc)
2735 INIT_WORK_ONSTACK(&flush.work, flush_probe); 2788 INIT_WORK_ONSTACK(&flush.work, flush_probe);
2736 COMPLETION_INITIALIZER_ONSTACK(flush.cmp); 2789 COMPLETION_INITIALIZER_ONSTACK(flush.cmp);
2737 queue_work(nfit_wq, &flush.work); 2790 queue_work(nfit_wq, &flush.work);
2791 mutex_unlock(&acpi_desc->init_mutex);
2738 2792
2739 rc = wait_for_completion_interruptible(&flush.cmp); 2793 rc = wait_for_completion_interruptible(&flush.cmp);
2740 cancel_work_sync(&flush.work); 2794 cancel_work_sync(&flush.work);
@@ -2771,10 +2825,12 @@ int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc)
2771 if (work_busy(&acpi_desc->work)) 2825 if (work_busy(&acpi_desc->work))
2772 return -EBUSY; 2826 return -EBUSY;
2773 2827
2774 if (acpi_desc->cancel) 2828 mutex_lock(&acpi_desc->init_mutex);
2829 if (acpi_desc->cancel) {
2830 mutex_unlock(&acpi_desc->init_mutex);
2775 return 0; 2831 return 0;
2832 }
2776 2833
2777 mutex_lock(&acpi_desc->init_mutex);
2778 list_for_each_entry(nfit_spa, &acpi_desc->spas, list) { 2834 list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
2779 struct acpi_nfit_system_address *spa = nfit_spa->spa; 2835 struct acpi_nfit_system_address *spa = nfit_spa->spa;
2780 2836
@@ -2818,6 +2874,40 @@ void acpi_nfit_desc_init(struct acpi_nfit_desc *acpi_desc, struct device *dev)
2818} 2874}
2819EXPORT_SYMBOL_GPL(acpi_nfit_desc_init); 2875EXPORT_SYMBOL_GPL(acpi_nfit_desc_init);
2820 2876
2877static void acpi_nfit_put_table(void *table)
2878{
2879 acpi_put_table(table);
2880}
2881
2882void acpi_nfit_shutdown(void *data)
2883{
2884 struct acpi_nfit_desc *acpi_desc = data;
2885 struct device *bus_dev = to_nvdimm_bus_dev(acpi_desc->nvdimm_bus);
2886
2887 /*
2888 * Destruct under acpi_desc_lock so that nfit_handle_mce does not
2889 * race teardown
2890 */
2891 mutex_lock(&acpi_desc_lock);
2892 list_del(&acpi_desc->list);
2893 mutex_unlock(&acpi_desc_lock);
2894
2895 mutex_lock(&acpi_desc->init_mutex);
2896 acpi_desc->cancel = 1;
2897 mutex_unlock(&acpi_desc->init_mutex);
2898
2899 /*
2900 * Bounce the nvdimm bus lock to make sure any in-flight
2901 * acpi_nfit_ars_rescan() submissions have had a chance to
2902 * either submit or see ->cancel set.
2903 */
2904 device_lock(bus_dev);
2905 device_unlock(bus_dev);
2906
2907 flush_workqueue(nfit_wq);
2908}
2909EXPORT_SYMBOL_GPL(acpi_nfit_shutdown);
2910
2821static int acpi_nfit_add(struct acpi_device *adev) 2911static int acpi_nfit_add(struct acpi_device *adev)
2822{ 2912{
2823 struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; 2913 struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL };
@@ -2834,6 +2924,10 @@ static int acpi_nfit_add(struct acpi_device *adev)
2834 dev_dbg(dev, "failed to find NFIT at startup\n"); 2924 dev_dbg(dev, "failed to find NFIT at startup\n");
2835 return 0; 2925 return 0;
2836 } 2926 }
2927
2928 rc = devm_add_action_or_reset(dev, acpi_nfit_put_table, tbl);
2929 if (rc)
2930 return rc;
2837 sz = tbl->length; 2931 sz = tbl->length;
2838 2932
2839 acpi_desc = devm_kzalloc(dev, sizeof(*acpi_desc), GFP_KERNEL); 2933 acpi_desc = devm_kzalloc(dev, sizeof(*acpi_desc), GFP_KERNEL);
@@ -2861,12 +2955,15 @@ static int acpi_nfit_add(struct acpi_device *adev)
2861 rc = acpi_nfit_init(acpi_desc, (void *) tbl 2955 rc = acpi_nfit_init(acpi_desc, (void *) tbl
2862 + sizeof(struct acpi_table_nfit), 2956 + sizeof(struct acpi_table_nfit),
2863 sz - sizeof(struct acpi_table_nfit)); 2957 sz - sizeof(struct acpi_table_nfit));
2864 return rc; 2958
2959 if (rc)
2960 return rc;
2961 return devm_add_action_or_reset(dev, acpi_nfit_shutdown, acpi_desc);
2865} 2962}
2866 2963
2867static int acpi_nfit_remove(struct acpi_device *adev) 2964static int acpi_nfit_remove(struct acpi_device *adev)
2868{ 2965{
2869 /* see acpi_nfit_destruct */ 2966 /* see acpi_nfit_unregister */
2870 return 0; 2967 return 0;
2871} 2968}
2872 2969
diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
index fc29c2e9832e..58fb7d68e04a 100644
--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -37,7 +37,7 @@
37 37
38#define ACPI_NFIT_MEM_FAILED_MASK (ACPI_NFIT_MEM_SAVE_FAILED \ 38#define ACPI_NFIT_MEM_FAILED_MASK (ACPI_NFIT_MEM_SAVE_FAILED \
39 | ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \ 39 | ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \
40 | ACPI_NFIT_MEM_NOT_ARMED) 40 | ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED)
41 41
42enum nfit_uuids { 42enum nfit_uuids {
43 /* for simplicity alias the uuid index with the family id */ 43 /* for simplicity alias the uuid index with the family id */
@@ -163,6 +163,7 @@ struct acpi_nfit_desc {
163 unsigned int scrub_count; 163 unsigned int scrub_count;
164 unsigned int scrub_mode; 164 unsigned int scrub_mode;
165 unsigned int cancel:1; 165 unsigned int cancel:1;
166 unsigned int init_complete:1;
166 unsigned long dimm_cmd_force_en; 167 unsigned long dimm_cmd_force_en;
167 unsigned long bus_cmd_force_en; 168 unsigned long bus_cmd_force_en;
168 int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa, 169 int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
@@ -238,6 +239,7 @@ static inline struct acpi_nfit_desc *to_acpi_desc(
238 239
239const u8 *to_nfit_uuid(enum nfit_uuids id); 240const u8 *to_nfit_uuid(enum nfit_uuids id);
240int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *nfit, acpi_size sz); 241int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *nfit, acpi_size sz);
242void acpi_nfit_shutdown(void *data);
241void __acpi_nfit_notify(struct device *dev, acpi_handle handle, u32 event); 243void __acpi_nfit_notify(struct device *dev, acpi_handle handle, u32 event);
242void __acpi_nvdimm_notify(struct device *dev, u32 event); 244void __acpi_nvdimm_notify(struct device *dev, u32 event);
243int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, 245int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index d545abbd5378..8ddc98279c8f 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -323,6 +323,7 @@ config BLK_DEV_SX8
323 323
324config BLK_DEV_RAM 324config BLK_DEV_RAM
325 tristate "RAM block device support" 325 tristate "RAM block device support"
326 select DAX if BLK_DEV_RAM_DAX
326 ---help--- 327 ---help---
327 Saying Y here will allow you to use a portion of your RAM memory as 328 Saying Y here will allow you to use a portion of your RAM memory as
328 a block device, so that you can make file systems on it, read and 329 a block device, so that you can make file systems on it, read and
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 4ec84d504780..57b574f2f66a 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -21,6 +21,7 @@
21#include <linux/slab.h> 21#include <linux/slab.h>
22#ifdef CONFIG_BLK_DEV_RAM_DAX 22#ifdef CONFIG_BLK_DEV_RAM_DAX
23#include <linux/pfn_t.h> 23#include <linux/pfn_t.h>
24#include <linux/dax.h>
24#endif 25#endif
25 26
26#include <linux/uaccess.h> 27#include <linux/uaccess.h>
@@ -41,6 +42,9 @@ struct brd_device {
41 42
42 struct request_queue *brd_queue; 43 struct request_queue *brd_queue;
43 struct gendisk *brd_disk; 44 struct gendisk *brd_disk;
45#ifdef CONFIG_BLK_DEV_RAM_DAX
46 struct dax_device *dax_dev;
47#endif
44 struct list_head brd_list; 48 struct list_head brd_list;
45 49
46 /* 50 /*
@@ -326,30 +330,38 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
326} 330}
327 331
328#ifdef CONFIG_BLK_DEV_RAM_DAX 332#ifdef CONFIG_BLK_DEV_RAM_DAX
329static long brd_direct_access(struct block_device *bdev, sector_t sector, 333static long __brd_direct_access(struct brd_device *brd, pgoff_t pgoff,
330 void **kaddr, pfn_t *pfn, long size) 334 long nr_pages, void **kaddr, pfn_t *pfn)
331{ 335{
332 struct brd_device *brd = bdev->bd_disk->private_data;
333 struct page *page; 336 struct page *page;
334 337
335 if (!brd) 338 if (!brd)
336 return -ENODEV; 339 return -ENODEV;
337 page = brd_insert_page(brd, sector); 340 page = brd_insert_page(brd, PFN_PHYS(pgoff) / 512);
338 if (!page) 341 if (!page)
339 return -ENOSPC; 342 return -ENOSPC;
340 *kaddr = page_address(page); 343 *kaddr = page_address(page);
341 *pfn = page_to_pfn_t(page); 344 *pfn = page_to_pfn_t(page);
342 345
343 return PAGE_SIZE; 346 return 1;
344} 347}
345#else 348
346#define brd_direct_access NULL 349static long brd_dax_direct_access(struct dax_device *dax_dev,
350 pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn)
351{
352 struct brd_device *brd = dax_get_private(dax_dev);
353
354 return __brd_direct_access(brd, pgoff, nr_pages, kaddr, pfn);
355}
356
357static const struct dax_operations brd_dax_ops = {
358 .direct_access = brd_dax_direct_access,
359};
347#endif 360#endif
348 361
349static const struct block_device_operations brd_fops = { 362static const struct block_device_operations brd_fops = {
350 .owner = THIS_MODULE, 363 .owner = THIS_MODULE,
351 .rw_page = brd_rw_page, 364 .rw_page = brd_rw_page,
352 .direct_access = brd_direct_access,
353}; 365};
354 366
355/* 367/*
@@ -415,9 +427,6 @@ static struct brd_device *brd_alloc(int i)
415 * is harmless) 427 * is harmless)
416 */ 428 */
417 blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE); 429 blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE);
418#ifdef CONFIG_BLK_DEV_RAM_DAX
419 queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue);
420#endif
421 disk = brd->brd_disk = alloc_disk(max_part); 430 disk = brd->brd_disk = alloc_disk(max_part);
422 if (!disk) 431 if (!disk)
423 goto out_free_queue; 432 goto out_free_queue;
@@ -430,8 +439,21 @@ static struct brd_device *brd_alloc(int i)
430 sprintf(disk->disk_name, "ram%d", i); 439 sprintf(disk->disk_name, "ram%d", i);
431 set_capacity(disk, rd_size * 2); 440 set_capacity(disk, rd_size * 2);
432 441
442#ifdef CONFIG_BLK_DEV_RAM_DAX
443 queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue);
444 brd->dax_dev = alloc_dax(brd, disk->disk_name, &brd_dax_ops);
445 if (!brd->dax_dev)
446 goto out_free_inode;
447#endif
448
449
433 return brd; 450 return brd;
434 451
452#ifdef CONFIG_BLK_DEV_RAM_DAX
453out_free_inode:
454 kill_dax(brd->dax_dev);
455 put_dax(brd->dax_dev);
456#endif
435out_free_queue: 457out_free_queue:
436 blk_cleanup_queue(brd->brd_queue); 458 blk_cleanup_queue(brd->brd_queue);
437out_free_dev: 459out_free_dev:
@@ -471,6 +493,10 @@ out:
471static void brd_del_one(struct brd_device *brd) 493static void brd_del_one(struct brd_device *brd)
472{ 494{
473 list_del(&brd->brd_list); 495 list_del(&brd->brd_list);
496#ifdef CONFIG_BLK_DEV_RAM_DAX
497 kill_dax(brd->dax_dev);
498 put_dax(brd->dax_dev);
499#endif
474 del_gendisk(brd->brd_disk); 500 del_gendisk(brd->brd_disk);
475 brd_free(brd); 501 brd_free(brd);
476} 502}
diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index 9e95bf94eb13..b7053eafd88e 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -1,8 +1,13 @@
1menuconfig DEV_DAX 1menuconfig DAX
2 tristate "DAX: direct access to differentiated memory" 2 tristate "DAX: direct access to differentiated memory"
3 select SRCU
3 default m if NVDIMM_DAX 4 default m if NVDIMM_DAX
5
6if DAX
7
8config DEV_DAX
9 tristate "Device DAX: direct access mapping device"
4 depends on TRANSPARENT_HUGEPAGE 10 depends on TRANSPARENT_HUGEPAGE
5 select SRCU
6 help 11 help
7 Support raw access to differentiated (persistence, bandwidth, 12 Support raw access to differentiated (persistence, bandwidth,
8 latency...) memory via an mmap(2) capable character 13 latency...) memory via an mmap(2) capable character
@@ -11,7 +16,6 @@ menuconfig DEV_DAX
11 baseline memory pool. Mappings of a /dev/daxX.Y device impose 16 baseline memory pool. Mappings of a /dev/daxX.Y device impose
12 restrictions that make the mapping behavior deterministic. 17 restrictions that make the mapping behavior deterministic.
13 18
14if DEV_DAX
15 19
16config DEV_DAX_PMEM 20config DEV_DAX_PMEM
17 tristate "PMEM DAX: direct access to persistent memory" 21 tristate "PMEM DAX: direct access to persistent memory"
diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile
index 27c54e38478a..dc7422530462 100644
--- a/drivers/dax/Makefile
+++ b/drivers/dax/Makefile
@@ -1,4 +1,7 @@
1obj-$(CONFIG_DEV_DAX) += dax.o 1obj-$(CONFIG_DAX) += dax.o
2obj-$(CONFIG_DEV_DAX) += device_dax.o
2obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o 3obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
3 4
5dax-y := super.o
4dax_pmem-y := pmem.o 6dax_pmem-y := pmem.o
7device_dax-y := device.o
diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h
new file mode 100644
index 000000000000..b6fc4f04636d
--- /dev/null
+++ b/drivers/dax/dax-private.h
@@ -0,0 +1,57 @@
1/*
2 * Copyright(c) 2016 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#ifndef __DAX_PRIVATE_H__
14#define __DAX_PRIVATE_H__
15
16#include <linux/device.h>
17#include <linux/cdev.h>
18
19/**
20 * struct dax_region - mapping infrastructure for dax devices
21 * @id: kernel-wide unique region for a memory range
22 * @base: linear address corresponding to @res
23 * @kref: to pin while other agents have a need to do lookups
24 * @dev: parent device backing this region
25 * @align: allocation and mapping alignment for child dax devices
26 * @res: physical address range of the region
27 * @pfn_flags: identify whether the pfns are paged back or not
28 */
29struct dax_region {
30 int id;
31 struct ida ida;
32 void *base;
33 struct kref kref;
34 struct device *dev;
35 unsigned int align;
36 struct resource res;
37 unsigned long pfn_flags;
38};
39
40/**
41 * struct dev_dax - instance data for a subdivision of a dax region
42 * @region - parent region
43 * @dax_dev - core dax functionality
44 * @dev - device core
45 * @id - child id in the region
46 * @num_resources - number of physical address extents in this device
47 * @res - array of physical address ranges
48 */
49struct dev_dax {
50 struct dax_region *region;
51 struct dax_device *dax_dev;
52 struct device dev;
53 int id;
54 int num_resources;
55 struct resource res[0];
56};
57#endif
diff --git a/drivers/dax/dax.h b/drivers/dax/dax.h
index ddd829ab58c0..f9e5feea742c 100644
--- a/drivers/dax/dax.h
+++ b/drivers/dax/dax.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright(c) 2016 Intel Corporation. All rights reserved. 2 * Copyright(c) 2016 - 2017 Intel Corporation. All rights reserved.
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as 5 * it under the terms of version 2 of the GNU General Public License as
@@ -12,14 +12,7 @@
12 */ 12 */
13#ifndef __DAX_H__ 13#ifndef __DAX_H__
14#define __DAX_H__ 14#define __DAX_H__
15struct device; 15struct dax_device;
16struct dax_dev; 16struct dax_device *inode_dax(struct inode *inode);
17struct resource; 17struct inode *dax_inode(struct dax_device *dax_dev);
18struct dax_region;
19void dax_region_put(struct dax_region *dax_region);
20struct dax_region *alloc_dax_region(struct device *parent,
21 int region_id, struct resource *res, unsigned int align,
22 void *addr, unsigned long flags);
23struct dax_dev *devm_create_dax_dev(struct dax_region *dax_region,
24 struct resource *res, int count);
25#endif /* __DAX_H__ */ 18#endif /* __DAX_H__ */
diff --git a/drivers/dax/device-dax.h b/drivers/dax/device-dax.h
new file mode 100644
index 000000000000..fdcd9769ffde
--- /dev/null
+++ b/drivers/dax/device-dax.h
@@ -0,0 +1,25 @@
1/*
2 * Copyright(c) 2016 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#ifndef __DEVICE_DAX_H__
14#define __DEVICE_DAX_H__
15struct device;
16struct dev_dax;
17struct resource;
18struct dax_region;
19void dax_region_put(struct dax_region *dax_region);
20struct dax_region *alloc_dax_region(struct device *parent,
21 int region_id, struct resource *res, unsigned int align,
22 void *addr, unsigned long flags);
23struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region,
24 struct resource *res, int count);
25#endif /* __DEVICE_DAX_H__ */
diff --git a/drivers/dax/dax.c b/drivers/dax/device.c
index 19795eb35579..006e657dfcb9 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/device.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright(c) 2016 Intel Corporation. All rights reserved. 2 * Copyright(c) 2016 - 2017 Intel Corporation. All rights reserved.
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as 5 * it under the terms of version 2 of the GNU General Public License as
@@ -13,100 +13,38 @@
13#include <linux/pagemap.h> 13#include <linux/pagemap.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/device.h> 15#include <linux/device.h>
16#include <linux/magic.h>
17#include <linux/mount.h>
18#include <linux/pfn_t.h> 16#include <linux/pfn_t.h>
19#include <linux/hash.h>
20#include <linux/cdev.h> 17#include <linux/cdev.h>
21#include <linux/slab.h> 18#include <linux/slab.h>
22#include <linux/dax.h> 19#include <linux/dax.h>
23#include <linux/fs.h> 20#include <linux/fs.h>
24#include <linux/mm.h> 21#include <linux/mm.h>
22#include "dax-private.h"
25#include "dax.h" 23#include "dax.h"
26 24
27static dev_t dax_devt;
28DEFINE_STATIC_SRCU(dax_srcu);
29static struct class *dax_class; 25static struct class *dax_class;
30static DEFINE_IDA(dax_minor_ida);
31static int nr_dax = CONFIG_NR_DEV_DAX;
32module_param(nr_dax, int, S_IRUGO);
33static struct vfsmount *dax_mnt;
34static struct kmem_cache *dax_cache __read_mostly;
35static struct super_block *dax_superblock __read_mostly;
36MODULE_PARM_DESC(nr_dax, "max number of device-dax instances");
37
38/**
39 * struct dax_region - mapping infrastructure for dax devices
40 * @id: kernel-wide unique region for a memory range
41 * @base: linear address corresponding to @res
42 * @kref: to pin while other agents have a need to do lookups
43 * @dev: parent device backing this region
44 * @align: allocation and mapping alignment for child dax devices
45 * @res: physical address range of the region
46 * @pfn_flags: identify whether the pfns are paged back or not
47 */
48struct dax_region {
49 int id;
50 struct ida ida;
51 void *base;
52 struct kref kref;
53 struct device *dev;
54 unsigned int align;
55 struct resource res;
56 unsigned long pfn_flags;
57};
58 26
59/** 27/*
60 * struct dax_dev - subdivision of a dax region 28 * Rely on the fact that drvdata is set before the attributes are
61 * @region - parent region 29 * registered, and that the attributes are unregistered before drvdata
62 * @dev - device backing the character device 30 * is cleared to assume that drvdata is always valid.
63 * @cdev - core chardev data
64 * @alive - !alive + srcu grace period == no new mappings can be established
65 * @id - child id in the region
66 * @num_resources - number of physical address extents in this device
67 * @res - array of physical address ranges
68 */ 31 */
69struct dax_dev {
70 struct dax_region *region;
71 struct inode *inode;
72 struct device dev;
73 struct cdev cdev;
74 bool alive;
75 int id;
76 int num_resources;
77 struct resource res[0];
78};
79
80static ssize_t id_show(struct device *dev, 32static ssize_t id_show(struct device *dev,
81 struct device_attribute *attr, char *buf) 33 struct device_attribute *attr, char *buf)
82{ 34{
83 struct dax_region *dax_region; 35 struct dax_region *dax_region = dev_get_drvdata(dev);
84 ssize_t rc = -ENXIO;
85
86 device_lock(dev);
87 dax_region = dev_get_drvdata(dev);
88 if (dax_region)
89 rc = sprintf(buf, "%d\n", dax_region->id);
90 device_unlock(dev);
91 36
92 return rc; 37 return sprintf(buf, "%d\n", dax_region->id);
93} 38}
94static DEVICE_ATTR_RO(id); 39static DEVICE_ATTR_RO(id);
95 40
96static ssize_t region_size_show(struct device *dev, 41static ssize_t region_size_show(struct device *dev,
97 struct device_attribute *attr, char *buf) 42 struct device_attribute *attr, char *buf)
98{ 43{
99 struct dax_region *dax_region; 44 struct dax_region *dax_region = dev_get_drvdata(dev);
100 ssize_t rc = -ENXIO;
101
102 device_lock(dev);
103 dax_region = dev_get_drvdata(dev);
104 if (dax_region)
105 rc = sprintf(buf, "%llu\n", (unsigned long long)
106 resource_size(&dax_region->res));
107 device_unlock(dev);
108 45
109 return rc; 46 return sprintf(buf, "%llu\n", (unsigned long long)
47 resource_size(&dax_region->res));
110} 48}
111static struct device_attribute dev_attr_region_size = __ATTR(size, 0444, 49static struct device_attribute dev_attr_region_size = __ATTR(size, 0444,
112 region_size_show, NULL); 50 region_size_show, NULL);
@@ -114,16 +52,9 @@ static struct device_attribute dev_attr_region_size = __ATTR(size, 0444,
114static ssize_t align_show(struct device *dev, 52static ssize_t align_show(struct device *dev,
115 struct device_attribute *attr, char *buf) 53 struct device_attribute *attr, char *buf)
116{ 54{
117 struct dax_region *dax_region; 55 struct dax_region *dax_region = dev_get_drvdata(dev);
118 ssize_t rc = -ENXIO;
119
120 device_lock(dev);
121 dax_region = dev_get_drvdata(dev);
122 if (dax_region)
123 rc = sprintf(buf, "%u\n", dax_region->align);
124 device_unlock(dev);
125 56
126 return rc; 57 return sprintf(buf, "%u\n", dax_region->align);
127} 58}
128static DEVICE_ATTR_RO(align); 59static DEVICE_ATTR_RO(align);
129 60
@@ -144,117 +75,6 @@ static const struct attribute_group *dax_region_attribute_groups[] = {
144 NULL, 75 NULL,
145}; 76};
146 77
147static struct inode *dax_alloc_inode(struct super_block *sb)
148{
149 return kmem_cache_alloc(dax_cache, GFP_KERNEL);
150}
151
152static void dax_i_callback(struct rcu_head *head)
153{
154 struct inode *inode = container_of(head, struct inode, i_rcu);
155
156 kmem_cache_free(dax_cache, inode);
157}
158
159static void dax_destroy_inode(struct inode *inode)
160{
161 call_rcu(&inode->i_rcu, dax_i_callback);
162}
163
164static const struct super_operations dax_sops = {
165 .statfs = simple_statfs,
166 .alloc_inode = dax_alloc_inode,
167 .destroy_inode = dax_destroy_inode,
168 .drop_inode = generic_delete_inode,
169};
170
171static struct dentry *dax_mount(struct file_system_type *fs_type,
172 int flags, const char *dev_name, void *data)
173{
174 return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC);
175}
176
177static struct file_system_type dax_type = {
178 .name = "dax",
179 .mount = dax_mount,
180 .kill_sb = kill_anon_super,
181};
182
183static int dax_test(struct inode *inode, void *data)
184{
185 return inode->i_cdev == data;
186}
187
188static int dax_set(struct inode *inode, void *data)
189{
190 inode->i_cdev = data;
191 return 0;
192}
193
194static struct inode *dax_inode_get(struct cdev *cdev, dev_t devt)
195{
196 struct inode *inode;
197
198 inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31),
199 dax_test, dax_set, cdev);
200
201 if (!inode)
202 return NULL;
203
204 if (inode->i_state & I_NEW) {
205 inode->i_mode = S_IFCHR;
206 inode->i_flags = S_DAX;
207 inode->i_rdev = devt;
208 mapping_set_gfp_mask(&inode->i_data, GFP_USER);
209 unlock_new_inode(inode);
210 }
211 return inode;
212}
213
214static void init_once(void *inode)
215{
216 inode_init_once(inode);
217}
218
219static int dax_inode_init(void)
220{
221 int rc;
222
223 dax_cache = kmem_cache_create("dax_cache", sizeof(struct inode), 0,
224 (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
225 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
226 init_once);
227 if (!dax_cache)
228 return -ENOMEM;
229
230 rc = register_filesystem(&dax_type);
231 if (rc)
232 goto err_register_fs;
233
234 dax_mnt = kern_mount(&dax_type);
235 if (IS_ERR(dax_mnt)) {
236 rc = PTR_ERR(dax_mnt);
237 goto err_mount;
238 }
239 dax_superblock = dax_mnt->mnt_sb;
240
241 return 0;
242
243 err_mount:
244 unregister_filesystem(&dax_type);
245 err_register_fs:
246 kmem_cache_destroy(dax_cache);
247
248 return rc;
249}
250
251static void dax_inode_exit(void)
252{
253 kern_unmount(dax_mnt);
254 unregister_filesystem(&dax_type);
255 kmem_cache_destroy(dax_cache);
256}
257
258static void dax_region_free(struct kref *kref) 78static void dax_region_free(struct kref *kref)
259{ 79{
260 struct dax_region *dax_region; 80 struct dax_region *dax_region;
@@ -323,47 +143,47 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id,
323} 143}
324EXPORT_SYMBOL_GPL(alloc_dax_region); 144EXPORT_SYMBOL_GPL(alloc_dax_region);
325 145
326static struct dax_dev *to_dax_dev(struct device *dev) 146static struct dev_dax *to_dev_dax(struct device *dev)
327{ 147{
328 return container_of(dev, struct dax_dev, dev); 148 return container_of(dev, struct dev_dax, dev);
329} 149}
330 150
331static ssize_t size_show(struct device *dev, 151static ssize_t size_show(struct device *dev,
332 struct device_attribute *attr, char *buf) 152 struct device_attribute *attr, char *buf)
333{ 153{
334 struct dax_dev *dax_dev = to_dax_dev(dev); 154 struct dev_dax *dev_dax = to_dev_dax(dev);
335 unsigned long long size = 0; 155 unsigned long long size = 0;
336 int i; 156 int i;
337 157
338 for (i = 0; i < dax_dev->num_resources; i++) 158 for (i = 0; i < dev_dax->num_resources; i++)
339 size += resource_size(&dax_dev->res[i]); 159 size += resource_size(&dev_dax->res[i]);
340 160
341 return sprintf(buf, "%llu\n", size); 161 return sprintf(buf, "%llu\n", size);
342} 162}
343static DEVICE_ATTR_RO(size); 163static DEVICE_ATTR_RO(size);
344 164
345static struct attribute *dax_device_attributes[] = { 165static struct attribute *dev_dax_attributes[] = {
346 &dev_attr_size.attr, 166 &dev_attr_size.attr,
347 NULL, 167 NULL,
348}; 168};
349 169
350static const struct attribute_group dax_device_attribute_group = { 170static const struct attribute_group dev_dax_attribute_group = {
351 .attrs = dax_device_attributes, 171 .attrs = dev_dax_attributes,
352}; 172};
353 173
354static const struct attribute_group *dax_attribute_groups[] = { 174static const struct attribute_group *dax_attribute_groups[] = {
355 &dax_device_attribute_group, 175 &dev_dax_attribute_group,
356 NULL, 176 NULL,
357}; 177};
358 178
359static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma, 179static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma,
360 const char *func) 180 const char *func)
361{ 181{
362 struct dax_region *dax_region = dax_dev->region; 182 struct dax_region *dax_region = dev_dax->region;
363 struct device *dev = &dax_dev->dev; 183 struct device *dev = &dev_dax->dev;
364 unsigned long mask; 184 unsigned long mask;
365 185
366 if (!dax_dev->alive) 186 if (!dax_alive(dev_dax->dax_dev))
367 return -ENXIO; 187 return -ENXIO;
368 188
369 /* prevent private mappings from being established */ 189 /* prevent private mappings from being established */
@@ -397,23 +217,24 @@ static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma,
397 return 0; 217 return 0;
398} 218}
399 219
400static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff, 220/* see "strong" declaration in tools/testing/nvdimm/dax-dev.c */
221__weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
401 unsigned long size) 222 unsigned long size)
402{ 223{
403 struct resource *res; 224 struct resource *res;
404 phys_addr_t phys; 225 phys_addr_t phys;
405 int i; 226 int i;
406 227
407 for (i = 0; i < dax_dev->num_resources; i++) { 228 for (i = 0; i < dev_dax->num_resources; i++) {
408 res = &dax_dev->res[i]; 229 res = &dev_dax->res[i];
409 phys = pgoff * PAGE_SIZE + res->start; 230 phys = pgoff * PAGE_SIZE + res->start;
410 if (phys >= res->start && phys <= res->end) 231 if (phys >= res->start && phys <= res->end)
411 break; 232 break;
412 pgoff -= PHYS_PFN(resource_size(res)); 233 pgoff -= PHYS_PFN(resource_size(res));
413 } 234 }
414 235
415 if (i < dax_dev->num_resources) { 236 if (i < dev_dax->num_resources) {
416 res = &dax_dev->res[i]; 237 res = &dev_dax->res[i];
417 if (phys + size - 1 <= res->end) 238 if (phys + size - 1 <= res->end)
418 return phys; 239 return phys;
419 } 240 }
@@ -421,28 +242,29 @@ static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
421 return -1; 242 return -1;
422} 243}
423 244
424static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) 245static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
425{ 246{
426 struct device *dev = &dax_dev->dev; 247 struct device *dev = &dev_dax->dev;
427 struct dax_region *dax_region; 248 struct dax_region *dax_region;
428 int rc = VM_FAULT_SIGBUS; 249 int rc = VM_FAULT_SIGBUS;
429 phys_addr_t phys; 250 phys_addr_t phys;
430 pfn_t pfn; 251 pfn_t pfn;
431 unsigned int fault_size = PAGE_SIZE; 252 unsigned int fault_size = PAGE_SIZE;
432 253
433 if (check_vma(dax_dev, vmf->vma, __func__)) 254 if (check_vma(dev_dax, vmf->vma, __func__))
434 return VM_FAULT_SIGBUS; 255 return VM_FAULT_SIGBUS;
435 256
436 dax_region = dax_dev->region; 257 dax_region = dev_dax->region;
437 if (dax_region->align > PAGE_SIZE) { 258 if (dax_region->align > PAGE_SIZE) {
438 dev_dbg(dev, "%s: alignment > fault size\n", __func__); 259 dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n",
260 __func__, dax_region->align, fault_size);
439 return VM_FAULT_SIGBUS; 261 return VM_FAULT_SIGBUS;
440 } 262 }
441 263
442 if (fault_size != dax_region->align) 264 if (fault_size != dax_region->align)
443 return VM_FAULT_SIGBUS; 265 return VM_FAULT_SIGBUS;
444 266
445 phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE); 267 phys = dax_pgoff_to_phys(dev_dax, vmf->pgoff, PAGE_SIZE);
446 if (phys == -1) { 268 if (phys == -1) {
447 dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, 269 dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__,
448 vmf->pgoff); 270 vmf->pgoff);
@@ -461,28 +283,29 @@ static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
461 return VM_FAULT_NOPAGE; 283 return VM_FAULT_NOPAGE;
462} 284}
463 285
464static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) 286static int __dev_dax_pmd_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
465{ 287{
466 unsigned long pmd_addr = vmf->address & PMD_MASK; 288 unsigned long pmd_addr = vmf->address & PMD_MASK;
467 struct device *dev = &dax_dev->dev; 289 struct device *dev = &dev_dax->dev;
468 struct dax_region *dax_region; 290 struct dax_region *dax_region;
469 phys_addr_t phys; 291 phys_addr_t phys;
470 pgoff_t pgoff; 292 pgoff_t pgoff;
471 pfn_t pfn; 293 pfn_t pfn;
472 unsigned int fault_size = PMD_SIZE; 294 unsigned int fault_size = PMD_SIZE;
473 295
474 if (check_vma(dax_dev, vmf->vma, __func__)) 296 if (check_vma(dev_dax, vmf->vma, __func__))
475 return VM_FAULT_SIGBUS; 297 return VM_FAULT_SIGBUS;
476 298
477 dax_region = dax_dev->region; 299 dax_region = dev_dax->region;
478 if (dax_region->align > PMD_SIZE) { 300 if (dax_region->align > PMD_SIZE) {
479 dev_dbg(dev, "%s: alignment > fault size\n", __func__); 301 dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n",
302 __func__, dax_region->align, fault_size);
480 return VM_FAULT_SIGBUS; 303 return VM_FAULT_SIGBUS;
481 } 304 }
482 305
483 /* dax pmd mappings require pfn_t_devmap() */ 306 /* dax pmd mappings require pfn_t_devmap() */
484 if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { 307 if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
485 dev_dbg(dev, "%s: alignment > fault size\n", __func__); 308 dev_dbg(dev, "%s: region lacks devmap flags\n", __func__);
486 return VM_FAULT_SIGBUS; 309 return VM_FAULT_SIGBUS;
487 } 310 }
488 311
@@ -497,7 +320,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
497 return VM_FAULT_SIGBUS; 320 return VM_FAULT_SIGBUS;
498 321
499 pgoff = linear_page_index(vmf->vma, pmd_addr); 322 pgoff = linear_page_index(vmf->vma, pmd_addr);
500 phys = pgoff_to_phys(dax_dev, pgoff, PMD_SIZE); 323 phys = dax_pgoff_to_phys(dev_dax, pgoff, PMD_SIZE);
501 if (phys == -1) { 324 if (phys == -1) {
502 dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, 325 dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__,
503 pgoff); 326 pgoff);
@@ -511,10 +334,10 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
511} 334}
512 335
513#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 336#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
514static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) 337static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
515{ 338{
516 unsigned long pud_addr = vmf->address & PUD_MASK; 339 unsigned long pud_addr = vmf->address & PUD_MASK;
517 struct device *dev = &dax_dev->dev; 340 struct device *dev = &dev_dax->dev;
518 struct dax_region *dax_region; 341 struct dax_region *dax_region;
519 phys_addr_t phys; 342 phys_addr_t phys;
520 pgoff_t pgoff; 343 pgoff_t pgoff;
@@ -522,18 +345,19 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
522 unsigned int fault_size = PUD_SIZE; 345 unsigned int fault_size = PUD_SIZE;
523 346
524 347
525 if (check_vma(dax_dev, vmf->vma, __func__)) 348 if (check_vma(dev_dax, vmf->vma, __func__))
526 return VM_FAULT_SIGBUS; 349 return VM_FAULT_SIGBUS;
527 350
528 dax_region = dax_dev->region; 351 dax_region = dev_dax->region;
529 if (dax_region->align > PUD_SIZE) { 352 if (dax_region->align > PUD_SIZE) {
530 dev_dbg(dev, "%s: alignment > fault size\n", __func__); 353 dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n",
354 __func__, dax_region->align, fault_size);
531 return VM_FAULT_SIGBUS; 355 return VM_FAULT_SIGBUS;
532 } 356 }
533 357
534 /* dax pud mappings require pfn_t_devmap() */ 358 /* dax pud mappings require pfn_t_devmap() */
535 if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { 359 if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
536 dev_dbg(dev, "%s: alignment > fault size\n", __func__); 360 dev_dbg(dev, "%s: region lacks devmap flags\n", __func__);
537 return VM_FAULT_SIGBUS; 361 return VM_FAULT_SIGBUS;
538 } 362 }
539 363
@@ -548,7 +372,7 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
548 return VM_FAULT_SIGBUS; 372 return VM_FAULT_SIGBUS;
549 373
550 pgoff = linear_page_index(vmf->vma, pud_addr); 374 pgoff = linear_page_index(vmf->vma, pud_addr);
551 phys = pgoff_to_phys(dax_dev, pgoff, PUD_SIZE); 375 phys = dax_pgoff_to_phys(dev_dax, pgoff, PUD_SIZE);
552 if (phys == -1) { 376 if (phys == -1) {
553 dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, 377 dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__,
554 pgoff); 378 pgoff);
@@ -561,65 +385,71 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
561 vmf->flags & FAULT_FLAG_WRITE); 385 vmf->flags & FAULT_FLAG_WRITE);
562} 386}
563#else 387#else
564static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) 388static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
565{ 389{
566 return VM_FAULT_FALLBACK; 390 return VM_FAULT_FALLBACK;
567} 391}
568#endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 392#endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
569 393
570static int dax_dev_huge_fault(struct vm_fault *vmf, 394static int dev_dax_huge_fault(struct vm_fault *vmf,
571 enum page_entry_size pe_size) 395 enum page_entry_size pe_size)
572{ 396{
573 int rc, id; 397 int rc, id;
574 struct file *filp = vmf->vma->vm_file; 398 struct file *filp = vmf->vma->vm_file;
575 struct dax_dev *dax_dev = filp->private_data; 399 struct dev_dax *dev_dax = filp->private_data;
576 400
577 dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, 401 dev_dbg(&dev_dax->dev, "%s: %s: %s (%#lx - %#lx) size = %d\n", __func__,
578 current->comm, (vmf->flags & FAULT_FLAG_WRITE) 402 current->comm, (vmf->flags & FAULT_FLAG_WRITE)
579 ? "write" : "read", 403 ? "write" : "read",
580 vmf->vma->vm_start, vmf->vma->vm_end); 404 vmf->vma->vm_start, vmf->vma->vm_end, pe_size);
581 405
582 id = srcu_read_lock(&dax_srcu); 406 id = dax_read_lock();
583 switch (pe_size) { 407 switch (pe_size) {
584 case PE_SIZE_PTE: 408 case PE_SIZE_PTE:
585 rc = __dax_dev_pte_fault(dax_dev, vmf); 409 rc = __dev_dax_pte_fault(dev_dax, vmf);
586 break; 410 break;
587 case PE_SIZE_PMD: 411 case PE_SIZE_PMD:
588 rc = __dax_dev_pmd_fault(dax_dev, vmf); 412 rc = __dev_dax_pmd_fault(dev_dax, vmf);
589 break; 413 break;
590 case PE_SIZE_PUD: 414 case PE_SIZE_PUD:
591 rc = __dax_dev_pud_fault(dax_dev, vmf); 415 rc = __dev_dax_pud_fault(dev_dax, vmf);
592 break; 416 break;
593 default: 417 default:
594 return VM_FAULT_FALLBACK; 418 rc = VM_FAULT_SIGBUS;
595 } 419 }
596 srcu_read_unlock(&dax_srcu, id); 420 dax_read_unlock(id);
597 421
598 return rc; 422 return rc;
599} 423}
600 424
601static int dax_dev_fault(struct vm_fault *vmf) 425static int dev_dax_fault(struct vm_fault *vmf)
602{ 426{
603 return dax_dev_huge_fault(vmf, PE_SIZE_PTE); 427 return dev_dax_huge_fault(vmf, PE_SIZE_PTE);
604} 428}
605 429
606static const struct vm_operations_struct dax_dev_vm_ops = { 430static const struct vm_operations_struct dax_vm_ops = {
607 .fault = dax_dev_fault, 431 .fault = dev_dax_fault,
608 .huge_fault = dax_dev_huge_fault, 432 .huge_fault = dev_dax_huge_fault,
609}; 433};
610 434
611static int dax_mmap(struct file *filp, struct vm_area_struct *vma) 435static int dax_mmap(struct file *filp, struct vm_area_struct *vma)
612{ 436{
613 struct dax_dev *dax_dev = filp->private_data; 437 struct dev_dax *dev_dax = filp->private_data;
614 int rc; 438 int rc, id;
615 439
616 dev_dbg(&dax_dev->dev, "%s\n", __func__); 440 dev_dbg(&dev_dax->dev, "%s\n", __func__);
617 441
618 rc = check_vma(dax_dev, vma, __func__); 442 /*
443 * We lock to check dax_dev liveness and will re-check at
444 * fault time.
445 */
446 id = dax_read_lock();
447 rc = check_vma(dev_dax, vma, __func__);
448 dax_read_unlock(id);
619 if (rc) 449 if (rc)
620 return rc; 450 return rc;
621 451
622 vma->vm_ops = &dax_dev_vm_ops; 452 vma->vm_ops = &dax_vm_ops;
623 vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; 453 vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
624 return 0; 454 return 0;
625} 455}
@@ -630,13 +460,13 @@ static unsigned long dax_get_unmapped_area(struct file *filp,
630 unsigned long flags) 460 unsigned long flags)
631{ 461{
632 unsigned long off, off_end, off_align, len_align, addr_align, align; 462 unsigned long off, off_end, off_align, len_align, addr_align, align;
633 struct dax_dev *dax_dev = filp ? filp->private_data : NULL; 463 struct dev_dax *dev_dax = filp ? filp->private_data : NULL;
634 struct dax_region *dax_region; 464 struct dax_region *dax_region;
635 465
636 if (!dax_dev || addr) 466 if (!dev_dax || addr)
637 goto out; 467 goto out;
638 468
639 dax_region = dax_dev->region; 469 dax_region = dev_dax->region;
640 align = dax_region->align; 470 align = dax_region->align;
641 off = pgoff << PAGE_SHIFT; 471 off = pgoff << PAGE_SHIFT;
642 off_end = off + len; 472 off_end = off + len;
@@ -661,14 +491,15 @@ static unsigned long dax_get_unmapped_area(struct file *filp,
661 491
662static int dax_open(struct inode *inode, struct file *filp) 492static int dax_open(struct inode *inode, struct file *filp)
663{ 493{
664 struct dax_dev *dax_dev; 494 struct dax_device *dax_dev = inode_dax(inode);
495 struct inode *__dax_inode = dax_inode(dax_dev);
496 struct dev_dax *dev_dax = dax_get_private(dax_dev);
665 497
666 dax_dev = container_of(inode->i_cdev, struct dax_dev, cdev); 498 dev_dbg(&dev_dax->dev, "%s\n", __func__);
667 dev_dbg(&dax_dev->dev, "%s\n", __func__); 499 inode->i_mapping = __dax_inode->i_mapping;
668 inode->i_mapping = dax_dev->inode->i_mapping; 500 inode->i_mapping->host = __dax_inode;
669 inode->i_mapping->host = dax_dev->inode;
670 filp->f_mapping = inode->i_mapping; 501 filp->f_mapping = inode->i_mapping;
671 filp->private_data = dax_dev; 502 filp->private_data = dev_dax;
672 inode->i_flags = S_DAX; 503 inode->i_flags = S_DAX;
673 504
674 return 0; 505 return 0;
@@ -676,9 +507,9 @@ static int dax_open(struct inode *inode, struct file *filp)
676 507
677static int dax_release(struct inode *inode, struct file *filp) 508static int dax_release(struct inode *inode, struct file *filp)
678{ 509{
679 struct dax_dev *dax_dev = filp->private_data; 510 struct dev_dax *dev_dax = filp->private_data;
680 511
681 dev_dbg(&dax_dev->dev, "%s\n", __func__); 512 dev_dbg(&dev_dax->dev, "%s\n", __func__);
682 return 0; 513 return 0;
683} 514}
684 515
@@ -691,55 +522,54 @@ static const struct file_operations dax_fops = {
691 .mmap = dax_mmap, 522 .mmap = dax_mmap,
692}; 523};
693 524
694static void dax_dev_release(struct device *dev) 525static void dev_dax_release(struct device *dev)
695{ 526{
696 struct dax_dev *dax_dev = to_dax_dev(dev); 527 struct dev_dax *dev_dax = to_dev_dax(dev);
697 struct dax_region *dax_region = dax_dev->region; 528 struct dax_region *dax_region = dev_dax->region;
529 struct dax_device *dax_dev = dev_dax->dax_dev;
698 530
699 ida_simple_remove(&dax_region->ida, dax_dev->id); 531 ida_simple_remove(&dax_region->ida, dev_dax->id);
700 ida_simple_remove(&dax_minor_ida, MINOR(dev->devt));
701 dax_region_put(dax_region); 532 dax_region_put(dax_region);
702 iput(dax_dev->inode); 533 put_dax(dax_dev);
703 kfree(dax_dev); 534 kfree(dev_dax);
704} 535}
705 536
706static void kill_dax_dev(struct dax_dev *dax_dev) 537static void kill_dev_dax(struct dev_dax *dev_dax)
707{ 538{
708 /* 539 struct dax_device *dax_dev = dev_dax->dax_dev;
709 * Note, rcu is not protecting the liveness of dax_dev, rcu is 540 struct inode *inode = dax_inode(dax_dev);
710 * ensuring that any fault handlers that might have seen 541
711 * dax_dev->alive == true, have completed. Any fault handlers 542 kill_dax(dax_dev);
712 * that start after synchronize_srcu() has started will abort 543 unmap_mapping_range(inode->i_mapping, 0, 0, 1);
713 * upon seeing dax_dev->alive == false.
714 */
715 dax_dev->alive = false;
716 synchronize_srcu(&dax_srcu);
717 unmap_mapping_range(dax_dev->inode->i_mapping, 0, 0, 1);
718} 544}
719 545
720static void unregister_dax_dev(void *dev) 546static void unregister_dev_dax(void *dev)
721{ 547{
722 struct dax_dev *dax_dev = to_dax_dev(dev); 548 struct dev_dax *dev_dax = to_dev_dax(dev);
549 struct dax_device *dax_dev = dev_dax->dax_dev;
550 struct inode *inode = dax_inode(dax_dev);
551 struct cdev *cdev = inode->i_cdev;
723 552
724 dev_dbg(dev, "%s\n", __func__); 553 dev_dbg(dev, "%s\n", __func__);
725 554
726 kill_dax_dev(dax_dev); 555 kill_dev_dax(dev_dax);
727 cdev_device_del(&dax_dev->cdev, dev); 556 cdev_device_del(cdev, dev);
728 put_device(dev); 557 put_device(dev);
729} 558}
730 559
731struct dax_dev *devm_create_dax_dev(struct dax_region *dax_region, 560struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region,
732 struct resource *res, int count) 561 struct resource *res, int count)
733{ 562{
734 struct device *parent = dax_region->dev; 563 struct device *parent = dax_region->dev;
735 struct dax_dev *dax_dev; 564 struct dax_device *dax_dev;
736 int rc = 0, minor, i; 565 struct dev_dax *dev_dax;
566 struct inode *inode;
737 struct device *dev; 567 struct device *dev;
738 struct cdev *cdev; 568 struct cdev *cdev;
739 dev_t dev_t; 569 int rc = 0, i;
740 570
741 dax_dev = kzalloc(sizeof(*dax_dev) + sizeof(*res) * count, GFP_KERNEL); 571 dev_dax = kzalloc(sizeof(*dev_dax) + sizeof(*res) * count, GFP_KERNEL);
742 if (!dax_dev) 572 if (!dev_dax)
743 return ERR_PTR(-ENOMEM); 573 return ERR_PTR(-ENOMEM);
744 574
745 for (i = 0; i < count; i++) { 575 for (i = 0; i < count; i++) {
@@ -749,110 +579,79 @@ struct dax_dev *devm_create_dax_dev(struct dax_region *dax_region,
749 rc = -EINVAL; 579 rc = -EINVAL;
750 break; 580 break;
751 } 581 }
752 dax_dev->res[i].start = res[i].start; 582 dev_dax->res[i].start = res[i].start;
753 dax_dev->res[i].end = res[i].end; 583 dev_dax->res[i].end = res[i].end;
754 } 584 }
755 585
756 if (i < count) 586 if (i < count)
757 goto err_id; 587 goto err_id;
758 588
759 dax_dev->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); 589 dev_dax->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL);
760 if (dax_dev->id < 0) { 590 if (dev_dax->id < 0) {
761 rc = dax_dev->id; 591 rc = dev_dax->id;
762 goto err_id; 592 goto err_id;
763 } 593 }
764 594
765 minor = ida_simple_get(&dax_minor_ida, 0, 0, GFP_KERNEL); 595 /*
766 if (minor < 0) { 596 * No 'host' or dax_operations since there is no access to this
767 rc = minor; 597 * device outside of mmap of the resulting character device.
768 goto err_minor; 598 */
769 } 599 dax_dev = alloc_dax(dev_dax, NULL, NULL);
770 600 if (!dax_dev)
771 dev_t = MKDEV(MAJOR(dax_devt), minor); 601 goto err_dax;
772 dev = &dax_dev->dev;
773 dax_dev->inode = dax_inode_get(&dax_dev->cdev, dev_t);
774 if (!dax_dev->inode) {
775 rc = -ENOMEM;
776 goto err_inode;
777 }
778 602
779 /* from here on we're committed to teardown via dax_dev_release() */ 603 /* from here on we're committed to teardown via dax_dev_release() */
604 dev = &dev_dax->dev;
780 device_initialize(dev); 605 device_initialize(dev);
781 606
782 cdev = &dax_dev->cdev; 607 inode = dax_inode(dax_dev);
608 cdev = inode->i_cdev;
783 cdev_init(cdev, &dax_fops); 609 cdev_init(cdev, &dax_fops);
784 cdev->owner = parent->driver->owner; 610 cdev->owner = parent->driver->owner;
785 611
786 dax_dev->num_resources = count; 612 dev_dax->num_resources = count;
787 dax_dev->alive = true; 613 dev_dax->dax_dev = dax_dev;
788 dax_dev->region = dax_region; 614 dev_dax->region = dax_region;
789 kref_get(&dax_region->kref); 615 kref_get(&dax_region->kref);
790 616
791 dev->devt = dev_t; 617 dev->devt = inode->i_rdev;
792 dev->class = dax_class; 618 dev->class = dax_class;
793 dev->parent = parent; 619 dev->parent = parent;
794 dev->groups = dax_attribute_groups; 620 dev->groups = dax_attribute_groups;
795 dev->release = dax_dev_release; 621 dev->release = dev_dax_release;
796 dev_set_name(dev, "dax%d.%d", dax_region->id, dax_dev->id); 622 dev_set_name(dev, "dax%d.%d", dax_region->id, dev_dax->id);
797 623
798 rc = cdev_device_add(cdev, dev); 624 rc = cdev_device_add(cdev, dev);
799 if (rc) { 625 if (rc) {
800 kill_dax_dev(dax_dev); 626 kill_dev_dax(dev_dax);
801 put_device(dev); 627 put_device(dev);
802 return ERR_PTR(rc); 628 return ERR_PTR(rc);
803 } 629 }
804 630
805 rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_dev, dev); 631 rc = devm_add_action_or_reset(dax_region->dev, unregister_dev_dax, dev);
806 if (rc) 632 if (rc)
807 return ERR_PTR(rc); 633 return ERR_PTR(rc);
808 634
809 return dax_dev; 635 return dev_dax;
810 636
811 err_inode: 637 err_dax:
812 ida_simple_remove(&dax_minor_ida, minor); 638 ida_simple_remove(&dax_region->ida, dev_dax->id);
813 err_minor:
814 ida_simple_remove(&dax_region->ida, dax_dev->id);
815 err_id: 639 err_id:
816 kfree(dax_dev); 640 kfree(dev_dax);
817 641
818 return ERR_PTR(rc); 642 return ERR_PTR(rc);
819} 643}
820EXPORT_SYMBOL_GPL(devm_create_dax_dev); 644EXPORT_SYMBOL_GPL(devm_create_dev_dax);
821 645
822static int __init dax_init(void) 646static int __init dax_init(void)
823{ 647{
824 int rc;
825
826 rc = dax_inode_init();
827 if (rc)
828 return rc;
829
830 nr_dax = max(nr_dax, 256);
831 rc = alloc_chrdev_region(&dax_devt, 0, nr_dax, "dax");
832 if (rc)
833 goto err_chrdev;
834
835 dax_class = class_create(THIS_MODULE, "dax"); 648 dax_class = class_create(THIS_MODULE, "dax");
836 if (IS_ERR(dax_class)) { 649 return PTR_ERR_OR_ZERO(dax_class);
837 rc = PTR_ERR(dax_class);
838 goto err_class;
839 }
840
841 return 0;
842
843 err_class:
844 unregister_chrdev_region(dax_devt, nr_dax);
845 err_chrdev:
846 dax_inode_exit();
847 return rc;
848} 650}
849 651
850static void __exit dax_exit(void) 652static void __exit dax_exit(void)
851{ 653{
852 class_destroy(dax_class); 654 class_destroy(dax_class);
853 unregister_chrdev_region(dax_devt, nr_dax);
854 ida_destroy(&dax_minor_ida);
855 dax_inode_exit();
856} 655}
857 656
858MODULE_AUTHOR("Intel Corporation"); 657MODULE_AUTHOR("Intel Corporation");
diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c
index cb0d742fa23f..9f2a0b4fd801 100644
--- a/drivers/dax/pmem.c
+++ b/drivers/dax/pmem.c
@@ -16,7 +16,7 @@
16#include <linux/pfn_t.h> 16#include <linux/pfn_t.h>
17#include "../nvdimm/pfn.h" 17#include "../nvdimm/pfn.h"
18#include "../nvdimm/nd.h" 18#include "../nvdimm/nd.h"
19#include "dax.h" 19#include "device-dax.h"
20 20
21struct dax_pmem { 21struct dax_pmem {
22 struct device *dev; 22 struct device *dev;
@@ -61,8 +61,8 @@ static int dax_pmem_probe(struct device *dev)
61 int rc; 61 int rc;
62 void *addr; 62 void *addr;
63 struct resource res; 63 struct resource res;
64 struct dax_dev *dax_dev;
65 struct nd_pfn_sb *pfn_sb; 64 struct nd_pfn_sb *pfn_sb;
65 struct dev_dax *dev_dax;
66 struct dax_pmem *dax_pmem; 66 struct dax_pmem *dax_pmem;
67 struct nd_region *nd_region; 67 struct nd_region *nd_region;
68 struct nd_namespace_io *nsio; 68 struct nd_namespace_io *nsio;
@@ -130,12 +130,12 @@ static int dax_pmem_probe(struct device *dev)
130 return -ENOMEM; 130 return -ENOMEM;
131 131
132 /* TODO: support for subdividing a dax region... */ 132 /* TODO: support for subdividing a dax region... */
133 dax_dev = devm_create_dax_dev(dax_region, &res, 1); 133 dev_dax = devm_create_dev_dax(dax_region, &res, 1);
134 134
135 /* child dax_dev instances now own the lifetime of the dax_region */ 135 /* child dev_dax instances now own the lifetime of the dax_region */
136 dax_region_put(dax_region); 136 dax_region_put(dax_region);
137 137
138 return PTR_ERR_OR_ZERO(dax_dev); 138 return PTR_ERR_OR_ZERO(dev_dax);
139} 139}
140 140
141static struct nd_device_driver dax_pmem_driver = { 141static struct nd_device_driver dax_pmem_driver = {
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
new file mode 100644
index 000000000000..465dcd7317d5
--- /dev/null
+++ b/drivers/dax/super.c
@@ -0,0 +1,425 @@
1/*
2 * Copyright(c) 2017 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#include <linux/pagemap.h>
14#include <linux/module.h>
15#include <linux/mount.h>
16#include <linux/magic.h>
17#include <linux/cdev.h>
18#include <linux/hash.h>
19#include <linux/slab.h>
20#include <linux/dax.h>
21#include <linux/fs.h>
22
23static int nr_dax = CONFIG_NR_DEV_DAX;
24module_param(nr_dax, int, S_IRUGO);
25MODULE_PARM_DESC(nr_dax, "max number of dax device instances");
26
27static dev_t dax_devt;
28DEFINE_STATIC_SRCU(dax_srcu);
29static struct vfsmount *dax_mnt;
30static DEFINE_IDA(dax_minor_ida);
31static struct kmem_cache *dax_cache __read_mostly;
32static struct super_block *dax_superblock __read_mostly;
33
34#define DAX_HASH_SIZE (PAGE_SIZE / sizeof(struct hlist_head))
35static struct hlist_head dax_host_list[DAX_HASH_SIZE];
36static DEFINE_SPINLOCK(dax_host_lock);
37
38int dax_read_lock(void)
39{
40 return srcu_read_lock(&dax_srcu);
41}
42EXPORT_SYMBOL_GPL(dax_read_lock);
43
44void dax_read_unlock(int id)
45{
46 srcu_read_unlock(&dax_srcu, id);
47}
48EXPORT_SYMBOL_GPL(dax_read_unlock);
49
50/**
51 * struct dax_device - anchor object for dax services
52 * @inode: core vfs
53 * @cdev: optional character interface for "device dax"
54 * @host: optional name for lookups where the device path is not available
55 * @private: dax driver private data
56 * @alive: !alive + rcu grace period == no new operations / mappings
57 */
58struct dax_device {
59 struct hlist_node list;
60 struct inode inode;
61 struct cdev cdev;
62 const char *host;
63 void *private;
64 bool alive;
65 const struct dax_operations *ops;
66};
67
68/**
69 * dax_direct_access() - translate a device pgoff to an absolute pfn
70 * @dax_dev: a dax_device instance representing the logical memory range
71 * @pgoff: offset in pages from the start of the device to translate
72 * @nr_pages: number of consecutive pages caller can handle relative to @pfn
73 * @kaddr: output parameter that returns a virtual address mapping of pfn
74 * @pfn: output parameter that returns an absolute pfn translation of @pgoff
75 *
76 * Return: negative errno if an error occurs, otherwise the number of
77 * pages accessible at the device relative @pgoff.
78 */
79long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
80 void **kaddr, pfn_t *pfn)
81{
82 long avail;
83
84 /*
85 * The device driver is allowed to sleep, in order to make the
86 * memory directly accessible.
87 */
88 might_sleep();
89
90 if (!dax_dev)
91 return -EOPNOTSUPP;
92
93 if (!dax_alive(dax_dev))
94 return -ENXIO;
95
96 if (nr_pages < 0)
97 return nr_pages;
98
99 avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages,
100 kaddr, pfn);
101 if (!avail)
102 return -ERANGE;
103 return min(avail, nr_pages);
104}
105EXPORT_SYMBOL_GPL(dax_direct_access);
106
107bool dax_alive(struct dax_device *dax_dev)
108{
109 lockdep_assert_held(&dax_srcu);
110 return dax_dev->alive;
111}
112EXPORT_SYMBOL_GPL(dax_alive);
113
114static int dax_host_hash(const char *host)
115{
116 return hashlen_hash(hashlen_string("DAX", host)) % DAX_HASH_SIZE;
117}
118
119/*
120 * Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring
121 * that any fault handlers or operations that might have seen
122 * dax_alive(), have completed. Any operations that start after
123 * synchronize_srcu() has run will abort upon seeing !dax_alive().
124 */
125void kill_dax(struct dax_device *dax_dev)
126{
127 if (!dax_dev)
128 return;
129
130 dax_dev->alive = false;
131
132 synchronize_srcu(&dax_srcu);
133
134 spin_lock(&dax_host_lock);
135 hlist_del_init(&dax_dev->list);
136 spin_unlock(&dax_host_lock);
137
138 dax_dev->private = NULL;
139}
140EXPORT_SYMBOL_GPL(kill_dax);
141
142static struct inode *dax_alloc_inode(struct super_block *sb)
143{
144 struct dax_device *dax_dev;
145
146 dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL);
147 return &dax_dev->inode;
148}
149
150static struct dax_device *to_dax_dev(struct inode *inode)
151{
152 return container_of(inode, struct dax_device, inode);
153}
154
155static void dax_i_callback(struct rcu_head *head)
156{
157 struct inode *inode = container_of(head, struct inode, i_rcu);
158 struct dax_device *dax_dev = to_dax_dev(inode);
159
160 kfree(dax_dev->host);
161 dax_dev->host = NULL;
162 ida_simple_remove(&dax_minor_ida, MINOR(inode->i_rdev));
163 kmem_cache_free(dax_cache, dax_dev);
164}
165
166static void dax_destroy_inode(struct inode *inode)
167{
168 struct dax_device *dax_dev = to_dax_dev(inode);
169
170 WARN_ONCE(dax_dev->alive,
171 "kill_dax() must be called before final iput()\n");
172 call_rcu(&inode->i_rcu, dax_i_callback);
173}
174
175static const struct super_operations dax_sops = {
176 .statfs = simple_statfs,
177 .alloc_inode = dax_alloc_inode,
178 .destroy_inode = dax_destroy_inode,
179 .drop_inode = generic_delete_inode,
180};
181
182static struct dentry *dax_mount(struct file_system_type *fs_type,
183 int flags, const char *dev_name, void *data)
184{
185 return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC);
186}
187
188static struct file_system_type dax_fs_type = {
189 .name = "dax",
190 .mount = dax_mount,
191 .kill_sb = kill_anon_super,
192};
193
194static int dax_test(struct inode *inode, void *data)
195{
196 dev_t devt = *(dev_t *) data;
197
198 return inode->i_rdev == devt;
199}
200
201static int dax_set(struct inode *inode, void *data)
202{
203 dev_t devt = *(dev_t *) data;
204
205 inode->i_rdev = devt;
206 return 0;
207}
208
209static struct dax_device *dax_dev_get(dev_t devt)
210{
211 struct dax_device *dax_dev;
212 struct inode *inode;
213
214 inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31),
215 dax_test, dax_set, &devt);
216
217 if (!inode)
218 return NULL;
219
220 dax_dev = to_dax_dev(inode);
221 if (inode->i_state & I_NEW) {
222 dax_dev->alive = true;
223 inode->i_cdev = &dax_dev->cdev;
224 inode->i_mode = S_IFCHR;
225 inode->i_flags = S_DAX;
226 mapping_set_gfp_mask(&inode->i_data, GFP_USER);
227 unlock_new_inode(inode);
228 }
229
230 return dax_dev;
231}
232
233static void dax_add_host(struct dax_device *dax_dev, const char *host)
234{
235 int hash;
236
237 /*
238 * Unconditionally init dax_dev since it's coming from a
239 * non-zeroed slab cache
240 */
241 INIT_HLIST_NODE(&dax_dev->list);
242 dax_dev->host = host;
243 if (!host)
244 return;
245
246 hash = dax_host_hash(host);
247 spin_lock(&dax_host_lock);
248 hlist_add_head(&dax_dev->list, &dax_host_list[hash]);
249 spin_unlock(&dax_host_lock);
250}
251
252struct dax_device *alloc_dax(void *private, const char *__host,
253 const struct dax_operations *ops)
254{
255 struct dax_device *dax_dev;
256 const char *host;
257 dev_t devt;
258 int minor;
259
260 host = kstrdup(__host, GFP_KERNEL);
261 if (__host && !host)
262 return NULL;
263
264 minor = ida_simple_get(&dax_minor_ida, 0, nr_dax, GFP_KERNEL);
265 if (minor < 0)
266 goto err_minor;
267
268 devt = MKDEV(MAJOR(dax_devt), minor);
269 dax_dev = dax_dev_get(devt);
270 if (!dax_dev)
271 goto err_dev;
272
273 dax_add_host(dax_dev, host);
274 dax_dev->ops = ops;
275 dax_dev->private = private;
276 return dax_dev;
277
278 err_dev:
279 ida_simple_remove(&dax_minor_ida, minor);
280 err_minor:
281 kfree(host);
282 return NULL;
283}
284EXPORT_SYMBOL_GPL(alloc_dax);
285
286void put_dax(struct dax_device *dax_dev)
287{
288 if (!dax_dev)
289 return;
290 iput(&dax_dev->inode);
291}
292EXPORT_SYMBOL_GPL(put_dax);
293
294/**
295 * dax_get_by_host() - temporary lookup mechanism for filesystem-dax
296 * @host: alternate name for the device registered by a dax driver
297 */
298struct dax_device *dax_get_by_host(const char *host)
299{
300 struct dax_device *dax_dev, *found = NULL;
301 int hash, id;
302
303 if (!host)
304 return NULL;
305
306 hash = dax_host_hash(host);
307
308 id = dax_read_lock();
309 spin_lock(&dax_host_lock);
310 hlist_for_each_entry(dax_dev, &dax_host_list[hash], list) {
311 if (!dax_alive(dax_dev)
312 || strcmp(host, dax_dev->host) != 0)
313 continue;
314
315 if (igrab(&dax_dev->inode))
316 found = dax_dev;
317 break;
318 }
319 spin_unlock(&dax_host_lock);
320 dax_read_unlock(id);
321
322 return found;
323}
324EXPORT_SYMBOL_GPL(dax_get_by_host);
325
326/**
327 * inode_dax: convert a public inode into its dax_dev
328 * @inode: An inode with i_cdev pointing to a dax_dev
329 *
330 * Note this is not equivalent to to_dax_dev() which is for private
331 * internal use where we know the inode filesystem type == dax_fs_type.
332 */
333struct dax_device *inode_dax(struct inode *inode)
334{
335 struct cdev *cdev = inode->i_cdev;
336
337 return container_of(cdev, struct dax_device, cdev);
338}
339EXPORT_SYMBOL_GPL(inode_dax);
340
341struct inode *dax_inode(struct dax_device *dax_dev)
342{
343 return &dax_dev->inode;
344}
345EXPORT_SYMBOL_GPL(dax_inode);
346
347void *dax_get_private(struct dax_device *dax_dev)
348{
349 return dax_dev->private;
350}
351EXPORT_SYMBOL_GPL(dax_get_private);
352
353static void init_once(void *_dax_dev)
354{
355 struct dax_device *dax_dev = _dax_dev;
356 struct inode *inode = &dax_dev->inode;
357
358 inode_init_once(inode);
359}
360
361static int __dax_fs_init(void)
362{
363 int rc;
364
365 dax_cache = kmem_cache_create("dax_cache", sizeof(struct dax_device), 0,
366 (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
367 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
368 init_once);
369 if (!dax_cache)
370 return -ENOMEM;
371
372 rc = register_filesystem(&dax_fs_type);
373 if (rc)
374 goto err_register_fs;
375
376 dax_mnt = kern_mount(&dax_fs_type);
377 if (IS_ERR(dax_mnt)) {
378 rc = PTR_ERR(dax_mnt);
379 goto err_mount;
380 }
381 dax_superblock = dax_mnt->mnt_sb;
382
383 return 0;
384
385 err_mount:
386 unregister_filesystem(&dax_fs_type);
387 err_register_fs:
388 kmem_cache_destroy(dax_cache);
389
390 return rc;
391}
392
393static void __dax_fs_exit(void)
394{
395 kern_unmount(dax_mnt);
396 unregister_filesystem(&dax_fs_type);
397 kmem_cache_destroy(dax_cache);
398}
399
400static int __init dax_fs_init(void)
401{
402 int rc;
403
404 rc = __dax_fs_init();
405 if (rc)
406 return rc;
407
408 nr_dax = max(nr_dax, 256);
409 rc = alloc_chrdev_region(&dax_devt, 0, nr_dax, "dax");
410 if (rc)
411 __dax_fs_exit();
412 return rc;
413}
414
415static void __exit dax_fs_exit(void)
416{
417 unregister_chrdev_region(dax_devt, nr_dax);
418 ida_destroy(&dax_minor_ida);
419 __dax_fs_exit();
420}
421
422MODULE_AUTHOR("Intel Corporation");
423MODULE_LICENSE("GPL v2");
424subsys_initcall(dax_fs_init);
425module_exit(dax_fs_exit);
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 7468a22f9d10..349ff8813401 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -200,6 +200,7 @@ config BLK_DEV_DM_BUILTIN
200config BLK_DEV_DM 200config BLK_DEV_DM
201 tristate "Device mapper support" 201 tristate "Device mapper support"
202 select BLK_DEV_DM_BUILTIN 202 select BLK_DEV_DM_BUILTIN
203 select DAX
203 ---help--- 204 ---help---
204 Device-mapper is a low level volume manager. It works by allowing 205 Device-mapper is a low level volume manager. It works by allowing
205 people to specify mappings for ranges of logical sectors. Various 206 people to specify mappings for ranges of logical sectors. Various
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 97db4d11c05a..52ca8d059e82 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -58,6 +58,7 @@ struct mapped_device {
58 struct target_type *immutable_target_type; 58 struct target_type *immutable_target_type;
59 59
60 struct gendisk *disk; 60 struct gendisk *disk;
61 struct dax_device *dax_dev;
61 char name[16]; 62 char name[16];
62 63
63 void *interface_ptr; 64 void *interface_ptr;
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index a5120961632a..7d42a9d9f406 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -9,6 +9,7 @@
9#include <linux/init.h> 9#include <linux/init.h>
10#include <linux/blkdev.h> 10#include <linux/blkdev.h>
11#include <linux/bio.h> 11#include <linux/bio.h>
12#include <linux/dax.h>
12#include <linux/slab.h> 13#include <linux/slab.h>
13#include <linux/device-mapper.h> 14#include <linux/device-mapper.h>
14 15
@@ -142,22 +143,20 @@ static int linear_iterate_devices(struct dm_target *ti,
142 return fn(ti, lc->dev, lc->start, ti->len, data); 143 return fn(ti, lc->dev, lc->start, ti->len, data);
143} 144}
144 145
145static long linear_direct_access(struct dm_target *ti, sector_t sector, 146static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
146 void **kaddr, pfn_t *pfn, long size) 147 long nr_pages, void **kaddr, pfn_t *pfn)
147{ 148{
149 long ret;
148 struct linear_c *lc = ti->private; 150 struct linear_c *lc = ti->private;
149 struct block_device *bdev = lc->dev->bdev; 151 struct block_device *bdev = lc->dev->bdev;
150 struct blk_dax_ctl dax = { 152 struct dax_device *dax_dev = lc->dev->dax_dev;
151 .sector = linear_map_sector(ti, sector), 153 sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
152 .size = size, 154
153 }; 155 dev_sector = linear_map_sector(ti, sector);
154 long ret; 156 ret = bdev_dax_pgoff(bdev, dev_sector, nr_pages * PAGE_SIZE, &pgoff);
155 157 if (ret)
156 ret = bdev_direct_access(bdev, &dax); 158 return ret;
157 *kaddr = dax.addr; 159 return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn);
158 *pfn = dax.pfn;
159
160 return ret;
161} 160}
162 161
163static struct target_type linear_target = { 162static struct target_type linear_target = {
@@ -171,7 +170,7 @@ static struct target_type linear_target = {
171 .status = linear_status, 170 .status = linear_status,
172 .prepare_ioctl = linear_prepare_ioctl, 171 .prepare_ioctl = linear_prepare_ioctl,
173 .iterate_devices = linear_iterate_devices, 172 .iterate_devices = linear_iterate_devices,
174 .direct_access = linear_direct_access, 173 .direct_access = linear_dax_direct_access,
175}; 174};
176 175
177int __init dm_linear_init(void) 176int __init dm_linear_init(void)
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index c65feeada864..e152d9817c81 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -2302,8 +2302,8 @@ static int origin_map(struct dm_target *ti, struct bio *bio)
2302 return do_origin(o->dev, bio); 2302 return do_origin(o->dev, bio);
2303} 2303}
2304 2304
2305static long origin_direct_access(struct dm_target *ti, sector_t sector, 2305static long origin_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
2306 void **kaddr, pfn_t *pfn, long size) 2306 long nr_pages, void **kaddr, pfn_t *pfn)
2307{ 2307{
2308 DMWARN("device does not support dax."); 2308 DMWARN("device does not support dax.");
2309 return -EIO; 2309 return -EIO;
@@ -2368,7 +2368,7 @@ static struct target_type origin_target = {
2368 .postsuspend = origin_postsuspend, 2368 .postsuspend = origin_postsuspend,
2369 .status = origin_status, 2369 .status = origin_status,
2370 .iterate_devices = origin_iterate_devices, 2370 .iterate_devices = origin_iterate_devices,
2371 .direct_access = origin_direct_access, 2371 .direct_access = origin_dax_direct_access,
2372}; 2372};
2373 2373
2374static struct target_type snapshot_target = { 2374static struct target_type snapshot_target = {
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 4b50ae115c6d..75152482f3ad 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -11,6 +11,7 @@
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/blkdev.h> 12#include <linux/blkdev.h>
13#include <linux/bio.h> 13#include <linux/bio.h>
14#include <linux/dax.h>
14#include <linux/slab.h> 15#include <linux/slab.h>
15#include <linux/log2.h> 16#include <linux/log2.h>
16 17
@@ -310,27 +311,25 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
310 return DM_MAPIO_REMAPPED; 311 return DM_MAPIO_REMAPPED;
311} 312}
312 313
313static long stripe_direct_access(struct dm_target *ti, sector_t sector, 314static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
314 void **kaddr, pfn_t *pfn, long size) 315 long nr_pages, void **kaddr, pfn_t *pfn)
315{ 316{
317 sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
316 struct stripe_c *sc = ti->private; 318 struct stripe_c *sc = ti->private;
317 uint32_t stripe; 319 struct dax_device *dax_dev;
318 struct block_device *bdev; 320 struct block_device *bdev;
319 struct blk_dax_ctl dax = { 321 uint32_t stripe;
320 .size = size,
321 };
322 long ret; 322 long ret;
323 323
324 stripe_map_sector(sc, sector, &stripe, &dax.sector); 324 stripe_map_sector(sc, sector, &stripe, &dev_sector);
325 325 dev_sector += sc->stripe[stripe].physical_start;
326 dax.sector += sc->stripe[stripe].physical_start; 326 dax_dev = sc->stripe[stripe].dev->dax_dev;
327 bdev = sc->stripe[stripe].dev->bdev; 327 bdev = sc->stripe[stripe].dev->bdev;
328 328
329 ret = bdev_direct_access(bdev, &dax); 329 ret = bdev_dax_pgoff(bdev, dev_sector, nr_pages * PAGE_SIZE, &pgoff);
330 *kaddr = dax.addr; 330 if (ret)
331 *pfn = dax.pfn; 331 return ret;
332 332 return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn);
333 return ret;
334} 333}
335 334
336/* 335/*
@@ -451,7 +450,7 @@ static struct target_type stripe_target = {
451 .status = stripe_status, 450 .status = stripe_status,
452 .iterate_devices = stripe_iterate_devices, 451 .iterate_devices = stripe_iterate_devices,
453 .io_hints = stripe_io_hints, 452 .io_hints = stripe_io_hints,
454 .direct_access = stripe_direct_access, 453 .direct_access = stripe_dax_direct_access,
455}; 454};
456 455
457int __init dm_stripe_init(void) 456int __init dm_stripe_init(void)
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 6264ff00dcf0..b242b750542f 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -142,8 +142,8 @@ static void io_err_release_clone_rq(struct request *clone)
142{ 142{
143} 143}
144 144
145static long io_err_direct_access(struct dm_target *ti, sector_t sector, 145static long io_err_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
146 void **kaddr, pfn_t *pfn, long size) 146 long nr_pages, void **kaddr, pfn_t *pfn)
147{ 147{
148 return -EIO; 148 return -EIO;
149} 149}
@@ -157,7 +157,7 @@ static struct target_type error_target = {
157 .map = io_err_map, 157 .map = io_err_map,
158 .clone_and_map_rq = io_err_clone_and_map_rq, 158 .clone_and_map_rq = io_err_clone_and_map_rq,
159 .release_clone_rq = io_err_release_clone_rq, 159 .release_clone_rq = io_err_release_clone_rq,
160 .direct_access = io_err_direct_access, 160 .direct_access = io_err_dax_direct_access,
161}; 161};
162 162
163int __init dm_target_init(void) 163int __init dm_target_init(void)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 268edf402bbb..6ef9500226c0 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -16,6 +16,7 @@
16#include <linux/blkpg.h> 16#include <linux/blkpg.h>
17#include <linux/bio.h> 17#include <linux/bio.h>
18#include <linux/mempool.h> 18#include <linux/mempool.h>
19#include <linux/dax.h>
19#include <linux/slab.h> 20#include <linux/slab.h>
20#include <linux/idr.h> 21#include <linux/idr.h>
21#include <linux/hdreg.h> 22#include <linux/hdreg.h>
@@ -629,6 +630,7 @@ static int open_table_device(struct table_device *td, dev_t dev,
629 } 630 }
630 631
631 td->dm_dev.bdev = bdev; 632 td->dm_dev.bdev = bdev;
633 td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
632 return 0; 634 return 0;
633} 635}
634 636
@@ -642,7 +644,9 @@ static void close_table_device(struct table_device *td, struct mapped_device *md
642 644
643 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); 645 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
644 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); 646 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
647 put_dax(td->dm_dev.dax_dev);
645 td->dm_dev.bdev = NULL; 648 td->dm_dev.bdev = NULL;
649 td->dm_dev.dax_dev = NULL;
646} 650}
647 651
648static struct table_device *find_table_device(struct list_head *l, dev_t dev, 652static struct table_device *find_table_device(struct list_head *l, dev_t dev,
@@ -920,31 +924,49 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
920} 924}
921EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); 925EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
922 926
923static long dm_blk_direct_access(struct block_device *bdev, sector_t sector, 927static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
924 void **kaddr, pfn_t *pfn, long size) 928 sector_t sector, int *srcu_idx)
925{ 929{
926 struct mapped_device *md = bdev->bd_disk->private_data;
927 struct dm_table *map; 930 struct dm_table *map;
928 struct dm_target *ti; 931 struct dm_target *ti;
929 int srcu_idx;
930 long len, ret = -EIO;
931 932
932 map = dm_get_live_table(md, &srcu_idx); 933 map = dm_get_live_table(md, srcu_idx);
933 if (!map) 934 if (!map)
934 goto out; 935 return NULL;
935 936
936 ti = dm_table_find_target(map, sector); 937 ti = dm_table_find_target(map, sector);
937 if (!dm_target_is_valid(ti)) 938 if (!dm_target_is_valid(ti))
938 goto out; 939 return NULL;
940
941 return ti;
942}
943
944static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
945 long nr_pages, void **kaddr, pfn_t *pfn)
946{
947 struct mapped_device *md = dax_get_private(dax_dev);
948 sector_t sector = pgoff * PAGE_SECTORS;
949 struct dm_target *ti;
950 long len, ret = -EIO;
951 int srcu_idx;
939 952
940 len = max_io_len(sector, ti) << SECTOR_SHIFT; 953 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
941 size = min(len, size);
942 954
955 if (!ti)
956 goto out;
957 if (!ti->type->direct_access)
958 goto out;
959 len = max_io_len(sector, ti) / PAGE_SECTORS;
960 if (len < 1)
961 goto out;
962 nr_pages = min(len, nr_pages);
943 if (ti->type->direct_access) 963 if (ti->type->direct_access)
944 ret = ti->type->direct_access(ti, sector, kaddr, pfn, size); 964 ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
945out: 965
966 out:
946 dm_put_live_table(md, srcu_idx); 967 dm_put_live_table(md, srcu_idx);
947 return min(ret, size); 968
969 return ret;
948} 970}
949 971
950/* 972/*
@@ -1471,6 +1493,7 @@ static int next_free_minor(int *minor)
1471} 1493}
1472 1494
1473static const struct block_device_operations dm_blk_dops; 1495static const struct block_device_operations dm_blk_dops;
1496static const struct dax_operations dm_dax_ops;
1474 1497
1475static void dm_wq_work(struct work_struct *work); 1498static void dm_wq_work(struct work_struct *work);
1476 1499
@@ -1517,6 +1540,12 @@ static void cleanup_mapped_device(struct mapped_device *md)
1517 if (md->bs) 1540 if (md->bs)
1518 bioset_free(md->bs); 1541 bioset_free(md->bs);
1519 1542
1543 if (md->dax_dev) {
1544 kill_dax(md->dax_dev);
1545 put_dax(md->dax_dev);
1546 md->dax_dev = NULL;
1547 }
1548
1520 if (md->disk) { 1549 if (md->disk) {
1521 spin_lock(&_minor_lock); 1550 spin_lock(&_minor_lock);
1522 md->disk->private_data = NULL; 1551 md->disk->private_data = NULL;
@@ -1544,6 +1573,7 @@ static void cleanup_mapped_device(struct mapped_device *md)
1544static struct mapped_device *alloc_dev(int minor) 1573static struct mapped_device *alloc_dev(int minor)
1545{ 1574{
1546 int r, numa_node_id = dm_get_numa_node(); 1575 int r, numa_node_id = dm_get_numa_node();
1576 struct dax_device *dax_dev;
1547 struct mapped_device *md; 1577 struct mapped_device *md;
1548 void *old_md; 1578 void *old_md;
1549 1579
@@ -1608,6 +1638,12 @@ static struct mapped_device *alloc_dev(int minor)
1608 md->disk->queue = md->queue; 1638 md->disk->queue = md->queue;
1609 md->disk->private_data = md; 1639 md->disk->private_data = md;
1610 sprintf(md->disk->disk_name, "dm-%d", minor); 1640 sprintf(md->disk->disk_name, "dm-%d", minor);
1641
1642 dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
1643 if (!dax_dev)
1644 goto bad;
1645 md->dax_dev = dax_dev;
1646
1611 add_disk(md->disk); 1647 add_disk(md->disk);
1612 format_dev_t(md->name, MKDEV(_major, minor)); 1648 format_dev_t(md->name, MKDEV(_major, minor));
1613 1649
@@ -2816,12 +2852,15 @@ static const struct block_device_operations dm_blk_dops = {
2816 .open = dm_blk_open, 2852 .open = dm_blk_open,
2817 .release = dm_blk_close, 2853 .release = dm_blk_close,
2818 .ioctl = dm_blk_ioctl, 2854 .ioctl = dm_blk_ioctl,
2819 .direct_access = dm_blk_direct_access,
2820 .getgeo = dm_blk_getgeo, 2855 .getgeo = dm_blk_getgeo,
2821 .pr_ops = &dm_pr_ops, 2856 .pr_ops = &dm_pr_ops,
2822 .owner = THIS_MODULE 2857 .owner = THIS_MODULE
2823}; 2858};
2824 2859
2860static const struct dax_operations dm_dax_ops = {
2861 .direct_access = dm_dax_direct_access,
2862};
2863
2825/* 2864/*
2826 * module hooks 2865 * module hooks
2827 */ 2866 */
diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig
index 59e750183b7f..5bdd499b5f4f 100644
--- a/drivers/nvdimm/Kconfig
+++ b/drivers/nvdimm/Kconfig
@@ -20,6 +20,7 @@ if LIBNVDIMM
20config BLK_DEV_PMEM 20config BLK_DEV_PMEM
21 tristate "PMEM: Persistent memory block device support" 21 tristate "PMEM: Persistent memory block device support"
22 default LIBNVDIMM 22 default LIBNVDIMM
23 select DAX
23 select ND_BTT if BTT 24 select ND_BTT if BTT
24 select ND_PFN if NVDIMM_PFN 25 select ND_PFN if NVDIMM_PFN
25 help 26 help
diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
index 97dd2925ed6e..4b76af2b8715 100644
--- a/drivers/nvdimm/btt_devs.c
+++ b/drivers/nvdimm/btt_devs.c
@@ -314,7 +314,7 @@ int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns)
314 if (rc < 0) { 314 if (rc < 0) {
315 struct nd_btt *nd_btt = to_nd_btt(btt_dev); 315 struct nd_btt *nd_btt = to_nd_btt(btt_dev);
316 316
317 __nd_detach_ndns(btt_dev, &nd_btt->ndns); 317 nd_detach_ndns(btt_dev, &nd_btt->ndns);
318 put_device(btt_dev); 318 put_device(btt_dev);
319 } 319 }
320 320
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 351bac8f6503..e9361bffe5ee 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -27,6 +27,7 @@
27#include <linux/nd.h> 27#include <linux/nd.h>
28#include "nd-core.h" 28#include "nd-core.h"
29#include "nd.h" 29#include "nd.h"
30#include "pfn.h"
30 31
31int nvdimm_major; 32int nvdimm_major;
32static int nvdimm_bus_major; 33static int nvdimm_bus_major;
@@ -171,6 +172,57 @@ void nvdimm_region_notify(struct nd_region *nd_region, enum nvdimm_event event)
171} 172}
172EXPORT_SYMBOL_GPL(nvdimm_region_notify); 173EXPORT_SYMBOL_GPL(nvdimm_region_notify);
173 174
175struct clear_badblocks_context {
176 resource_size_t phys, cleared;
177};
178
179static int nvdimm_clear_badblocks_region(struct device *dev, void *data)
180{
181 struct clear_badblocks_context *ctx = data;
182 struct nd_region *nd_region;
183 resource_size_t ndr_end;
184 sector_t sector;
185
186 /* make sure device is a region */
187 if (!is_nd_pmem(dev))
188 return 0;
189
190 nd_region = to_nd_region(dev);
191 ndr_end = nd_region->ndr_start + nd_region->ndr_size - 1;
192
193 /* make sure we are in the region */
194 if (ctx->phys < nd_region->ndr_start
195 || (ctx->phys + ctx->cleared) > ndr_end)
196 return 0;
197
198 sector = (ctx->phys - nd_region->ndr_start) / 512;
199 badblocks_clear(&nd_region->bb, sector, ctx->cleared / 512);
200
201 return 0;
202}
203
204static void nvdimm_clear_badblocks_regions(struct nvdimm_bus *nvdimm_bus,
205 phys_addr_t phys, u64 cleared)
206{
207 struct clear_badblocks_context ctx = {
208 .phys = phys,
209 .cleared = cleared,
210 };
211
212 device_for_each_child(&nvdimm_bus->dev, &ctx,
213 nvdimm_clear_badblocks_region);
214}
215
216static void nvdimm_account_cleared_poison(struct nvdimm_bus *nvdimm_bus,
217 phys_addr_t phys, u64 cleared)
218{
219 if (cleared > 0)
220 nvdimm_forget_poison(nvdimm_bus, phys, cleared);
221
222 if (cleared > 0 && cleared / 512)
223 nvdimm_clear_badblocks_regions(nvdimm_bus, phys, cleared);
224}
225
174long nvdimm_clear_poison(struct device *dev, phys_addr_t phys, 226long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
175 unsigned int len) 227 unsigned int len)
176{ 228{
@@ -218,7 +270,8 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
218 if (cmd_rc < 0) 270 if (cmd_rc < 0)
219 return cmd_rc; 271 return cmd_rc;
220 272
221 nvdimm_clear_from_poison_list(nvdimm_bus, phys, len); 273 nvdimm_account_cleared_poison(nvdimm_bus, phys, clear_err.cleared);
274
222 return clear_err.cleared; 275 return clear_err.cleared;
223} 276}
224EXPORT_SYMBOL_GPL(nvdimm_clear_poison); 277EXPORT_SYMBOL_GPL(nvdimm_clear_poison);
@@ -286,6 +339,7 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
286 init_waitqueue_head(&nvdimm_bus->probe_wait); 339 init_waitqueue_head(&nvdimm_bus->probe_wait);
287 nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL); 340 nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
288 mutex_init(&nvdimm_bus->reconfig_mutex); 341 mutex_init(&nvdimm_bus->reconfig_mutex);
342 spin_lock_init(&nvdimm_bus->poison_lock);
289 if (nvdimm_bus->id < 0) { 343 if (nvdimm_bus->id < 0) {
290 kfree(nvdimm_bus); 344 kfree(nvdimm_bus);
291 return NULL; 345 return NULL;
@@ -354,9 +408,9 @@ static int nd_bus_remove(struct device *dev)
354 nd_synchronize(); 408 nd_synchronize();
355 device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister); 409 device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
356 410
357 nvdimm_bus_lock(&nvdimm_bus->dev); 411 spin_lock(&nvdimm_bus->poison_lock);
358 free_poison_list(&nvdimm_bus->poison_list); 412 free_poison_list(&nvdimm_bus->poison_list);
359 nvdimm_bus_unlock(&nvdimm_bus->dev); 413 spin_unlock(&nvdimm_bus->poison_lock);
360 414
361 nvdimm_bus_destroy_ndctl(nvdimm_bus); 415 nvdimm_bus_destroy_ndctl(nvdimm_bus);
362 416
@@ -769,16 +823,55 @@ void wait_nvdimm_bus_probe_idle(struct device *dev)
769 } while (true); 823 } while (true);
770} 824}
771 825
772static int pmem_active(struct device *dev, void *data) 826static int nd_pmem_forget_poison_check(struct device *dev, void *data)
773{ 827{
774 if (is_nd_pmem(dev) && dev->driver) 828 struct nd_cmd_clear_error *clear_err =
829 (struct nd_cmd_clear_error *)data;
830 struct nd_btt *nd_btt = is_nd_btt(dev) ? to_nd_btt(dev) : NULL;
831 struct nd_pfn *nd_pfn = is_nd_pfn(dev) ? to_nd_pfn(dev) : NULL;
832 struct nd_dax *nd_dax = is_nd_dax(dev) ? to_nd_dax(dev) : NULL;
833 struct nd_namespace_common *ndns = NULL;
834 struct nd_namespace_io *nsio;
835 resource_size_t offset = 0, end_trunc = 0, start, end, pstart, pend;
836
837 if (nd_dax || !dev->driver)
838 return 0;
839
840 start = clear_err->address;
841 end = clear_err->address + clear_err->cleared - 1;
842
843 if (nd_btt || nd_pfn || nd_dax) {
844 if (nd_btt)
845 ndns = nd_btt->ndns;
846 else if (nd_pfn)
847 ndns = nd_pfn->ndns;
848 else if (nd_dax)
849 ndns = nd_dax->nd_pfn.ndns;
850
851 if (!ndns)
852 return 0;
853 } else
854 ndns = to_ndns(dev);
855
856 nsio = to_nd_namespace_io(&ndns->dev);
857 pstart = nsio->res.start + offset;
858 pend = nsio->res.end - end_trunc;
859
860 if ((pstart >= start) && (pend <= end))
775 return -EBUSY; 861 return -EBUSY;
862
776 return 0; 863 return 0;
864
865}
866
867static int nd_ns_forget_poison_check(struct device *dev, void *data)
868{
869 return device_for_each_child(dev, data, nd_pmem_forget_poison_check);
777} 870}
778 871
779/* set_config requires an idle interleave set */ 872/* set_config requires an idle interleave set */
780static int nd_cmd_clear_to_send(struct nvdimm_bus *nvdimm_bus, 873static int nd_cmd_clear_to_send(struct nvdimm_bus *nvdimm_bus,
781 struct nvdimm *nvdimm, unsigned int cmd) 874 struct nvdimm *nvdimm, unsigned int cmd, void *data)
782{ 875{
783 struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; 876 struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
784 877
@@ -792,8 +885,8 @@ static int nd_cmd_clear_to_send(struct nvdimm_bus *nvdimm_bus,
792 885
793 /* require clear error to go through the pmem driver */ 886 /* require clear error to go through the pmem driver */
794 if (!nvdimm && cmd == ND_CMD_CLEAR_ERROR) 887 if (!nvdimm && cmd == ND_CMD_CLEAR_ERROR)
795 return device_for_each_child(&nvdimm_bus->dev, NULL, 888 return device_for_each_child(&nvdimm_bus->dev, data,
796 pmem_active); 889 nd_ns_forget_poison_check);
797 890
798 if (!nvdimm || cmd != ND_CMD_SET_CONFIG_DATA) 891 if (!nvdimm || cmd != ND_CMD_SET_CONFIG_DATA)
799 return 0; 892 return 0;
@@ -820,7 +913,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
820 const char *cmd_name, *dimm_name; 913 const char *cmd_name, *dimm_name;
821 unsigned long cmd_mask; 914 unsigned long cmd_mask;
822 void *buf; 915 void *buf;
823 int rc, i; 916 int rc, i, cmd_rc;
824 917
825 if (nvdimm) { 918 if (nvdimm) {
826 desc = nd_cmd_dimm_desc(cmd); 919 desc = nd_cmd_dimm_desc(cmd);
@@ -927,13 +1020,20 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
927 } 1020 }
928 1021
929 nvdimm_bus_lock(&nvdimm_bus->dev); 1022 nvdimm_bus_lock(&nvdimm_bus->dev);
930 rc = nd_cmd_clear_to_send(nvdimm_bus, nvdimm, cmd); 1023 rc = nd_cmd_clear_to_send(nvdimm_bus, nvdimm, cmd, buf);
931 if (rc) 1024 if (rc)
932 goto out_unlock; 1025 goto out_unlock;
933 1026
934 rc = nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len, NULL); 1027 rc = nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len, &cmd_rc);
935 if (rc < 0) 1028 if (rc < 0)
936 goto out_unlock; 1029 goto out_unlock;
1030
1031 if (!nvdimm && cmd == ND_CMD_CLEAR_ERROR && cmd_rc >= 0) {
1032 struct nd_cmd_clear_error *clear_err = buf;
1033
1034 nvdimm_account_cleared_poison(nvdimm_bus, clear_err->address,
1035 clear_err->cleared);
1036 }
937 nvdimm_bus_unlock(&nvdimm_bus->dev); 1037 nvdimm_bus_unlock(&nvdimm_bus->dev);
938 1038
939 if (copy_to_user(p, buf, buf_len)) 1039 if (copy_to_user(p, buf, buf_len))
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index ca6d572c48fc..93d128da1c92 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -21,8 +21,13 @@
21void __nd_detach_ndns(struct device *dev, struct nd_namespace_common **_ndns) 21void __nd_detach_ndns(struct device *dev, struct nd_namespace_common **_ndns)
22{ 22{
23 struct nd_namespace_common *ndns = *_ndns; 23 struct nd_namespace_common *ndns = *_ndns;
24 struct nvdimm_bus *nvdimm_bus;
24 25
25 lockdep_assert_held(&ndns->dev.mutex); 26 if (!ndns)
27 return;
28
29 nvdimm_bus = walk_to_nvdimm_bus(&ndns->dev);
30 lockdep_assert_held(&nvdimm_bus->reconfig_mutex);
26 dev_WARN_ONCE(dev, ndns->claim != dev, "%s: invalid claim\n", __func__); 31 dev_WARN_ONCE(dev, ndns->claim != dev, "%s: invalid claim\n", __func__);
27 ndns->claim = NULL; 32 ndns->claim = NULL;
28 *_ndns = NULL; 33 *_ndns = NULL;
@@ -37,18 +42,20 @@ void nd_detach_ndns(struct device *dev,
37 if (!ndns) 42 if (!ndns)
38 return; 43 return;
39 get_device(&ndns->dev); 44 get_device(&ndns->dev);
40 device_lock(&ndns->dev); 45 nvdimm_bus_lock(&ndns->dev);
41 __nd_detach_ndns(dev, _ndns); 46 __nd_detach_ndns(dev, _ndns);
42 device_unlock(&ndns->dev); 47 nvdimm_bus_unlock(&ndns->dev);
43 put_device(&ndns->dev); 48 put_device(&ndns->dev);
44} 49}
45 50
46bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach, 51bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach,
47 struct nd_namespace_common **_ndns) 52 struct nd_namespace_common **_ndns)
48{ 53{
54 struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&attach->dev);
55
49 if (attach->claim) 56 if (attach->claim)
50 return false; 57 return false;
51 lockdep_assert_held(&attach->dev.mutex); 58 lockdep_assert_held(&nvdimm_bus->reconfig_mutex);
52 dev_WARN_ONCE(dev, *_ndns, "%s: invalid claim\n", __func__); 59 dev_WARN_ONCE(dev, *_ndns, "%s: invalid claim\n", __func__);
53 attach->claim = dev; 60 attach->claim = dev;
54 *_ndns = attach; 61 *_ndns = attach;
@@ -61,9 +68,9 @@ bool nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach,
61{ 68{
62 bool claimed; 69 bool claimed;
63 70
64 device_lock(&attach->dev); 71 nvdimm_bus_lock(&attach->dev);
65 claimed = __nd_attach_ndns(dev, attach, _ndns); 72 claimed = __nd_attach_ndns(dev, attach, _ndns);
66 device_unlock(&attach->dev); 73 nvdimm_bus_unlock(&attach->dev);
67 return claimed; 74 return claimed;
68} 75}
69 76
@@ -114,7 +121,7 @@ static void nd_detach_and_reset(struct device *dev,
114 struct nd_namespace_common **_ndns) 121 struct nd_namespace_common **_ndns)
115{ 122{
116 /* detach the namespace and destroy / reset the device */ 123 /* detach the namespace and destroy / reset the device */
117 nd_detach_ndns(dev, _ndns); 124 __nd_detach_ndns(dev, _ndns);
118 if (is_idle(dev, *_ndns)) { 125 if (is_idle(dev, *_ndns)) {
119 nd_device_unregister(dev, ND_ASYNC); 126 nd_device_unregister(dev, ND_ASYNC);
120 } else if (is_nd_btt(dev)) { 127 } else if (is_nd_btt(dev)) {
@@ -184,7 +191,7 @@ ssize_t nd_namespace_store(struct device *dev,
184 } 191 }
185 192
186 WARN_ON_ONCE(!is_nvdimm_bus_locked(dev)); 193 WARN_ON_ONCE(!is_nvdimm_bus_locked(dev));
187 if (!nd_attach_ndns(dev, ndns, _ndns)) { 194 if (!__nd_attach_ndns(dev, ndns, _ndns)) {
188 dev_dbg(dev, "%s already claimed\n", 195 dev_dbg(dev, "%s already claimed\n",
189 dev_name(&ndns->dev)); 196 dev_name(&ndns->dev));
190 len = -EBUSY; 197 len = -EBUSY;
@@ -239,22 +246,24 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
239 if (rw == READ) { 246 if (rw == READ) {
240 if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) 247 if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align)))
241 return -EIO; 248 return -EIO;
242 return memcpy_from_pmem(buf, nsio->addr + offset, size); 249 return memcpy_mcsafe(buf, nsio->addr + offset, size);
243 } 250 }
244 251
245 if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) { 252 if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) {
246 /* 253 /*
247 * FIXME: nsio_rw_bytes() may be called from atomic 254 * FIXME: nsio_rw_bytes() may be called from atomic
248 * context in the btt case and nvdimm_clear_poison() 255 * context in the btt case and the ACPI DSM path for
249 * takes a sleeping lock. Until the locking can be 256 * clearing the error takes sleeping locks and allocates
250 * reworked this capability requires that the namespace 257 * memory. An explicit error clearing path, and support
251 * is not claimed by btt. 258 * for tracking badblocks in BTT metadata is needed to
259 * work around this collision.
252 */ 260 */
253 if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512) 261 if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512)
254 && (!ndns->claim || !is_nd_btt(ndns->claim))) { 262 && (!ndns->claim || !is_nd_btt(ndns->claim))) {
255 long cleared; 263 long cleared;
256 264
257 cleared = nvdimm_clear_poison(&ndns->dev, offset, size); 265 cleared = nvdimm_clear_poison(&ndns->dev,
266 nsio->res.start + offset, size);
258 if (cleared < size) 267 if (cleared < size)
259 rc = -EIO; 268 rc = -EIO;
260 if (cleared > 0 && cleared / 512) { 269 if (cleared > 0 && cleared / 512) {
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index 9303cfeb8bee..2dee908e4bae 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -518,6 +518,15 @@ void nvdimm_badblocks_populate(struct nd_region *nd_region,
518} 518}
519EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate); 519EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate);
520 520
521static void append_poison_entry(struct nvdimm_bus *nvdimm_bus,
522 struct nd_poison *pl, u64 addr, u64 length)
523{
524 lockdep_assert_held(&nvdimm_bus->poison_lock);
525 pl->start = addr;
526 pl->length = length;
527 list_add_tail(&pl->list, &nvdimm_bus->poison_list);
528}
529
521static int add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length, 530static int add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length,
522 gfp_t flags) 531 gfp_t flags)
523{ 532{
@@ -527,19 +536,24 @@ static int add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length,
527 if (!pl) 536 if (!pl)
528 return -ENOMEM; 537 return -ENOMEM;
529 538
530 pl->start = addr; 539 append_poison_entry(nvdimm_bus, pl, addr, length);
531 pl->length = length;
532 list_add_tail(&pl->list, &nvdimm_bus->poison_list);
533
534 return 0; 540 return 0;
535} 541}
536 542
537static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) 543static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
538{ 544{
539 struct nd_poison *pl; 545 struct nd_poison *pl, *pl_new;
540 546
541 if (list_empty(&nvdimm_bus->poison_list)) 547 spin_unlock(&nvdimm_bus->poison_lock);
542 return add_poison(nvdimm_bus, addr, length, GFP_KERNEL); 548 pl_new = kzalloc(sizeof(*pl_new), GFP_KERNEL);
549 spin_lock(&nvdimm_bus->poison_lock);
550
551 if (list_empty(&nvdimm_bus->poison_list)) {
552 if (!pl_new)
553 return -ENOMEM;
554 append_poison_entry(nvdimm_bus, pl_new, addr, length);
555 return 0;
556 }
543 557
544 /* 558 /*
545 * There is a chance this is a duplicate, check for those first. 559 * There is a chance this is a duplicate, check for those first.
@@ -551,6 +565,7 @@ static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
551 /* If length has changed, update this list entry */ 565 /* If length has changed, update this list entry */
552 if (pl->length != length) 566 if (pl->length != length)
553 pl->length = length; 567 pl->length = length;
568 kfree(pl_new);
554 return 0; 569 return 0;
555 } 570 }
556 571
@@ -559,29 +574,33 @@ static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
559 * as any overlapping ranges will get resolved when the list is consumed 574 * as any overlapping ranges will get resolved when the list is consumed
560 * and converted to badblocks 575 * and converted to badblocks
561 */ 576 */
562 return add_poison(nvdimm_bus, addr, length, GFP_KERNEL); 577 if (!pl_new)
578 return -ENOMEM;
579 append_poison_entry(nvdimm_bus, pl_new, addr, length);
580
581 return 0;
563} 582}
564 583
565int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) 584int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
566{ 585{
567 int rc; 586 int rc;
568 587
569 nvdimm_bus_lock(&nvdimm_bus->dev); 588 spin_lock(&nvdimm_bus->poison_lock);
570 rc = bus_add_poison(nvdimm_bus, addr, length); 589 rc = bus_add_poison(nvdimm_bus, addr, length);
571 nvdimm_bus_unlock(&nvdimm_bus->dev); 590 spin_unlock(&nvdimm_bus->poison_lock);
572 591
573 return rc; 592 return rc;
574} 593}
575EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison); 594EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison);
576 595
577void nvdimm_clear_from_poison_list(struct nvdimm_bus *nvdimm_bus, 596void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, phys_addr_t start,
578 phys_addr_t start, unsigned int len) 597 unsigned int len)
579{ 598{
580 struct list_head *poison_list = &nvdimm_bus->poison_list; 599 struct list_head *poison_list = &nvdimm_bus->poison_list;
581 u64 clr_end = start + len - 1; 600 u64 clr_end = start + len - 1;
582 struct nd_poison *pl, *next; 601 struct nd_poison *pl, *next;
583 602
584 nvdimm_bus_lock(&nvdimm_bus->dev); 603 spin_lock(&nvdimm_bus->poison_lock);
585 WARN_ON_ONCE(list_empty(poison_list)); 604 WARN_ON_ONCE(list_empty(poison_list));
586 605
587 /* 606 /*
@@ -628,15 +647,15 @@ void nvdimm_clear_from_poison_list(struct nvdimm_bus *nvdimm_bus,
628 u64 new_len = pl_end - new_start + 1; 647 u64 new_len = pl_end - new_start + 1;
629 648
630 /* Add new entry covering the right half */ 649 /* Add new entry covering the right half */
631 add_poison(nvdimm_bus, new_start, new_len, GFP_NOIO); 650 add_poison(nvdimm_bus, new_start, new_len, GFP_NOWAIT);
632 /* Adjust this entry to cover the left half */ 651 /* Adjust this entry to cover the left half */
633 pl->length = start - pl->start; 652 pl->length = start - pl->start;
634 continue; 653 continue;
635 } 654 }
636 } 655 }
637 nvdimm_bus_unlock(&nvdimm_bus->dev); 656 spin_unlock(&nvdimm_bus->poison_lock);
638} 657}
639EXPORT_SYMBOL_GPL(nvdimm_clear_from_poison_list); 658EXPORT_SYMBOL_GPL(nvdimm_forget_poison);
640 659
641#ifdef CONFIG_BLK_DEV_INTEGRITY 660#ifdef CONFIG_BLK_DEV_INTEGRITY
642int nd_integrity_init(struct gendisk *disk, unsigned long meta_size) 661int nd_integrity_init(struct gendisk *disk, unsigned long meta_size)
diff --git a/drivers/nvdimm/dax_devs.c b/drivers/nvdimm/dax_devs.c
index 45fa82cae87c..c1b6556aea6e 100644
--- a/drivers/nvdimm/dax_devs.c
+++ b/drivers/nvdimm/dax_devs.c
@@ -124,7 +124,7 @@ int nd_dax_probe(struct device *dev, struct nd_namespace_common *ndns)
124 dev_dbg(dev, "%s: dax: %s\n", __func__, 124 dev_dbg(dev, "%s: dax: %s\n", __func__,
125 rc == 0 ? dev_name(dax_dev) : "<none>"); 125 rc == 0 ? dev_name(dax_dev) : "<none>");
126 if (rc < 0) { 126 if (rc < 0) {
127 __nd_detach_ndns(dax_dev, &nd_pfn->ndns); 127 nd_detach_ndns(dax_dev, &nd_pfn->ndns);
128 put_device(dax_dev); 128 put_device(dax_dev);
129 } else 129 } else
130 __nd_device_register(dax_dev); 130 __nd_device_register(dax_dev);
diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c
index ee0b412827bf..e0f0e3ce1a32 100644
--- a/drivers/nvdimm/dimm.c
+++ b/drivers/nvdimm/dimm.c
@@ -49,6 +49,8 @@ static int nvdimm_probe(struct device *dev)
49 kref_init(&ndd->kref); 49 kref_init(&ndd->kref);
50 50
51 rc = nvdimm_init_nsarea(ndd); 51 rc = nvdimm_init_nsarea(ndd);
52 if (rc == -EACCES)
53 nvdimm_set_locked(dev);
52 if (rc) 54 if (rc)
53 goto err; 55 goto err;
54 56
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index 8b721321be5b..fac1e9fbd11d 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -34,7 +34,7 @@ int nvdimm_check_config_data(struct device *dev)
34 34
35 if (!nvdimm->cmd_mask || 35 if (!nvdimm->cmd_mask ||
36 !test_bit(ND_CMD_GET_CONFIG_DATA, &nvdimm->cmd_mask)) { 36 !test_bit(ND_CMD_GET_CONFIG_DATA, &nvdimm->cmd_mask)) {
37 if (nvdimm->flags & NDD_ALIASING) 37 if (test_bit(NDD_ALIASING, &nvdimm->flags))
38 return -ENXIO; 38 return -ENXIO;
39 else 39 else
40 return -ENOTTY; 40 return -ENOTTY;
@@ -67,6 +67,7 @@ int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd)
67 struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev); 67 struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev);
68 struct nvdimm_bus_descriptor *nd_desc; 68 struct nvdimm_bus_descriptor *nd_desc;
69 int rc = validate_dimm(ndd); 69 int rc = validate_dimm(ndd);
70 int cmd_rc = 0;
70 71
71 if (rc) 72 if (rc)
72 return rc; 73 return rc;
@@ -76,8 +77,11 @@ int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd)
76 77
77 memset(cmd, 0, sizeof(*cmd)); 78 memset(cmd, 0, sizeof(*cmd));
78 nd_desc = nvdimm_bus->nd_desc; 79 nd_desc = nvdimm_bus->nd_desc;
79 return nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev), 80 rc = nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev),
80 ND_CMD_GET_CONFIG_SIZE, cmd, sizeof(*cmd), NULL); 81 ND_CMD_GET_CONFIG_SIZE, cmd, sizeof(*cmd), &cmd_rc);
82 if (rc < 0)
83 return rc;
84 return cmd_rc;
81} 85}
82 86
83int nvdimm_init_config_data(struct nvdimm_drvdata *ndd) 87int nvdimm_init_config_data(struct nvdimm_drvdata *ndd)
@@ -188,7 +192,14 @@ void nvdimm_set_aliasing(struct device *dev)
188{ 192{
189 struct nvdimm *nvdimm = to_nvdimm(dev); 193 struct nvdimm *nvdimm = to_nvdimm(dev);
190 194
191 nvdimm->flags |= NDD_ALIASING; 195 set_bit(NDD_ALIASING, &nvdimm->flags);
196}
197
198void nvdimm_set_locked(struct device *dev)
199{
200 struct nvdimm *nvdimm = to_nvdimm(dev);
201
202 set_bit(NDD_LOCKED, &nvdimm->flags);
192} 203}
193 204
194static void nvdimm_release(struct device *dev) 205static void nvdimm_release(struct device *dev)
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 1b481a5fb966..2f9dfbd2dbec 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -2236,14 +2236,21 @@ static int init_active_labels(struct nd_region *nd_region)
2236 int count, j; 2236 int count, j;
2237 2237
2238 /* 2238 /*
2239 * If the dimm is disabled then prevent the region from 2239 * If the dimm is disabled then we may need to prevent
2240 * being activated if it aliases DPA. 2240 * the region from being activated.
2241 */ 2241 */
2242 if (!ndd) { 2242 if (!ndd) {
2243 if ((nvdimm->flags & NDD_ALIASING) == 0) 2243 if (test_bit(NDD_LOCKED, &nvdimm->flags))
2244 /* fail, label data may be unreadable */;
2245 else if (test_bit(NDD_ALIASING, &nvdimm->flags))
2246 /* fail, labels needed to disambiguate dpa */;
2247 else
2244 return 0; 2248 return 0;
2245 dev_dbg(&nd_region->dev, "%s: is disabled, failing probe\n", 2249
2246 dev_name(&nd_mapping->nvdimm->dev)); 2250 dev_err(&nd_region->dev, "%s: is %s, failing probe\n",
2251 dev_name(&nd_mapping->nvdimm->dev),
2252 test_bit(NDD_LOCKED, &nvdimm->flags)
2253 ? "locked" : "disabled");
2247 return -ENXIO; 2254 return -ENXIO;
2248 } 2255 }
2249 nd_mapping->ndd = ndd; 2256 nd_mapping->ndd = ndd;
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 8623e57c2ce3..4c4bd209e725 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -32,6 +32,7 @@ struct nvdimm_bus {
32 struct list_head poison_list; 32 struct list_head poison_list;
33 struct list_head mapping_list; 33 struct list_head mapping_list;
34 struct mutex reconfig_mutex; 34 struct mutex reconfig_mutex;
35 spinlock_t poison_lock;
35}; 36};
36 37
37struct nvdimm { 38struct nvdimm {
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 2a99c83aa19f..77d032192bf7 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -154,6 +154,7 @@ struct nd_region {
154 u64 ndr_start; 154 u64 ndr_start;
155 int id, num_lanes, ro, numa_node; 155 int id, num_lanes, ro, numa_node;
156 void *provider_data; 156 void *provider_data;
157 struct badblocks bb;
157 struct nd_interleave_set *nd_set; 158 struct nd_interleave_set *nd_set;
158 struct nd_percpu_lane __percpu *lane; 159 struct nd_percpu_lane __percpu *lane;
159 struct nd_mapping mapping[0]; 160 struct nd_mapping mapping[0];
@@ -239,6 +240,7 @@ int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset,
239long nvdimm_clear_poison(struct device *dev, phys_addr_t phys, 240long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
240 unsigned int len); 241 unsigned int len);
241void nvdimm_set_aliasing(struct device *dev); 242void nvdimm_set_aliasing(struct device *dev);
243void nvdimm_set_locked(struct device *dev);
242struct nd_btt *to_nd_btt(struct device *dev); 244struct nd_btt *to_nd_btt(struct device *dev);
243 245
244struct nd_gen_sb { 246struct nd_gen_sb {
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 6c033c9a2f06..335c8175410b 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -484,7 +484,7 @@ int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns)
484 dev_dbg(dev, "%s: pfn: %s\n", __func__, 484 dev_dbg(dev, "%s: pfn: %s\n", __func__,
485 rc == 0 ? dev_name(pfn_dev) : "<none>"); 485 rc == 0 ? dev_name(pfn_dev) : "<none>");
486 if (rc < 0) { 486 if (rc < 0) {
487 __nd_detach_ndns(pfn_dev, &nd_pfn->ndns); 487 nd_detach_ndns(pfn_dev, &nd_pfn->ndns);
488 put_device(pfn_dev); 488 put_device(pfn_dev);
489 } else 489 } else
490 __nd_device_register(pfn_dev); 490 __nd_device_register(pfn_dev);
@@ -538,7 +538,8 @@ static struct vmem_altmap *__nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
538 nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns); 538 nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
539 altmap = NULL; 539 altmap = NULL;
540 } else if (nd_pfn->mode == PFN_MODE_PMEM) { 540 } else if (nd_pfn->mode == PFN_MODE_PMEM) {
541 nd_pfn->npfns = (resource_size(res) - offset) / PAGE_SIZE; 541 nd_pfn->npfns = PFN_SECTION_ALIGN_UP((resource_size(res)
542 - offset) / PAGE_SIZE);
542 if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns) 543 if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns)
543 dev_info(&nd_pfn->dev, 544 dev_info(&nd_pfn->dev,
544 "number of pfns truncated from %lld to %ld\n", 545 "number of pfns truncated from %lld to %ld\n",
@@ -625,7 +626,8 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
625 */ 626 */
626 start += start_pad; 627 start += start_pad;
627 size = resource_size(&nsio->res); 628 size = resource_size(&nsio->res);
628 npfns = (size - start_pad - end_trunc - SZ_8K) / SZ_4K; 629 npfns = PFN_SECTION_ALIGN_UP((size - start_pad - end_trunc - SZ_8K)
630 / PAGE_SIZE);
629 if (nd_pfn->mode == PFN_MODE_PMEM) { 631 if (nd_pfn->mode == PFN_MODE_PMEM) {
630 /* 632 /*
631 * vmemmap_populate_hugepages() allocates the memmap array in 633 * vmemmap_populate_hugepages() allocates the memmap array in
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index fbc640bf06b0..c544d466ea51 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -29,6 +29,7 @@
29#include <linux/pfn_t.h> 29#include <linux/pfn_t.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/pmem.h> 31#include <linux/pmem.h>
32#include <linux/dax.h>
32#include <linux/nd.h> 33#include <linux/nd.h>
33#include "pmem.h" 34#include "pmem.h"
34#include "pfn.h" 35#include "pfn.h"
@@ -89,7 +90,7 @@ static int read_pmem(struct page *page, unsigned int off,
89 int rc; 90 int rc;
90 void *mem = kmap_atomic(page); 91 void *mem = kmap_atomic(page);
91 92
92 rc = memcpy_from_pmem(mem + off, pmem_addr, len); 93 rc = memcpy_mcsafe(mem + off, pmem_addr, len);
93 kunmap_atomic(mem); 94 kunmap_atomic(mem);
94 if (rc) 95 if (rc)
95 return -EIO; 96 return -EIO;
@@ -200,13 +201,13 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
200} 201}
201 202
202/* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */ 203/* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */
203__weak long pmem_direct_access(struct block_device *bdev, sector_t sector, 204__weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
204 void **kaddr, pfn_t *pfn, long size) 205 long nr_pages, void **kaddr, pfn_t *pfn)
205{ 206{
206 struct pmem_device *pmem = bdev->bd_queue->queuedata; 207 resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
207 resource_size_t offset = sector * 512 + pmem->data_offset;
208 208
209 if (unlikely(is_bad_pmem(&pmem->bb, sector, size))) 209 if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
210 PFN_PHYS(nr_pages))))
210 return -EIO; 211 return -EIO;
211 *kaddr = pmem->virt_addr + offset; 212 *kaddr = pmem->virt_addr + offset;
212 *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags); 213 *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
@@ -216,17 +217,28 @@ __weak long pmem_direct_access(struct block_device *bdev, sector_t sector,
216 * requested range. 217 * requested range.
217 */ 218 */
218 if (unlikely(pmem->bb.count)) 219 if (unlikely(pmem->bb.count))
219 return size; 220 return nr_pages;
220 return pmem->size - pmem->pfn_pad - offset; 221 return PHYS_PFN(pmem->size - pmem->pfn_pad - offset);
221} 222}
222 223
223static const struct block_device_operations pmem_fops = { 224static const struct block_device_operations pmem_fops = {
224 .owner = THIS_MODULE, 225 .owner = THIS_MODULE,
225 .rw_page = pmem_rw_page, 226 .rw_page = pmem_rw_page,
226 .direct_access = pmem_direct_access,
227 .revalidate_disk = nvdimm_revalidate_disk, 227 .revalidate_disk = nvdimm_revalidate_disk,
228}; 228};
229 229
230static long pmem_dax_direct_access(struct dax_device *dax_dev,
231 pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn)
232{
233 struct pmem_device *pmem = dax_get_private(dax_dev);
234
235 return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn);
236}
237
238static const struct dax_operations pmem_dax_ops = {
239 .direct_access = pmem_dax_direct_access,
240};
241
230static void pmem_release_queue(void *q) 242static void pmem_release_queue(void *q)
231{ 243{
232 blk_cleanup_queue(q); 244 blk_cleanup_queue(q);
@@ -237,10 +249,14 @@ static void pmem_freeze_queue(void *q)
237 blk_freeze_queue_start(q); 249 blk_freeze_queue_start(q);
238} 250}
239 251
240static void pmem_release_disk(void *disk) 252static void pmem_release_disk(void *__pmem)
241{ 253{
242 del_gendisk(disk); 254 struct pmem_device *pmem = __pmem;
243 put_disk(disk); 255
256 kill_dax(pmem->dax_dev);
257 put_dax(pmem->dax_dev);
258 del_gendisk(pmem->disk);
259 put_disk(pmem->disk);
244} 260}
245 261
246static int pmem_attach_disk(struct device *dev, 262static int pmem_attach_disk(struct device *dev,
@@ -251,6 +267,7 @@ static int pmem_attach_disk(struct device *dev,
251 struct vmem_altmap __altmap, *altmap = NULL; 267 struct vmem_altmap __altmap, *altmap = NULL;
252 struct resource *res = &nsio->res; 268 struct resource *res = &nsio->res;
253 struct nd_pfn *nd_pfn = NULL; 269 struct nd_pfn *nd_pfn = NULL;
270 struct dax_device *dax_dev;
254 int nid = dev_to_node(dev); 271 int nid = dev_to_node(dev);
255 struct nd_pfn_sb *pfn_sb; 272 struct nd_pfn_sb *pfn_sb;
256 struct pmem_device *pmem; 273 struct pmem_device *pmem;
@@ -334,6 +351,7 @@ static int pmem_attach_disk(struct device *dev,
334 disk = alloc_disk_node(0, nid); 351 disk = alloc_disk_node(0, nid);
335 if (!disk) 352 if (!disk)
336 return -ENOMEM; 353 return -ENOMEM;
354 pmem->disk = disk;
337 355
338 disk->fops = &pmem_fops; 356 disk->fops = &pmem_fops;
339 disk->queue = q; 357 disk->queue = q;
@@ -345,9 +363,16 @@ static int pmem_attach_disk(struct device *dev,
345 return -ENOMEM; 363 return -ENOMEM;
346 nvdimm_badblocks_populate(nd_region, &pmem->bb, res); 364 nvdimm_badblocks_populate(nd_region, &pmem->bb, res);
347 disk->bb = &pmem->bb; 365 disk->bb = &pmem->bb;
348 device_add_disk(dev, disk);
349 366
350 if (devm_add_action_or_reset(dev, pmem_release_disk, disk)) 367 dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops);
368 if (!dax_dev) {
369 put_disk(disk);
370 return -ENOMEM;
371 }
372 pmem->dax_dev = dax_dev;
373
374 device_add_disk(dev, disk);
375 if (devm_add_action_or_reset(dev, pmem_release_disk, pmem))
351 return -ENOMEM; 376 return -ENOMEM;
352 377
353 revalidate_disk(disk); 378 revalidate_disk(disk);
@@ -397,12 +422,12 @@ static void nd_pmem_shutdown(struct device *dev)
397 422
398static void nd_pmem_notify(struct device *dev, enum nvdimm_event event) 423static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
399{ 424{
400 struct pmem_device *pmem = dev_get_drvdata(dev); 425 struct nd_region *nd_region;
401 struct nd_region *nd_region = to_region(pmem);
402 resource_size_t offset = 0, end_trunc = 0; 426 resource_size_t offset = 0, end_trunc = 0;
403 struct nd_namespace_common *ndns; 427 struct nd_namespace_common *ndns;
404 struct nd_namespace_io *nsio; 428 struct nd_namespace_io *nsio;
405 struct resource res; 429 struct resource res;
430 struct badblocks *bb;
406 431
407 if (event != NVDIMM_REVALIDATE_POISON) 432 if (event != NVDIMM_REVALIDATE_POISON)
408 return; 433 return;
@@ -411,20 +436,33 @@ static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
411 struct nd_btt *nd_btt = to_nd_btt(dev); 436 struct nd_btt *nd_btt = to_nd_btt(dev);
412 437
413 ndns = nd_btt->ndns; 438 ndns = nd_btt->ndns;
414 } else if (is_nd_pfn(dev)) { 439 nd_region = to_nd_region(ndns->dev.parent);
415 struct nd_pfn *nd_pfn = to_nd_pfn(dev); 440 nsio = to_nd_namespace_io(&ndns->dev);
416 struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; 441 bb = &nsio->bb;
442 } else {
443 struct pmem_device *pmem = dev_get_drvdata(dev);
417 444
418 ndns = nd_pfn->ndns; 445 nd_region = to_region(pmem);
419 offset = pmem->data_offset + __le32_to_cpu(pfn_sb->start_pad); 446 bb = &pmem->bb;
420 end_trunc = __le32_to_cpu(pfn_sb->end_trunc); 447
421 } else 448 if (is_nd_pfn(dev)) {
422 ndns = to_ndns(dev); 449 struct nd_pfn *nd_pfn = to_nd_pfn(dev);
450 struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
451
452 ndns = nd_pfn->ndns;
453 offset = pmem->data_offset +
454 __le32_to_cpu(pfn_sb->start_pad);
455 end_trunc = __le32_to_cpu(pfn_sb->end_trunc);
456 } else {
457 ndns = to_ndns(dev);
458 }
459
460 nsio = to_nd_namespace_io(&ndns->dev);
461 }
423 462
424 nsio = to_nd_namespace_io(&ndns->dev);
425 res.start = nsio->res.start + offset; 463 res.start = nsio->res.start + offset;
426 res.end = nsio->res.end - end_trunc; 464 res.end = nsio->res.end - end_trunc;
427 nvdimm_badblocks_populate(nd_region, &pmem->bb, &res); 465 nvdimm_badblocks_populate(nd_region, bb, &res);
428} 466}
429 467
430MODULE_ALIAS("pmem"); 468MODULE_ALIAS("pmem");
diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
index b4ee4f71b4a1..7f4dbd72a90a 100644
--- a/drivers/nvdimm/pmem.h
+++ b/drivers/nvdimm/pmem.h
@@ -5,8 +5,6 @@
5#include <linux/pfn_t.h> 5#include <linux/pfn_t.h>
6#include <linux/fs.h> 6#include <linux/fs.h>
7 7
8long pmem_direct_access(struct block_device *bdev, sector_t sector,
9 void **kaddr, pfn_t *pfn, long size);
10/* this definition is in it's own header for tools/testing/nvdimm to consume */ 8/* this definition is in it's own header for tools/testing/nvdimm to consume */
11struct pmem_device { 9struct pmem_device {
12 /* One contiguous memory region per device */ 10 /* One contiguous memory region per device */
@@ -20,5 +18,10 @@ struct pmem_device {
20 /* trim size when namespace capacity has been section aligned */ 18 /* trim size when namespace capacity has been section aligned */
21 u32 pfn_pad; 19 u32 pfn_pad;
22 struct badblocks bb; 20 struct badblocks bb;
21 struct dax_device *dax_dev;
22 struct gendisk *disk;
23}; 23};
24
25long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
26 long nr_pages, void **kaddr, pfn_t *pfn);
24#endif /* __NVDIMM_PMEM_H__ */ 27#endif /* __NVDIMM_PMEM_H__ */
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
index 8f241772ec0b..869a886c292e 100644
--- a/drivers/nvdimm/region.c
+++ b/drivers/nvdimm/region.c
@@ -14,6 +14,7 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/device.h> 15#include <linux/device.h>
16#include <linux/nd.h> 16#include <linux/nd.h>
17#include "nd-core.h"
17#include "nd.h" 18#include "nd.h"
18 19
19static int nd_region_probe(struct device *dev) 20static int nd_region_probe(struct device *dev)
@@ -52,6 +53,17 @@ static int nd_region_probe(struct device *dev)
52 if (rc && err && rc == err) 53 if (rc && err && rc == err)
53 return -ENODEV; 54 return -ENODEV;
54 55
56 if (is_nd_pmem(&nd_region->dev)) {
57 struct resource ndr_res;
58
59 if (devm_init_badblocks(dev, &nd_region->bb))
60 return -ENODEV;
61 ndr_res.start = nd_region->ndr_start;
62 ndr_res.end = nd_region->ndr_start + nd_region->ndr_size - 1;
63 nvdimm_badblocks_populate(nd_region,
64 &nd_region->bb, &ndr_res);
65 }
66
55 nd_region->btt_seed = nd_btt_create(nd_region); 67 nd_region->btt_seed = nd_btt_create(nd_region);
56 nd_region->pfn_seed = nd_pfn_create(nd_region); 68 nd_region->pfn_seed = nd_pfn_create(nd_region);
57 nd_region->dax_seed = nd_dax_create(nd_region); 69 nd_region->dax_seed = nd_dax_create(nd_region);
@@ -104,6 +116,18 @@ static int child_notify(struct device *dev, void *data)
104 116
105static void nd_region_notify(struct device *dev, enum nvdimm_event event) 117static void nd_region_notify(struct device *dev, enum nvdimm_event event)
106{ 118{
119 if (event == NVDIMM_REVALIDATE_POISON) {
120 struct nd_region *nd_region = to_nd_region(dev);
121 struct resource res;
122
123 if (is_nd_pmem(&nd_region->dev)) {
124 res.start = nd_region->ndr_start;
125 res.end = nd_region->ndr_start +
126 nd_region->ndr_size - 1;
127 nvdimm_badblocks_populate(nd_region,
128 &nd_region->bb, &res);
129 }
130 }
107 device_for_each_child(dev, &event, child_notify); 131 device_for_each_child(dev, &event, child_notify);
108} 132}
109 133
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index b7cb5066d961..b550edf2571f 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -222,7 +222,7 @@ int nd_region_to_nstype(struct nd_region *nd_region)
222 struct nd_mapping *nd_mapping = &nd_region->mapping[i]; 222 struct nd_mapping *nd_mapping = &nd_region->mapping[i];
223 struct nvdimm *nvdimm = nd_mapping->nvdimm; 223 struct nvdimm *nvdimm = nd_mapping->nvdimm;
224 224
225 if (nvdimm->flags & NDD_ALIASING) 225 if (test_bit(NDD_ALIASING, &nvdimm->flags))
226 alias++; 226 alias++;
227 } 227 }
228 if (alias) 228 if (alias)
@@ -255,6 +255,35 @@ static ssize_t size_show(struct device *dev,
255} 255}
256static DEVICE_ATTR_RO(size); 256static DEVICE_ATTR_RO(size);
257 257
258static ssize_t deep_flush_show(struct device *dev,
259 struct device_attribute *attr, char *buf)
260{
261 struct nd_region *nd_region = to_nd_region(dev);
262
263 /*
264 * NOTE: in the nvdimm_has_flush() error case this attribute is
265 * not visible.
266 */
267 return sprintf(buf, "%d\n", nvdimm_has_flush(nd_region));
268}
269
270static ssize_t deep_flush_store(struct device *dev, struct device_attribute *attr,
271 const char *buf, size_t len)
272{
273 bool flush;
274 int rc = strtobool(buf, &flush);
275 struct nd_region *nd_region = to_nd_region(dev);
276
277 if (rc)
278 return rc;
279 if (!flush)
280 return -EINVAL;
281 nvdimm_flush(nd_region);
282
283 return len;
284}
285static DEVICE_ATTR_RW(deep_flush);
286
258static ssize_t mappings_show(struct device *dev, 287static ssize_t mappings_show(struct device *dev,
259 struct device_attribute *attr, char *buf) 288 struct device_attribute *attr, char *buf)
260{ 289{
@@ -448,6 +477,25 @@ static ssize_t read_only_store(struct device *dev,
448} 477}
449static DEVICE_ATTR_RW(read_only); 478static DEVICE_ATTR_RW(read_only);
450 479
480static ssize_t region_badblocks_show(struct device *dev,
481 struct device_attribute *attr, char *buf)
482{
483 struct nd_region *nd_region = to_nd_region(dev);
484
485 return badblocks_show(&nd_region->bb, buf, 0);
486}
487
488static DEVICE_ATTR(badblocks, 0444, region_badblocks_show, NULL);
489
490static ssize_t resource_show(struct device *dev,
491 struct device_attribute *attr, char *buf)
492{
493 struct nd_region *nd_region = to_nd_region(dev);
494
495 return sprintf(buf, "%#llx\n", nd_region->ndr_start);
496}
497static DEVICE_ATTR_RO(resource);
498
451static struct attribute *nd_region_attributes[] = { 499static struct attribute *nd_region_attributes[] = {
452 &dev_attr_size.attr, 500 &dev_attr_size.attr,
453 &dev_attr_nstype.attr, 501 &dev_attr_nstype.attr,
@@ -455,11 +503,14 @@ static struct attribute *nd_region_attributes[] = {
455 &dev_attr_btt_seed.attr, 503 &dev_attr_btt_seed.attr,
456 &dev_attr_pfn_seed.attr, 504 &dev_attr_pfn_seed.attr,
457 &dev_attr_dax_seed.attr, 505 &dev_attr_dax_seed.attr,
506 &dev_attr_deep_flush.attr,
458 &dev_attr_read_only.attr, 507 &dev_attr_read_only.attr,
459 &dev_attr_set_cookie.attr, 508 &dev_attr_set_cookie.attr,
460 &dev_attr_available_size.attr, 509 &dev_attr_available_size.attr,
461 &dev_attr_namespace_seed.attr, 510 &dev_attr_namespace_seed.attr,
462 &dev_attr_init_namespaces.attr, 511 &dev_attr_init_namespaces.attr,
512 &dev_attr_badblocks.attr,
513 &dev_attr_resource.attr,
463 NULL, 514 NULL,
464}; 515};
465 516
@@ -476,6 +527,23 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
476 if (!is_nd_pmem(dev) && a == &dev_attr_dax_seed.attr) 527 if (!is_nd_pmem(dev) && a == &dev_attr_dax_seed.attr)
477 return 0; 528 return 0;
478 529
530 if (!is_nd_pmem(dev) && a == &dev_attr_badblocks.attr)
531 return 0;
532
533 if (!is_nd_pmem(dev) && a == &dev_attr_resource.attr)
534 return 0;
535
536 if (a == &dev_attr_deep_flush.attr) {
537 int has_flush = nvdimm_has_flush(nd_region);
538
539 if (has_flush == 1)
540 return a->mode;
541 else if (has_flush == 0)
542 return 0444;
543 else
544 return 0;
545 }
546
479 if (a != &dev_attr_set_cookie.attr 547 if (a != &dev_attr_set_cookie.attr
480 && a != &dev_attr_available_size.attr) 548 && a != &dev_attr_available_size.attr)
481 return a->mode; 549 return a->mode;
@@ -813,7 +881,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
813 return NULL; 881 return NULL;
814 } 882 }
815 883
816 if (nvdimm->flags & NDD_UNARMED) 884 if (test_bit(NDD_UNARMED, &nvdimm->flags))
817 ro = 1; 885 ro = 1;
818 } 886 }
819 887
@@ -968,17 +1036,20 @@ EXPORT_SYMBOL_GPL(nvdimm_flush);
968 */ 1036 */
969int nvdimm_has_flush(struct nd_region *nd_region) 1037int nvdimm_has_flush(struct nd_region *nd_region)
970{ 1038{
971 struct nd_region_data *ndrd = dev_get_drvdata(&nd_region->dev);
972 int i; 1039 int i;
973 1040
974 /* no nvdimm == flushing capability unknown */ 1041 /* no nvdimm == flushing capability unknown */
975 if (nd_region->ndr_mappings == 0) 1042 if (nd_region->ndr_mappings == 0)
976 return -ENXIO; 1043 return -ENXIO;
977 1044
978 for (i = 0; i < nd_region->ndr_mappings; i++) 1045 for (i = 0; i < nd_region->ndr_mappings; i++) {
979 /* flush hints present, flushing required */ 1046 struct nd_mapping *nd_mapping = &nd_region->mapping[i];
980 if (ndrd_get_flush_wpq(ndrd, i, 0)) 1047 struct nvdimm *nvdimm = nd_mapping->nvdimm;
1048
1049 /* flush hints present / available */
1050 if (nvdimm->num_flush)
981 return 1; 1051 return 1;
1052 }
982 1053
983 /* 1054 /*
984 * The platform defines dimm devices without hints, assume 1055 * The platform defines dimm devices without hints, assume
diff --git a/drivers/s390/block/Kconfig b/drivers/s390/block/Kconfig
index 4a3b62326183..0acb8c2f9475 100644
--- a/drivers/s390/block/Kconfig
+++ b/drivers/s390/block/Kconfig
@@ -14,6 +14,7 @@ config BLK_DEV_XPRAM
14 14
15config DCSSBLK 15config DCSSBLK
16 def_tristate m 16 def_tristate m
17 select DAX
17 prompt "DCSSBLK support" 18 prompt "DCSSBLK support"
18 depends on S390 && BLOCK 19 depends on S390 && BLOCK
19 help 20 help
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 415d10a67b7a..36e5280af3e4 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -18,6 +18,7 @@
18#include <linux/interrupt.h> 18#include <linux/interrupt.h>
19#include <linux/platform_device.h> 19#include <linux/platform_device.h>
20#include <linux/pfn_t.h> 20#include <linux/pfn_t.h>
21#include <linux/dax.h>
21#include <asm/extmem.h> 22#include <asm/extmem.h>
22#include <asm/io.h> 23#include <asm/io.h>
23 24
@@ -30,8 +31,8 @@ static int dcssblk_open(struct block_device *bdev, fmode_t mode);
30static void dcssblk_release(struct gendisk *disk, fmode_t mode); 31static void dcssblk_release(struct gendisk *disk, fmode_t mode);
31static blk_qc_t dcssblk_make_request(struct request_queue *q, 32static blk_qc_t dcssblk_make_request(struct request_queue *q,
32 struct bio *bio); 33 struct bio *bio);
33static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum, 34static long dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
34 void **kaddr, pfn_t *pfn, long size); 35 long nr_pages, void **kaddr, pfn_t *pfn);
35 36
36static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0"; 37static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0";
37 38
@@ -40,7 +41,10 @@ static const struct block_device_operations dcssblk_devops = {
40 .owner = THIS_MODULE, 41 .owner = THIS_MODULE,
41 .open = dcssblk_open, 42 .open = dcssblk_open,
42 .release = dcssblk_release, 43 .release = dcssblk_release,
43 .direct_access = dcssblk_direct_access, 44};
45
46static const struct dax_operations dcssblk_dax_ops = {
47 .direct_access = dcssblk_dax_direct_access,
44}; 48};
45 49
46struct dcssblk_dev_info { 50struct dcssblk_dev_info {
@@ -57,6 +61,7 @@ struct dcssblk_dev_info {
57 struct request_queue *dcssblk_queue; 61 struct request_queue *dcssblk_queue;
58 int num_of_segments; 62 int num_of_segments;
59 struct list_head seg_list; 63 struct list_head seg_list;
64 struct dax_device *dax_dev;
60}; 65};
61 66
62struct segment_info { 67struct segment_info {
@@ -389,6 +394,8 @@ removeseg:
389 } 394 }
390 list_del(&dev_info->lh); 395 list_del(&dev_info->lh);
391 396
397 kill_dax(dev_info->dax_dev);
398 put_dax(dev_info->dax_dev);
392 del_gendisk(dev_info->gd); 399 del_gendisk(dev_info->gd);
393 blk_cleanup_queue(dev_info->dcssblk_queue); 400 blk_cleanup_queue(dev_info->dcssblk_queue);
394 dev_info->gd->queue = NULL; 401 dev_info->gd->queue = NULL;
@@ -654,6 +661,13 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
654 if (rc) 661 if (rc)
655 goto put_dev; 662 goto put_dev;
656 663
664 dev_info->dax_dev = alloc_dax(dev_info, dev_info->gd->disk_name,
665 &dcssblk_dax_ops);
666 if (!dev_info->dax_dev) {
667 rc = -ENOMEM;
668 goto put_dev;
669 }
670
657 get_device(&dev_info->dev); 671 get_device(&dev_info->dev);
658 device_add_disk(&dev_info->dev, dev_info->gd); 672 device_add_disk(&dev_info->dev, dev_info->gd);
659 673
@@ -752,6 +766,8 @@ dcssblk_remove_store(struct device *dev, struct device_attribute *attr, const ch
752 } 766 }
753 767
754 list_del(&dev_info->lh); 768 list_del(&dev_info->lh);
769 kill_dax(dev_info->dax_dev);
770 put_dax(dev_info->dax_dev);
755 del_gendisk(dev_info->gd); 771 del_gendisk(dev_info->gd);
756 blk_cleanup_queue(dev_info->dcssblk_queue); 772 blk_cleanup_queue(dev_info->dcssblk_queue);
757 dev_info->gd->queue = NULL; 773 dev_info->gd->queue = NULL;
@@ -883,21 +899,26 @@ fail:
883} 899}
884 900
885static long 901static long
886dcssblk_direct_access (struct block_device *bdev, sector_t secnum, 902__dcssblk_direct_access(struct dcssblk_dev_info *dev_info, pgoff_t pgoff,
887 void **kaddr, pfn_t *pfn, long size) 903 long nr_pages, void **kaddr, pfn_t *pfn)
888{ 904{
889 struct dcssblk_dev_info *dev_info; 905 resource_size_t offset = pgoff * PAGE_SIZE;
890 unsigned long offset, dev_sz; 906 unsigned long dev_sz;
891 907
892 dev_info = bdev->bd_disk->private_data;
893 if (!dev_info)
894 return -ENODEV;
895 dev_sz = dev_info->end - dev_info->start + 1; 908 dev_sz = dev_info->end - dev_info->start + 1;
896 offset = secnum * 512;
897 *kaddr = (void *) dev_info->start + offset; 909 *kaddr = (void *) dev_info->start + offset;
898 *pfn = __pfn_to_pfn_t(PFN_DOWN(dev_info->start + offset), PFN_DEV); 910 *pfn = __pfn_to_pfn_t(PFN_DOWN(dev_info->start + offset), PFN_DEV);
899 911
900 return dev_sz - offset; 912 return (dev_sz - offset) / PAGE_SIZE;
913}
914
915static long
916dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
917 long nr_pages, void **kaddr, pfn_t *pfn)
918{
919 struct dcssblk_dev_info *dev_info = dax_get_private(dax_dev);
920
921 return __dcssblk_direct_access(dev_info, pgoff, nr_pages, kaddr, pfn);
901} 922}
902 923
903static void 924static void
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 0d435c794d76..2a305c1a2d88 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -18,6 +18,7 @@
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/blkpg.h> 19#include <linux/blkpg.h>
20#include <linux/magic.h> 20#include <linux/magic.h>
21#include <linux/dax.h>
21#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
22#include <linux/swap.h> 23#include <linux/swap.h>
23#include <linux/pagevec.h> 24#include <linux/pagevec.h>
@@ -716,50 +717,18 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
716} 717}
717EXPORT_SYMBOL_GPL(bdev_write_page); 718EXPORT_SYMBOL_GPL(bdev_write_page);
718 719
719/** 720int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
720 * bdev_direct_access() - Get the address for directly-accessibly memory 721 pgoff_t *pgoff)
721 * @bdev: The device containing the memory
722 * @dax: control and output parameters for ->direct_access
723 *
724 * If a block device is made up of directly addressable memory, this function
725 * will tell the caller the PFN and the address of the memory. The address
726 * may be directly dereferenced within the kernel without the need to call
727 * ioremap(), kmap() or similar. The PFN is suitable for inserting into
728 * page tables.
729 *
730 * Return: negative errno if an error occurs, otherwise the number of bytes
731 * accessible at this address.
732 */
733long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
734{ 722{
735 sector_t sector = dax->sector; 723 phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512;
736 long avail, size = dax->size;
737 const struct block_device_operations *ops = bdev->bd_disk->fops;
738 724
739 /* 725 if (pgoff)
740 * The device driver is allowed to sleep, in order to make the 726 *pgoff = PHYS_PFN(phys_off);
741 * memory directly accessible. 727 if (phys_off % PAGE_SIZE || size % PAGE_SIZE)
742 */
743 might_sleep();
744
745 if (size < 0)
746 return size;
747 if (!blk_queue_dax(bdev_get_queue(bdev)) || !ops->direct_access)
748 return -EOPNOTSUPP;
749 if ((sector + DIV_ROUND_UP(size, 512)) >
750 part_nr_sects_read(bdev->bd_part))
751 return -ERANGE;
752 sector += get_start_sect(bdev);
753 if (sector % (PAGE_SIZE / 512))
754 return -EINVAL; 728 return -EINVAL;
755 avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn, size); 729 return 0;
756 if (!avail)
757 return -ERANGE;
758 if (avail > 0 && avail & ~PAGE_MASK)
759 return -ENXIO;
760 return min(avail, size);
761} 730}
762EXPORT_SYMBOL_GPL(bdev_direct_access); 731EXPORT_SYMBOL(bdev_dax_pgoff);
763 732
764/** 733/**
765 * bdev_dax_supported() - Check if the device supports dax for filesystem 734 * bdev_dax_supported() - Check if the device supports dax for filesystem
@@ -773,62 +742,46 @@ EXPORT_SYMBOL_GPL(bdev_direct_access);
773 */ 742 */
774int bdev_dax_supported(struct super_block *sb, int blocksize) 743int bdev_dax_supported(struct super_block *sb, int blocksize)
775{ 744{
776 struct blk_dax_ctl dax = { 745 struct block_device *bdev = sb->s_bdev;
777 .sector = 0, 746 struct dax_device *dax_dev;
778 .size = PAGE_SIZE, 747 pgoff_t pgoff;
779 }; 748 int err, id;
780 int err; 749 void *kaddr;
750 pfn_t pfn;
751 long len;
781 752
782 if (blocksize != PAGE_SIZE) { 753 if (blocksize != PAGE_SIZE) {
783 vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax"); 754 vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax");
784 return -EINVAL; 755 return -EINVAL;
785 } 756 }
786 757
787 err = bdev_direct_access(sb->s_bdev, &dax); 758 err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
788 if (err < 0) { 759 if (err) {
789 switch (err) { 760 vfs_msg(sb, KERN_ERR, "error: unaligned partition for dax");
790 case -EOPNOTSUPP:
791 vfs_msg(sb, KERN_ERR,
792 "error: device does not support dax");
793 break;
794 case -EINVAL:
795 vfs_msg(sb, KERN_ERR,
796 "error: unaligned partition for dax");
797 break;
798 default:
799 vfs_msg(sb, KERN_ERR,
800 "error: dax access failed (%d)", err);
801 }
802 return err; 761 return err;
803 } 762 }
804 763
805 return 0; 764 dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
806} 765 if (!dax_dev) {
807EXPORT_SYMBOL_GPL(bdev_dax_supported); 766 vfs_msg(sb, KERN_ERR, "error: device does not support dax");
808 767 return -EOPNOTSUPP;
809/** 768 }
810 * bdev_dax_capable() - Return if the raw device is capable for dax
811 * @bdev: The device for raw block device access
812 */
813bool bdev_dax_capable(struct block_device *bdev)
814{
815 struct blk_dax_ctl dax = {
816 .size = PAGE_SIZE,
817 };
818 769
819 if (!IS_ENABLED(CONFIG_FS_DAX)) 770 id = dax_read_lock();
820 return false; 771 len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn);
772 dax_read_unlock(id);
821 773
822 dax.sector = 0; 774 put_dax(dax_dev);
823 if (bdev_direct_access(bdev, &dax) < 0)
824 return false;
825 775
826 dax.sector = bdev->bd_part->nr_sects - (PAGE_SIZE / 512); 776 if (len < 1) {
827 if (bdev_direct_access(bdev, &dax) < 0) 777 vfs_msg(sb, KERN_ERR,
828 return false; 778 "error: dax access failed (%ld)", len);
779 return len < 0 ? len : -EIO;
780 }
829 781
830 return true; 782 return 0;
831} 783}
784EXPORT_SYMBOL_GPL(bdev_dax_supported);
832 785
833/* 786/*
834 * pseudo-fs 787 * pseudo-fs
diff --git a/fs/dax.c b/fs/dax.c
index 6433650be833..43bbd6d1037d 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -55,32 +55,6 @@ static int __init init_dax_wait_table(void)
55} 55}
56fs_initcall(init_dax_wait_table); 56fs_initcall(init_dax_wait_table);
57 57
58static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
59{
60 struct request_queue *q = bdev->bd_queue;
61 long rc = -EIO;
62
63 dax->addr = ERR_PTR(-EIO);
64 if (blk_queue_enter(q, true) != 0)
65 return rc;
66
67 rc = bdev_direct_access(bdev, dax);
68 if (rc < 0) {
69 dax->addr = ERR_PTR(rc);
70 blk_queue_exit(q);
71 return rc;
72 }
73 return rc;
74}
75
76static void dax_unmap_atomic(struct block_device *bdev,
77 const struct blk_dax_ctl *dax)
78{
79 if (IS_ERR(dax->addr))
80 return;
81 blk_queue_exit(bdev->bd_queue);
82}
83
84static int dax_is_pmd_entry(void *entry) 58static int dax_is_pmd_entry(void *entry)
85{ 59{
86 return (unsigned long)entry & RADIX_DAX_PMD; 60 return (unsigned long)entry & RADIX_DAX_PMD;
@@ -101,26 +75,6 @@ static int dax_is_empty_entry(void *entry)
101 return (unsigned long)entry & RADIX_DAX_EMPTY; 75 return (unsigned long)entry & RADIX_DAX_EMPTY;
102} 76}
103 77
104struct page *read_dax_sector(struct block_device *bdev, sector_t n)
105{
106 struct page *page = alloc_pages(GFP_KERNEL, 0);
107 struct blk_dax_ctl dax = {
108 .size = PAGE_SIZE,
109 .sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
110 };
111 long rc;
112
113 if (!page)
114 return ERR_PTR(-ENOMEM);
115
116 rc = dax_map_atomic(bdev, &dax);
117 if (rc < 0)
118 return ERR_PTR(rc);
119 memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
120 dax_unmap_atomic(bdev, &dax);
121 return page;
122}
123
124/* 78/*
125 * DAX radix tree locking 79 * DAX radix tree locking
126 */ 80 */
@@ -582,21 +536,30 @@ static int dax_load_hole(struct address_space *mapping, void **entry,
582 return ret; 536 return ret;
583} 537}
584 538
585static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size, 539static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
586 struct page *to, unsigned long vaddr) 540 sector_t sector, size_t size, struct page *to,
541 unsigned long vaddr)
587{ 542{
588 struct blk_dax_ctl dax = { 543 void *vto, *kaddr;
589 .sector = sector, 544 pgoff_t pgoff;
590 .size = size, 545 pfn_t pfn;
591 }; 546 long rc;
592 void *vto; 547 int id;
593 548
594 if (dax_map_atomic(bdev, &dax) < 0) 549 rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
595 return PTR_ERR(dax.addr); 550 if (rc)
551 return rc;
552
553 id = dax_read_lock();
554 rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
555 if (rc < 0) {
556 dax_read_unlock(id);
557 return rc;
558 }
596 vto = kmap_atomic(to); 559 vto = kmap_atomic(to);
597 copy_user_page(vto, (void __force *)dax.addr, vaddr, to); 560 copy_user_page(vto, (void __force *)kaddr, vaddr, to);
598 kunmap_atomic(vto); 561 kunmap_atomic(vto);
599 dax_unmap_atomic(bdev, &dax); 562 dax_read_unlock(id);
600 return 0; 563 return 0;
601} 564}
602 565
@@ -764,12 +727,16 @@ unlock_pte:
764} 727}
765 728
766static int dax_writeback_one(struct block_device *bdev, 729static int dax_writeback_one(struct block_device *bdev,
767 struct address_space *mapping, pgoff_t index, void *entry) 730 struct dax_device *dax_dev, struct address_space *mapping,
731 pgoff_t index, void *entry)
768{ 732{
769 struct radix_tree_root *page_tree = &mapping->page_tree; 733 struct radix_tree_root *page_tree = &mapping->page_tree;
770 struct blk_dax_ctl dax; 734 void *entry2, **slot, *kaddr;
771 void *entry2, **slot; 735 long ret = 0, id;
772 int ret = 0; 736 sector_t sector;
737 pgoff_t pgoff;
738 size_t size;
739 pfn_t pfn;
773 740
774 /* 741 /*
775 * A page got tagged dirty in DAX mapping? Something is seriously 742 * A page got tagged dirty in DAX mapping? Something is seriously
@@ -818,26 +785,29 @@ static int dax_writeback_one(struct block_device *bdev,
818 * 'entry'. This allows us to flush for PMD_SIZE and not have to 785 * 'entry'. This allows us to flush for PMD_SIZE and not have to
819 * worry about partial PMD writebacks. 786 * worry about partial PMD writebacks.
820 */ 787 */
821 dax.sector = dax_radix_sector(entry); 788 sector = dax_radix_sector(entry);
822 dax.size = PAGE_SIZE << dax_radix_order(entry); 789 size = PAGE_SIZE << dax_radix_order(entry);
790
791 id = dax_read_lock();
792 ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
793 if (ret)
794 goto dax_unlock;
823 795
824 /* 796 /*
825 * We cannot hold tree_lock while calling dax_map_atomic() because it 797 * dax_direct_access() may sleep, so cannot hold tree_lock over
826 * eventually calls cond_resched(). 798 * its invocation.
827 */ 799 */
828 ret = dax_map_atomic(bdev, &dax); 800 ret = dax_direct_access(dax_dev, pgoff, size / PAGE_SIZE, &kaddr, &pfn);
829 if (ret < 0) { 801 if (ret < 0)
830 put_locked_mapping_entry(mapping, index, entry); 802 goto dax_unlock;
831 return ret;
832 }
833 803
834 if (WARN_ON_ONCE(ret < dax.size)) { 804 if (WARN_ON_ONCE(ret < size / PAGE_SIZE)) {
835 ret = -EIO; 805 ret = -EIO;
836 goto unmap; 806 goto dax_unlock;
837 } 807 }
838 808
839 dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn)); 809 dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn));
840 wb_cache_pmem(dax.addr, dax.size); 810 wb_cache_pmem(kaddr, size);
841 /* 811 /*
842 * After we have flushed the cache, we can clear the dirty tag. There 812 * After we have flushed the cache, we can clear the dirty tag. There
843 * cannot be new dirty data in the pfn after the flush has completed as 813 * cannot be new dirty data in the pfn after the flush has completed as
@@ -847,8 +817,8 @@ static int dax_writeback_one(struct block_device *bdev,
847 spin_lock_irq(&mapping->tree_lock); 817 spin_lock_irq(&mapping->tree_lock);
848 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); 818 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
849 spin_unlock_irq(&mapping->tree_lock); 819 spin_unlock_irq(&mapping->tree_lock);
850 unmap: 820 dax_unlock:
851 dax_unmap_atomic(bdev, &dax); 821 dax_read_unlock(id);
852 put_locked_mapping_entry(mapping, index, entry); 822 put_locked_mapping_entry(mapping, index, entry);
853 return ret; 823 return ret;
854 824
@@ -869,6 +839,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
869 struct inode *inode = mapping->host; 839 struct inode *inode = mapping->host;
870 pgoff_t start_index, end_index; 840 pgoff_t start_index, end_index;
871 pgoff_t indices[PAGEVEC_SIZE]; 841 pgoff_t indices[PAGEVEC_SIZE];
842 struct dax_device *dax_dev;
872 struct pagevec pvec; 843 struct pagevec pvec;
873 bool done = false; 844 bool done = false;
874 int i, ret = 0; 845 int i, ret = 0;
@@ -879,6 +850,10 @@ int dax_writeback_mapping_range(struct address_space *mapping,
879 if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) 850 if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
880 return 0; 851 return 0;
881 852
853 dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
854 if (!dax_dev)
855 return -EIO;
856
882 start_index = wbc->range_start >> PAGE_SHIFT; 857 start_index = wbc->range_start >> PAGE_SHIFT;
883 end_index = wbc->range_end >> PAGE_SHIFT; 858 end_index = wbc->range_end >> PAGE_SHIFT;
884 859
@@ -899,38 +874,49 @@ int dax_writeback_mapping_range(struct address_space *mapping,
899 break; 874 break;
900 } 875 }
901 876
902 ret = dax_writeback_one(bdev, mapping, indices[i], 877 ret = dax_writeback_one(bdev, dax_dev, mapping,
903 pvec.pages[i]); 878 indices[i], pvec.pages[i]);
904 if (ret < 0) 879 if (ret < 0) {
880 put_dax(dax_dev);
905 return ret; 881 return ret;
882 }
906 } 883 }
907 } 884 }
885 put_dax(dax_dev);
908 return 0; 886 return 0;
909} 887}
910EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); 888EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
911 889
912static int dax_insert_mapping(struct address_space *mapping, 890static int dax_insert_mapping(struct address_space *mapping,
913 struct block_device *bdev, sector_t sector, size_t size, 891 struct block_device *bdev, struct dax_device *dax_dev,
914 void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) 892 sector_t sector, size_t size, void **entryp,
893 struct vm_area_struct *vma, struct vm_fault *vmf)
915{ 894{
916 unsigned long vaddr = vmf->address; 895 unsigned long vaddr = vmf->address;
917 struct blk_dax_ctl dax = {
918 .sector = sector,
919 .size = size,
920 };
921 void *ret;
922 void *entry = *entryp; 896 void *entry = *entryp;
897 void *ret, *kaddr;
898 pgoff_t pgoff;
899 int id, rc;
900 pfn_t pfn;
923 901
924 if (dax_map_atomic(bdev, &dax) < 0) 902 rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
925 return PTR_ERR(dax.addr); 903 if (rc)
926 dax_unmap_atomic(bdev, &dax); 904 return rc;
927 905
928 ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0); 906 id = dax_read_lock();
907 rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
908 if (rc < 0) {
909 dax_read_unlock(id);
910 return rc;
911 }
912 dax_read_unlock(id);
913
914 ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
929 if (IS_ERR(ret)) 915 if (IS_ERR(ret))
930 return PTR_ERR(ret); 916 return PTR_ERR(ret);
931 *entryp = ret; 917 *entryp = ret;
932 918
933 return vm_insert_mixed(vma, vaddr, dax.pfn); 919 return vm_insert_mixed(vma, vaddr, pfn);
934} 920}
935 921
936/** 922/**
@@ -979,24 +965,34 @@ static bool dax_range_is_aligned(struct block_device *bdev,
979 return true; 965 return true;
980} 966}
981 967
982int __dax_zero_page_range(struct block_device *bdev, sector_t sector, 968int __dax_zero_page_range(struct block_device *bdev,
983 unsigned int offset, unsigned int length) 969 struct dax_device *dax_dev, sector_t sector,
970 unsigned int offset, unsigned int size)
984{ 971{
985 struct blk_dax_ctl dax = { 972 if (dax_range_is_aligned(bdev, offset, size)) {
986 .sector = sector, 973 sector_t start_sector = sector + (offset >> 9);
987 .size = PAGE_SIZE,
988 };
989
990 if (dax_range_is_aligned(bdev, offset, length)) {
991 sector_t start_sector = dax.sector + (offset >> 9);
992 974
993 return blkdev_issue_zeroout(bdev, start_sector, 975 return blkdev_issue_zeroout(bdev, start_sector,
994 length >> 9, GFP_NOFS, 0); 976 size >> 9, GFP_NOFS, 0);
995 } else { 977 } else {
996 if (dax_map_atomic(bdev, &dax) < 0) 978 pgoff_t pgoff;
997 return PTR_ERR(dax.addr); 979 long rc, id;
998 clear_pmem(dax.addr + offset, length); 980 void *kaddr;
999 dax_unmap_atomic(bdev, &dax); 981 pfn_t pfn;
982
983 rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
984 if (rc)
985 return rc;
986
987 id = dax_read_lock();
988 rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr,
989 &pfn);
990 if (rc < 0) {
991 dax_read_unlock(id);
992 return rc;
993 }
994 clear_pmem(kaddr + offset, size);
995 dax_read_unlock(id);
1000 } 996 }
1001 return 0; 997 return 0;
1002} 998}
@@ -1011,9 +1007,12 @@ static loff_t
1011dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 1007dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
1012 struct iomap *iomap) 1008 struct iomap *iomap)
1013{ 1009{
1010 struct block_device *bdev = iomap->bdev;
1011 struct dax_device *dax_dev = iomap->dax_dev;
1014 struct iov_iter *iter = data; 1012 struct iov_iter *iter = data;
1015 loff_t end = pos + length, done = 0; 1013 loff_t end = pos + length, done = 0;
1016 ssize_t ret = 0; 1014 ssize_t ret = 0;
1015 int id;
1017 1016
1018 if (iov_iter_rw(iter) == READ) { 1017 if (iov_iter_rw(iter) == READ) {
1019 end = min(end, i_size_read(inode)); 1018 end = min(end, i_size_read(inode));
@@ -1038,34 +1037,42 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
1038 (end - 1) >> PAGE_SHIFT); 1037 (end - 1) >> PAGE_SHIFT);
1039 } 1038 }
1040 1039
1040 id = dax_read_lock();
1041 while (pos < end) { 1041 while (pos < end) {
1042 unsigned offset = pos & (PAGE_SIZE - 1); 1042 unsigned offset = pos & (PAGE_SIZE - 1);
1043 struct blk_dax_ctl dax = { 0 }; 1043 const size_t size = ALIGN(length + offset, PAGE_SIZE);
1044 const sector_t sector = dax_iomap_sector(iomap, pos);
1044 ssize_t map_len; 1045 ssize_t map_len;
1046 pgoff_t pgoff;
1047 void *kaddr;
1048 pfn_t pfn;
1045 1049
1046 if (fatal_signal_pending(current)) { 1050 if (fatal_signal_pending(current)) {
1047 ret = -EINTR; 1051 ret = -EINTR;
1048 break; 1052 break;
1049 } 1053 }
1050 1054
1051 dax.sector = dax_iomap_sector(iomap, pos); 1055 ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
1052 dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK; 1056 if (ret)
1053 map_len = dax_map_atomic(iomap->bdev, &dax); 1057 break;
1058
1059 map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
1060 &kaddr, &pfn);
1054 if (map_len < 0) { 1061 if (map_len < 0) {
1055 ret = map_len; 1062 ret = map_len;
1056 break; 1063 break;
1057 } 1064 }
1058 1065
1059 dax.addr += offset; 1066 map_len = PFN_PHYS(map_len);
1067 kaddr += offset;
1060 map_len -= offset; 1068 map_len -= offset;
1061 if (map_len > end - pos) 1069 if (map_len > end - pos)
1062 map_len = end - pos; 1070 map_len = end - pos;
1063 1071
1064 if (iov_iter_rw(iter) == WRITE) 1072 if (iov_iter_rw(iter) == WRITE)
1065 map_len = copy_from_iter_pmem(dax.addr, map_len, iter); 1073 map_len = copy_from_iter_pmem(kaddr, map_len, iter);
1066 else 1074 else
1067 map_len = copy_to_iter(dax.addr, map_len, iter); 1075 map_len = copy_to_iter(kaddr, map_len, iter);
1068 dax_unmap_atomic(iomap->bdev, &dax);
1069 if (map_len <= 0) { 1076 if (map_len <= 0) {
1070 ret = map_len ? map_len : -EFAULT; 1077 ret = map_len ? map_len : -EFAULT;
1071 break; 1078 break;
@@ -1075,6 +1082,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
1075 length -= map_len; 1082 length -= map_len;
1076 done += map_len; 1083 done += map_len;
1077 } 1084 }
1085 dax_read_unlock(id);
1078 1086
1079 return done ? done : ret; 1087 return done ? done : ret;
1080} 1088}
@@ -1181,8 +1189,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
1181 clear_user_highpage(vmf->cow_page, vaddr); 1189 clear_user_highpage(vmf->cow_page, vaddr);
1182 break; 1190 break;
1183 case IOMAP_MAPPED: 1191 case IOMAP_MAPPED:
1184 error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE, 1192 error = copy_user_dax(iomap.bdev, iomap.dax_dev,
1185 vmf->cow_page, vaddr); 1193 sector, PAGE_SIZE, vmf->cow_page, vaddr);
1186 break; 1194 break;
1187 default: 1195 default:
1188 WARN_ON_ONCE(1); 1196 WARN_ON_ONCE(1);
@@ -1207,8 +1215,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
1207 mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); 1215 mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
1208 major = VM_FAULT_MAJOR; 1216 major = VM_FAULT_MAJOR;
1209 } 1217 }
1210 error = dax_insert_mapping(mapping, iomap.bdev, sector, 1218 error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
1211 PAGE_SIZE, &entry, vmf->vma, vmf); 1219 sector, PAGE_SIZE, &entry, vmf->vma, vmf);
1212 /* -EBUSY is fine, somebody else faulted on the same PTE */ 1220 /* -EBUSY is fine, somebody else faulted on the same PTE */
1213 if (error == -EBUSY) 1221 if (error == -EBUSY)
1214 error = 0; 1222 error = 0;
@@ -1258,41 +1266,48 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
1258 loff_t pos, void **entryp) 1266 loff_t pos, void **entryp)
1259{ 1267{
1260 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1268 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1269 const sector_t sector = dax_iomap_sector(iomap, pos);
1270 struct dax_device *dax_dev = iomap->dax_dev;
1261 struct block_device *bdev = iomap->bdev; 1271 struct block_device *bdev = iomap->bdev;
1262 struct inode *inode = mapping->host; 1272 struct inode *inode = mapping->host;
1263 struct blk_dax_ctl dax = { 1273 const size_t size = PMD_SIZE;
1264 .sector = dax_iomap_sector(iomap, pos), 1274 void *ret = NULL, *kaddr;
1265 .size = PMD_SIZE, 1275 long length = 0;
1266 }; 1276 pgoff_t pgoff;
1267 long length = dax_map_atomic(bdev, &dax); 1277 pfn_t pfn;
1268 void *ret = NULL; 1278 int id;
1269 1279
1270 if (length < 0) /* dax_map_atomic() failed */ 1280 if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
1271 goto fallback; 1281 goto fallback;
1272 if (length < PMD_SIZE)
1273 goto unmap_fallback;
1274 if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
1275 goto unmap_fallback;
1276 if (!pfn_t_devmap(dax.pfn))
1277 goto unmap_fallback;
1278
1279 dax_unmap_atomic(bdev, &dax);
1280 1282
1281 ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector, 1283 id = dax_read_lock();
1284 length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
1285 if (length < 0)
1286 goto unlock_fallback;
1287 length = PFN_PHYS(length);
1288
1289 if (length < size)
1290 goto unlock_fallback;
1291 if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR)
1292 goto unlock_fallback;
1293 if (!pfn_t_devmap(pfn))
1294 goto unlock_fallback;
1295 dax_read_unlock(id);
1296
1297 ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector,
1282 RADIX_DAX_PMD); 1298 RADIX_DAX_PMD);
1283 if (IS_ERR(ret)) 1299 if (IS_ERR(ret))
1284 goto fallback; 1300 goto fallback;
1285 *entryp = ret; 1301 *entryp = ret;
1286 1302
1287 trace_dax_pmd_insert_mapping(inode, vmf, length, dax.pfn, ret); 1303 trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
1288 return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, 1304 return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
1289 dax.pfn, vmf->flags & FAULT_FLAG_WRITE); 1305 pfn, vmf->flags & FAULT_FLAG_WRITE);
1290 1306
1291 unmap_fallback: 1307unlock_fallback:
1292 dax_unmap_atomic(bdev, &dax); 1308 dax_read_unlock(id);
1293fallback: 1309fallback:
1294 trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, 1310 trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret);
1295 dax.pfn, ret);
1296 return VM_FAULT_FALLBACK; 1311 return VM_FAULT_FALLBACK;
1297} 1312}
1298 1313
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 3a38c1b84e3c..26d77f9f8c12 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -799,6 +799,7 @@ int ext2_get_block(struct inode *inode, sector_t iblock,
799static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, 799static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
800 unsigned flags, struct iomap *iomap) 800 unsigned flags, struct iomap *iomap)
801{ 801{
802 struct block_device *bdev;
802 unsigned int blkbits = inode->i_blkbits; 803 unsigned int blkbits = inode->i_blkbits;
803 unsigned long first_block = offset >> blkbits; 804 unsigned long first_block = offset >> blkbits;
804 unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits; 805 unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits;
@@ -812,8 +813,13 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
812 return ret; 813 return ret;
813 814
814 iomap->flags = 0; 815 iomap->flags = 0;
815 iomap->bdev = inode->i_sb->s_bdev; 816 bdev = inode->i_sb->s_bdev;
817 iomap->bdev = bdev;
816 iomap->offset = (u64)first_block << blkbits; 818 iomap->offset = (u64)first_block << blkbits;
819 if (blk_queue_dax(bdev->bd_queue))
820 iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
821 else
822 iomap->dax_dev = NULL;
817 823
818 if (ret == 0) { 824 if (ret == 0) {
819 iomap->type = IOMAP_HOLE; 825 iomap->type = IOMAP_HOLE;
@@ -835,6 +841,7 @@ static int
835ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length, 841ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length,
836 ssize_t written, unsigned flags, struct iomap *iomap) 842 ssize_t written, unsigned flags, struct iomap *iomap)
837{ 843{
844 put_dax(iomap->dax_dev);
838 if (iomap->type == IOMAP_MAPPED && 845 if (iomap->type == IOMAP_MAPPED &&
839 written < length && 846 written < length &&
840 (flags & IOMAP_WRITE)) 847 (flags & IOMAP_WRITE))
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 10b574ab354b..f0729b0705c7 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3305,6 +3305,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3305static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, 3305static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
3306 unsigned flags, struct iomap *iomap) 3306 unsigned flags, struct iomap *iomap)
3307{ 3307{
3308 struct block_device *bdev;
3308 unsigned int blkbits = inode->i_blkbits; 3309 unsigned int blkbits = inode->i_blkbits;
3309 unsigned long first_block = offset >> blkbits; 3310 unsigned long first_block = offset >> blkbits;
3310 unsigned long last_block = (offset + length - 1) >> blkbits; 3311 unsigned long last_block = (offset + length - 1) >> blkbits;
@@ -3373,7 +3374,12 @@ retry:
3373 } 3374 }
3374 3375
3375 iomap->flags = 0; 3376 iomap->flags = 0;
3376 iomap->bdev = inode->i_sb->s_bdev; 3377 bdev = inode->i_sb->s_bdev;
3378 iomap->bdev = bdev;
3379 if (blk_queue_dax(bdev->bd_queue))
3380 iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
3381 else
3382 iomap->dax_dev = NULL;
3377 iomap->offset = first_block << blkbits; 3383 iomap->offset = first_block << blkbits;
3378 3384
3379 if (ret == 0) { 3385 if (ret == 0) {
@@ -3406,6 +3412,7 @@ static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
3406 int blkbits = inode->i_blkbits; 3412 int blkbits = inode->i_blkbits;
3407 bool truncate = false; 3413 bool truncate = false;
3408 3414
3415 put_dax(iomap->dax_dev);
3409 if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT)) 3416 if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
3410 return 0; 3417 return 0;
3411 3418
diff --git a/fs/iomap.c b/fs/iomap.c
index 1c25ae30500e..4add7d4ad006 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -360,7 +360,8 @@ static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
360 sector_t sector = iomap->blkno + 360 sector_t sector = iomap->blkno +
361 (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9); 361 (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
362 362
363 return __dax_zero_page_range(iomap->bdev, sector, offset, bytes); 363 return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector,
364 offset, bytes);
364} 365}
365 366
366static loff_t 367static loff_t
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 288ee5b840d7..4b47403f8089 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -976,6 +976,7 @@ xfs_file_iomap_begin(
976 int nimaps = 1, error = 0; 976 int nimaps = 1, error = 0;
977 bool shared = false, trimmed = false; 977 bool shared = false, trimmed = false;
978 unsigned lockmode; 978 unsigned lockmode;
979 struct block_device *bdev;
979 980
980 if (XFS_FORCED_SHUTDOWN(mp)) 981 if (XFS_FORCED_SHUTDOWN(mp))
981 return -EIO; 982 return -EIO;
@@ -1063,6 +1064,14 @@ xfs_file_iomap_begin(
1063 } 1064 }
1064 1065
1065 xfs_bmbt_to_iomap(ip, iomap, &imap); 1066 xfs_bmbt_to_iomap(ip, iomap, &imap);
1067
1068 /* optionally associate a dax device with the iomap bdev */
1069 bdev = iomap->bdev;
1070 if (blk_queue_dax(bdev->bd_queue))
1071 iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
1072 else
1073 iomap->dax_dev = NULL;
1074
1066 if (shared) 1075 if (shared)
1067 iomap->flags |= IOMAP_F_SHARED; 1076 iomap->flags |= IOMAP_F_SHARED;
1068 return 0; 1077 return 0;
@@ -1140,6 +1149,7 @@ xfs_file_iomap_end(
1140 unsigned flags, 1149 unsigned flags,
1141 struct iomap *iomap) 1150 struct iomap *iomap)
1142{ 1151{
1152 put_dax(iomap->dax_dev);
1143 if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC) 1153 if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
1144 return xfs_file_iomap_end_delalloc(XFS_I(inode), offset, 1154 return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
1145 length, written, iomap); 1155 length, written, iomap);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 83d28623645f..5493a66dc710 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1923,28 +1923,12 @@ static inline bool integrity_req_gap_front_merge(struct request *req,
1923 1923
1924#endif /* CONFIG_BLK_DEV_INTEGRITY */ 1924#endif /* CONFIG_BLK_DEV_INTEGRITY */
1925 1925
1926/**
1927 * struct blk_dax_ctl - control and output parameters for ->direct_access
1928 * @sector: (input) offset relative to a block_device
1929 * @addr: (output) kernel virtual address for @sector populated by driver
1930 * @pfn: (output) page frame number for @addr populated by driver
1931 * @size: (input) number of bytes requested
1932 */
1933struct blk_dax_ctl {
1934 sector_t sector;
1935 void *addr;
1936 long size;
1937 pfn_t pfn;
1938};
1939
1940struct block_device_operations { 1926struct block_device_operations {
1941 int (*open) (struct block_device *, fmode_t); 1927 int (*open) (struct block_device *, fmode_t);
1942 void (*release) (struct gendisk *, fmode_t); 1928 void (*release) (struct gendisk *, fmode_t);
1943 int (*rw_page)(struct block_device *, sector_t, struct page *, bool); 1929 int (*rw_page)(struct block_device *, sector_t, struct page *, bool);
1944 int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); 1930 int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
1945 int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); 1931 int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
1946 long (*direct_access)(struct block_device *, sector_t, void **, pfn_t *,
1947 long);
1948 unsigned int (*check_events) (struct gendisk *disk, 1932 unsigned int (*check_events) (struct gendisk *disk,
1949 unsigned int clearing); 1933 unsigned int clearing);
1950 /* ->media_changed() is DEPRECATED, use ->check_events() instead */ 1934 /* ->media_changed() is DEPRECATED, use ->check_events() instead */
@@ -1963,9 +1947,8 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
1963extern int bdev_read_page(struct block_device *, sector_t, struct page *); 1947extern int bdev_read_page(struct block_device *, sector_t, struct page *);
1964extern int bdev_write_page(struct block_device *, sector_t, struct page *, 1948extern int bdev_write_page(struct block_device *, sector_t, struct page *,
1965 struct writeback_control *); 1949 struct writeback_control *);
1966extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *);
1967extern int bdev_dax_supported(struct super_block *, int); 1950extern int bdev_dax_supported(struct super_block *, int);
1968extern bool bdev_dax_capable(struct block_device *); 1951int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
1969#else /* CONFIG_BLOCK */ 1952#else /* CONFIG_BLOCK */
1970 1953
1971struct block_device; 1954struct block_device;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index d8a3dc042e1c..d3158e74a59e 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -7,6 +7,28 @@
7#include <asm/pgtable.h> 7#include <asm/pgtable.h>
8 8
9struct iomap_ops; 9struct iomap_ops;
10struct dax_device;
11struct dax_operations {
12 /*
13 * direct_access: translate a device-relative
14 * logical-page-offset into an absolute physical pfn. Return the
15 * number of pages available for DAX at that pfn.
16 */
17 long (*direct_access)(struct dax_device *, pgoff_t, long,
18 void **, pfn_t *);
19};
20
21int dax_read_lock(void);
22void dax_read_unlock(int id);
23struct dax_device *dax_get_by_host(const char *host);
24struct dax_device *alloc_dax(void *private, const char *host,
25 const struct dax_operations *ops);
26void put_dax(struct dax_device *dax_dev);
27bool dax_alive(struct dax_device *dax_dev);
28void kill_dax(struct dax_device *dax_dev);
29void *dax_get_private(struct dax_device *dax_dev);
30long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
31 void **kaddr, pfn_t *pfn);
10 32
11/* 33/*
12 * We use lowest available bit in exceptional entry for locking, one bit for 34 * We use lowest available bit in exceptional entry for locking, one bit for
@@ -48,17 +70,13 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
48 pgoff_t index, void *entry, bool wake_all); 70 pgoff_t index, void *entry, bool wake_all);
49 71
50#ifdef CONFIG_FS_DAX 72#ifdef CONFIG_FS_DAX
51struct page *read_dax_sector(struct block_device *bdev, sector_t n); 73int __dax_zero_page_range(struct block_device *bdev,
52int __dax_zero_page_range(struct block_device *bdev, sector_t sector, 74 struct dax_device *dax_dev, sector_t sector,
53 unsigned int offset, unsigned int length); 75 unsigned int offset, unsigned int length);
54#else 76#else
55static inline struct page *read_dax_sector(struct block_device *bdev,
56 sector_t n)
57{
58 return ERR_PTR(-ENXIO);
59}
60static inline int __dax_zero_page_range(struct block_device *bdev, 77static inline int __dax_zero_page_range(struct block_device *bdev,
61 sector_t sector, unsigned int offset, unsigned int length) 78 struct dax_device *dax_dev, sector_t sector,
79 unsigned int offset, unsigned int length)
62{ 80{
63 return -ENXIO; 81 return -ENXIO;
64} 82}
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 78ad0624cdae..f4c639c0c362 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -130,13 +130,15 @@ typedef int (*dm_busy_fn) (struct dm_target *ti);
130 * < 0 : error 130 * < 0 : error
131 * >= 0 : the number of bytes accessible at the address 131 * >= 0 : the number of bytes accessible at the address
132 */ 132 */
133typedef long (*dm_direct_access_fn) (struct dm_target *ti, sector_t sector, 133typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff,
134 void **kaddr, pfn_t *pfn, long size); 134 long nr_pages, void **kaddr, pfn_t *pfn);
135#define PAGE_SECTORS (PAGE_SIZE / 512)
135 136
136void dm_error(const char *message); 137void dm_error(const char *message);
137 138
138struct dm_dev { 139struct dm_dev {
139 struct block_device *bdev; 140 struct block_device *bdev;
141 struct dax_device *dax_dev;
140 fmode_t mode; 142 fmode_t mode;
141 char name[16]; 143 char name[16];
142}; 144};
@@ -178,7 +180,7 @@ struct target_type {
178 dm_busy_fn busy; 180 dm_busy_fn busy;
179 dm_iterate_devices_fn iterate_devices; 181 dm_iterate_devices_fn iterate_devices;
180 dm_io_hints_fn io_hints; 182 dm_io_hints_fn io_hints;
181 dm_direct_access_fn direct_access; 183 dm_dax_direct_access_fn direct_access;
182 184
183 /* For internal device-mapper use. */ 185 /* For internal device-mapper use. */
184 struct list_head list; 186 struct list_head list;
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 7291810067eb..f753e788da31 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -41,6 +41,7 @@ struct iomap {
41 u16 type; /* type of mapping */ 41 u16 type; /* type of mapping */
42 u16 flags; /* flags for mapping */ 42 u16 flags; /* flags for mapping */
43 struct block_device *bdev; /* block device for I/O */ 43 struct block_device *bdev; /* block device for I/O */
44 struct dax_device *dax_dev; /* dax_dev for dax operations */
44}; 45};
45 46
46/* 47/*
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 77e7af32543f..6c807017128d 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -20,9 +20,11 @@
20 20
21enum { 21enum {
22 /* when a dimm supports both PMEM and BLK access a label is required */ 22 /* when a dimm supports both PMEM and BLK access a label is required */
23 NDD_ALIASING = 1 << 0, 23 NDD_ALIASING = 0,
24 /* unarmed memory devices may not persist writes */ 24 /* unarmed memory devices may not persist writes */
25 NDD_UNARMED = 1 << 1, 25 NDD_UNARMED = 1,
26 /* locked memory devices should not be accessed */
27 NDD_LOCKED = 2,
26 28
27 /* need to set a limit somewhere, but yes, this is likely overkill */ 29 /* need to set a limit somewhere, but yes, this is likely overkill */
28 ND_IOCTL_MAX_BUFLEN = SZ_4M, 30 ND_IOCTL_MAX_BUFLEN = SZ_4M,
@@ -120,7 +122,7 @@ static inline struct nd_blk_region_desc *to_blk_region_desc(
120} 122}
121 123
122int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length); 124int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length);
123void nvdimm_clear_from_poison_list(struct nvdimm_bus *nvdimm_bus, 125void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus,
124 phys_addr_t start, unsigned int len); 126 phys_addr_t start, unsigned int len);
125struct nvdimm_bus *nvdimm_bus_register(struct device *parent, 127struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
126 struct nvdimm_bus_descriptor *nfit_desc); 128 struct nvdimm_bus_descriptor *nfit_desc);
diff --git a/include/linux/pmem.h b/include/linux/pmem.h
index e856c2cb0fe8..71ecf3d46aac 100644
--- a/include/linux/pmem.h
+++ b/include/linux/pmem.h
@@ -31,12 +31,6 @@ static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n)
31 BUG(); 31 BUG();
32} 32}
33 33
34static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n)
35{
36 BUG();
37 return -EFAULT;
38}
39
40static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes, 34static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes,
41 struct iov_iter *i) 35 struct iov_iter *i)
42{ 36{
@@ -65,23 +59,6 @@ static inline bool arch_has_pmem_api(void)
65 return IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API); 59 return IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API);
66} 60}
67 61
68/*
69 * memcpy_from_pmem - read from persistent memory with error handling
70 * @dst: destination buffer
71 * @src: source buffer
72 * @size: transfer length
73 *
74 * Returns 0 on success negative error code on failure.
75 */
76static inline int memcpy_from_pmem(void *dst, void const *src, size_t size)
77{
78 if (arch_has_pmem_api())
79 return arch_memcpy_from_pmem(dst, src, size);
80 else
81 memcpy(dst, src, size);
82 return 0;
83}
84
85/** 62/**
86 * memcpy_to_pmem - copy data to persistent memory 63 * memcpy_to_pmem - copy data to persistent memory
87 * @dst: destination buffer for the copy 64 * @dst: destination buffer for the copy
diff --git a/include/linux/string.h b/include/linux/string.h
index c4011b28f3d8..537918f8a98e 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -114,6 +114,14 @@ extern int memcmp(const void *,const void *,__kernel_size_t);
114#ifndef __HAVE_ARCH_MEMCHR 114#ifndef __HAVE_ARCH_MEMCHR
115extern void * memchr(const void *,int,__kernel_size_t); 115extern void * memchr(const void *,int,__kernel_size_t);
116#endif 116#endif
117#ifndef __HAVE_ARCH_MEMCPY_MCSAFE
118static inline __must_check int memcpy_mcsafe(void *dst, const void *src,
119 size_t cnt)
120{
121 memcpy(dst, src, cnt);
122 return 0;
123}
124#endif
117void *memchr_inv(const void *s, int c, size_t n); 125void *memchr_inv(const void *s, int c, size_t n);
118char *strreplace(char *s, char old, char new); 126char *strreplace(char *s, char old, char new);
119 127
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index ede5c6a62164..7ad3863cb88b 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -169,6 +169,7 @@ enum {
169enum { 169enum {
170 ND_ARS_VOLATILE = 1, 170 ND_ARS_VOLATILE = 1,
171 ND_ARS_PERSISTENT = 2, 171 ND_ARS_PERSISTENT = 2,
172 ND_CONFIG_LOCKED = 1,
172}; 173};
173 174
174static inline const char *nvdimm_bus_cmd_name(unsigned cmd) 175static inline const char *nvdimm_bus_cmd_name(unsigned cmd)
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild
index 405212be044a..d870520da68b 100644
--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -28,7 +28,10 @@ obj-$(CONFIG_ND_BTT) += nd_btt.o
28obj-$(CONFIG_ND_BLK) += nd_blk.o 28obj-$(CONFIG_ND_BLK) += nd_blk.o
29obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o 29obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o
30obj-$(CONFIG_ACPI_NFIT) += nfit.o 30obj-$(CONFIG_ACPI_NFIT) += nfit.o
31obj-$(CONFIG_DEV_DAX) += dax.o 31ifeq ($(CONFIG_DAX),m)
32obj-$(CONFIG_DAX) += dax.o
33endif
34obj-$(CONFIG_DEV_DAX) += device_dax.o
32obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o 35obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
33 36
34nfit-y := $(ACPI_SRC)/core.o 37nfit-y := $(ACPI_SRC)/core.o
@@ -48,9 +51,13 @@ nd_blk-y += config_check.o
48nd_e820-y := $(NVDIMM_SRC)/e820.o 51nd_e820-y := $(NVDIMM_SRC)/e820.o
49nd_e820-y += config_check.o 52nd_e820-y += config_check.o
50 53
51dax-y := $(DAX_SRC)/dax.o 54dax-y := $(DAX_SRC)/super.o
52dax-y += config_check.o 55dax-y += config_check.o
53 56
57device_dax-y := $(DAX_SRC)/device.o
58device_dax-y += dax-dev.o
59device_dax-y += config_check.o
60
54dax_pmem-y := $(DAX_SRC)/pmem.o 61dax_pmem-y := $(DAX_SRC)/pmem.o
55dax_pmem-y += config_check.o 62dax_pmem-y += config_check.o
56 63
diff --git a/tools/testing/nvdimm/dax-dev.c b/tools/testing/nvdimm/dax-dev.c
new file mode 100644
index 000000000000..36ee3d8797c3
--- /dev/null
+++ b/tools/testing/nvdimm/dax-dev.c
@@ -0,0 +1,49 @@
1/*
2 * Copyright (c) 2016, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 */
13#include "test/nfit_test.h"
14#include <linux/mm.h>
15#include "../../../drivers/dax/dax-private.h"
16
17phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
18 unsigned long size)
19{
20 struct resource *res;
21 phys_addr_t addr;
22 int i;
23
24 for (i = 0; i < dev_dax->num_resources; i++) {
25 res = &dev_dax->res[i];
26 addr = pgoff * PAGE_SIZE + res->start;
27 if (addr >= res->start && addr <= res->end)
28 break;
29 pgoff -= PHYS_PFN(resource_size(res));
30 }
31
32 if (i < dev_dax->num_resources) {
33 res = &dev_dax->res[i];
34 if (addr + size - 1 <= res->end) {
35 if (get_nfit_res(addr)) {
36 struct page *page;
37
38 if (dev_dax->region->align > PAGE_SIZE)
39 return -1;
40
41 page = vmalloc_to_page((void *)addr);
42 return PFN_PHYS(page_to_pfn(page));
43 } else
44 return addr;
45 }
46 }
47
48 return -1;
49}
diff --git a/tools/testing/nvdimm/pmem-dax.c b/tools/testing/nvdimm/pmem-dax.c
index c9b8c48f85fc..b53596ad601b 100644
--- a/tools/testing/nvdimm/pmem-dax.c
+++ b/tools/testing/nvdimm/pmem-dax.c
@@ -15,13 +15,13 @@
15#include <pmem.h> 15#include <pmem.h>
16#include <nd.h> 16#include <nd.h>
17 17
18long pmem_direct_access(struct block_device *bdev, sector_t sector, 18long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
19 void **kaddr, pfn_t *pfn, long size) 19 long nr_pages, void **kaddr, pfn_t *pfn)
20{ 20{
21 struct pmem_device *pmem = bdev->bd_queue->queuedata; 21 resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
22 resource_size_t offset = sector * 512 + pmem->data_offset;
23 22
24 if (unlikely(is_bad_pmem(&pmem->bb, sector, size))) 23 if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
24 PFN_PHYS(nr_pages))))
25 return -EIO; 25 return -EIO;
26 26
27 /* 27 /*
@@ -34,11 +34,10 @@ long pmem_direct_access(struct block_device *bdev, sector_t sector,
34 *kaddr = pmem->virt_addr + offset; 34 *kaddr = pmem->virt_addr + offset;
35 page = vmalloc_to_page(pmem->virt_addr + offset); 35 page = vmalloc_to_page(pmem->virt_addr + offset);
36 *pfn = page_to_pfn_t(page); 36 *pfn = page_to_pfn_t(page);
37 dev_dbg_ratelimited(disk_to_dev(bdev->bd_disk)->parent, 37 pr_debug_ratelimited("%s: pmem: %p pgoff: %#lx pfn: %#lx\n",
38 "%s: sector: %#llx pfn: %#lx\n", __func__, 38 __func__, pmem, pgoff, page_to_pfn(page));
39 (unsigned long long) sector, page_to_pfn(page));
40 39
41 return PAGE_SIZE; 40 return 1;
42 } 41 }
43 42
44 *kaddr = pmem->virt_addr + offset; 43 *kaddr = pmem->virt_addr + offset;
@@ -49,6 +48,6 @@ long pmem_direct_access(struct block_device *bdev, sector_t sector,
49 * requested range. 48 * requested range.
50 */ 49 */
51 if (unlikely(pmem->bb.count)) 50 if (unlikely(pmem->bb.count))
52 return size; 51 return nr_pages;
53 return pmem->size - pmem->pfn_pad - offset; 52 return PHYS_PFN(pmem->size - pmem->pfn_pad - offset);
54} 53}
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 798f17655433..c2187178fb13 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -132,6 +132,7 @@ static u32 handle[] = {
132 [3] = NFIT_DIMM_HANDLE(0, 0, 1, 0, 1), 132 [3] = NFIT_DIMM_HANDLE(0, 0, 1, 0, 1),
133 [4] = NFIT_DIMM_HANDLE(0, 1, 0, 0, 0), 133 [4] = NFIT_DIMM_HANDLE(0, 1, 0, 0, 0),
134 [5] = NFIT_DIMM_HANDLE(1, 0, 0, 0, 0), 134 [5] = NFIT_DIMM_HANDLE(1, 0, 0, 0, 0),
135 [6] = NFIT_DIMM_HANDLE(1, 0, 0, 0, 1),
135}; 136};
136 137
137static unsigned long dimm_fail_cmd_flags[NUM_DCR]; 138static unsigned long dimm_fail_cmd_flags[NUM_DCR];
@@ -728,8 +729,8 @@ static int nfit_test0_alloc(struct nfit_test *t)
728static int nfit_test1_alloc(struct nfit_test *t) 729static int nfit_test1_alloc(struct nfit_test *t)
729{ 730{
730 size_t nfit_size = sizeof(struct acpi_nfit_system_address) * 2 731 size_t nfit_size = sizeof(struct acpi_nfit_system_address) * 2
731 + sizeof(struct acpi_nfit_memory_map) 732 + sizeof(struct acpi_nfit_memory_map) * 2
732 + offsetof(struct acpi_nfit_control_region, window_size); 733 + offsetof(struct acpi_nfit_control_region, window_size) * 2;
733 int i; 734 int i;
734 735
735 t->nfit_buf = test_alloc(t, nfit_size, &t->nfit_dma); 736 t->nfit_buf = test_alloc(t, nfit_size, &t->nfit_dma);
@@ -906,6 +907,7 @@ static void nfit_test0_setup(struct nfit_test *t)
906 memdev->address = 0; 907 memdev->address = 0;
907 memdev->interleave_index = 0; 908 memdev->interleave_index = 0;
908 memdev->interleave_ways = 2; 909 memdev->interleave_ways = 2;
910 memdev->flags = ACPI_NFIT_MEM_HEALTH_ENABLED;
909 911
910 /* mem-region2 (spa1, dimm0) */ 912 /* mem-region2 (spa1, dimm0) */
911 memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 2; 913 memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 2;
@@ -921,6 +923,7 @@ static void nfit_test0_setup(struct nfit_test *t)
921 memdev->address = SPA0_SIZE/2; 923 memdev->address = SPA0_SIZE/2;
922 memdev->interleave_index = 0; 924 memdev->interleave_index = 0;
923 memdev->interleave_ways = 4; 925 memdev->interleave_ways = 4;
926 memdev->flags = ACPI_NFIT_MEM_HEALTH_ENABLED;
924 927
925 /* mem-region3 (spa1, dimm1) */ 928 /* mem-region3 (spa1, dimm1) */
926 memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 3; 929 memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 3;
@@ -951,6 +954,7 @@ static void nfit_test0_setup(struct nfit_test *t)
951 memdev->address = SPA0_SIZE/2; 954 memdev->address = SPA0_SIZE/2;
952 memdev->interleave_index = 0; 955 memdev->interleave_index = 0;
953 memdev->interleave_ways = 4; 956 memdev->interleave_ways = 4;
957 memdev->flags = ACPI_NFIT_MEM_HEALTH_ENABLED;
954 958
955 /* mem-region5 (spa1, dimm3) */ 959 /* mem-region5 (spa1, dimm3) */
956 memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 5; 960 memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 5;
@@ -1086,6 +1090,7 @@ static void nfit_test0_setup(struct nfit_test *t)
1086 memdev->address = 0; 1090 memdev->address = 0;
1087 memdev->interleave_index = 0; 1091 memdev->interleave_index = 0;
1088 memdev->interleave_ways = 1; 1092 memdev->interleave_ways = 1;
1093 memdev->flags = ACPI_NFIT_MEM_HEALTH_ENABLED;
1089 1094
1090 offset = offset + sizeof(struct acpi_nfit_memory_map) * 14; 1095 offset = offset + sizeof(struct acpi_nfit_memory_map) * 14;
1091 /* dcr-descriptor0: blk */ 1096 /* dcr-descriptor0: blk */
@@ -1384,6 +1389,7 @@ static void nfit_test0_setup(struct nfit_test *t)
1384 memdev->address = 0; 1389 memdev->address = 0;
1385 memdev->interleave_index = 0; 1390 memdev->interleave_index = 0;
1386 memdev->interleave_ways = 1; 1391 memdev->interleave_ways = 1;
1392 memdev->flags = ACPI_NFIT_MEM_HEALTH_ENABLED;
1387 1393
1388 /* mem-region16 (spa/bdw4, dimm4) */ 1394 /* mem-region16 (spa/bdw4, dimm4) */
1389 memdev = nfit_buf + offset + 1395 memdev = nfit_buf + offset +
@@ -1486,6 +1492,34 @@ static void nfit_test1_setup(struct nfit_test *t)
1486 dcr->code = NFIT_FIC_BYTE; 1492 dcr->code = NFIT_FIC_BYTE;
1487 dcr->windows = 0; 1493 dcr->windows = 0;
1488 1494
1495 offset += dcr->header.length;
1496 memdev = nfit_buf + offset;
1497 memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
1498 memdev->header.length = sizeof(*memdev);
1499 memdev->device_handle = handle[6];
1500 memdev->physical_id = 0;
1501 memdev->region_id = 0;
1502 memdev->range_index = 0;
1503 memdev->region_index = 0+2;
1504 memdev->region_size = SPA2_SIZE;
1505 memdev->region_offset = 0;
1506 memdev->address = 0;
1507 memdev->interleave_index = 0;
1508 memdev->interleave_ways = 1;
1509 memdev->flags = ACPI_NFIT_MEM_MAP_FAILED;
1510
1511 /* dcr-descriptor1 */
1512 offset += sizeof(*memdev);
1513 dcr = nfit_buf + offset;
1514 dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
1515 dcr->header.length = offsetof(struct acpi_nfit_control_region,
1516 window_size);
1517 dcr->region_index = 0+2;
1518 dcr_common_init(dcr);
1519 dcr->serial_number = ~handle[6];
1520 dcr->code = NFIT_FIC_BYTE;
1521 dcr->windows = 0;
1522
1489 post_ars_status(&t->ars_state, t->spa_set_dma[0], SPA2_SIZE); 1523 post_ars_status(&t->ars_state, t->spa_set_dma[0], SPA2_SIZE);
1490 1524
1491 acpi_desc = &t->acpi_desc; 1525 acpi_desc = &t->acpi_desc;
@@ -1817,6 +1851,10 @@ static int nfit_test_probe(struct platform_device *pdev)
1817 if (rc) 1851 if (rc)
1818 return rc; 1852 return rc;
1819 1853
1854 rc = devm_add_action_or_reset(&pdev->dev, acpi_nfit_shutdown, acpi_desc);
1855 if (rc)
1856 return rc;
1857
1820 if (nfit_test->setup != nfit_test0_setup) 1858 if (nfit_test->setup != nfit_test0_setup)
1821 return 0; 1859 return 0;
1822 1860
@@ -1907,7 +1945,7 @@ static __init int nfit_test_init(void)
1907 case 1: 1945 case 1:
1908 nfit_test->num_pm = 1; 1946 nfit_test->num_pm = 1;
1909 nfit_test->dcr_idx = NUM_DCR; 1947 nfit_test->dcr_idx = NUM_DCR;
1910 nfit_test->num_dcr = 1; 1948 nfit_test->num_dcr = 2;
1911 nfit_test->alloc = nfit_test1_alloc; 1949 nfit_test->alloc = nfit_test1_alloc;
1912 nfit_test->setup = nfit_test1_setup; 1950 nfit_test->setup = nfit_test1_setup;
1913 break; 1951 break;
@@ -1924,6 +1962,7 @@ static __init int nfit_test_init(void)
1924 put_device(&pdev->dev); 1962 put_device(&pdev->dev);
1925 goto err_register; 1963 goto err_register;
1926 } 1964 }
1965 get_device(&pdev->dev);
1927 1966
1928 rc = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); 1967 rc = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
1929 if (rc) 1968 if (rc)
@@ -1942,6 +1981,10 @@ static __init int nfit_test_init(void)
1942 if (instances[i]) 1981 if (instances[i])
1943 platform_device_unregister(&instances[i]->pdev); 1982 platform_device_unregister(&instances[i]->pdev);
1944 nfit_test_teardown(); 1983 nfit_test_teardown();
1984 for (i = 0; i < NUM_NFITS; i++)
1985 if (instances[i])
1986 put_device(&instances[i]->pdev.dev);
1987
1945 return rc; 1988 return rc;
1946} 1989}
1947 1990
@@ -1949,10 +1992,13 @@ static __exit void nfit_test_exit(void)
1949{ 1992{
1950 int i; 1993 int i;
1951 1994
1952 platform_driver_unregister(&nfit_test_driver);
1953 for (i = 0; i < NUM_NFITS; i++) 1995 for (i = 0; i < NUM_NFITS; i++)
1954 platform_device_unregister(&instances[i]->pdev); 1996 platform_device_unregister(&instances[i]->pdev);
1997 platform_driver_unregister(&nfit_test_driver);
1955 nfit_test_teardown(); 1998 nfit_test_teardown();
1999
2000 for (i = 0; i < NUM_NFITS; i++)
2001 put_device(&instances[i]->pdev.dev);
1956 class_destroy(nfit_test_dimm); 2002 class_destroy(nfit_test_dimm);
1957} 2003}
1958 2004