summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/dax.txt32
-rw-r--r--arch/powerpc/sysdev/axonram.c2
-rw-r--r--block/ioctl.c1
-rw-r--r--drivers/block/brd.c2
-rw-r--r--drivers/nvdimm/pmem.c10
-rw-r--r--drivers/s390/block/dcssblk.c4
-rw-r--r--fs/block_dev.c114
-rw-r--r--fs/dax.c257
-rw-r--r--fs/ext2/file.c4
-rw-r--r--fs/ext2/inode.c12
-rw-r--r--fs/ext2/super.c11
-rw-r--r--fs/ext4/file.c4
-rw-r--r--fs/ext4/super.c11
-rw-r--r--fs/xfs/xfs_bmap_util.c15
-rw-r--r--fs/xfs/xfs_file.c7
-rw-r--r--fs/xfs/xfs_super.c12
-rw-r--r--include/linux/blkdev.h15
-rw-r--r--include/linux/dax.h25
-rw-r--r--include/linux/fs.h1
19 files changed, 246 insertions, 293 deletions
diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
index 7bde64014a89..ce4587d257d2 100644
--- a/Documentation/filesystems/dax.txt
+++ b/Documentation/filesystems/dax.txt
@@ -79,6 +79,38 @@ These filesystems may be used for inspiration:
79- ext4: the fourth extended filesystem, see Documentation/filesystems/ext4.txt 79- ext4: the fourth extended filesystem, see Documentation/filesystems/ext4.txt
80 80
81 81
82Handling Media Errors
83---------------------
84
85The libnvdimm subsystem stores a record of known media error locations for
86each pmem block device (in gendisk->badblocks). If we fault at such location,
87or one with a latent error not yet discovered, the application can expect
88to receive a SIGBUS. Libnvdimm also allows clearing of these errors by simply
89writing the affected sectors (through the pmem driver, and if the underlying
90NVDIMM supports the clear_poison DSM defined by ACPI).
91
92Since DAX IO normally doesn't go through the driver/bio path, applications or
93sysadmins have an option to restore the lost data from a prior backup/inbuilt
94redundancy in the following ways:
95
961. Delete the affected file, and restore from a backup (sysadmin route):
97 This will free the file system blocks that were being used by the file,
98 and the next time they're allocated, they will be zeroed first, which
99 happens through the driver, and will clear bad sectors.
100
1012. Truncate or hole-punch the part of the file that has a bad-block (at least
102 an entire aligned sector has to be hole-punched, but not necessarily an
103 entire filesystem block).
104
105These are the two basic paths that allow DAX filesystems to continue operating
106in the presence of media errors. More robust error recovery mechanisms can be
107built on top of this in the future, for example, involving redundancy/mirroring
108provided at the block layer through DM, or additionally, at the filesystem
109level. These would have to rely on the above two tenets, that error clearing
110can happen either by sending an IO through the driver, or zeroing (also through
111the driver).
112
113
82Shortcomings 114Shortcomings
83------------ 115------------
84 116
diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index 0d112b94d91d..ff75d70f7285 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -143,7 +143,7 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio)
143 */ 143 */
144static long 144static long
145axon_ram_direct_access(struct block_device *device, sector_t sector, 145axon_ram_direct_access(struct block_device *device, sector_t sector,
146 void __pmem **kaddr, pfn_t *pfn) 146 void __pmem **kaddr, pfn_t *pfn, long size)
147{ 147{
148 struct axon_ram_bank *bank = device->bd_disk->private_data; 148 struct axon_ram_bank *bank = device->bd_disk->private_data;
149 loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT; 149 loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT;
diff --git a/block/ioctl.c b/block/ioctl.c
index 698c7933d582..ed2397f8de9d 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -4,7 +4,6 @@
4#include <linux/gfp.h> 4#include <linux/gfp.h>
5#include <linux/blkpg.h> 5#include <linux/blkpg.h>
6#include <linux/hdreg.h> 6#include <linux/hdreg.h>
7#include <linux/badblocks.h>
8#include <linux/backing-dev.h> 7#include <linux/backing-dev.h>
9#include <linux/fs.h> 8#include <linux/fs.h>
10#include <linux/blktrace_api.h> 9#include <linux/blktrace_api.h>
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 51a071e32221..c04bd9bc39fd 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -381,7 +381,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
381 381
382#ifdef CONFIG_BLK_DEV_RAM_DAX 382#ifdef CONFIG_BLK_DEV_RAM_DAX
383static long brd_direct_access(struct block_device *bdev, sector_t sector, 383static long brd_direct_access(struct block_device *bdev, sector_t sector,
384 void __pmem **kaddr, pfn_t *pfn) 384 void __pmem **kaddr, pfn_t *pfn, long size)
385{ 385{
386 struct brd_device *brd = bdev->bd_disk->private_data; 386 struct brd_device *brd = bdev->bd_disk->private_data;
387 struct page *page; 387 struct page *page;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 042baec56931..608fc4464574 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -164,14 +164,22 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
164} 164}
165 165
166static long pmem_direct_access(struct block_device *bdev, sector_t sector, 166static long pmem_direct_access(struct block_device *bdev, sector_t sector,
167 void __pmem **kaddr, pfn_t *pfn) 167 void __pmem **kaddr, pfn_t *pfn, long size)
168{ 168{
169 struct pmem_device *pmem = bdev->bd_queue->queuedata; 169 struct pmem_device *pmem = bdev->bd_queue->queuedata;
170 resource_size_t offset = sector * 512 + pmem->data_offset; 170 resource_size_t offset = sector * 512 + pmem->data_offset;
171 171
172 if (unlikely(is_bad_pmem(&pmem->bb, sector, size)))
173 return -EIO;
172 *kaddr = pmem->virt_addr + offset; 174 *kaddr = pmem->virt_addr + offset;
173 *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags); 175 *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
174 176
177 /*
178 * If badblocks are present, limit known good range to the
179 * requested range.
180 */
181 if (unlikely(pmem->bb.count))
182 return size;
175 return pmem->size - pmem->pfn_pad - offset; 183 return pmem->size - pmem->pfn_pad - offset;
176} 184}
177 185
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index b83908670a9a..bed53c46dd90 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -31,7 +31,7 @@ static void dcssblk_release(struct gendisk *disk, fmode_t mode);
31static blk_qc_t dcssblk_make_request(struct request_queue *q, 31static blk_qc_t dcssblk_make_request(struct request_queue *q,
32 struct bio *bio); 32 struct bio *bio);
33static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum, 33static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
34 void __pmem **kaddr, pfn_t *pfn); 34 void __pmem **kaddr, pfn_t *pfn, long size);
35 35
36static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0"; 36static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0";
37 37
@@ -884,7 +884,7 @@ fail:
884 884
885static long 885static long
886dcssblk_direct_access (struct block_device *bdev, sector_t secnum, 886dcssblk_direct_access (struct block_device *bdev, sector_t secnum,
887 void __pmem **kaddr, pfn_t *pfn) 887 void __pmem **kaddr, pfn_t *pfn, long size)
888{ 888{
889 struct dcssblk_dev_info *dev_info; 889 struct dcssblk_dev_info *dev_info;
890 unsigned long offset, dev_sz; 890 unsigned long offset, dev_sz;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1089dbf25925..71ccab1d22c6 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -51,6 +51,18 @@ struct block_device *I_BDEV(struct inode *inode)
51} 51}
52EXPORT_SYMBOL(I_BDEV); 52EXPORT_SYMBOL(I_BDEV);
53 53
54void __vfs_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
55{
56 struct va_format vaf;
57 va_list args;
58
59 va_start(args, fmt);
60 vaf.fmt = fmt;
61 vaf.va = &args;
62 printk_ratelimited("%sVFS (%s): %pV\n", prefix, sb->s_id, &vaf);
63 va_end(args);
64}
65
54static void bdev_write_inode(struct block_device *bdev) 66static void bdev_write_inode(struct block_device *bdev)
55{ 67{
56 struct inode *inode = bdev->bd_inode; 68 struct inode *inode = bdev->bd_inode;
@@ -489,7 +501,7 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
489 sector += get_start_sect(bdev); 501 sector += get_start_sect(bdev);
490 if (sector % (PAGE_SIZE / 512)) 502 if (sector % (PAGE_SIZE / 512))
491 return -EINVAL; 503 return -EINVAL;
492 avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn); 504 avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn, size);
493 if (!avail) 505 if (!avail)
494 return -ERANGE; 506 return -ERANGE;
495 if (avail > 0 && avail & ~PAGE_MASK) 507 if (avail > 0 && avail & ~PAGE_MASK)
@@ -498,6 +510,75 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
498} 510}
499EXPORT_SYMBOL_GPL(bdev_direct_access); 511EXPORT_SYMBOL_GPL(bdev_direct_access);
500 512
513/**
514 * bdev_dax_supported() - Check if the device supports dax for filesystem
515 * @sb: The superblock of the device
516 * @blocksize: The block size of the device
517 *
518 * This is a library function for filesystems to check if the block device
519 * can be mounted with dax option.
520 *
521 * Return: negative errno if unsupported, 0 if supported.
522 */
523int bdev_dax_supported(struct super_block *sb, int blocksize)
524{
525 struct blk_dax_ctl dax = {
526 .sector = 0,
527 .size = PAGE_SIZE,
528 };
529 int err;
530
531 if (blocksize != PAGE_SIZE) {
532 vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax");
533 return -EINVAL;
534 }
535
536 err = bdev_direct_access(sb->s_bdev, &dax);
537 if (err < 0) {
538 switch (err) {
539 case -EOPNOTSUPP:
540 vfs_msg(sb, KERN_ERR,
541 "error: device does not support dax");
542 break;
543 case -EINVAL:
544 vfs_msg(sb, KERN_ERR,
545 "error: unaligned partition for dax");
546 break;
547 default:
548 vfs_msg(sb, KERN_ERR,
549 "error: dax access failed (%d)", err);
550 }
551 return err;
552 }
553
554 return 0;
555}
556EXPORT_SYMBOL_GPL(bdev_dax_supported);
557
558/**
559 * bdev_dax_capable() - Return if the raw device is capable for dax
560 * @bdev: The device for raw block device access
561 */
562bool bdev_dax_capable(struct block_device *bdev)
563{
564 struct blk_dax_ctl dax = {
565 .size = PAGE_SIZE,
566 };
567
568 if (!IS_ENABLED(CONFIG_FS_DAX))
569 return false;
570
571 dax.sector = 0;
572 if (bdev_direct_access(bdev, &dax) < 0)
573 return false;
574
575 dax.sector = bdev->bd_part->nr_sects - (PAGE_SIZE / 512);
576 if (bdev_direct_access(bdev, &dax) < 0)
577 return false;
578
579 return true;
580}
581
501/* 582/*
502 * pseudo-fs 583 * pseudo-fs
503 */ 584 */
@@ -1160,33 +1241,6 @@ void bd_set_size(struct block_device *bdev, loff_t size)
1160} 1241}
1161EXPORT_SYMBOL(bd_set_size); 1242EXPORT_SYMBOL(bd_set_size);
1162 1243
1163static bool blkdev_dax_capable(struct block_device *bdev)
1164{
1165 struct gendisk *disk = bdev->bd_disk;
1166
1167 if (!disk->fops->direct_access || !IS_ENABLED(CONFIG_FS_DAX))
1168 return false;
1169
1170 /*
1171 * If the partition is not aligned on a page boundary, we can't
1172 * do dax I/O to it.
1173 */
1174 if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
1175 || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
1176 return false;
1177
1178 /*
1179 * If the device has known bad blocks, force all I/O through the
1180 * driver / page cache.
1181 *
1182 * TODO: support finer grained dax error handling
1183 */
1184 if (disk->bb && disk->bb->count)
1185 return false;
1186
1187 return true;
1188}
1189
1190static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); 1244static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
1191 1245
1192/* 1246/*
@@ -1266,7 +1320,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1266 1320
1267 if (!ret) { 1321 if (!ret) {
1268 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); 1322 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
1269 if (!blkdev_dax_capable(bdev)) 1323 if (!bdev_dax_capable(bdev))
1270 bdev->bd_inode->i_flags &= ~S_DAX; 1324 bdev->bd_inode->i_flags &= ~S_DAX;
1271 } 1325 }
1272 1326
@@ -1303,7 +1357,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1303 goto out_clear; 1357 goto out_clear;
1304 } 1358 }
1305 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); 1359 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
1306 if (!blkdev_dax_capable(bdev)) 1360 if (!bdev_dax_capable(bdev))
1307 bdev->bd_inode->i_flags &= ~S_DAX; 1361 bdev->bd_inode->i_flags &= ~S_DAX;
1308 } 1362 }
1309 } else { 1363 } else {
diff --git a/fs/dax.c b/fs/dax.c
index 7d9df93b3a14..5a282260d27e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -87,50 +87,6 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n)
87 return page; 87 return page;
88} 88}
89 89
90/*
91 * dax_clear_sectors() is called from within transaction context from XFS,
92 * and hence this means the stack from this point must follow GFP_NOFS
93 * semantics for all operations.
94 */
95int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size)
96{
97 struct blk_dax_ctl dax = {
98 .sector = _sector,
99 .size = _size,
100 };
101
102 might_sleep();
103 do {
104 long count, sz;
105
106 count = dax_map_atomic(bdev, &dax);
107 if (count < 0)
108 return count;
109 sz = min_t(long, count, SZ_128K);
110 clear_pmem(dax.addr, sz);
111 dax.size -= sz;
112 dax.sector += sz / 512;
113 dax_unmap_atomic(bdev, &dax);
114 cond_resched();
115 } while (dax.size);
116
117 wmb_pmem();
118 return 0;
119}
120EXPORT_SYMBOL_GPL(dax_clear_sectors);
121
122/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
123static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
124 loff_t pos, loff_t end)
125{
126 loff_t final = end - pos + first; /* The final byte of the buffer */
127
128 if (first > 0)
129 clear_pmem(addr, first);
130 if (final < size)
131 clear_pmem(addr + final, size - final);
132}
133
134static bool buffer_written(struct buffer_head *bh) 90static bool buffer_written(struct buffer_head *bh)
135{ 91{
136 return buffer_mapped(bh) && !buffer_unwritten(bh); 92 return buffer_mapped(bh) && !buffer_unwritten(bh);
@@ -169,6 +125,9 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
169 struct blk_dax_ctl dax = { 125 struct blk_dax_ctl dax = {
170 .addr = (void __pmem *) ERR_PTR(-EIO), 126 .addr = (void __pmem *) ERR_PTR(-EIO),
171 }; 127 };
128 unsigned blkbits = inode->i_blkbits;
129 sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
130 >> blkbits;
172 131
173 if (rw == READ) 132 if (rw == READ)
174 end = min(end, i_size_read(inode)); 133 end = min(end, i_size_read(inode));
@@ -176,7 +135,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
176 while (pos < end) { 135 while (pos < end) {
177 size_t len; 136 size_t len;
178 if (pos == max) { 137 if (pos == max) {
179 unsigned blkbits = inode->i_blkbits;
180 long page = pos >> PAGE_SHIFT; 138 long page = pos >> PAGE_SHIFT;
181 sector_t block = page << (PAGE_SHIFT - blkbits); 139 sector_t block = page << (PAGE_SHIFT - blkbits);
182 unsigned first = pos - (block << blkbits); 140 unsigned first = pos - (block << blkbits);
@@ -192,6 +150,13 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
192 bh->b_size = 1 << blkbits; 150 bh->b_size = 1 << blkbits;
193 bh_max = pos - first + bh->b_size; 151 bh_max = pos - first + bh->b_size;
194 bdev = bh->b_bdev; 152 bdev = bh->b_bdev;
153 /*
154 * We allow uninitialized buffers for writes
155 * beyond EOF as those cannot race with faults
156 */
157 WARN_ON_ONCE(
158 (buffer_new(bh) && block < file_blks) ||
159 (rw == WRITE && buffer_unwritten(bh)));
195 } else { 160 } else {
196 unsigned done = bh->b_size - 161 unsigned done = bh->b_size -
197 (bh_max - (pos - first)); 162 (bh_max - (pos - first));
@@ -211,11 +176,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
211 rc = map_len; 176 rc = map_len;
212 break; 177 break;
213 } 178 }
214 if (buffer_unwritten(bh) || buffer_new(bh)) {
215 dax_new_buf(dax.addr, map_len, first,
216 pos, end);
217 need_wmb = true;
218 }
219 dax.addr += first; 179 dax.addr += first;
220 size = map_len - first; 180 size = map_len - first;
221 } 181 }
@@ -276,15 +236,8 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
276 memset(&bh, 0, sizeof(bh)); 236 memset(&bh, 0, sizeof(bh));
277 bh.b_bdev = inode->i_sb->s_bdev; 237 bh.b_bdev = inode->i_sb->s_bdev;
278 238
279 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) { 239 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
280 struct address_space *mapping = inode->i_mapping;
281 inode_lock(inode); 240 inode_lock(inode);
282 retval = filemap_write_and_wait_range(mapping, pos, end - 1);
283 if (retval) {
284 inode_unlock(inode);
285 goto out;
286 }
287 }
288 241
289 /* Protects against truncate */ 242 /* Protects against truncate */
290 if (!(flags & DIO_SKIP_DIO_COUNT)) 243 if (!(flags & DIO_SKIP_DIO_COUNT))
@@ -305,7 +258,6 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
305 258
306 if (!(flags & DIO_SKIP_DIO_COUNT)) 259 if (!(flags & DIO_SKIP_DIO_COUNT))
307 inode_dio_end(inode); 260 inode_dio_end(inode);
308 out:
309 return retval; 261 return retval;
310} 262}
311EXPORT_SYMBOL_GPL(dax_do_io); 263EXPORT_SYMBOL_GPL(dax_do_io);
@@ -321,20 +273,11 @@ EXPORT_SYMBOL_GPL(dax_do_io);
321static int dax_load_hole(struct address_space *mapping, struct page *page, 273static int dax_load_hole(struct address_space *mapping, struct page *page,
322 struct vm_fault *vmf) 274 struct vm_fault *vmf)
323{ 275{
324 unsigned long size;
325 struct inode *inode = mapping->host;
326 if (!page) 276 if (!page)
327 page = find_or_create_page(mapping, vmf->pgoff, 277 page = find_or_create_page(mapping, vmf->pgoff,
328 GFP_KERNEL | __GFP_ZERO); 278 GFP_KERNEL | __GFP_ZERO);
329 if (!page) 279 if (!page)
330 return VM_FAULT_OOM; 280 return VM_FAULT_OOM;
331 /* Recheck i_size under page lock to avoid truncate race */
332 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
333 if (vmf->pgoff >= size) {
334 unlock_page(page);
335 put_page(page);
336 return VM_FAULT_SIGBUS;
337 }
338 281
339 vmf->page = page; 282 vmf->page = page;
340 return VM_FAULT_LOCKED; 283 return VM_FAULT_LOCKED;
@@ -565,33 +508,14 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
565 .sector = to_sector(bh, inode), 508 .sector = to_sector(bh, inode),
566 .size = bh->b_size, 509 .size = bh->b_size,
567 }; 510 };
568 pgoff_t size;
569 int error; 511 int error;
570 512
571 i_mmap_lock_read(mapping); 513 i_mmap_lock_read(mapping);
572 514
573 /*
574 * Check truncate didn't happen while we were allocating a block.
575 * If it did, this block may or may not be still allocated to the
576 * file. We can't tell the filesystem to free it because we can't
577 * take i_mutex here. In the worst case, the file still has blocks
578 * allocated past the end of the file.
579 */
580 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
581 if (unlikely(vmf->pgoff >= size)) {
582 error = -EIO;
583 goto out;
584 }
585
586 if (dax_map_atomic(bdev, &dax) < 0) { 515 if (dax_map_atomic(bdev, &dax) < 0) {
587 error = PTR_ERR(dax.addr); 516 error = PTR_ERR(dax.addr);
588 goto out; 517 goto out;
589 } 518 }
590
591 if (buffer_unwritten(bh) || buffer_new(bh)) {
592 clear_pmem(dax.addr, PAGE_SIZE);
593 wmb_pmem();
594 }
595 dax_unmap_atomic(bdev, &dax); 519 dax_unmap_atomic(bdev, &dax);
596 520
597 error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, 521 error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
@@ -612,19 +536,13 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
612 * @vma: The virtual memory area where the fault occurred 536 * @vma: The virtual memory area where the fault occurred
613 * @vmf: The description of the fault 537 * @vmf: The description of the fault
614 * @get_block: The filesystem method used to translate file offsets to blocks 538 * @get_block: The filesystem method used to translate file offsets to blocks
615 * @complete_unwritten: The filesystem method used to convert unwritten blocks
616 * to written so the data written to them is exposed. This is required for
617 * required by write faults for filesystems that will return unwritten
618 * extent mappings from @get_block, but it is optional for reads as
619 * dax_insert_mapping() will always zero unwritten blocks. If the fs does
620 * not support unwritten extents, the it should pass NULL.
621 * 539 *
622 * When a page fault occurs, filesystems may call this helper in their 540 * When a page fault occurs, filesystems may call this helper in their
623 * fault handler for DAX files. __dax_fault() assumes the caller has done all 541 * fault handler for DAX files. __dax_fault() assumes the caller has done all
624 * the necessary locking for the page fault to proceed successfully. 542 * the necessary locking for the page fault to proceed successfully.
625 */ 543 */
626int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 544int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
627 get_block_t get_block, dax_iodone_t complete_unwritten) 545 get_block_t get_block)
628{ 546{
629 struct file *file = vma->vm_file; 547 struct file *file = vma->vm_file;
630 struct address_space *mapping = file->f_mapping; 548 struct address_space *mapping = file->f_mapping;
@@ -659,15 +577,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
659 put_page(page); 577 put_page(page);
660 goto repeat; 578 goto repeat;
661 } 579 }
662 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
663 if (unlikely(vmf->pgoff >= size)) {
664 /*
665 * We have a struct page covering a hole in the file
666 * from a read fault and we've raced with a truncate
667 */
668 error = -EIO;
669 goto unlock_page;
670 }
671 } 580 }
672 581
673 error = get_block(inode, block, &bh, 0); 582 error = get_block(inode, block, &bh, 0);
@@ -700,17 +609,8 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
700 if (error) 609 if (error)
701 goto unlock_page; 610 goto unlock_page;
702 vmf->page = page; 611 vmf->page = page;
703 if (!page) { 612 if (!page)
704 i_mmap_lock_read(mapping); 613 i_mmap_lock_read(mapping);
705 /* Check we didn't race with truncate */
706 size = (i_size_read(inode) + PAGE_SIZE - 1) >>
707 PAGE_SHIFT;
708 if (vmf->pgoff >= size) {
709 i_mmap_unlock_read(mapping);
710 error = -EIO;
711 goto out;
712 }
713 }
714 return VM_FAULT_LOCKED; 614 return VM_FAULT_LOCKED;
715 } 615 }
716 616
@@ -727,23 +627,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
727 page = NULL; 627 page = NULL;
728 } 628 }
729 629
730 /* 630 /* Filesystem should not return unwritten buffers to us! */
731 * If we successfully insert the new mapping over an unwritten extent, 631 WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
732 * we need to ensure we convert the unwritten extent. If there is an
733 * error inserting the mapping, the filesystem needs to leave it as
734 * unwritten to prevent exposure of the stale underlying data to
735 * userspace, but we still need to call the completion function so
736 * the private resources on the mapping buffer can be released. We
737 * indicate what the callback should do via the uptodate variable, same
738 * as for normal BH based IO completions.
739 */
740 error = dax_insert_mapping(inode, &bh, vma, vmf); 632 error = dax_insert_mapping(inode, &bh, vma, vmf);
741 if (buffer_unwritten(&bh)) {
742 if (complete_unwritten)
743 complete_unwritten(&bh, !error);
744 else
745 WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
746 }
747 633
748 out: 634 out:
749 if (error == -ENOMEM) 635 if (error == -ENOMEM)
@@ -772,7 +658,7 @@ EXPORT_SYMBOL(__dax_fault);
772 * fault handler for DAX files. 658 * fault handler for DAX files.
773 */ 659 */
774int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 660int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
775 get_block_t get_block, dax_iodone_t complete_unwritten) 661 get_block_t get_block)
776{ 662{
777 int result; 663 int result;
778 struct super_block *sb = file_inode(vma->vm_file)->i_sb; 664 struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@@ -781,7 +667,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
781 sb_start_pagefault(sb); 667 sb_start_pagefault(sb);
782 file_update_time(vma->vm_file); 668 file_update_time(vma->vm_file);
783 } 669 }
784 result = __dax_fault(vma, vmf, get_block, complete_unwritten); 670 result = __dax_fault(vma, vmf, get_block);
785 if (vmf->flags & FAULT_FLAG_WRITE) 671 if (vmf->flags & FAULT_FLAG_WRITE)
786 sb_end_pagefault(sb); 672 sb_end_pagefault(sb);
787 673
@@ -815,8 +701,7 @@ static void __dax_dbg(struct buffer_head *bh, unsigned long address,
815#define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd") 701#define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd")
816 702
817int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 703int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
818 pmd_t *pmd, unsigned int flags, get_block_t get_block, 704 pmd_t *pmd, unsigned int flags, get_block_t get_block)
819 dax_iodone_t complete_unwritten)
820{ 705{
821 struct file *file = vma->vm_file; 706 struct file *file = vma->vm_file;
822 struct address_space *mapping = file->f_mapping; 707 struct address_space *mapping = file->f_mapping;
@@ -875,6 +760,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
875 if (get_block(inode, block, &bh, 1) != 0) 760 if (get_block(inode, block, &bh, 1) != 0)
876 return VM_FAULT_SIGBUS; 761 return VM_FAULT_SIGBUS;
877 alloc = true; 762 alloc = true;
763 WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
878 } 764 }
879 765
880 bdev = bh.b_bdev; 766 bdev = bh.b_bdev;
@@ -902,23 +788,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
902 788
903 i_mmap_lock_read(mapping); 789 i_mmap_lock_read(mapping);
904 790
905 /*
906 * If a truncate happened while we were allocating blocks, we may
907 * leave blocks allocated to the file that are beyond EOF. We can't
908 * take i_mutex here, so just leave them hanging; they'll be freed
909 * when the file is deleted.
910 */
911 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
912 if (pgoff >= size) {
913 result = VM_FAULT_SIGBUS;
914 goto out;
915 }
916 if ((pgoff | PG_PMD_COLOUR) >= size) {
917 dax_pmd_dbg(&bh, address,
918 "offset + huge page size > file size");
919 goto fallback;
920 }
921
922 if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { 791 if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
923 spinlock_t *ptl; 792 spinlock_t *ptl;
924 pmd_t entry; 793 pmd_t entry;
@@ -954,8 +823,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
954 long length = dax_map_atomic(bdev, &dax); 823 long length = dax_map_atomic(bdev, &dax);
955 824
956 if (length < 0) { 825 if (length < 0) {
957 result = VM_FAULT_SIGBUS; 826 dax_pmd_dbg(&bh, address, "dax-error fallback");
958 goto out; 827 goto fallback;
959 } 828 }
960 if (length < PMD_SIZE) { 829 if (length < PMD_SIZE) {
961 dax_pmd_dbg(&bh, address, "dax-length too small"); 830 dax_pmd_dbg(&bh, address, "dax-length too small");
@@ -973,14 +842,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
973 dax_pmd_dbg(&bh, address, "pfn not in memmap"); 842 dax_pmd_dbg(&bh, address, "pfn not in memmap");
974 goto fallback; 843 goto fallback;
975 } 844 }
976
977 if (buffer_unwritten(&bh) || buffer_new(&bh)) {
978 clear_pmem(dax.addr, PMD_SIZE);
979 wmb_pmem();
980 count_vm_event(PGMAJFAULT);
981 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
982 result |= VM_FAULT_MAJOR;
983 }
984 dax_unmap_atomic(bdev, &dax); 845 dax_unmap_atomic(bdev, &dax);
985 846
986 /* 847 /*
@@ -1020,9 +881,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1020 out: 881 out:
1021 i_mmap_unlock_read(mapping); 882 i_mmap_unlock_read(mapping);
1022 883
1023 if (buffer_unwritten(&bh))
1024 complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
1025
1026 return result; 884 return result;
1027 885
1028 fallback: 886 fallback:
@@ -1042,8 +900,7 @@ EXPORT_SYMBOL_GPL(__dax_pmd_fault);
1042 * pmd_fault handler for DAX files. 900 * pmd_fault handler for DAX files.
1043 */ 901 */
1044int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 902int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1045 pmd_t *pmd, unsigned int flags, get_block_t get_block, 903 pmd_t *pmd, unsigned int flags, get_block_t get_block)
1046 dax_iodone_t complete_unwritten)
1047{ 904{
1048 int result; 905 int result;
1049 struct super_block *sb = file_inode(vma->vm_file)->i_sb; 906 struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@@ -1052,8 +909,7 @@ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1052 sb_start_pagefault(sb); 909 sb_start_pagefault(sb);
1053 file_update_time(vma->vm_file); 910 file_update_time(vma->vm_file);
1054 } 911 }
1055 result = __dax_pmd_fault(vma, address, pmd, flags, get_block, 912 result = __dax_pmd_fault(vma, address, pmd, flags, get_block);
1056 complete_unwritten);
1057 if (flags & FAULT_FLAG_WRITE) 913 if (flags & FAULT_FLAG_WRITE)
1058 sb_end_pagefault(sb); 914 sb_end_pagefault(sb);
1059 915
@@ -1091,6 +947,43 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1091} 947}
1092EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); 948EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
1093 949
950static bool dax_range_is_aligned(struct block_device *bdev,
951 unsigned int offset, unsigned int length)
952{
953 unsigned short sector_size = bdev_logical_block_size(bdev);
954
955 if (!IS_ALIGNED(offset, sector_size))
956 return false;
957 if (!IS_ALIGNED(length, sector_size))
958 return false;
959
960 return true;
961}
962
963int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
964 unsigned int offset, unsigned int length)
965{
966 struct blk_dax_ctl dax = {
967 .sector = sector,
968 .size = PAGE_SIZE,
969 };
970
971 if (dax_range_is_aligned(bdev, offset, length)) {
972 sector_t start_sector = dax.sector + (offset >> 9);
973
974 return blkdev_issue_zeroout(bdev, start_sector,
975 length >> 9, GFP_NOFS, true);
976 } else {
977 if (dax_map_atomic(bdev, &dax) < 0)
978 return PTR_ERR(dax.addr);
979 clear_pmem(dax.addr + offset, length);
980 wmb_pmem();
981 dax_unmap_atomic(bdev, &dax);
982 }
983 return 0;
984}
985EXPORT_SYMBOL_GPL(__dax_zero_page_range);
986
1094/** 987/**
1095 * dax_zero_page_range - zero a range within a page of a DAX file 988 * dax_zero_page_range - zero a range within a page of a DAX file
1096 * @inode: The file being truncated 989 * @inode: The file being truncated
@@ -1102,12 +995,6 @@ EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
1102 * page in a DAX file. This is intended for hole-punch operations. If 995 * page in a DAX file. This is intended for hole-punch operations. If
1103 * you are truncating a file, the helper function dax_truncate_page() may be 996 * you are truncating a file, the helper function dax_truncate_page() may be
1104 * more convenient. 997 * more convenient.
1105 *
1106 * We work in terms of PAGE_SIZE here for commonality with
1107 * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
1108 * took care of disposing of the unnecessary blocks. Even if the filesystem
1109 * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
1110 * since the file might be mmapped.
1111 */ 998 */
1112int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, 999int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
1113 get_block_t get_block) 1000 get_block_t get_block)
@@ -1126,23 +1013,11 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
1126 bh.b_bdev = inode->i_sb->s_bdev; 1013 bh.b_bdev = inode->i_sb->s_bdev;
1127 bh.b_size = PAGE_SIZE; 1014 bh.b_size = PAGE_SIZE;
1128 err = get_block(inode, index, &bh, 0); 1015 err = get_block(inode, index, &bh, 0);
1129 if (err < 0) 1016 if (err < 0 || !buffer_written(&bh))
1130 return err; 1017 return err;
1131 if (buffer_written(&bh)) {
1132 struct block_device *bdev = bh.b_bdev;
1133 struct blk_dax_ctl dax = {
1134 .sector = to_sector(&bh, inode),
1135 .size = PAGE_SIZE,
1136 };
1137
1138 if (dax_map_atomic(bdev, &dax) < 0)
1139 return PTR_ERR(dax.addr);
1140 clear_pmem(dax.addr + offset, length);
1141 wmb_pmem();
1142 dax_unmap_atomic(bdev, &dax);
1143 }
1144 1018
1145 return 0; 1019 return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
1020 offset, length);
1146} 1021}
1147EXPORT_SYMBOL_GPL(dax_zero_page_range); 1022EXPORT_SYMBOL_GPL(dax_zero_page_range);
1148 1023
@@ -1154,12 +1029,6 @@ EXPORT_SYMBOL_GPL(dax_zero_page_range);
1154 * 1029 *
1155 * Similar to block_truncate_page(), this function can be called by a 1030 * Similar to block_truncate_page(), this function can be called by a
1156 * filesystem when it is truncating a DAX file to handle the partial page. 1031 * filesystem when it is truncating a DAX file to handle the partial page.
1157 *
1158 * We work in terms of PAGE_SIZE here for commonality with
1159 * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
1160 * took care of disposing of the unnecessary blocks. Even if the filesystem
1161 * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
1162 * since the file might be mmapped.
1163 */ 1032 */
1164int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) 1033int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
1165{ 1034{
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index c1400b109805..868c02317b05 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -51,7 +51,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
51 } 51 }
52 down_read(&ei->dax_sem); 52 down_read(&ei->dax_sem);
53 53
54 ret = __dax_fault(vma, vmf, ext2_get_block, NULL); 54 ret = __dax_fault(vma, vmf, ext2_get_block);
55 55
56 up_read(&ei->dax_sem); 56 up_read(&ei->dax_sem);
57 if (vmf->flags & FAULT_FLAG_WRITE) 57 if (vmf->flags & FAULT_FLAG_WRITE)
@@ -72,7 +72,7 @@ static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
72 } 72 }
73 down_read(&ei->dax_sem); 73 down_read(&ei->dax_sem);
74 74
75 ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL); 75 ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block);
76 76
77 up_read(&ei->dax_sem); 77 up_read(&ei->dax_sem);
78 if (flags & FAULT_FLAG_WRITE) 78 if (flags & FAULT_FLAG_WRITE)
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index b675610391b8..fcbe58641e40 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -26,6 +26,7 @@
26#include <linux/highuid.h> 26#include <linux/highuid.h>
27#include <linux/pagemap.h> 27#include <linux/pagemap.h>
28#include <linux/dax.h> 28#include <linux/dax.h>
29#include <linux/blkdev.h>
29#include <linux/quotaops.h> 30#include <linux/quotaops.h>
30#include <linux/writeback.h> 31#include <linux/writeback.h>
31#include <linux/buffer_head.h> 32#include <linux/buffer_head.h>
@@ -737,19 +738,18 @@ static int ext2_get_blocks(struct inode *inode,
737 * so that it's not found by another thread before it's 738 * so that it's not found by another thread before it's
738 * initialised 739 * initialised
739 */ 740 */
740 err = dax_clear_sectors(inode->i_sb->s_bdev, 741 err = sb_issue_zeroout(inode->i_sb,
741 le32_to_cpu(chain[depth-1].key) << 742 le32_to_cpu(chain[depth-1].key), count,
742 (inode->i_blkbits - 9), 743 GFP_NOFS);
743 1 << inode->i_blkbits);
744 if (err) { 744 if (err) {
745 mutex_unlock(&ei->truncate_mutex); 745 mutex_unlock(&ei->truncate_mutex);
746 goto cleanup; 746 goto cleanup;
747 } 747 }
748 } 748 } else
749 set_buffer_new(bh_result);
749 750
750 ext2_splice_branch(inode, iblock, partial, indirect_blks, count); 751 ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
751 mutex_unlock(&ei->truncate_mutex); 752 mutex_unlock(&ei->truncate_mutex);
752 set_buffer_new(bh_result);
753got_it: 753got_it:
754 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); 754 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
755 if (count > blocks_to_boundary) 755 if (count > blocks_to_boundary)
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index b78caf25f746..1d9379568aa8 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -922,16 +922,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
922 blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); 922 blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
923 923
924 if (sbi->s_mount_opt & EXT2_MOUNT_DAX) { 924 if (sbi->s_mount_opt & EXT2_MOUNT_DAX) {
925 if (blocksize != PAGE_SIZE) { 925 err = bdev_dax_supported(sb, blocksize);
926 ext2_msg(sb, KERN_ERR, 926 if (err)
927 "error: unsupported blocksize for dax");
928 goto failed_mount; 927 goto failed_mount;
929 }
930 if (!sb->s_bdev->bd_disk->fops->direct_access) {
931 ext2_msg(sb, KERN_ERR,
932 "error: device does not support dax");
933 goto failed_mount;
934 }
935 } 928 }
936 929
937 /* If the blocksize doesn't match, re-read the thing.. */ 930 /* If the blocksize doesn't match, re-read the thing.. */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d478110c32a6..df44c877892a 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -202,7 +202,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
202 if (IS_ERR(handle)) 202 if (IS_ERR(handle))
203 result = VM_FAULT_SIGBUS; 203 result = VM_FAULT_SIGBUS;
204 else 204 else
205 result = __dax_fault(vma, vmf, ext4_dax_get_block, NULL); 205 result = __dax_fault(vma, vmf, ext4_dax_get_block);
206 206
207 if (write) { 207 if (write) {
208 if (!IS_ERR(handle)) 208 if (!IS_ERR(handle))
@@ -238,7 +238,7 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
238 result = VM_FAULT_SIGBUS; 238 result = VM_FAULT_SIGBUS;
239 else 239 else
240 result = __dax_pmd_fault(vma, addr, pmd, flags, 240 result = __dax_pmd_fault(vma, addr, pmd, flags,
241 ext4_dax_get_block, NULL); 241 ext4_dax_get_block);
242 242
243 if (write) { 243 if (write) {
244 if (!IS_ERR(handle)) 244 if (!IS_ERR(handle))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 20c5d52253b4..3822a5aedc61 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3417,16 +3417,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3417 } 3417 }
3418 3418
3419 if (sbi->s_mount_opt & EXT4_MOUNT_DAX) { 3419 if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
3420 if (blocksize != PAGE_SIZE) { 3420 err = bdev_dax_supported(sb, blocksize);
3421 ext4_msg(sb, KERN_ERR, 3421 if (err)
3422 "error: unsupported blocksize for dax");
3423 goto failed_mount;
3424 }
3425 if (!sb->s_bdev->bd_disk->fops->direct_access) {
3426 ext4_msg(sb, KERN_ERR,
3427 "error: device does not support dax");
3428 goto failed_mount; 3422 goto failed_mount;
3429 }
3430 } 3423 }
3431 3424
3432 if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { 3425 if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) {
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 613ea2d7ac19..586bb64e674b 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -72,18 +72,11 @@ xfs_zero_extent(
72 struct xfs_mount *mp = ip->i_mount; 72 struct xfs_mount *mp = ip->i_mount;
73 xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb); 73 xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb);
74 sector_t block = XFS_BB_TO_FSBT(mp, sector); 74 sector_t block = XFS_BB_TO_FSBT(mp, sector);
75 ssize_t size = XFS_FSB_TO_B(mp, count_fsb);
76
77 if (IS_DAX(VFS_I(ip)))
78 return dax_clear_sectors(xfs_find_bdev_for_inode(VFS_I(ip)),
79 sector, size);
80
81 /*
82 * let the block layer decide on the fastest method of
83 * implementing the zeroing.
84 */
85 return sb_issue_zeroout(mp->m_super, block, count_fsb, GFP_NOFS);
86 75
76 return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)),
77 block << (mp->m_super->s_blocksize_bits - 9),
78 count_fsb << (mp->m_super->s_blocksize_bits - 9),
79 GFP_NOFS, true);
87} 80}
88 81
89/* 82/*
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 44af22897c8b..47fc63295422 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1551,7 +1551,7 @@ xfs_filemap_page_mkwrite(
1551 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1551 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1552 1552
1553 if (IS_DAX(inode)) { 1553 if (IS_DAX(inode)) {
1554 ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL); 1554 ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
1555 } else { 1555 } else {
1556 ret = block_page_mkwrite(vma, vmf, xfs_get_blocks); 1556 ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
1557 ret = block_page_mkwrite_return(ret); 1557 ret = block_page_mkwrite_return(ret);
@@ -1585,7 +1585,7 @@ xfs_filemap_fault(
1585 * changes to xfs_get_blocks_direct() to map unwritten extent 1585 * changes to xfs_get_blocks_direct() to map unwritten extent
1586 * ioend for conversion on read-only mappings. 1586 * ioend for conversion on read-only mappings.
1587 */ 1587 */
1588 ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL); 1588 ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault);
1589 } else 1589 } else
1590 ret = filemap_fault(vma, vmf); 1590 ret = filemap_fault(vma, vmf);
1591 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1591 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1622,8 +1622,7 @@ xfs_filemap_pmd_fault(
1622 } 1622 }
1623 1623
1624 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1624 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1625 ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault, 1625 ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault);
1626 NULL);
1627 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1626 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1628 1627
1629 if (flags & FAULT_FLAG_WRITE) 1628 if (flags & FAULT_FLAG_WRITE)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 416421d7ff10..11ea5d51db56 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1555,14 +1555,12 @@ xfs_fs_fill_super(
1555 1555
1556 if (mp->m_flags & XFS_MOUNT_DAX) { 1556 if (mp->m_flags & XFS_MOUNT_DAX) {
1557 xfs_warn(mp, 1557 xfs_warn(mp,
1558 "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); 1558 "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
1559 if (sb->s_blocksize != PAGE_SIZE) { 1559
1560 xfs_alert(mp, 1560 error = bdev_dax_supported(sb, sb->s_blocksize);
1561 "Filesystem block size invalid for DAX Turning DAX off."); 1561 if (error) {
1562 mp->m_flags &= ~XFS_MOUNT_DAX;
1563 } else if (!sb->s_bdev->bd_disk->fops->direct_access) {
1564 xfs_alert(mp, 1562 xfs_alert(mp,
1565 "Block device does not support DAX Turning DAX off."); 1563 "DAX unsupported by block device. Turning off DAX.");
1566 mp->m_flags &= ~XFS_MOUNT_DAX; 1564 mp->m_flags &= ~XFS_MOUNT_DAX;
1567 } 1565 }
1568 } 1566 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1fd8fdff2f81..3d9cf326574f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -768,6 +768,17 @@ static inline void rq_flush_dcache_pages(struct request *rq)
768} 768}
769#endif 769#endif
770 770
771#ifdef CONFIG_PRINTK
772#define vfs_msg(sb, level, fmt, ...) \
773 __vfs_msg(sb, level, fmt, ##__VA_ARGS__)
774#else
775#define vfs_msg(sb, level, fmt, ...) \
776do { \
777 no_printk(fmt, ##__VA_ARGS__); \
778 __vfs_msg(sb, "", " "); \
779} while (0)
780#endif
781
771extern int blk_register_queue(struct gendisk *disk); 782extern int blk_register_queue(struct gendisk *disk);
772extern void blk_unregister_queue(struct gendisk *disk); 783extern void blk_unregister_queue(struct gendisk *disk);
773extern blk_qc_t generic_make_request(struct bio *bio); 784extern blk_qc_t generic_make_request(struct bio *bio);
@@ -1660,7 +1671,7 @@ struct block_device_operations {
1660 int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); 1671 int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
1661 int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); 1672 int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
1662 long (*direct_access)(struct block_device *, sector_t, void __pmem **, 1673 long (*direct_access)(struct block_device *, sector_t, void __pmem **,
1663 pfn_t *); 1674 pfn_t *, long);
1664 unsigned int (*check_events) (struct gendisk *disk, 1675 unsigned int (*check_events) (struct gendisk *disk,
1665 unsigned int clearing); 1676 unsigned int clearing);
1666 /* ->media_changed() is DEPRECATED, use ->check_events() instead */ 1677 /* ->media_changed() is DEPRECATED, use ->check_events() instead */
@@ -1680,6 +1691,8 @@ extern int bdev_read_page(struct block_device *, sector_t, struct page *);
1680extern int bdev_write_page(struct block_device *, sector_t, struct page *, 1691extern int bdev_write_page(struct block_device *, sector_t, struct page *,
1681 struct writeback_control *); 1692 struct writeback_control *);
1682extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *); 1693extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *);
1694extern int bdev_dax_supported(struct super_block *, int);
1695extern bool bdev_dax_capable(struct block_device *);
1683#else /* CONFIG_BLOCK */ 1696#else /* CONFIG_BLOCK */
1684 1697
1685struct block_device; 1698struct block_device;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 982a6c4a62f3..7743e51f826c 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -7,41 +7,44 @@
7 7
8ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, 8ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,
9 get_block_t, dio_iodone_t, int flags); 9 get_block_t, dio_iodone_t, int flags);
10int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size);
11int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); 10int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
12int dax_truncate_page(struct inode *, loff_t from, get_block_t); 11int dax_truncate_page(struct inode *, loff_t from, get_block_t);
13int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, 12int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
14 dax_iodone_t); 13int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
15int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
16 dax_iodone_t);
17 14
18#ifdef CONFIG_FS_DAX 15#ifdef CONFIG_FS_DAX
19struct page *read_dax_sector(struct block_device *bdev, sector_t n); 16struct page *read_dax_sector(struct block_device *bdev, sector_t n);
17int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
18 unsigned int offset, unsigned int length);
20#else 19#else
21static inline struct page *read_dax_sector(struct block_device *bdev, 20static inline struct page *read_dax_sector(struct block_device *bdev,
22 sector_t n) 21 sector_t n)
23{ 22{
24 return ERR_PTR(-ENXIO); 23 return ERR_PTR(-ENXIO);
25} 24}
25static inline int __dax_zero_page_range(struct block_device *bdev,
26 sector_t sector, unsigned int offset, unsigned int length)
27{
28 return -ENXIO;
29}
26#endif 30#endif
27 31
28#ifdef CONFIG_TRANSPARENT_HUGEPAGE 32#ifdef CONFIG_TRANSPARENT_HUGEPAGE
29int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *, 33int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
30 unsigned int flags, get_block_t, dax_iodone_t); 34 unsigned int flags, get_block_t);
31int __dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *, 35int __dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
32 unsigned int flags, get_block_t, dax_iodone_t); 36 unsigned int flags, get_block_t);
33#else 37#else
34static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, 38static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
35 pmd_t *pmd, unsigned int flags, get_block_t gb, 39 pmd_t *pmd, unsigned int flags, get_block_t gb)
36 dax_iodone_t di)
37{ 40{
38 return VM_FAULT_FALLBACK; 41 return VM_FAULT_FALLBACK;
39} 42}
40#define __dax_pmd_fault dax_pmd_fault 43#define __dax_pmd_fault dax_pmd_fault
41#endif 44#endif
42int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); 45int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
43#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod) 46#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb)
44#define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod) 47#define __dax_mkwrite(vma, vmf, gb) __dax_fault(vma, vmf, gb)
45 48
46static inline bool vma_is_dax(struct vm_area_struct *vma) 49static inline bool vma_is_dax(struct vm_area_struct *vma)
47{ 50{
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5f61431d8673..9ace7f745bcd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -74,7 +74,6 @@ typedef int (get_block_t)(struct inode *inode, sector_t iblock,
74 struct buffer_head *bh_result, int create); 74 struct buffer_head *bh_result, int create);
75typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, 75typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
76 ssize_t bytes, void *private); 76 ssize_t bytes, void *private);
77typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate);
78 77
79#define MAY_EXEC 0x00000001 78#define MAY_EXEC 0x00000001
80#define MAY_WRITE 0x00000002 79#define MAY_WRITE 0x00000002