aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-05-26 22:34:26 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-05-26 22:34:26 -0400
commit315227f6da389f3a560f27f7777080857278e1b4 (patch)
tree11306e1e8d8b66044ab48901b90141b5362c12e3 /fs
parenta10c38a4f385f5d7c173a263ff6bb2d36021b3bb (diff)
parent40543f62cbdce42633e3fe10923099feee272e1f (diff)
Merge tag 'dax-misc-for-4.7' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm
Pull misc DAX updates from Vishal Verma: "DAX error handling for 4.7 - Until now, dax has been disabled if media errors were found on any device. This enables the use of DAX in the presence of these errors by making all sector-aligned zeroing go through the driver. - The driver (already) has the ability to clear errors on writes that are sent through the block layer using 'DSMs' defined in ACPI 6.1. Other misc changes: - When mounting DAX filesystems, check to make sure the partition is page aligned. This is a requirement for DAX, and previously, we allowed such unaligned mounts to succeed, but subsequent reads/writes would fail. - Misc/cleanup fixes from Jan that remove unused code from DAX related to zeroing, writeback, and some size checks" * tag 'dax-misc-for-4.7' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: dax: fix a comment in dax_zero_page_range and dax_truncate_page dax: for truncate/hole-punch, do zeroing through the driver if possible dax: export a low-level __dax_zero_page_range helper dax: use sb_issue_zerout instead of calling dax_clear_sectors dax: enable dax in the presence of known media errors (badblocks) dax: fallback from pmd to pte on error block: Update blkdev_dax_capable() for consistency xfs: Add alignment check for DAX mount ext2: Add alignment check for DAX mount ext4: Add alignment check for DAX mount block: Add bdev_dax_supported() for dax mount checks block: Add vfs_msg() interface dax: Remove redundant inode size checks dax: Remove pointless writeback from dax_do_io() dax: Remove zeroing from dax_io() dax: Remove dead zeroing code from fault handlers ext2: Avoid DAX zeroing to corrupt data ext2: Fix block zeroing in ext2_get_blocks() for DAX dax: Remove complete_unwritten argument DAX: move RADIX_DAX_ definitions to dax.c
Diffstat (limited to 'fs')
-rw-r--r--fs/block_dev.c114
-rw-r--r--fs/dax.c257
-rw-r--r--fs/ext2/file.c4
-rw-r--r--fs/ext2/inode.c12
-rw-r--r--fs/ext2/super.c11
-rw-r--r--fs/ext4/file.c4
-rw-r--r--fs/ext4/super.c11
-rw-r--r--fs/xfs/xfs_bmap_util.c15
-rw-r--r--fs/xfs/xfs_file.c7
-rw-r--r--fs/xfs/xfs_super.c12
10 files changed, 173 insertions, 274 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1089dbf25925..71ccab1d22c6 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -51,6 +51,18 @@ struct block_device *I_BDEV(struct inode *inode)
51} 51}
52EXPORT_SYMBOL(I_BDEV); 52EXPORT_SYMBOL(I_BDEV);
53 53
54void __vfs_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
55{
56 struct va_format vaf;
57 va_list args;
58
59 va_start(args, fmt);
60 vaf.fmt = fmt;
61 vaf.va = &args;
62 printk_ratelimited("%sVFS (%s): %pV\n", prefix, sb->s_id, &vaf);
63 va_end(args);
64}
65
54static void bdev_write_inode(struct block_device *bdev) 66static void bdev_write_inode(struct block_device *bdev)
55{ 67{
56 struct inode *inode = bdev->bd_inode; 68 struct inode *inode = bdev->bd_inode;
@@ -489,7 +501,7 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
489 sector += get_start_sect(bdev); 501 sector += get_start_sect(bdev);
490 if (sector % (PAGE_SIZE / 512)) 502 if (sector % (PAGE_SIZE / 512))
491 return -EINVAL; 503 return -EINVAL;
492 avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn); 504 avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn, size);
493 if (!avail) 505 if (!avail)
494 return -ERANGE; 506 return -ERANGE;
495 if (avail > 0 && avail & ~PAGE_MASK) 507 if (avail > 0 && avail & ~PAGE_MASK)
@@ -498,6 +510,75 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
498} 510}
499EXPORT_SYMBOL_GPL(bdev_direct_access); 511EXPORT_SYMBOL_GPL(bdev_direct_access);
500 512
513/**
514 * bdev_dax_supported() - Check if the device supports dax for filesystem
515 * @sb: The superblock of the device
516 * @blocksize: The block size of the device
517 *
518 * This is a library function for filesystems to check if the block device
519 * can be mounted with dax option.
520 *
521 * Return: negative errno if unsupported, 0 if supported.
522 */
523int bdev_dax_supported(struct super_block *sb, int blocksize)
524{
525 struct blk_dax_ctl dax = {
526 .sector = 0,
527 .size = PAGE_SIZE,
528 };
529 int err;
530
531 if (blocksize != PAGE_SIZE) {
532 vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax");
533 return -EINVAL;
534 }
535
536 err = bdev_direct_access(sb->s_bdev, &dax);
537 if (err < 0) {
538 switch (err) {
539 case -EOPNOTSUPP:
540 vfs_msg(sb, KERN_ERR,
541 "error: device does not support dax");
542 break;
543 case -EINVAL:
544 vfs_msg(sb, KERN_ERR,
545 "error: unaligned partition for dax");
546 break;
547 default:
548 vfs_msg(sb, KERN_ERR,
549 "error: dax access failed (%d)", err);
550 }
551 return err;
552 }
553
554 return 0;
555}
556EXPORT_SYMBOL_GPL(bdev_dax_supported);
557
558/**
559 * bdev_dax_capable() - Return if the raw device is capable for dax
560 * @bdev: The device for raw block device access
561 */
562bool bdev_dax_capable(struct block_device *bdev)
563{
564 struct blk_dax_ctl dax = {
565 .size = PAGE_SIZE,
566 };
567
568 if (!IS_ENABLED(CONFIG_FS_DAX))
569 return false;
570
571 dax.sector = 0;
572 if (bdev_direct_access(bdev, &dax) < 0)
573 return false;
574
575 dax.sector = bdev->bd_part->nr_sects - (PAGE_SIZE / 512);
576 if (bdev_direct_access(bdev, &dax) < 0)
577 return false;
578
579 return true;
580}
581
501/* 582/*
502 * pseudo-fs 583 * pseudo-fs
503 */ 584 */
@@ -1160,33 +1241,6 @@ void bd_set_size(struct block_device *bdev, loff_t size)
1160} 1241}
1161EXPORT_SYMBOL(bd_set_size); 1242EXPORT_SYMBOL(bd_set_size);
1162 1243
1163static bool blkdev_dax_capable(struct block_device *bdev)
1164{
1165 struct gendisk *disk = bdev->bd_disk;
1166
1167 if (!disk->fops->direct_access || !IS_ENABLED(CONFIG_FS_DAX))
1168 return false;
1169
1170 /*
1171 * If the partition is not aligned on a page boundary, we can't
1172 * do dax I/O to it.
1173 */
1174 if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
1175 || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
1176 return false;
1177
1178 /*
1179 * If the device has known bad blocks, force all I/O through the
1180 * driver / page cache.
1181 *
1182 * TODO: support finer grained dax error handling
1183 */
1184 if (disk->bb && disk->bb->count)
1185 return false;
1186
1187 return true;
1188}
1189
1190static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); 1244static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
1191 1245
1192/* 1246/*
@@ -1266,7 +1320,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1266 1320
1267 if (!ret) { 1321 if (!ret) {
1268 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); 1322 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
1269 if (!blkdev_dax_capable(bdev)) 1323 if (!bdev_dax_capable(bdev))
1270 bdev->bd_inode->i_flags &= ~S_DAX; 1324 bdev->bd_inode->i_flags &= ~S_DAX;
1271 } 1325 }
1272 1326
@@ -1303,7 +1357,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1303 goto out_clear; 1357 goto out_clear;
1304 } 1358 }
1305 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); 1359 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
1306 if (!blkdev_dax_capable(bdev)) 1360 if (!bdev_dax_capable(bdev))
1307 bdev->bd_inode->i_flags &= ~S_DAX; 1361 bdev->bd_inode->i_flags &= ~S_DAX;
1308 } 1362 }
1309 } else { 1363 } else {
diff --git a/fs/dax.c b/fs/dax.c
index 7d9df93b3a14..5a282260d27e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -87,50 +87,6 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n)
87 return page; 87 return page;
88} 88}
89 89
90/*
91 * dax_clear_sectors() is called from within transaction context from XFS,
92 * and hence this means the stack from this point must follow GFP_NOFS
93 * semantics for all operations.
94 */
95int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size)
96{
97 struct blk_dax_ctl dax = {
98 .sector = _sector,
99 .size = _size,
100 };
101
102 might_sleep();
103 do {
104 long count, sz;
105
106 count = dax_map_atomic(bdev, &dax);
107 if (count < 0)
108 return count;
109 sz = min_t(long, count, SZ_128K);
110 clear_pmem(dax.addr, sz);
111 dax.size -= sz;
112 dax.sector += sz / 512;
113 dax_unmap_atomic(bdev, &dax);
114 cond_resched();
115 } while (dax.size);
116
117 wmb_pmem();
118 return 0;
119}
120EXPORT_SYMBOL_GPL(dax_clear_sectors);
121
122/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
123static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
124 loff_t pos, loff_t end)
125{
126 loff_t final = end - pos + first; /* The final byte of the buffer */
127
128 if (first > 0)
129 clear_pmem(addr, first);
130 if (final < size)
131 clear_pmem(addr + final, size - final);
132}
133
134static bool buffer_written(struct buffer_head *bh) 90static bool buffer_written(struct buffer_head *bh)
135{ 91{
136 return buffer_mapped(bh) && !buffer_unwritten(bh); 92 return buffer_mapped(bh) && !buffer_unwritten(bh);
@@ -169,6 +125,9 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
169 struct blk_dax_ctl dax = { 125 struct blk_dax_ctl dax = {
170 .addr = (void __pmem *) ERR_PTR(-EIO), 126 .addr = (void __pmem *) ERR_PTR(-EIO),
171 }; 127 };
128 unsigned blkbits = inode->i_blkbits;
129 sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
130 >> blkbits;
172 131
173 if (rw == READ) 132 if (rw == READ)
174 end = min(end, i_size_read(inode)); 133 end = min(end, i_size_read(inode));
@@ -176,7 +135,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
176 while (pos < end) { 135 while (pos < end) {
177 size_t len; 136 size_t len;
178 if (pos == max) { 137 if (pos == max) {
179 unsigned blkbits = inode->i_blkbits;
180 long page = pos >> PAGE_SHIFT; 138 long page = pos >> PAGE_SHIFT;
181 sector_t block = page << (PAGE_SHIFT - blkbits); 139 sector_t block = page << (PAGE_SHIFT - blkbits);
182 unsigned first = pos - (block << blkbits); 140 unsigned first = pos - (block << blkbits);
@@ -192,6 +150,13 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
192 bh->b_size = 1 << blkbits; 150 bh->b_size = 1 << blkbits;
193 bh_max = pos - first + bh->b_size; 151 bh_max = pos - first + bh->b_size;
194 bdev = bh->b_bdev; 152 bdev = bh->b_bdev;
153 /*
154 * We allow uninitialized buffers for writes
155 * beyond EOF as those cannot race with faults
156 */
157 WARN_ON_ONCE(
158 (buffer_new(bh) && block < file_blks) ||
159 (rw == WRITE && buffer_unwritten(bh)));
195 } else { 160 } else {
196 unsigned done = bh->b_size - 161 unsigned done = bh->b_size -
197 (bh_max - (pos - first)); 162 (bh_max - (pos - first));
@@ -211,11 +176,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
211 rc = map_len; 176 rc = map_len;
212 break; 177 break;
213 } 178 }
214 if (buffer_unwritten(bh) || buffer_new(bh)) {
215 dax_new_buf(dax.addr, map_len, first,
216 pos, end);
217 need_wmb = true;
218 }
219 dax.addr += first; 179 dax.addr += first;
220 size = map_len - first; 180 size = map_len - first;
221 } 181 }
@@ -276,15 +236,8 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
276 memset(&bh, 0, sizeof(bh)); 236 memset(&bh, 0, sizeof(bh));
277 bh.b_bdev = inode->i_sb->s_bdev; 237 bh.b_bdev = inode->i_sb->s_bdev;
278 238
279 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) { 239 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
280 struct address_space *mapping = inode->i_mapping;
281 inode_lock(inode); 240 inode_lock(inode);
282 retval = filemap_write_and_wait_range(mapping, pos, end - 1);
283 if (retval) {
284 inode_unlock(inode);
285 goto out;
286 }
287 }
288 241
289 /* Protects against truncate */ 242 /* Protects against truncate */
290 if (!(flags & DIO_SKIP_DIO_COUNT)) 243 if (!(flags & DIO_SKIP_DIO_COUNT))
@@ -305,7 +258,6 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
305 258
306 if (!(flags & DIO_SKIP_DIO_COUNT)) 259 if (!(flags & DIO_SKIP_DIO_COUNT))
307 inode_dio_end(inode); 260 inode_dio_end(inode);
308 out:
309 return retval; 261 return retval;
310} 262}
311EXPORT_SYMBOL_GPL(dax_do_io); 263EXPORT_SYMBOL_GPL(dax_do_io);
@@ -321,20 +273,11 @@ EXPORT_SYMBOL_GPL(dax_do_io);
321static int dax_load_hole(struct address_space *mapping, struct page *page, 273static int dax_load_hole(struct address_space *mapping, struct page *page,
322 struct vm_fault *vmf) 274 struct vm_fault *vmf)
323{ 275{
324 unsigned long size;
325 struct inode *inode = mapping->host;
326 if (!page) 276 if (!page)
327 page = find_or_create_page(mapping, vmf->pgoff, 277 page = find_or_create_page(mapping, vmf->pgoff,
328 GFP_KERNEL | __GFP_ZERO); 278 GFP_KERNEL | __GFP_ZERO);
329 if (!page) 279 if (!page)
330 return VM_FAULT_OOM; 280 return VM_FAULT_OOM;
331 /* Recheck i_size under page lock to avoid truncate race */
332 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
333 if (vmf->pgoff >= size) {
334 unlock_page(page);
335 put_page(page);
336 return VM_FAULT_SIGBUS;
337 }
338 281
339 vmf->page = page; 282 vmf->page = page;
340 return VM_FAULT_LOCKED; 283 return VM_FAULT_LOCKED;
@@ -565,33 +508,14 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
565 .sector = to_sector(bh, inode), 508 .sector = to_sector(bh, inode),
566 .size = bh->b_size, 509 .size = bh->b_size,
567 }; 510 };
568 pgoff_t size;
569 int error; 511 int error;
570 512
571 i_mmap_lock_read(mapping); 513 i_mmap_lock_read(mapping);
572 514
573 /*
574 * Check truncate didn't happen while we were allocating a block.
575 * If it did, this block may or may not be still allocated to the
576 * file. We can't tell the filesystem to free it because we can't
577 * take i_mutex here. In the worst case, the file still has blocks
578 * allocated past the end of the file.
579 */
580 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
581 if (unlikely(vmf->pgoff >= size)) {
582 error = -EIO;
583 goto out;
584 }
585
586 if (dax_map_atomic(bdev, &dax) < 0) { 515 if (dax_map_atomic(bdev, &dax) < 0) {
587 error = PTR_ERR(dax.addr); 516 error = PTR_ERR(dax.addr);
588 goto out; 517 goto out;
589 } 518 }
590
591 if (buffer_unwritten(bh) || buffer_new(bh)) {
592 clear_pmem(dax.addr, PAGE_SIZE);
593 wmb_pmem();
594 }
595 dax_unmap_atomic(bdev, &dax); 519 dax_unmap_atomic(bdev, &dax);
596 520
597 error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, 521 error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
@@ -612,19 +536,13 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
612 * @vma: The virtual memory area where the fault occurred 536 * @vma: The virtual memory area where the fault occurred
613 * @vmf: The description of the fault 537 * @vmf: The description of the fault
614 * @get_block: The filesystem method used to translate file offsets to blocks 538 * @get_block: The filesystem method used to translate file offsets to blocks
615 * @complete_unwritten: The filesystem method used to convert unwritten blocks
616 * to written so the data written to them is exposed. This is required for
617 * required by write faults for filesystems that will return unwritten
618 * extent mappings from @get_block, but it is optional for reads as
619 * dax_insert_mapping() will always zero unwritten blocks. If the fs does
620 * not support unwritten extents, the it should pass NULL.
621 * 539 *
622 * When a page fault occurs, filesystems may call this helper in their 540 * When a page fault occurs, filesystems may call this helper in their
623 * fault handler for DAX files. __dax_fault() assumes the caller has done all 541 * fault handler for DAX files. __dax_fault() assumes the caller has done all
624 * the necessary locking for the page fault to proceed successfully. 542 * the necessary locking for the page fault to proceed successfully.
625 */ 543 */
626int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 544int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
627 get_block_t get_block, dax_iodone_t complete_unwritten) 545 get_block_t get_block)
628{ 546{
629 struct file *file = vma->vm_file; 547 struct file *file = vma->vm_file;
630 struct address_space *mapping = file->f_mapping; 548 struct address_space *mapping = file->f_mapping;
@@ -659,15 +577,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
659 put_page(page); 577 put_page(page);
660 goto repeat; 578 goto repeat;
661 } 579 }
662 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
663 if (unlikely(vmf->pgoff >= size)) {
664 /*
665 * We have a struct page covering a hole in the file
666 * from a read fault and we've raced with a truncate
667 */
668 error = -EIO;
669 goto unlock_page;
670 }
671 } 580 }
672 581
673 error = get_block(inode, block, &bh, 0); 582 error = get_block(inode, block, &bh, 0);
@@ -700,17 +609,8 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
700 if (error) 609 if (error)
701 goto unlock_page; 610 goto unlock_page;
702 vmf->page = page; 611 vmf->page = page;
703 if (!page) { 612 if (!page)
704 i_mmap_lock_read(mapping); 613 i_mmap_lock_read(mapping);
705 /* Check we didn't race with truncate */
706 size = (i_size_read(inode) + PAGE_SIZE - 1) >>
707 PAGE_SHIFT;
708 if (vmf->pgoff >= size) {
709 i_mmap_unlock_read(mapping);
710 error = -EIO;
711 goto out;
712 }
713 }
714 return VM_FAULT_LOCKED; 614 return VM_FAULT_LOCKED;
715 } 615 }
716 616
@@ -727,23 +627,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
727 page = NULL; 627 page = NULL;
728 } 628 }
729 629
730 /* 630 /* Filesystem should not return unwritten buffers to us! */
731 * If we successfully insert the new mapping over an unwritten extent, 631 WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
732 * we need to ensure we convert the unwritten extent. If there is an
733 * error inserting the mapping, the filesystem needs to leave it as
734 * unwritten to prevent exposure of the stale underlying data to
735 * userspace, but we still need to call the completion function so
736 * the private resources on the mapping buffer can be released. We
737 * indicate what the callback should do via the uptodate variable, same
738 * as for normal BH based IO completions.
739 */
740 error = dax_insert_mapping(inode, &bh, vma, vmf); 632 error = dax_insert_mapping(inode, &bh, vma, vmf);
741 if (buffer_unwritten(&bh)) {
742 if (complete_unwritten)
743 complete_unwritten(&bh, !error);
744 else
745 WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
746 }
747 633
748 out: 634 out:
749 if (error == -ENOMEM) 635 if (error == -ENOMEM)
@@ -772,7 +658,7 @@ EXPORT_SYMBOL(__dax_fault);
772 * fault handler for DAX files. 658 * fault handler for DAX files.
773 */ 659 */
774int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 660int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
775 get_block_t get_block, dax_iodone_t complete_unwritten) 661 get_block_t get_block)
776{ 662{
777 int result; 663 int result;
778 struct super_block *sb = file_inode(vma->vm_file)->i_sb; 664 struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@@ -781,7 +667,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
781 sb_start_pagefault(sb); 667 sb_start_pagefault(sb);
782 file_update_time(vma->vm_file); 668 file_update_time(vma->vm_file);
783 } 669 }
784 result = __dax_fault(vma, vmf, get_block, complete_unwritten); 670 result = __dax_fault(vma, vmf, get_block);
785 if (vmf->flags & FAULT_FLAG_WRITE) 671 if (vmf->flags & FAULT_FLAG_WRITE)
786 sb_end_pagefault(sb); 672 sb_end_pagefault(sb);
787 673
@@ -815,8 +701,7 @@ static void __dax_dbg(struct buffer_head *bh, unsigned long address,
815#define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd") 701#define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd")
816 702
817int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 703int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
818 pmd_t *pmd, unsigned int flags, get_block_t get_block, 704 pmd_t *pmd, unsigned int flags, get_block_t get_block)
819 dax_iodone_t complete_unwritten)
820{ 705{
821 struct file *file = vma->vm_file; 706 struct file *file = vma->vm_file;
822 struct address_space *mapping = file->f_mapping; 707 struct address_space *mapping = file->f_mapping;
@@ -875,6 +760,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
875 if (get_block(inode, block, &bh, 1) != 0) 760 if (get_block(inode, block, &bh, 1) != 0)
876 return VM_FAULT_SIGBUS; 761 return VM_FAULT_SIGBUS;
877 alloc = true; 762 alloc = true;
763 WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
878 } 764 }
879 765
880 bdev = bh.b_bdev; 766 bdev = bh.b_bdev;
@@ -902,23 +788,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
902 788
903 i_mmap_lock_read(mapping); 789 i_mmap_lock_read(mapping);
904 790
905 /*
906 * If a truncate happened while we were allocating blocks, we may
907 * leave blocks allocated to the file that are beyond EOF. We can't
908 * take i_mutex here, so just leave them hanging; they'll be freed
909 * when the file is deleted.
910 */
911 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
912 if (pgoff >= size) {
913 result = VM_FAULT_SIGBUS;
914 goto out;
915 }
916 if ((pgoff | PG_PMD_COLOUR) >= size) {
917 dax_pmd_dbg(&bh, address,
918 "offset + huge page size > file size");
919 goto fallback;
920 }
921
922 if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { 791 if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
923 spinlock_t *ptl; 792 spinlock_t *ptl;
924 pmd_t entry; 793 pmd_t entry;
@@ -954,8 +823,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
954 long length = dax_map_atomic(bdev, &dax); 823 long length = dax_map_atomic(bdev, &dax);
955 824
956 if (length < 0) { 825 if (length < 0) {
957 result = VM_FAULT_SIGBUS; 826 dax_pmd_dbg(&bh, address, "dax-error fallback");
958 goto out; 827 goto fallback;
959 } 828 }
960 if (length < PMD_SIZE) { 829 if (length < PMD_SIZE) {
961 dax_pmd_dbg(&bh, address, "dax-length too small"); 830 dax_pmd_dbg(&bh, address, "dax-length too small");
@@ -973,14 +842,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
973 dax_pmd_dbg(&bh, address, "pfn not in memmap"); 842 dax_pmd_dbg(&bh, address, "pfn not in memmap");
974 goto fallback; 843 goto fallback;
975 } 844 }
976
977 if (buffer_unwritten(&bh) || buffer_new(&bh)) {
978 clear_pmem(dax.addr, PMD_SIZE);
979 wmb_pmem();
980 count_vm_event(PGMAJFAULT);
981 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
982 result |= VM_FAULT_MAJOR;
983 }
984 dax_unmap_atomic(bdev, &dax); 845 dax_unmap_atomic(bdev, &dax);
985 846
986 /* 847 /*
@@ -1020,9 +881,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1020 out: 881 out:
1021 i_mmap_unlock_read(mapping); 882 i_mmap_unlock_read(mapping);
1022 883
1023 if (buffer_unwritten(&bh))
1024 complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
1025
1026 return result; 884 return result;
1027 885
1028 fallback: 886 fallback:
@@ -1042,8 +900,7 @@ EXPORT_SYMBOL_GPL(__dax_pmd_fault);
1042 * pmd_fault handler for DAX files. 900 * pmd_fault handler for DAX files.
1043 */ 901 */
1044int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 902int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1045 pmd_t *pmd, unsigned int flags, get_block_t get_block, 903 pmd_t *pmd, unsigned int flags, get_block_t get_block)
1046 dax_iodone_t complete_unwritten)
1047{ 904{
1048 int result; 905 int result;
1049 struct super_block *sb = file_inode(vma->vm_file)->i_sb; 906 struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@@ -1052,8 +909,7 @@ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1052 sb_start_pagefault(sb); 909 sb_start_pagefault(sb);
1053 file_update_time(vma->vm_file); 910 file_update_time(vma->vm_file);
1054 } 911 }
1055 result = __dax_pmd_fault(vma, address, pmd, flags, get_block, 912 result = __dax_pmd_fault(vma, address, pmd, flags, get_block);
1056 complete_unwritten);
1057 if (flags & FAULT_FLAG_WRITE) 913 if (flags & FAULT_FLAG_WRITE)
1058 sb_end_pagefault(sb); 914 sb_end_pagefault(sb);
1059 915
@@ -1091,6 +947,43 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1091} 947}
1092EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); 948EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
1093 949
950static bool dax_range_is_aligned(struct block_device *bdev,
951 unsigned int offset, unsigned int length)
952{
953 unsigned short sector_size = bdev_logical_block_size(bdev);
954
955 if (!IS_ALIGNED(offset, sector_size))
956 return false;
957 if (!IS_ALIGNED(length, sector_size))
958 return false;
959
960 return true;
961}
962
963int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
964 unsigned int offset, unsigned int length)
965{
966 struct blk_dax_ctl dax = {
967 .sector = sector,
968 .size = PAGE_SIZE,
969 };
970
971 if (dax_range_is_aligned(bdev, offset, length)) {
972 sector_t start_sector = dax.sector + (offset >> 9);
973
974 return blkdev_issue_zeroout(bdev, start_sector,
975 length >> 9, GFP_NOFS, true);
976 } else {
977 if (dax_map_atomic(bdev, &dax) < 0)
978 return PTR_ERR(dax.addr);
979 clear_pmem(dax.addr + offset, length);
980 wmb_pmem();
981 dax_unmap_atomic(bdev, &dax);
982 }
983 return 0;
984}
985EXPORT_SYMBOL_GPL(__dax_zero_page_range);
986
1094/** 987/**
1095 * dax_zero_page_range - zero a range within a page of a DAX file 988 * dax_zero_page_range - zero a range within a page of a DAX file
1096 * @inode: The file being truncated 989 * @inode: The file being truncated
@@ -1102,12 +995,6 @@ EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
1102 * page in a DAX file. This is intended for hole-punch operations. If 995 * page in a DAX file. This is intended for hole-punch operations. If
1103 * you are truncating a file, the helper function dax_truncate_page() may be 996 * you are truncating a file, the helper function dax_truncate_page() may be
1104 * more convenient. 997 * more convenient.
1105 *
1106 * We work in terms of PAGE_SIZE here for commonality with
1107 * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
1108 * took care of disposing of the unnecessary blocks. Even if the filesystem
1109 * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
1110 * since the file might be mmapped.
1111 */ 998 */
1112int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, 999int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
1113 get_block_t get_block) 1000 get_block_t get_block)
@@ -1126,23 +1013,11 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
1126 bh.b_bdev = inode->i_sb->s_bdev; 1013 bh.b_bdev = inode->i_sb->s_bdev;
1127 bh.b_size = PAGE_SIZE; 1014 bh.b_size = PAGE_SIZE;
1128 err = get_block(inode, index, &bh, 0); 1015 err = get_block(inode, index, &bh, 0);
1129 if (err < 0) 1016 if (err < 0 || !buffer_written(&bh))
1130 return err; 1017 return err;
1131 if (buffer_written(&bh)) {
1132 struct block_device *bdev = bh.b_bdev;
1133 struct blk_dax_ctl dax = {
1134 .sector = to_sector(&bh, inode),
1135 .size = PAGE_SIZE,
1136 };
1137
1138 if (dax_map_atomic(bdev, &dax) < 0)
1139 return PTR_ERR(dax.addr);
1140 clear_pmem(dax.addr + offset, length);
1141 wmb_pmem();
1142 dax_unmap_atomic(bdev, &dax);
1143 }
1144 1018
1145 return 0; 1019 return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
1020 offset, length);
1146} 1021}
1147EXPORT_SYMBOL_GPL(dax_zero_page_range); 1022EXPORT_SYMBOL_GPL(dax_zero_page_range);
1148 1023
@@ -1154,12 +1029,6 @@ EXPORT_SYMBOL_GPL(dax_zero_page_range);
1154 * 1029 *
1155 * Similar to block_truncate_page(), this function can be called by a 1030 * Similar to block_truncate_page(), this function can be called by a
1156 * filesystem when it is truncating a DAX file to handle the partial page. 1031 * filesystem when it is truncating a DAX file to handle the partial page.
1157 *
1158 * We work in terms of PAGE_SIZE here for commonality with
1159 * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
1160 * took care of disposing of the unnecessary blocks. Even if the filesystem
1161 * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
1162 * since the file might be mmapped.
1163 */ 1032 */
1164int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) 1033int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
1165{ 1034{
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index c1400b109805..868c02317b05 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -51,7 +51,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
51 } 51 }
52 down_read(&ei->dax_sem); 52 down_read(&ei->dax_sem);
53 53
54 ret = __dax_fault(vma, vmf, ext2_get_block, NULL); 54 ret = __dax_fault(vma, vmf, ext2_get_block);
55 55
56 up_read(&ei->dax_sem); 56 up_read(&ei->dax_sem);
57 if (vmf->flags & FAULT_FLAG_WRITE) 57 if (vmf->flags & FAULT_FLAG_WRITE)
@@ -72,7 +72,7 @@ static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
72 } 72 }
73 down_read(&ei->dax_sem); 73 down_read(&ei->dax_sem);
74 74
75 ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL); 75 ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block);
76 76
77 up_read(&ei->dax_sem); 77 up_read(&ei->dax_sem);
78 if (flags & FAULT_FLAG_WRITE) 78 if (flags & FAULT_FLAG_WRITE)
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index b675610391b8..fcbe58641e40 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -26,6 +26,7 @@
26#include <linux/highuid.h> 26#include <linux/highuid.h>
27#include <linux/pagemap.h> 27#include <linux/pagemap.h>
28#include <linux/dax.h> 28#include <linux/dax.h>
29#include <linux/blkdev.h>
29#include <linux/quotaops.h> 30#include <linux/quotaops.h>
30#include <linux/writeback.h> 31#include <linux/writeback.h>
31#include <linux/buffer_head.h> 32#include <linux/buffer_head.h>
@@ -737,19 +738,18 @@ static int ext2_get_blocks(struct inode *inode,
737 * so that it's not found by another thread before it's 738 * so that it's not found by another thread before it's
738 * initialised 739 * initialised
739 */ 740 */
740 err = dax_clear_sectors(inode->i_sb->s_bdev, 741 err = sb_issue_zeroout(inode->i_sb,
741 le32_to_cpu(chain[depth-1].key) << 742 le32_to_cpu(chain[depth-1].key), count,
742 (inode->i_blkbits - 9), 743 GFP_NOFS);
743 1 << inode->i_blkbits);
744 if (err) { 744 if (err) {
745 mutex_unlock(&ei->truncate_mutex); 745 mutex_unlock(&ei->truncate_mutex);
746 goto cleanup; 746 goto cleanup;
747 } 747 }
748 } 748 } else
749 set_buffer_new(bh_result);
749 750
750 ext2_splice_branch(inode, iblock, partial, indirect_blks, count); 751 ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
751 mutex_unlock(&ei->truncate_mutex); 752 mutex_unlock(&ei->truncate_mutex);
752 set_buffer_new(bh_result);
753got_it: 753got_it:
754 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); 754 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
755 if (count > blocks_to_boundary) 755 if (count > blocks_to_boundary)
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index b78caf25f746..1d9379568aa8 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -922,16 +922,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
922 blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); 922 blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
923 923
924 if (sbi->s_mount_opt & EXT2_MOUNT_DAX) { 924 if (sbi->s_mount_opt & EXT2_MOUNT_DAX) {
925 if (blocksize != PAGE_SIZE) { 925 err = bdev_dax_supported(sb, blocksize);
926 ext2_msg(sb, KERN_ERR, 926 if (err)
927 "error: unsupported blocksize for dax");
928 goto failed_mount; 927 goto failed_mount;
929 }
930 if (!sb->s_bdev->bd_disk->fops->direct_access) {
931 ext2_msg(sb, KERN_ERR,
932 "error: device does not support dax");
933 goto failed_mount;
934 }
935 } 928 }
936 929
937 /* If the blocksize doesn't match, re-read the thing.. */ 930 /* If the blocksize doesn't match, re-read the thing.. */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d478110c32a6..df44c877892a 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -202,7 +202,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
202 if (IS_ERR(handle)) 202 if (IS_ERR(handle))
203 result = VM_FAULT_SIGBUS; 203 result = VM_FAULT_SIGBUS;
204 else 204 else
205 result = __dax_fault(vma, vmf, ext4_dax_get_block, NULL); 205 result = __dax_fault(vma, vmf, ext4_dax_get_block);
206 206
207 if (write) { 207 if (write) {
208 if (!IS_ERR(handle)) 208 if (!IS_ERR(handle))
@@ -238,7 +238,7 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
238 result = VM_FAULT_SIGBUS; 238 result = VM_FAULT_SIGBUS;
239 else 239 else
240 result = __dax_pmd_fault(vma, addr, pmd, flags, 240 result = __dax_pmd_fault(vma, addr, pmd, flags,
241 ext4_dax_get_block, NULL); 241 ext4_dax_get_block);
242 242
243 if (write) { 243 if (write) {
244 if (!IS_ERR(handle)) 244 if (!IS_ERR(handle))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 20c5d52253b4..3822a5aedc61 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3417,16 +3417,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3417 } 3417 }
3418 3418
3419 if (sbi->s_mount_opt & EXT4_MOUNT_DAX) { 3419 if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
3420 if (blocksize != PAGE_SIZE) { 3420 err = bdev_dax_supported(sb, blocksize);
3421 ext4_msg(sb, KERN_ERR, 3421 if (err)
3422 "error: unsupported blocksize for dax");
3423 goto failed_mount;
3424 }
3425 if (!sb->s_bdev->bd_disk->fops->direct_access) {
3426 ext4_msg(sb, KERN_ERR,
3427 "error: device does not support dax");
3428 goto failed_mount; 3422 goto failed_mount;
3429 }
3430 } 3423 }
3431 3424
3432 if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { 3425 if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) {
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 613ea2d7ac19..586bb64e674b 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -72,18 +72,11 @@ xfs_zero_extent(
72 struct xfs_mount *mp = ip->i_mount; 72 struct xfs_mount *mp = ip->i_mount;
73 xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb); 73 xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb);
74 sector_t block = XFS_BB_TO_FSBT(mp, sector); 74 sector_t block = XFS_BB_TO_FSBT(mp, sector);
75 ssize_t size = XFS_FSB_TO_B(mp, count_fsb);
76
77 if (IS_DAX(VFS_I(ip)))
78 return dax_clear_sectors(xfs_find_bdev_for_inode(VFS_I(ip)),
79 sector, size);
80
81 /*
82 * let the block layer decide on the fastest method of
83 * implementing the zeroing.
84 */
85 return sb_issue_zeroout(mp->m_super, block, count_fsb, GFP_NOFS);
86 75
76 return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)),
77 block << (mp->m_super->s_blocksize_bits - 9),
78 count_fsb << (mp->m_super->s_blocksize_bits - 9),
79 GFP_NOFS, true);
87} 80}
88 81
89/* 82/*
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 44af22897c8b..47fc63295422 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1551,7 +1551,7 @@ xfs_filemap_page_mkwrite(
1551 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1551 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1552 1552
1553 if (IS_DAX(inode)) { 1553 if (IS_DAX(inode)) {
1554 ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL); 1554 ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
1555 } else { 1555 } else {
1556 ret = block_page_mkwrite(vma, vmf, xfs_get_blocks); 1556 ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
1557 ret = block_page_mkwrite_return(ret); 1557 ret = block_page_mkwrite_return(ret);
@@ -1585,7 +1585,7 @@ xfs_filemap_fault(
1585 * changes to xfs_get_blocks_direct() to map unwritten extent 1585 * changes to xfs_get_blocks_direct() to map unwritten extent
1586 * ioend for conversion on read-only mappings. 1586 * ioend for conversion on read-only mappings.
1587 */ 1587 */
1588 ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL); 1588 ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault);
1589 } else 1589 } else
1590 ret = filemap_fault(vma, vmf); 1590 ret = filemap_fault(vma, vmf);
1591 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1591 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1622,8 +1622,7 @@ xfs_filemap_pmd_fault(
1622 } 1622 }
1623 1623
1624 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1624 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1625 ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault, 1625 ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault);
1626 NULL);
1627 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1626 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1628 1627
1629 if (flags & FAULT_FLAG_WRITE) 1628 if (flags & FAULT_FLAG_WRITE)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 416421d7ff10..11ea5d51db56 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1555,14 +1555,12 @@ xfs_fs_fill_super(
1555 1555
1556 if (mp->m_flags & XFS_MOUNT_DAX) { 1556 if (mp->m_flags & XFS_MOUNT_DAX) {
1557 xfs_warn(mp, 1557 xfs_warn(mp,
1558 "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); 1558 "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
1559 if (sb->s_blocksize != PAGE_SIZE) { 1559
1560 xfs_alert(mp, 1560 error = bdev_dax_supported(sb, sb->s_blocksize);
1561 "Filesystem block size invalid for DAX Turning DAX off."); 1561 if (error) {
1562 mp->m_flags &= ~XFS_MOUNT_DAX;
1563 } else if (!sb->s_bdev->bd_disk->fops->direct_access) {
1564 xfs_alert(mp, 1562 xfs_alert(mp,
1565 "Block device does not support DAX Turning DAX off."); 1563 "DAX unsupported by block device. Turning off DAX.");
1566 mp->m_flags &= ~XFS_MOUNT_DAX; 1564 mp->m_flags &= ~XFS_MOUNT_DAX;
1567 } 1565 }
1568 } 1566 }