aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2015-06-03 19:18:18 -0400
committerDave Chinner <david@fromorbit.com>2015-06-03 19:18:18 -0400
commite842f2903908934187af7232fb5b21da527d1757 (patch)
tree5fe6e2da05d58519d0f9360cf5116662a766906a
parentec56b1f1fdc69599963574ce94cc5693d535dd64 (diff)
dax: don't abuse get_block mapping for endio callbacks
dax_fault() currently relies on the get_block callback to attach an io completion callback to the mapping buffer head so that it can run unwritten extent conversion after zeroing allocated blocks. Instead of this hack, pass the conversion callback directly into dax_fault() similar to the get_block callback. When the filesystem allocates unwritten extents, it will set the buffer_unwritten() flag, and hence the dax_fault code can call the completion function in the contexts where it is necessary without overloading the mapping buffer head. Note: The changes to ext4 to use this interface are suspect at best. In fact, the way ext4 did this end_io assignment in the first place looks suspect because it only set a completion callback when there wasn't already some other write() call taking place on the same inode. The ext4 end_io code looks rather intricate and fragile with all it's reference counting and passing to different contexts for modification via inode private pointers that aren't protected by locks... Signed-off-by: Dave Chinner <dchinner@redhat.com> Acked-by: Jan Kara <jack@suse.cz> Signed-off-by: Dave Chinner <david@fromorbit.com>
-rw-r--r--fs/dax.c21
-rw-r--r--fs/ext2/file.c4
-rw-r--r--fs/ext4/file.c16
-rw-r--r--fs/ext4/inode.c21
-rw-r--r--include/linux/fs.h6
5 files changed, 42 insertions, 26 deletions
diff --git a/fs/dax.c b/fs/dax.c
index 6f65f00e58ec..4bb5b7cd5dfd 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -309,14 +309,11 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
309 out: 309 out:
310 i_mmap_unlock_read(mapping); 310 i_mmap_unlock_read(mapping);
311 311
312 if (bh->b_end_io)
313 bh->b_end_io(bh, 1);
314
315 return error; 312 return error;
316} 313}
317 314
318static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 315static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
319 get_block_t get_block) 316 get_block_t get_block, dax_iodone_t complete_unwritten)
320{ 317{
321 struct file *file = vma->vm_file; 318 struct file *file = vma->vm_file;
322 struct address_space *mapping = file->f_mapping; 319 struct address_space *mapping = file->f_mapping;
@@ -417,7 +414,19 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
417 page_cache_release(page); 414 page_cache_release(page);
418 } 415 }
419 416
417 /*
418 * If we successfully insert the new mapping over an unwritten extent,
419 * we need to ensure we convert the unwritten extent. If there is an
420 * error inserting the mapping, the filesystem needs to leave it as
421 * unwritten to prevent exposure of the stale underlying data to
422 * userspace, but we still need to call the completion function so
423 * the private resources on the mapping buffer can be released. We
424 * indicate what the callback should do via the uptodate variable, same
425 * as for normal BH based IO completions.
426 */
420 error = dax_insert_mapping(inode, &bh, vma, vmf); 427 error = dax_insert_mapping(inode, &bh, vma, vmf);
428 if (buffer_unwritten(&bh))
429 complete_unwritten(&bh, !error);
421 430
422 out: 431 out:
423 if (error == -ENOMEM) 432 if (error == -ENOMEM)
@@ -445,7 +454,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
445 * fault handler for DAX files. 454 * fault handler for DAX files.
446 */ 455 */
447int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 456int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
448 get_block_t get_block) 457 get_block_t get_block, dax_iodone_t complete_unwritten)
449{ 458{
450 int result; 459 int result;
451 struct super_block *sb = file_inode(vma->vm_file)->i_sb; 460 struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@@ -454,7 +463,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
454 sb_start_pagefault(sb); 463 sb_start_pagefault(sb);
455 file_update_time(vma->vm_file); 464 file_update_time(vma->vm_file);
456 } 465 }
457 result = do_dax_fault(vma, vmf, get_block); 466 result = do_dax_fault(vma, vmf, get_block, complete_unwritten);
458 if (vmf->flags & FAULT_FLAG_WRITE) 467 if (vmf->flags & FAULT_FLAG_WRITE)
459 sb_end_pagefault(sb); 468 sb_end_pagefault(sb);
460 469
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 3a0a6c6406d0..3b57c9f83c9b 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -28,12 +28,12 @@
28#ifdef CONFIG_FS_DAX 28#ifdef CONFIG_FS_DAX
29static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 29static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
30{ 30{
31 return dax_fault(vma, vmf, ext2_get_block); 31 return dax_fault(vma, vmf, ext2_get_block, NULL);
32} 32}
33 33
34static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 34static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
35{ 35{
36 return dax_mkwrite(vma, vmf, ext2_get_block); 36 return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
37} 37}
38 38
39static const struct vm_operations_struct ext2_dax_vm_ops = { 39static const struct vm_operations_struct ext2_dax_vm_ops = {
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 0613c256c344..f713cfcc43a2 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -192,15 +192,27 @@ out:
192} 192}
193 193
194#ifdef CONFIG_FS_DAX 194#ifdef CONFIG_FS_DAX
195static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
196{
197 struct inode *inode = bh->b_assoc_map->host;
198 /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
199 loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
200 int err;
201 if (!uptodate)
202 return;
203 WARN_ON(!buffer_unwritten(bh));
204 err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
205}
206
195static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 207static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
196{ 208{
197 return dax_fault(vma, vmf, ext4_get_block); 209 return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
198 /* Is this the right get_block? */ 210 /* Is this the right get_block? */
199} 211}
200 212
201static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 213static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
202{ 214{
203 return dax_mkwrite(vma, vmf, ext4_get_block); 215 return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
204} 216}
205 217
206static const struct vm_operations_struct ext4_dax_vm_ops = { 218static const struct vm_operations_struct ext4_dax_vm_ops = {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 55b187c3bac1..7c38ed3494cb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -656,18 +656,6 @@ has_zeroout:
656 return retval; 656 return retval;
657} 657}
658 658
659static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
660{
661 struct inode *inode = bh->b_assoc_map->host;
662 /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
663 loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
664 int err;
665 if (!uptodate)
666 return;
667 WARN_ON(!buffer_unwritten(bh));
668 err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
669}
670
671/* Maximum number of blocks we map for direct IO at once. */ 659/* Maximum number of blocks we map for direct IO at once. */
672#define DIO_MAX_BLOCKS 4096 660#define DIO_MAX_BLOCKS 4096
673 661
@@ -705,10 +693,15 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
705 693
706 map_bh(bh, inode->i_sb, map.m_pblk); 694 map_bh(bh, inode->i_sb, map.m_pblk);
707 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; 695 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
708 if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) { 696 if (IS_DAX(inode) && buffer_unwritten(bh)) {
697 /*
698 * dgc: I suspect unwritten conversion on ext4+DAX is
699 * fundamentally broken here when there are concurrent
700 * read/write in progress on this inode.
701 */
702 WARN_ON_ONCE(io_end);
709 bh->b_assoc_map = inode->i_mapping; 703 bh->b_assoc_map = inode->i_mapping;
710 bh->b_private = (void *)(unsigned long)iblock; 704 bh->b_private = (void *)(unsigned long)iblock;
711 bh->b_end_io = ext4_end_io_unwritten;
712 } 705 }
713 if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) 706 if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
714 set_buffer_defer_completion(bh); 707 set_buffer_defer_completion(bh);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 35ec87e490b1..c9b4cca9e08d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -70,6 +70,7 @@ typedef int (get_block_t)(struct inode *inode, sector_t iblock,
70 struct buffer_head *bh_result, int create); 70 struct buffer_head *bh_result, int create);
71typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, 71typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
72 ssize_t bytes, void *private); 72 ssize_t bytes, void *private);
73typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate);
73 74
74#define MAY_EXEC 0x00000001 75#define MAY_EXEC 0x00000001
75#define MAY_WRITE 0x00000002 76#define MAY_WRITE 0x00000002
@@ -2627,9 +2628,10 @@ ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
2627int dax_clear_blocks(struct inode *, sector_t block, long size); 2628int dax_clear_blocks(struct inode *, sector_t block, long size);
2628int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); 2629int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
2629int dax_truncate_page(struct inode *, loff_t from, get_block_t); 2630int dax_truncate_page(struct inode *, loff_t from, get_block_t);
2630int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); 2631int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
2632 dax_iodone_t);
2631int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); 2633int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
2632#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb) 2634#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod)
2633 2635
2634#ifdef CONFIG_BLOCK 2636#ifdef CONFIG_BLOCK
2635typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode, 2637typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,