aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJeff Layton <jlayton@redhat.com>2017-07-06 07:02:25 -0400
committerJeff Layton <jlayton@redhat.com>2017-07-06 07:02:25 -0400
commit5660e13d2fd6af1903d4b0b98020af95ca2d638a (patch)
tree10944f111ba11bf1d3b194300018f3a45e9fd9e8
parent84cbadadc6eafc4798513773a2c8fce37dcd2fb8 (diff)
fs: new infrastructure for writeback error handling and reporting
Most filesystems currently use mapping_set_error and filemap_check_errors for setting and reporting/clearing writeback errors at the mapping level. filemap_check_errors is indirectly called from most of the filemap_fdatawait_* functions and from filemap_write_and_wait*. These functions are called from all sorts of contexts to wait on writeback to finish -- e.g. mostly in fsync, but also in truncate calls, getattr, etc. The non-fsync callers are problematic. We should be reporting writeback errors during fsync, but many places spread over the tree clear out errors before they can be properly reported, or report errors at nonsensical times. If I get -EIO on a stat() call, there is no reason for me to assume that it is because some previous writeback failed. The fact that it also clears out the error such that a subsequent fsync returns 0 is a bug, and a nasty one since that's potentially silent data corruption. This patch adds a small bit of new infrastructure for setting and reporting errors during address_space writeback. While the above was my original impetus for adding this, I think it's also the case that current fsync semantics are just problematic for userland. Most applications that call fsync do so to ensure that the data they wrote has hit the backing store. In the case where there are multiple writers to the file at the same time, this is really hard to determine. The first one to call fsync will see any stored error, and the rest get back 0. The processes with open fds may not be associated with one another in any way. They could even be in different containers, so ensuring coordination between all fsync callers is not really an option. One way to remedy this would be to track what file descriptor was used to dirty the file, but that's rather cumbersome and would likely be slow. However, there is a simpler way to improve the semantics here without incurring too much overhead. This set adds an errseq_t to struct address_space, and a corresponding one is added to struct file. Writeback errors are recorded in the mapping's errseq_t, and the one in struct file is used as the "since" value. This changes the semantics of the Linux fsync implementation such that applications can now use it to determine whether there were any writeback errors since fsync(fd) was last called (or since the file was opened in the case of fsync having never been called). Note that those writeback errors may have occurred when writing data that was dirtied via an entirely different fd, but that's the case now with the current mapping_set_error/filemap_check_error infrastructure. This will at least prevent you from getting a false report of success. The new behavior is still consistent with the POSIX spec, and is more reliable for application developers. This patch just adds some basic infrastructure for doing this, and ensures that the f_wb_err "cursor" is properly set when a file is opened. Later patches will change the existing code to use this new infrastructure for reporting errors at fsync time. Signed-off-by: Jeff Layton <jlayton@redhat.com> Reviewed-by: Jan Kara <jack@suse.cz>
-rw-r--r--drivers/dax/device.c1
-rw-r--r--fs/block_dev.c1
-rw-r--r--fs/file_table.c1
-rw-r--r--fs/open.c3
-rw-r--r--include/linux/fs.h60
-rw-r--r--include/trace/events/filemap.h57
-rw-r--r--mm/filemap.c84
7 files changed, 206 insertions, 1 deletions
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 006e657dfcb9..12943d19bfc4 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -499,6 +499,7 @@ static int dax_open(struct inode *inode, struct file *filp)
499 inode->i_mapping = __dax_inode->i_mapping; 499 inode->i_mapping = __dax_inode->i_mapping;
500 inode->i_mapping->host = __dax_inode; 500 inode->i_mapping->host = __dax_inode;
501 filp->f_mapping = inode->i_mapping; 501 filp->f_mapping = inode->i_mapping;
502 filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
502 filp->private_data = dev_dax; 503 filp->private_data = dev_dax;
503 inode->i_flags = S_DAX; 504 inode->i_flags = S_DAX;
504 505
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 519599dddd36..4d62fe771587 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1743,6 +1743,7 @@ static int blkdev_open(struct inode * inode, struct file * filp)
1743 return -ENOMEM; 1743 return -ENOMEM;
1744 1744
1745 filp->f_mapping = bdev->bd_inode->i_mapping; 1745 filp->f_mapping = bdev->bd_inode->i_mapping;
1746 filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
1746 1747
1747 return blkdev_get(bdev, filp->f_mode, filp); 1748 return blkdev_get(bdev, filp->f_mode, filp);
1748} 1749}
diff --git a/fs/file_table.c b/fs/file_table.c
index 954d510b765a..72e861a35a7f 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -168,6 +168,7 @@ struct file *alloc_file(const struct path *path, fmode_t mode,
168 file->f_path = *path; 168 file->f_path = *path;
169 file->f_inode = path->dentry->d_inode; 169 file->f_inode = path->dentry->d_inode;
170 file->f_mapping = path->dentry->d_inode->i_mapping; 170 file->f_mapping = path->dentry->d_inode->i_mapping;
171 file->f_wb_err = filemap_sample_wb_err(file->f_mapping);
171 if ((mode & FMODE_READ) && 172 if ((mode & FMODE_READ) &&
172 likely(fop->read || fop->read_iter)) 173 likely(fop->read || fop->read_iter))
173 mode |= FMODE_CAN_READ; 174 mode |= FMODE_CAN_READ;
diff --git a/fs/open.c b/fs/open.c
index cd0c5be8d012..280d4a963791 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -707,6 +707,9 @@ static int do_dentry_open(struct file *f,
707 f->f_inode = inode; 707 f->f_inode = inode;
708 f->f_mapping = inode->i_mapping; 708 f->f_mapping = inode->i_mapping;
709 709
710 /* Ensure that we skip any errors that predate opening of the file */
711 f->f_wb_err = filemap_sample_wb_err(f->f_mapping);
712
710 if (unlikely(f->f_flags & O_PATH)) { 713 if (unlikely(f->f_flags & O_PATH)) {
711 f->f_mode = FMODE_PATH; 714 f->f_mode = FMODE_PATH;
712 f->f_op = &empty_fops; 715 f->f_op = &empty_fops;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8ac8df1b3550..78b5c2901712 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -30,7 +30,7 @@
30#include <linux/percpu-rwsem.h> 30#include <linux/percpu-rwsem.h>
31#include <linux/workqueue.h> 31#include <linux/workqueue.h>
32#include <linux/delayed_call.h> 32#include <linux/delayed_call.h>
33 33#include <linux/errseq.h>
34#include <asm/byteorder.h> 34#include <asm/byteorder.h>
35#include <uapi/linux/fs.h> 35#include <uapi/linux/fs.h>
36 36
@@ -392,6 +392,7 @@ struct address_space {
392 gfp_t gfp_mask; /* implicit gfp mask for allocations */ 392 gfp_t gfp_mask; /* implicit gfp mask for allocations */
393 struct list_head private_list; /* ditto */ 393 struct list_head private_list; /* ditto */
394 void *private_data; /* ditto */ 394 void *private_data; /* ditto */
395 errseq_t wb_err;
395} __attribute__((aligned(sizeof(long)))); 396} __attribute__((aligned(sizeof(long))));
396 /* 397 /*
397 * On most architectures that alignment is already the case; but 398 * On most architectures that alignment is already the case; but
@@ -868,6 +869,7 @@ struct file {
868 struct list_head f_tfile_llink; 869 struct list_head f_tfile_llink;
869#endif /* #ifdef CONFIG_EPOLL */ 870#endif /* #ifdef CONFIG_EPOLL */
870 struct address_space *f_mapping; 871 struct address_space *f_mapping;
872 errseq_t f_wb_err;
871} __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ 873} __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */
872 874
873struct file_handle { 875struct file_handle {
@@ -2526,6 +2528,62 @@ extern int filemap_fdatawrite_range(struct address_space *mapping,
2526 loff_t start, loff_t end); 2528 loff_t start, loff_t end);
2527extern int filemap_check_errors(struct address_space *mapping); 2529extern int filemap_check_errors(struct address_space *mapping);
2528 2530
2531extern void __filemap_set_wb_err(struct address_space *mapping, int err);
2532extern int __must_check file_check_and_advance_wb_err(struct file *file);
2533extern int __must_check file_write_and_wait_range(struct file *file,
2534 loff_t start, loff_t end);
2535
2536/**
2537 * filemap_set_wb_err - set a writeback error on an address_space
2538 * @mapping: mapping in which to set writeback error
2539 * @err: error to be set in mapping
2540 *
2541 * When writeback fails in some way, we must record that error so that
2542 * userspace can be informed when fsync and the like are called. We endeavor
2543 * to report errors on any file that was open at the time of the error. Some
2544 * internal callers also need to know when writeback errors have occurred.
2545 *
2546 * When a writeback error occurs, most filesystems will want to call
2547 * filemap_set_wb_err to record the error in the mapping so that it will be
2548 * automatically reported whenever fsync is called on the file.
2549 *
2550 * FIXME: mention FS_* flag here?
2551 */
2552static inline void filemap_set_wb_err(struct address_space *mapping, int err)
2553{
2554 /* Fastpath for common case of no error */
2555 if (unlikely(err))
2556 __filemap_set_wb_err(mapping, err);
2557}
2558
2559/**
2560 * filemap_check_wb_error - has an error occurred since the mark was sampled?
2561 * @mapping: mapping to check for writeback errors
2562 * @since: previously-sampled errseq_t
2563 *
2564 * Grab the errseq_t value from the mapping, and see if it has changed "since"
2565 * the given value was sampled.
2566 *
2567 * If it has then report the latest error set, otherwise return 0.
2568 */
2569static inline int filemap_check_wb_err(struct address_space *mapping,
2570 errseq_t since)
2571{
2572 return errseq_check(&mapping->wb_err, since);
2573}
2574
2575/**
2576 * filemap_sample_wb_err - sample the current errseq_t to test for later errors
2577 * @mapping: mapping to be sampled
2578 *
2579 * Writeback errors are always reported relative to a particular sample point
2580 * in the past. This function provides those sample points.
2581 */
2582static inline errseq_t filemap_sample_wb_err(struct address_space *mapping)
2583{
2584 return errseq_sample(&mapping->wb_err);
2585}
2586
2529extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, 2587extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
2530 int datasync); 2588 int datasync);
2531extern int vfs_fsync(struct file *file, int datasync); 2589extern int vfs_fsync(struct file *file, int datasync);
diff --git a/include/trace/events/filemap.h b/include/trace/events/filemap.h
index 42febb6bc1d5..ff91325b8123 100644
--- a/include/trace/events/filemap.h
+++ b/include/trace/events/filemap.h
@@ -10,6 +10,7 @@
10#include <linux/memcontrol.h> 10#include <linux/memcontrol.h>
11#include <linux/device.h> 11#include <linux/device.h>
12#include <linux/kdev_t.h> 12#include <linux/kdev_t.h>
13#include <linux/errseq.h>
13 14
14DECLARE_EVENT_CLASS(mm_filemap_op_page_cache, 15DECLARE_EVENT_CLASS(mm_filemap_op_page_cache,
15 16
@@ -52,6 +53,62 @@ DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_add_to_page_cache,
52 TP_ARGS(page) 53 TP_ARGS(page)
53 ); 54 );
54 55
56TRACE_EVENT(filemap_set_wb_err,
57 TP_PROTO(struct address_space *mapping, errseq_t eseq),
58
59 TP_ARGS(mapping, eseq),
60
61 TP_STRUCT__entry(
62 __field(unsigned long, i_ino)
63 __field(dev_t, s_dev)
64 __field(errseq_t, errseq)
65 ),
66
67 TP_fast_assign(
68 __entry->i_ino = mapping->host->i_ino;
69 __entry->errseq = eseq;
70 if (mapping->host->i_sb)
71 __entry->s_dev = mapping->host->i_sb->s_dev;
72 else
73 __entry->s_dev = mapping->host->i_rdev;
74 ),
75
76 TP_printk("dev=%d:%d ino=0x%lx errseq=0x%x",
77 MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
78 __entry->i_ino, __entry->errseq)
79);
80
81TRACE_EVENT(file_check_and_advance_wb_err,
82 TP_PROTO(struct file *file, errseq_t old),
83
84 TP_ARGS(file, old),
85
86 TP_STRUCT__entry(
87 __field(struct file *, file);
88 __field(unsigned long, i_ino)
89 __field(dev_t, s_dev)
90 __field(errseq_t, old)
91 __field(errseq_t, new)
92 ),
93
94 TP_fast_assign(
95 __entry->file = file;
96 __entry->i_ino = file->f_mapping->host->i_ino;
97 if (file->f_mapping->host->i_sb)
98 __entry->s_dev =
99 file->f_mapping->host->i_sb->s_dev;
100 else
101 __entry->s_dev =
102 file->f_mapping->host->i_rdev;
103 __entry->old = old;
104 __entry->new = file->f_wb_err;
105 ),
106
107 TP_printk("file=%p dev=%d:%d ino=0x%lx old=0x%x new=0x%x",
108 __entry->file, MAJOR(__entry->s_dev),
109 MINOR(__entry->s_dev), __entry->i_ino, __entry->old,
110 __entry->new)
111);
55#endif /* _TRACE_FILEMAP_H */ 112#endif /* _TRACE_FILEMAP_H */
56 113
57/* This part must be outside protection */ 114/* This part must be outside protection */
diff --git a/mm/filemap.c b/mm/filemap.c
index eb99b5f23c61..d7a30aefee0d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -553,6 +553,90 @@ int filemap_write_and_wait_range(struct address_space *mapping,
553} 553}
554EXPORT_SYMBOL(filemap_write_and_wait_range); 554EXPORT_SYMBOL(filemap_write_and_wait_range);
555 555
556void __filemap_set_wb_err(struct address_space *mapping, int err)
557{
558 errseq_t eseq = __errseq_set(&mapping->wb_err, err);
559
560 trace_filemap_set_wb_err(mapping, eseq);
561}
562EXPORT_SYMBOL(__filemap_set_wb_err);
563
564/**
565 * file_check_and_advance_wb_err - report wb error (if any) that was previously
566 * and advance wb_err to current one
567 * @file: struct file on which the error is being reported
568 *
569 * When userland calls fsync (or something like nfsd does the equivalent), we
570 * want to report any writeback errors that occurred since the last fsync (or
571 * since the file was opened if there haven't been any).
572 *
573 * Grab the wb_err from the mapping. If it matches what we have in the file,
574 * then just quickly return 0. The file is all caught up.
575 *
576 * If it doesn't match, then take the mapping value, set the "seen" flag in
577 * it and try to swap it into place. If it works, or another task beat us
578 * to it with the new value, then update the f_wb_err and return the error
579 * portion. The error at this point must be reported via proper channels
580 * (a'la fsync, or NFS COMMIT operation, etc.).
581 *
582 * While we handle mapping->wb_err with atomic operations, the f_wb_err
583 * value is protected by the f_lock since we must ensure that it reflects
584 * the latest value swapped in for this file descriptor.
585 */
586int file_check_and_advance_wb_err(struct file *file)
587{
588 int err = 0;
589 errseq_t old = READ_ONCE(file->f_wb_err);
590 struct address_space *mapping = file->f_mapping;
591
592 /* Locklessly handle the common case where nothing has changed */
593 if (errseq_check(&mapping->wb_err, old)) {
594 /* Something changed, must use slow path */
595 spin_lock(&file->f_lock);
596 old = file->f_wb_err;
597 err = errseq_check_and_advance(&mapping->wb_err,
598 &file->f_wb_err);
599 trace_file_check_and_advance_wb_err(file, old);
600 spin_unlock(&file->f_lock);
601 }
602 return err;
603}
604EXPORT_SYMBOL(file_check_and_advance_wb_err);
605
606/**
607 * file_write_and_wait_range - write out & wait on a file range
608 * @file: file pointing to address_space with pages
609 * @lstart: offset in bytes where the range starts
610 * @lend: offset in bytes where the range ends (inclusive)
611 *
612 * Write out and wait upon file offsets lstart->lend, inclusive.
613 *
614 * Note that @lend is inclusive (describes the last byte to be written) so
615 * that this function can be used to write to the very end-of-file (end = -1).
616 *
617 * After writing out and waiting on the data, we check and advance the
618 * f_wb_err cursor to the latest value, and return any errors detected there.
619 */
620int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
621{
622 int err = 0, err2;
623 struct address_space *mapping = file->f_mapping;
624
625 if ((!dax_mapping(mapping) && mapping->nrpages) ||
626 (dax_mapping(mapping) && mapping->nrexceptional)) {
627 err = __filemap_fdatawrite_range(mapping, lstart, lend,
628 WB_SYNC_ALL);
629 /* See comment of filemap_write_and_wait() */
630 if (err != -EIO)
631 __filemap_fdatawait_range(mapping, lstart, lend);
632 }
633 err2 = file_check_and_advance_wb_err(file);
634 if (!err)
635 err = err2;
636 return err;
637}
638EXPORT_SYMBOL(file_write_and_wait_range);
639
556/** 640/**
557 * replace_page_cache_page - replace a pagecache page with a new one 641 * replace_page_cache_page - replace a pagecache page with a new one
558 * @old: page to be replaced 642 * @old: page to be replaced