aboutsummaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorTheodore Ts'o <tytso@mit.edu>2015-02-02 00:37:00 -0500
committerAl Viro <viro@zeniv.linux.org.uk>2015-02-05 02:45:00 -0500
commit0ae45f63d4ef8d8eeec49c7d8b44a1775fff13e8 (patch)
tree660dbb014482092361eab263847fb906b5a9ec22 /include
parente36f014edff70fc02b3d3d79cead1d58f289332e (diff)
vfs: add support for a lazytime mount option
Add a new mount option which enables a new "lazytime" mode. This mode causes atime, mtime, and ctime updates to only be made to the in-memory version of the inode. The on-disk times will only get updated when (a) if the inode needs to be updated for some non-time related change, (b) if userspace calls fsync(), syncfs() or sync(), or (c) just before an undeleted inode is evicted from memory. This is OK according to POSIX because there are no guarantees after a crash unless userspace explicitly requests via a fsync(2) call. For workloads which feature a large number of random write to a preallocated file, the lazytime mount option significantly reduces writes to the inode table. The repeated 4k writes to a single block will result in undesirable stress on flash devices and SMR disk drives. Even on conventional HDD's, the repeated writes to the inode table block will trigger Adjacent Track Interference (ATI) remediation latencies, which very negatively impact long tail latencies --- which is a very big deal for web serving tiers (for example). Google-Bug-Id: 18297052 Signed-off-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Diffstat (limited to 'include')
-rw-r--r--include/linux/backing-dev.h1
-rw-r--r--include/linux/fs.h5
-rw-r--r--include/trace/events/writeback.h60
-rw-r--r--include/uapi/linux/fs.h4
4 files changed, 68 insertions, 2 deletions
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 5da6012b7a14..4cdf7336f64a 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -55,6 +55,7 @@ struct bdi_writeback {
55 struct list_head b_dirty; /* dirty inodes */ 55 struct list_head b_dirty; /* dirty inodes */
56 struct list_head b_io; /* parked for writeback */ 56 struct list_head b_io; /* parked for writeback */
57 struct list_head b_more_io; /* parked for more writeback */ 57 struct list_head b_more_io; /* parked for more writeback */
58 struct list_head b_dirty_time; /* time stamps are dirty */
58 spinlock_t list_lock; /* protects the b_* lists */ 59 spinlock_t list_lock; /* protects the b_* lists */
59}; 60};
60 61
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 42efe13077b6..cd027ce2c705 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1746,8 +1746,12 @@ struct super_operations {
1746#define __I_DIO_WAKEUP 9 1746#define __I_DIO_WAKEUP 9
1747#define I_DIO_WAKEUP (1 << I_DIO_WAKEUP) 1747#define I_DIO_WAKEUP (1 << I_DIO_WAKEUP)
1748#define I_LINKABLE (1 << 10) 1748#define I_LINKABLE (1 << 10)
1749#define I_DIRTY_TIME (1 << 11)
1750#define __I_DIRTY_TIME_EXPIRED 12
1751#define I_DIRTY_TIME_EXPIRED (1 << __I_DIRTY_TIME_EXPIRED)
1749 1752
1750#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) 1753#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
1754#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
1751 1755
1752extern void __mark_inode_dirty(struct inode *, int); 1756extern void __mark_inode_dirty(struct inode *, int);
1753static inline void mark_inode_dirty(struct inode *inode) 1757static inline void mark_inode_dirty(struct inode *inode)
@@ -1910,6 +1914,7 @@ extern int current_umask(void);
1910 1914
1911extern void ihold(struct inode * inode); 1915extern void ihold(struct inode * inode);
1912extern void iput(struct inode *); 1916extern void iput(struct inode *);
1917extern int generic_update_time(struct inode *, struct timespec *, int);
1913 1918
1914static inline struct inode *file_inode(const struct file *f) 1919static inline struct inode *file_inode(const struct file *f)
1915{ 1920{
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index cee02d65ab3f..5ecb4c234625 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -18,6 +18,8 @@
18 {I_FREEING, "I_FREEING"}, \ 18 {I_FREEING, "I_FREEING"}, \
19 {I_CLEAR, "I_CLEAR"}, \ 19 {I_CLEAR, "I_CLEAR"}, \
20 {I_SYNC, "I_SYNC"}, \ 20 {I_SYNC, "I_SYNC"}, \
21 {I_DIRTY_TIME, "I_DIRTY_TIME"}, \
22 {I_DIRTY_TIME_EXPIRED, "I_DIRTY_TIME_EXPIRED"}, \
21 {I_REFERENCED, "I_REFERENCED"} \ 23 {I_REFERENCED, "I_REFERENCED"} \
22 ) 24 )
23 25
@@ -68,6 +70,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
68 TP_STRUCT__entry ( 70 TP_STRUCT__entry (
69 __array(char, name, 32) 71 __array(char, name, 32)
70 __field(unsigned long, ino) 72 __field(unsigned long, ino)
73 __field(unsigned long, state)
71 __field(unsigned long, flags) 74 __field(unsigned long, flags)
72 ), 75 ),
73 76
@@ -78,16 +81,25 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
78 strncpy(__entry->name, 81 strncpy(__entry->name,
79 bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32); 82 bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32);
80 __entry->ino = inode->i_ino; 83 __entry->ino = inode->i_ino;
84 __entry->state = inode->i_state;
81 __entry->flags = flags; 85 __entry->flags = flags;
82 ), 86 ),
83 87
84 TP_printk("bdi %s: ino=%lu flags=%s", 88 TP_printk("bdi %s: ino=%lu state=%s flags=%s",
85 __entry->name, 89 __entry->name,
86 __entry->ino, 90 __entry->ino,
91 show_inode_state(__entry->state),
87 show_inode_state(__entry->flags) 92 show_inode_state(__entry->flags)
88 ) 93 )
89); 94);
90 95
96DEFINE_EVENT(writeback_dirty_inode_template, writeback_mark_inode_dirty,
97
98 TP_PROTO(struct inode *inode, int flags),
99
100 TP_ARGS(inode, flags)
101);
102
91DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start, 103DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start,
92 104
93 TP_PROTO(struct inode *inode, int flags), 105 TP_PROTO(struct inode *inode, int flags),
@@ -598,6 +610,52 @@ DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode,
598 TP_ARGS(inode, wbc, nr_to_write) 610 TP_ARGS(inode, wbc, nr_to_write)
599); 611);
600 612
613DECLARE_EVENT_CLASS(writeback_lazytime_template,
614 TP_PROTO(struct inode *inode),
615
616 TP_ARGS(inode),
617
618 TP_STRUCT__entry(
619 __field( dev_t, dev )
620 __field(unsigned long, ino )
621 __field(unsigned long, state )
622 __field( __u16, mode )
623 __field(unsigned long, dirtied_when )
624 ),
625
626 TP_fast_assign(
627 __entry->dev = inode->i_sb->s_dev;
628 __entry->ino = inode->i_ino;
629 __entry->state = inode->i_state;
630 __entry->mode = inode->i_mode;
631 __entry->dirtied_when = inode->dirtied_when;
632 ),
633
634 TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o",
635 MAJOR(__entry->dev), MINOR(__entry->dev),
636 __entry->ino, __entry->dirtied_when,
637 show_inode_state(__entry->state), __entry->mode)
638);
639
640DEFINE_EVENT(writeback_lazytime_template, writeback_lazytime,
641 TP_PROTO(struct inode *inode),
642
643 TP_ARGS(inode)
644);
645
646DEFINE_EVENT(writeback_lazytime_template, writeback_lazytime_iput,
647 TP_PROTO(struct inode *inode),
648
649 TP_ARGS(inode)
650);
651
652DEFINE_EVENT(writeback_lazytime_template, writeback_dirty_inode_enqueue,
653
654 TP_PROTO(struct inode *inode),
655
656 TP_ARGS(inode)
657);
658
601#endif /* _TRACE_WRITEBACK_H */ 659#endif /* _TRACE_WRITEBACK_H */
602 660
603/* This part must be outside protection */ 661/* This part must be outside protection */
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 3735fa0a6784..9b964a5920af 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -90,6 +90,7 @@ struct inodes_stat_t {
90#define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ 90#define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */
91#define MS_I_VERSION (1<<23) /* Update inode I_version field */ 91#define MS_I_VERSION (1<<23) /* Update inode I_version field */
92#define MS_STRICTATIME (1<<24) /* Always perform atime updates */ 92#define MS_STRICTATIME (1<<24) /* Always perform atime updates */
93#define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
93 94
94/* These sb flags are internal to the kernel */ 95/* These sb flags are internal to the kernel */
95#define MS_NOSEC (1<<28) 96#define MS_NOSEC (1<<28)
@@ -100,7 +101,8 @@ struct inodes_stat_t {
100/* 101/*
101 * Superblock flags that can be altered by MS_REMOUNT 102 * Superblock flags that can be altered by MS_REMOUNT
102 */ 103 */
103#define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION) 104#define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION|\
105 MS_LAZYTIME)
104 106
105/* 107/*
106 * Old magic mount flag and mask 108 * Old magic mount flag and mask