aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTheodore Ts'o <tytso@mit.edu>2015-02-02 00:37:00 -0500
committerAl Viro <viro@zeniv.linux.org.uk>2015-02-05 02:45:00 -0500
commit0ae45f63d4ef8d8eeec49c7d8b44a1775fff13e8 (patch)
tree660dbb014482092361eab263847fb906b5a9ec22
parente36f014edff70fc02b3d3d79cead1d58f289332e (diff)
vfs: add support for a lazytime mount option
Add a new mount option which enables a new "lazytime" mode. This mode causes atime, mtime, and ctime updates to only be made to the in-memory version of the inode. The on-disk times will only get updated when (a) if the inode needs to be updated for some non-time related change, (b) if userspace calls fsync(), syncfs() or sync(), or (c) just before an undeleted inode is evicted from memory. This is OK according to POSIX because there are no guarantees after a crash unless userspace explicitly requests via a fsync(2) call. For workloads which feature a large number of random write to a preallocated file, the lazytime mount option significantly reduces writes to the inode table. The repeated 4k writes to a single block will result in undesirable stress on flash devices and SMR disk drives. Even on conventional HDD's, the repeated writes to the inode table block will trigger Adjacent Track Interference (ATI) remediation latencies, which very negatively impact long tail latencies --- which is a very big deal for web serving tiers (for example). Google-Bug-Id: 18297052 Signed-off-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
-rw-r--r--fs/ext4/inode.c6
-rw-r--r--fs/fs-writeback.c62
-rw-r--r--fs/gfs2/file.c4
-rw-r--r--fs/inode.c56
-rw-r--r--fs/jfs/file.c2
-rw-r--r--fs/libfs.c2
-rw-r--r--fs/proc_namespace.c1
-rw-r--r--fs/sync.c8
-rw-r--r--include/linux/backing-dev.h1
-rw-r--r--include/linux/fs.h5
-rw-r--r--include/trace/events/writeback.h60
-rw-r--r--include/uapi/linux/fs.h4
-rw-r--r--mm/backing-dev.c10
13 files changed, 186 insertions, 35 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5653fa42930b..628df5ba44a6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4840,11 +4840,17 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
4840 * If the inode is marked synchronous, we don't honour that here - doing 4840 * If the inode is marked synchronous, we don't honour that here - doing
4841 * so would cause a commit on atime updates, which we don't bother doing. 4841 * so would cause a commit on atime updates, which we don't bother doing.
4842 * We handle synchronous inodes at the highest possible level. 4842 * We handle synchronous inodes at the highest possible level.
4843 *
4844 * If only the I_DIRTY_TIME flag is set, we can skip everything. If
4845 * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need
4846 * to copy into the on-disk inode structure are the timestamp files.
4843 */ 4847 */
4844void ext4_dirty_inode(struct inode *inode, int flags) 4848void ext4_dirty_inode(struct inode *inode, int flags)
4845{ 4849{
4846 handle_t *handle; 4850 handle_t *handle;
4847 4851
4852 if (flags == I_DIRTY_TIME)
4853 return;
4848 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); 4854 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
4849 if (IS_ERR(handle)) 4855 if (IS_ERR(handle))
4850 goto out; 4856 goto out;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2d609a5fbfea..004686191354 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -247,14 +247,19 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
247 return ret; 247 return ret;
248} 248}
249 249
250#define EXPIRE_DIRTY_ATIME 0x0001
251
250/* 252/*
251 * Move expired (dirtied before work->older_than_this) dirty inodes from 253 * Move expired (dirtied before work->older_than_this) dirty inodes from
252 * @delaying_queue to @dispatch_queue. 254 * @delaying_queue to @dispatch_queue.
253 */ 255 */
254static int move_expired_inodes(struct list_head *delaying_queue, 256static int move_expired_inodes(struct list_head *delaying_queue,
255 struct list_head *dispatch_queue, 257 struct list_head *dispatch_queue,
258 int flags,
256 struct wb_writeback_work *work) 259 struct wb_writeback_work *work)
257{ 260{
261 unsigned long *older_than_this = NULL;
262 unsigned long expire_time;
258 LIST_HEAD(tmp); 263 LIST_HEAD(tmp);
259 struct list_head *pos, *node; 264 struct list_head *pos, *node;
260 struct super_block *sb = NULL; 265 struct super_block *sb = NULL;
@@ -262,13 +267,21 @@ static int move_expired_inodes(struct list_head *delaying_queue,
262 int do_sb_sort = 0; 267 int do_sb_sort = 0;
263 int moved = 0; 268 int moved = 0;
264 269
270 if ((flags & EXPIRE_DIRTY_ATIME) == 0)
271 older_than_this = work->older_than_this;
272 else if ((work->reason == WB_REASON_SYNC) == 0) {
273 expire_time = jiffies - (HZ * 86400);
274 older_than_this = &expire_time;
275 }
265 while (!list_empty(delaying_queue)) { 276 while (!list_empty(delaying_queue)) {
266 inode = wb_inode(delaying_queue->prev); 277 inode = wb_inode(delaying_queue->prev);
267 if (work->older_than_this && 278 if (older_than_this &&
268 inode_dirtied_after(inode, *work->older_than_this)) 279 inode_dirtied_after(inode, *older_than_this))
269 break; 280 break;
270 list_move(&inode->i_wb_list, &tmp); 281 list_move(&inode->i_wb_list, &tmp);
271 moved++; 282 moved++;
283 if (flags & EXPIRE_DIRTY_ATIME)
284 set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
272 if (sb_is_blkdev_sb(inode->i_sb)) 285 if (sb_is_blkdev_sb(inode->i_sb))
273 continue; 286 continue;
274 if (sb && sb != inode->i_sb) 287 if (sb && sb != inode->i_sb)
@@ -309,9 +322,12 @@ out:
309static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) 322static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
310{ 323{
311 int moved; 324 int moved;
325
312 assert_spin_locked(&wb->list_lock); 326 assert_spin_locked(&wb->list_lock);
313 list_splice_init(&wb->b_more_io, &wb->b_io); 327 list_splice_init(&wb->b_more_io, &wb->b_io);
314 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work); 328 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
329 moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
330 EXPIRE_DIRTY_ATIME, work);
315 trace_writeback_queue_io(wb, work, moved); 331 trace_writeback_queue_io(wb, work, moved);
316} 332}
317 333
@@ -435,6 +451,8 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
435 * updates after data IO completion. 451 * updates after data IO completion.
436 */ 452 */
437 redirty_tail(inode, wb); 453 redirty_tail(inode, wb);
454 } else if (inode->i_state & I_DIRTY_TIME) {
455 list_move(&inode->i_wb_list, &wb->b_dirty_time);
438 } else { 456 } else {
439 /* The inode is clean. Remove from writeback lists. */ 457 /* The inode is clean. Remove from writeback lists. */
440 list_del_init(&inode->i_wb_list); 458 list_del_init(&inode->i_wb_list);
@@ -481,7 +499,13 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
481 spin_lock(&inode->i_lock); 499 spin_lock(&inode->i_lock);
482 500
483 dirty = inode->i_state & I_DIRTY; 501 dirty = inode->i_state & I_DIRTY;
484 inode->i_state &= ~I_DIRTY; 502 if (((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) &&
503 (inode->i_state & I_DIRTY_TIME)) ||
504 (inode->i_state & I_DIRTY_TIME_EXPIRED)) {
505 dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
506 trace_writeback_lazytime(inode);
507 }
508 inode->i_state &= ~dirty;
485 509
486 /* 510 /*
487 * Paired with smp_mb() in __mark_inode_dirty(). This allows 511 * Paired with smp_mb() in __mark_inode_dirty(). This allows
@@ -501,8 +525,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
501 525
502 spin_unlock(&inode->i_lock); 526 spin_unlock(&inode->i_lock);
503 527
528 if (dirty & I_DIRTY_TIME)
529 mark_inode_dirty_sync(inode);
504 /* Don't write the inode if only I_DIRTY_PAGES was set */ 530 /* Don't write the inode if only I_DIRTY_PAGES was set */
505 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 531 if (dirty & ~I_DIRTY_PAGES) {
506 int err = write_inode(inode, wbc); 532 int err = write_inode(inode, wbc);
507 if (ret == 0) 533 if (ret == 0)
508 ret = err; 534 ret = err;
@@ -550,7 +576,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
550 * make sure inode is on some writeback list and leave it there unless 576 * make sure inode is on some writeback list and leave it there unless
551 * we have completely cleaned the inode. 577 * we have completely cleaned the inode.
552 */ 578 */
553 if (!(inode->i_state & I_DIRTY) && 579 if (!(inode->i_state & I_DIRTY_ALL) &&
554 (wbc->sync_mode != WB_SYNC_ALL || 580 (wbc->sync_mode != WB_SYNC_ALL ||
555 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) 581 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
556 goto out; 582 goto out;
@@ -565,7 +591,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
565 * If inode is clean, remove it from writeback lists. Otherwise don't 591 * If inode is clean, remove it from writeback lists. Otherwise don't
566 * touch it. See comment above for explanation. 592 * touch it. See comment above for explanation.
567 */ 593 */
568 if (!(inode->i_state & I_DIRTY)) 594 if (!(inode->i_state & I_DIRTY_ALL))
569 list_del_init(&inode->i_wb_list); 595 list_del_init(&inode->i_wb_list);
570 spin_unlock(&wb->list_lock); 596 spin_unlock(&wb->list_lock);
571 inode_sync_complete(inode); 597 inode_sync_complete(inode);
@@ -707,7 +733,7 @@ static long writeback_sb_inodes(struct super_block *sb,
707 wrote += write_chunk - wbc.nr_to_write; 733 wrote += write_chunk - wbc.nr_to_write;
708 spin_lock(&wb->list_lock); 734 spin_lock(&wb->list_lock);
709 spin_lock(&inode->i_lock); 735 spin_lock(&inode->i_lock);
710 if (!(inode->i_state & I_DIRTY)) 736 if (!(inode->i_state & I_DIRTY_ALL))
711 wrote++; 737 wrote++;
712 requeue_inode(inode, wb, &wbc); 738 requeue_inode(inode, wb, &wbc);
713 inode_sync_complete(inode); 739 inode_sync_complete(inode);
@@ -1145,16 +1171,20 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
1145 * page->mapping->host, so the page-dirtying time is recorded in the internal 1171 * page->mapping->host, so the page-dirtying time is recorded in the internal
1146 * blockdev inode. 1172 * blockdev inode.
1147 */ 1173 */
1174#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
1148void __mark_inode_dirty(struct inode *inode, int flags) 1175void __mark_inode_dirty(struct inode *inode, int flags)
1149{ 1176{
1150 struct super_block *sb = inode->i_sb; 1177 struct super_block *sb = inode->i_sb;
1151 struct backing_dev_info *bdi = NULL; 1178 struct backing_dev_info *bdi = NULL;
1179 int dirtytime;
1180
1181 trace_writeback_mark_inode_dirty(inode, flags);
1152 1182
1153 /* 1183 /*
1154 * Don't do this for I_DIRTY_PAGES - that doesn't actually 1184 * Don't do this for I_DIRTY_PAGES - that doesn't actually
1155 * dirty the inode itself 1185 * dirty the inode itself
1156 */ 1186 */
1157 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 1187 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) {
1158 trace_writeback_dirty_inode_start(inode, flags); 1188 trace_writeback_dirty_inode_start(inode, flags);
1159 1189
1160 if (sb->s_op->dirty_inode) 1190 if (sb->s_op->dirty_inode)
@@ -1162,6 +1192,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1162 1192
1163 trace_writeback_dirty_inode(inode, flags); 1193 trace_writeback_dirty_inode(inode, flags);
1164 } 1194 }
1195 if (flags & I_DIRTY_INODE)
1196 flags &= ~I_DIRTY_TIME;
1197 dirtytime = flags & I_DIRTY_TIME;
1165 1198
1166 /* 1199 /*
1167 * Paired with smp_mb() in __writeback_single_inode() for the 1200 * Paired with smp_mb() in __writeback_single_inode() for the
@@ -1169,16 +1202,21 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1169 */ 1202 */
1170 smp_mb(); 1203 smp_mb();
1171 1204
1172 if ((inode->i_state & flags) == flags) 1205 if (((inode->i_state & flags) == flags) ||
1206 (dirtytime && (inode->i_state & I_DIRTY_INODE)))
1173 return; 1207 return;
1174 1208
1175 if (unlikely(block_dump)) 1209 if (unlikely(block_dump))
1176 block_dump___mark_inode_dirty(inode); 1210 block_dump___mark_inode_dirty(inode);
1177 1211
1178 spin_lock(&inode->i_lock); 1212 spin_lock(&inode->i_lock);
1213 if (dirtytime && (inode->i_state & I_DIRTY_INODE))
1214 goto out_unlock_inode;
1179 if ((inode->i_state & flags) != flags) { 1215 if ((inode->i_state & flags) != flags) {
1180 const int was_dirty = inode->i_state & I_DIRTY; 1216 const int was_dirty = inode->i_state & I_DIRTY;
1181 1217
1218 if (flags & I_DIRTY_INODE)
1219 inode->i_state &= ~I_DIRTY_TIME;
1182 inode->i_state |= flags; 1220 inode->i_state |= flags;
1183 1221
1184 /* 1222 /*
@@ -1225,8 +1263,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1225 } 1263 }
1226 1264
1227 inode->dirtied_when = jiffies; 1265 inode->dirtied_when = jiffies;
1228 list_move(&inode->i_wb_list, &bdi->wb.b_dirty); 1266 list_move(&inode->i_wb_list, dirtytime ?
1267 &bdi->wb.b_dirty_time : &bdi->wb.b_dirty);
1229 spin_unlock(&bdi->wb.list_lock); 1268 spin_unlock(&bdi->wb.list_lock);
1269 trace_writeback_dirty_inode_enqueue(inode);
1230 1270
1231 if (wakeup_bdi) 1271 if (wakeup_bdi)
1232 bdi_wakeup_thread_delayed(bdi); 1272 bdi_wakeup_thread_delayed(bdi);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 6e600abf694a..15c44cf457cc 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -655,7 +655,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
655{ 655{
656 struct address_space *mapping = file->f_mapping; 656 struct address_space *mapping = file->f_mapping;
657 struct inode *inode = mapping->host; 657 struct inode *inode = mapping->host;
658 int sync_state = inode->i_state & I_DIRTY; 658 int sync_state = inode->i_state & I_DIRTY_ALL;
659 struct gfs2_inode *ip = GFS2_I(inode); 659 struct gfs2_inode *ip = GFS2_I(inode);
660 int ret = 0, ret1 = 0; 660 int ret = 0, ret1 = 0;
661 661
@@ -668,7 +668,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
668 if (!gfs2_is_jdata(ip)) 668 if (!gfs2_is_jdata(ip))
669 sync_state &= ~I_DIRTY_PAGES; 669 sync_state &= ~I_DIRTY_PAGES;
670 if (datasync) 670 if (datasync)
671 sync_state &= ~I_DIRTY_SYNC; 671 sync_state &= ~(I_DIRTY_SYNC | I_DIRTY_TIME);
672 672
673 if (sync_state) { 673 if (sync_state) {
674 ret = sync_inode_metadata(inode, 1); 674 ret = sync_inode_metadata(inode, 1);
diff --git a/fs/inode.c b/fs/inode.c
index aa149e7262ac..4feb85cc125f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -18,6 +18,7 @@
18#include <linux/buffer_head.h> /* for inode_has_buffers */ 18#include <linux/buffer_head.h> /* for inode_has_buffers */
19#include <linux/ratelimit.h> 19#include <linux/ratelimit.h>
20#include <linux/list_lru.h> 20#include <linux/list_lru.h>
21#include <trace/events/writeback.h>
21#include "internal.h" 22#include "internal.h"
22 23
23/* 24/*
@@ -30,7 +31,7 @@
30 * inode_sb_list_lock protects: 31 * inode_sb_list_lock protects:
31 * sb->s_inodes, inode->i_sb_list 32 * sb->s_inodes, inode->i_sb_list
32 * bdi->wb.list_lock protects: 33 * bdi->wb.list_lock protects:
33 * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list 34 * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list
34 * inode_hash_lock protects: 35 * inode_hash_lock protects:
35 * inode_hashtable, inode->i_hash 36 * inode_hashtable, inode->i_hash
36 * 37 *
@@ -416,7 +417,8 @@ static void inode_lru_list_add(struct inode *inode)
416 */ 417 */
417void inode_add_lru(struct inode *inode) 418void inode_add_lru(struct inode *inode)
418{ 419{
419 if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) && 420 if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
421 I_FREEING | I_WILL_FREE)) &&
420 !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE) 422 !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE)
421 inode_lru_list_add(inode); 423 inode_lru_list_add(inode);
422} 424}
@@ -647,7 +649,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
647 spin_unlock(&inode->i_lock); 649 spin_unlock(&inode->i_lock);
648 continue; 650 continue;
649 } 651 }
650 if (inode->i_state & I_DIRTY && !kill_dirty) { 652 if (inode->i_state & I_DIRTY_ALL && !kill_dirty) {
651 spin_unlock(&inode->i_lock); 653 spin_unlock(&inode->i_lock);
652 busy = 1; 654 busy = 1;
653 continue; 655 continue;
@@ -1432,11 +1434,20 @@ static void iput_final(struct inode *inode)
1432 */ 1434 */
1433void iput(struct inode *inode) 1435void iput(struct inode *inode)
1434{ 1436{
1435 if (inode) { 1437 if (!inode)
1436 BUG_ON(inode->i_state & I_CLEAR); 1438 return;
1437 1439 BUG_ON(inode->i_state & I_CLEAR);
1438 if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) 1440retry:
1439 iput_final(inode); 1441 if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
1442 if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
1443 atomic_inc(&inode->i_count);
1444 inode->i_state &= ~I_DIRTY_TIME;
1445 spin_unlock(&inode->i_lock);
1446 trace_writeback_lazytime_iput(inode);
1447 mark_inode_dirty_sync(inode);
1448 goto retry;
1449 }
1450 iput_final(inode);
1440 } 1451 }
1441} 1452}
1442EXPORT_SYMBOL(iput); 1453EXPORT_SYMBOL(iput);
@@ -1495,14 +1506,9 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
1495 return 0; 1506 return 0;
1496} 1507}
1497 1508
1498/* 1509int generic_update_time(struct inode *inode, struct timespec *time, int flags)
1499 * This does the actual work of updating an inodes time or version. Must have
1500 * had called mnt_want_write() before calling this.
1501 */
1502static int update_time(struct inode *inode, struct timespec *time, int flags)
1503{ 1510{
1504 if (inode->i_op->update_time) 1511 int iflags = I_DIRTY_TIME;
1505 return inode->i_op->update_time(inode, time, flags);
1506 1512
1507 if (flags & S_ATIME) 1513 if (flags & S_ATIME)
1508 inode->i_atime = *time; 1514 inode->i_atime = *time;
@@ -1512,9 +1518,27 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
1512 inode->i_ctime = *time; 1518 inode->i_ctime = *time;
1513 if (flags & S_MTIME) 1519 if (flags & S_MTIME)
1514 inode->i_mtime = *time; 1520 inode->i_mtime = *time;
1515 mark_inode_dirty_sync(inode); 1521
1522 if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION))
1523 iflags |= I_DIRTY_SYNC;
1524 __mark_inode_dirty(inode, iflags);
1516 return 0; 1525 return 0;
1517} 1526}
1527EXPORT_SYMBOL(generic_update_time);
1528
1529/*
1530 * This does the actual work of updating an inodes time or version. Must have
1531 * had called mnt_want_write() before calling this.
1532 */
1533static int update_time(struct inode *inode, struct timespec *time, int flags)
1534{
1535 int (*update_time)(struct inode *, struct timespec *, int);
1536
1537 update_time = inode->i_op->update_time ? inode->i_op->update_time :
1538 generic_update_time;
1539
1540 return update_time(inode, time, flags);
1541}
1518 1542
1519/** 1543/**
1520 * touch_atime - update the access time 1544 * touch_atime - update the access time
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 33aa0cc1f8b8..10815f8dfd8b 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -39,7 +39,7 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
39 return rc; 39 return rc;
40 40
41 mutex_lock(&inode->i_mutex); 41 mutex_lock(&inode->i_mutex);
42 if (!(inode->i_state & I_DIRTY) || 42 if (!(inode->i_state & I_DIRTY_ALL) ||
43 (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) { 43 (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
44 /* Make sure committed changes hit the disk */ 44 /* Make sure committed changes hit the disk */
45 jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1); 45 jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
diff --git a/fs/libfs.c b/fs/libfs.c
index 005843ce5dbd..b2ffdb045be4 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -948,7 +948,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
948 948
949 mutex_lock(&inode->i_mutex); 949 mutex_lock(&inode->i_mutex);
950 ret = sync_mapping_buffers(inode->i_mapping); 950 ret = sync_mapping_buffers(inode->i_mapping);
951 if (!(inode->i_state & I_DIRTY)) 951 if (!(inode->i_state & I_DIRTY_ALL))
952 goto out; 952 goto out;
953 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 953 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
954 goto out; 954 goto out;
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 0f96f71ab32b..8db932da4009 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -44,6 +44,7 @@ static int show_sb_opts(struct seq_file *m, struct super_block *sb)
44 { MS_SYNCHRONOUS, ",sync" }, 44 { MS_SYNCHRONOUS, ",sync" },
45 { MS_DIRSYNC, ",dirsync" }, 45 { MS_DIRSYNC, ",dirsync" },
46 { MS_MANDLOCK, ",mand" }, 46 { MS_MANDLOCK, ",mand" },
47 { MS_LAZYTIME, ",lazytime" },
47 { 0, NULL } 48 { 0, NULL }
48 }; 49 };
49 const struct proc_fs_info *fs_infop; 50 const struct proc_fs_info *fs_infop;
diff --git a/fs/sync.c b/fs/sync.c
index 01d9f18a70b5..fbc98ee62044 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -177,8 +177,16 @@ SYSCALL_DEFINE1(syncfs, int, fd)
177 */ 177 */
178int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync) 178int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
179{ 179{
180 struct inode *inode = file->f_mapping->host;
181
180 if (!file->f_op->fsync) 182 if (!file->f_op->fsync)
181 return -EINVAL; 183 return -EINVAL;
184 if (!datasync && (inode->i_state & I_DIRTY_TIME)) {
185 spin_lock(&inode->i_lock);
186 inode->i_state &= ~I_DIRTY_TIME;
187 spin_unlock(&inode->i_lock);
188 mark_inode_dirty_sync(inode);
189 }
182 return file->f_op->fsync(file, start, end, datasync); 190 return file->f_op->fsync(file, start, end, datasync);
183} 191}
184EXPORT_SYMBOL(vfs_fsync_range); 192EXPORT_SYMBOL(vfs_fsync_range);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 5da6012b7a14..4cdf7336f64a 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -55,6 +55,7 @@ struct bdi_writeback {
55 struct list_head b_dirty; /* dirty inodes */ 55 struct list_head b_dirty; /* dirty inodes */
56 struct list_head b_io; /* parked for writeback */ 56 struct list_head b_io; /* parked for writeback */
57 struct list_head b_more_io; /* parked for more writeback */ 57 struct list_head b_more_io; /* parked for more writeback */
58 struct list_head b_dirty_time; /* time stamps are dirty */
58 spinlock_t list_lock; /* protects the b_* lists */ 59 spinlock_t list_lock; /* protects the b_* lists */
59}; 60};
60 61
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 42efe13077b6..cd027ce2c705 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1746,8 +1746,12 @@ struct super_operations {
1746#define __I_DIO_WAKEUP 9 1746#define __I_DIO_WAKEUP 9
1747#define I_DIO_WAKEUP (1 << I_DIO_WAKEUP) 1747#define I_DIO_WAKEUP (1 << I_DIO_WAKEUP)
1748#define I_LINKABLE (1 << 10) 1748#define I_LINKABLE (1 << 10)
1749#define I_DIRTY_TIME (1 << 11)
1750#define __I_DIRTY_TIME_EXPIRED 12
1751#define I_DIRTY_TIME_EXPIRED (1 << __I_DIRTY_TIME_EXPIRED)
1749 1752
1750#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) 1753#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
1754#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
1751 1755
1752extern void __mark_inode_dirty(struct inode *, int); 1756extern void __mark_inode_dirty(struct inode *, int);
1753static inline void mark_inode_dirty(struct inode *inode) 1757static inline void mark_inode_dirty(struct inode *inode)
@@ -1910,6 +1914,7 @@ extern int current_umask(void);
1910 1914
1911extern void ihold(struct inode * inode); 1915extern void ihold(struct inode * inode);
1912extern void iput(struct inode *); 1916extern void iput(struct inode *);
1917extern int generic_update_time(struct inode *, struct timespec *, int);
1913 1918
1914static inline struct inode *file_inode(const struct file *f) 1919static inline struct inode *file_inode(const struct file *f)
1915{ 1920{
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index cee02d65ab3f..5ecb4c234625 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -18,6 +18,8 @@
18 {I_FREEING, "I_FREEING"}, \ 18 {I_FREEING, "I_FREEING"}, \
19 {I_CLEAR, "I_CLEAR"}, \ 19 {I_CLEAR, "I_CLEAR"}, \
20 {I_SYNC, "I_SYNC"}, \ 20 {I_SYNC, "I_SYNC"}, \
21 {I_DIRTY_TIME, "I_DIRTY_TIME"}, \
22 {I_DIRTY_TIME_EXPIRED, "I_DIRTY_TIME_EXPIRED"}, \
21 {I_REFERENCED, "I_REFERENCED"} \ 23 {I_REFERENCED, "I_REFERENCED"} \
22 ) 24 )
23 25
@@ -68,6 +70,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
68 TP_STRUCT__entry ( 70 TP_STRUCT__entry (
69 __array(char, name, 32) 71 __array(char, name, 32)
70 __field(unsigned long, ino) 72 __field(unsigned long, ino)
73 __field(unsigned long, state)
71 __field(unsigned long, flags) 74 __field(unsigned long, flags)
72 ), 75 ),
73 76
@@ -78,16 +81,25 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
78 strncpy(__entry->name, 81 strncpy(__entry->name,
79 bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32); 82 bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32);
80 __entry->ino = inode->i_ino; 83 __entry->ino = inode->i_ino;
84 __entry->state = inode->i_state;
81 __entry->flags = flags; 85 __entry->flags = flags;
82 ), 86 ),
83 87
84 TP_printk("bdi %s: ino=%lu flags=%s", 88 TP_printk("bdi %s: ino=%lu state=%s flags=%s",
85 __entry->name, 89 __entry->name,
86 __entry->ino, 90 __entry->ino,
91 show_inode_state(__entry->state),
87 show_inode_state(__entry->flags) 92 show_inode_state(__entry->flags)
88 ) 93 )
89); 94);
90 95
96DEFINE_EVENT(writeback_dirty_inode_template, writeback_mark_inode_dirty,
97
98 TP_PROTO(struct inode *inode, int flags),
99
100 TP_ARGS(inode, flags)
101);
102
91DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start, 103DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start,
92 104
93 TP_PROTO(struct inode *inode, int flags), 105 TP_PROTO(struct inode *inode, int flags),
@@ -598,6 +610,52 @@ DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode,
598 TP_ARGS(inode, wbc, nr_to_write) 610 TP_ARGS(inode, wbc, nr_to_write)
599); 611);
600 612
613DECLARE_EVENT_CLASS(writeback_lazytime_template,
614 TP_PROTO(struct inode *inode),
615
616 TP_ARGS(inode),
617
618 TP_STRUCT__entry(
619 __field( dev_t, dev )
620 __field(unsigned long, ino )
621 __field(unsigned long, state )
622 __field( __u16, mode )
623 __field(unsigned long, dirtied_when )
624 ),
625
626 TP_fast_assign(
627 __entry->dev = inode->i_sb->s_dev;
628 __entry->ino = inode->i_ino;
629 __entry->state = inode->i_state;
630 __entry->mode = inode->i_mode;
631 __entry->dirtied_when = inode->dirtied_when;
632 ),
633
634 TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o",
635 MAJOR(__entry->dev), MINOR(__entry->dev),
636 __entry->ino, __entry->dirtied_when,
637 show_inode_state(__entry->state), __entry->mode)
638);
639
640DEFINE_EVENT(writeback_lazytime_template, writeback_lazytime,
641 TP_PROTO(struct inode *inode),
642
643 TP_ARGS(inode)
644);
645
646DEFINE_EVENT(writeback_lazytime_template, writeback_lazytime_iput,
647 TP_PROTO(struct inode *inode),
648
649 TP_ARGS(inode)
650);
651
652DEFINE_EVENT(writeback_lazytime_template, writeback_dirty_inode_enqueue,
653
654 TP_PROTO(struct inode *inode),
655
656 TP_ARGS(inode)
657);
658
601#endif /* _TRACE_WRITEBACK_H */ 659#endif /* _TRACE_WRITEBACK_H */
602 660
603/* This part must be outside protection */ 661/* This part must be outside protection */
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 3735fa0a6784..9b964a5920af 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -90,6 +90,7 @@ struct inodes_stat_t {
90#define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ 90#define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */
91#define MS_I_VERSION (1<<23) /* Update inode I_version field */ 91#define MS_I_VERSION (1<<23) /* Update inode I_version field */
92#define MS_STRICTATIME (1<<24) /* Always perform atime updates */ 92#define MS_STRICTATIME (1<<24) /* Always perform atime updates */
93#define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
93 94
94/* These sb flags are internal to the kernel */ 95/* These sb flags are internal to the kernel */
95#define MS_NOSEC (1<<28) 96#define MS_NOSEC (1<<28)
@@ -100,7 +101,8 @@ struct inodes_stat_t {
100/* 101/*
101 * Superblock flags that can be altered by MS_REMOUNT 102 * Superblock flags that can be altered by MS_REMOUNT
102 */ 103 */
103#define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION) 104#define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION|\
105 MS_LAZYTIME)
104 106
105/* 107/*
106 * Old magic mount flag and mask 108 * Old magic mount flag and mask
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0ae0df55000b..915feea94c66 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -69,10 +69,10 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
69 unsigned long background_thresh; 69 unsigned long background_thresh;
70 unsigned long dirty_thresh; 70 unsigned long dirty_thresh;
71 unsigned long bdi_thresh; 71 unsigned long bdi_thresh;
72 unsigned long nr_dirty, nr_io, nr_more_io; 72 unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
73 struct inode *inode; 73 struct inode *inode;
74 74
75 nr_dirty = nr_io = nr_more_io = 0; 75 nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
76 spin_lock(&wb->list_lock); 76 spin_lock(&wb->list_lock);
77 list_for_each_entry(inode, &wb->b_dirty, i_wb_list) 77 list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
78 nr_dirty++; 78 nr_dirty++;
@@ -80,6 +80,9 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
80 nr_io++; 80 nr_io++;
81 list_for_each_entry(inode, &wb->b_more_io, i_wb_list) 81 list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
82 nr_more_io++; 82 nr_more_io++;
83 list_for_each_entry(inode, &wb->b_dirty_time, i_wb_list)
84 if (inode->i_state & I_DIRTY_TIME)
85 nr_dirty_time++;
83 spin_unlock(&wb->list_lock); 86 spin_unlock(&wb->list_lock);
84 87
85 global_dirty_limits(&background_thresh, &dirty_thresh); 88 global_dirty_limits(&background_thresh, &dirty_thresh);
@@ -98,6 +101,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
98 "b_dirty: %10lu\n" 101 "b_dirty: %10lu\n"
99 "b_io: %10lu\n" 102 "b_io: %10lu\n"
100 "b_more_io: %10lu\n" 103 "b_more_io: %10lu\n"
104 "b_dirty_time: %10lu\n"
101 "bdi_list: %10u\n" 105 "bdi_list: %10u\n"
102 "state: %10lx\n", 106 "state: %10lx\n",
103 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), 107 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
@@ -111,6 +115,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
111 nr_dirty, 115 nr_dirty,
112 nr_io, 116 nr_io,
113 nr_more_io, 117 nr_more_io,
118 nr_dirty_time,
114 !list_empty(&bdi->bdi_list), bdi->state); 119 !list_empty(&bdi->bdi_list), bdi->state);
115#undef K 120#undef K
116 121
@@ -418,6 +423,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
418 INIT_LIST_HEAD(&wb->b_dirty); 423 INIT_LIST_HEAD(&wb->b_dirty);
419 INIT_LIST_HEAD(&wb->b_io); 424 INIT_LIST_HEAD(&wb->b_io);
420 INIT_LIST_HEAD(&wb->b_more_io); 425 INIT_LIST_HEAD(&wb->b_more_io);
426 INIT_LIST_HEAD(&wb->b_dirty_time);
421 spin_lock_init(&wb->list_lock); 427 spin_lock_init(&wb->list_lock);
422 INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn); 428 INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
423} 429}