aboutsummaryrefslogtreecommitdiffstats
path: root/fs/fs-writeback.c
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2016-07-26 18:21:50 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-07-26 19:19:19 -0400
commit6c60d2b5746cf23025ffe71bd7ff9075048fc90c (patch)
tree6794888cf362aa86c079ed5697c1ef7a6c117a1a /fs/fs-writeback.c
parent7d65b27448a90e08270537234819563e07936f76 (diff)
fs/fs-writeback.c: add a new writeback list for sync
wait_sb_inodes() currently does a walk of all inodes in the filesystem to find dirty one to wait on during sync. This is highly inefficient and wastes a lot of CPU when there are lots of clean cached inodes that we don't need to wait on. To avoid this "all inode" walk, we need to track inodes that are currently under writeback that we need to wait for. We do this by adding inodes to a writeback list on the sb when the mapping is first tagged as having pages under writeback. wait_sb_inodes() can then walk this list of "inodes under IO" and wait specifically just for the inodes that the current sync(2) needs to wait for. Define a couple helpers to add/remove an inode from the writeback list and call them when the overall mapping is tagged for or cleared from writeback. Update wait_sb_inodes() to walk only the inodes under writeback due to the sync. With this change, filesystem sync times are significantly reduced for fs' with largely populated inode caches and otherwise no other work to do. For example, on a 16xcpu 2GHz x86-64 server, 10TB XFS filesystem with a ~10m entry inode cache, sync times are reduced from ~7.3s to less than 0.1s when the filesystem is fully clean. Link: http://lkml.kernel.org/r/1466594593-6757-2-git-send-email-bfoster@redhat.com Signed-off-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Josef Bacik <jbacik@fb.com> Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Jan Kara <jack@suse.cz> Tested-by: Holger Hoffstätte <holger.hoffstaette@applied-asynchrony.com> Cc: Al Viro <viro@ZenIV.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r--fs/fs-writeback.c106
1 files changed, 81 insertions, 25 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index fe7e83a45eff..1fcce8345da3 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -981,6 +981,37 @@ void inode_io_list_del(struct inode *inode)
981} 981}
982 982
983/* 983/*
984 * mark an inode as under writeback on the sb
985 */
986void sb_mark_inode_writeback(struct inode *inode)
987{
988 struct super_block *sb = inode->i_sb;
989 unsigned long flags;
990
991 if (list_empty(&inode->i_wb_list)) {
992 spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
993 if (list_empty(&inode->i_wb_list))
994 list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb);
995 spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
996 }
997}
998
999/*
1000 * clear an inode as under writeback on the sb
1001 */
1002void sb_clear_inode_writeback(struct inode *inode)
1003{
1004 struct super_block *sb = inode->i_sb;
1005 unsigned long flags;
1006
1007 if (!list_empty(&inode->i_wb_list)) {
1008 spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
1009 list_del_init(&inode->i_wb_list);
1010 spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
1011 }
1012}
1013
1014/*
984 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the 1015 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
985 * furthest end of its superblock's dirty-inode list. 1016 * furthest end of its superblock's dirty-inode list.
986 * 1017 *
@@ -2154,7 +2185,7 @@ EXPORT_SYMBOL(__mark_inode_dirty);
2154 */ 2185 */
2155static void wait_sb_inodes(struct super_block *sb) 2186static void wait_sb_inodes(struct super_block *sb)
2156{ 2187{
2157 struct inode *inode, *old_inode = NULL; 2188 LIST_HEAD(sync_list);
2158 2189
2159 /* 2190 /*
2160 * We need to be protected against the filesystem going from 2191 * We need to be protected against the filesystem going from
@@ -2163,38 +2194,60 @@ static void wait_sb_inodes(struct super_block *sb)
2163 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 2194 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2164 2195
2165 mutex_lock(&sb->s_sync_lock); 2196 mutex_lock(&sb->s_sync_lock);
2166 spin_lock(&sb->s_inode_list_lock);
2167 2197
2168 /* 2198 /*
2169 * Data integrity sync. Must wait for all pages under writeback, 2199 * Splice the writeback list onto a temporary list to avoid waiting on
2170 * because there may have been pages dirtied before our sync 2200 * inodes that have started writeback after this point.
2171 * call, but which had writeout started before we write it out. 2201 *
2172 * In which case, the inode may not be on the dirty list, but 2202 * Use rcu_read_lock() to keep the inodes around until we have a
2173 * we still have to wait for that writeout. 2203 * reference. s_inode_wblist_lock protects sb->s_inodes_wb as well as
2204 * the local list because inodes can be dropped from either by writeback
2205 * completion.
2206 */
2207 rcu_read_lock();
2208 spin_lock_irq(&sb->s_inode_wblist_lock);
2209 list_splice_init(&sb->s_inodes_wb, &sync_list);
2210
2211 /*
2212 * Data integrity sync. Must wait for all pages under writeback, because
2213 * there may have been pages dirtied before our sync call, but which had
2214 * writeout started before we write it out. In which case, the inode
2215 * may not be on the dirty list, but we still have to wait for that
2216 * writeout.
2174 */ 2217 */
2175 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 2218 while (!list_empty(&sync_list)) {
2219 struct inode *inode = list_first_entry(&sync_list, struct inode,
2220 i_wb_list);
2176 struct address_space *mapping = inode->i_mapping; 2221 struct address_space *mapping = inode->i_mapping;
2177 2222
2223 /*
2224 * Move each inode back to the wb list before we drop the lock
2225 * to preserve consistency between i_wb_list and the mapping
2226 * writeback tag. Writeback completion is responsible to remove
2227 * the inode from either list once the writeback tag is cleared.
2228 */
2229 list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb);
2230
2231 /*
2232 * The mapping can appear untagged while still on-list since we
2233 * do not have the mapping lock. Skip it here, wb completion
2234 * will remove it.
2235 */
2236 if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
2237 continue;
2238
2239 spin_unlock_irq(&sb->s_inode_wblist_lock);
2240
2178 spin_lock(&inode->i_lock); 2241 spin_lock(&inode->i_lock);
2179 if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || 2242 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
2180 (mapping->nrpages == 0)) {
2181 spin_unlock(&inode->i_lock); 2243 spin_unlock(&inode->i_lock);
2244
2245 spin_lock_irq(&sb->s_inode_wblist_lock);
2182 continue; 2246 continue;
2183 } 2247 }
2184 __iget(inode); 2248 __iget(inode);
2185 spin_unlock(&inode->i_lock); 2249 spin_unlock(&inode->i_lock);
2186 spin_unlock(&sb->s_inode_list_lock); 2250 rcu_read_unlock();
2187
2188 /*
2189 * We hold a reference to 'inode' so it couldn't have been
2190 * removed from s_inodes list while we dropped the
2191 * s_inode_list_lock. We cannot iput the inode now as we can
2192 * be holding the last reference and we cannot iput it under
2193 * s_inode_list_lock. So we keep the reference and iput it
2194 * later.
2195 */
2196 iput(old_inode);
2197 old_inode = inode;
2198 2251
2199 /* 2252 /*
2200 * We keep the error status of individual mapping so that 2253 * We keep the error status of individual mapping so that
@@ -2205,10 +2258,13 @@ static void wait_sb_inodes(struct super_block *sb)
2205 2258
2206 cond_resched(); 2259 cond_resched();
2207 2260
2208 spin_lock(&sb->s_inode_list_lock); 2261 iput(inode);
2262
2263 rcu_read_lock();
2264 spin_lock_irq(&sb->s_inode_wblist_lock);
2209 } 2265 }
2210 spin_unlock(&sb->s_inode_list_lock); 2266 spin_unlock_irq(&sb->s_inode_wblist_lock);
2211 iput(old_inode); 2267 rcu_read_unlock();
2212 mutex_unlock(&sb->s_sync_lock); 2268 mutex_unlock(&sb->s_sync_lock);
2213} 2269}
2214 2270