aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-08-07 01:45:50 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-08-07 01:53:23 -0400
commit3ddcd0569cd68f00f3beae9a7959b72918bb91f4 (patch)
tree3f7c591316560b1c22e2cc0700fbcd29aa3fbd7f
parent830c0f0edca67403d361fe976a25b17356c11f19 (diff)
vfs: optimize inode cache access patterns
The inode structure layout is largely random, and some of the vfs paths really do care. The path lookup in particular is already quite D$ intensive, and profiles show that accessing the 'inode->i_op->xyz' fields is quite costly. We already optimized the dcache to not unnecessarily load the d_op structure for members that are often NULL using the DCACHE_OP_xyz bits in dentry->d_flags, and this does something very similar for the inode ops that are used during pathname lookup. It also re-orders the fields so that the fields accessed by 'stat' are together at the beginning of the inode structure, and roughly in the order accessed. The effect of this seems to be in the 1-2% range for an empty kernel "make -j" run (which is fairly kernel-intensive, mostly in filename lookup), so it's visible. The numbers are fairly noisy, though, and likely depend a lot on exact microarchitecture. So there's more tuning to be done. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/inode.c1
-rw-r--r--fs/namei.c76
-rw-r--r--fs/stat.c4
-rw-r--r--include/linux/fs.h59
4 files changed, 106 insertions, 34 deletions
diff --git a/fs/inode.c b/fs/inode.c
index 5aab80dc008c..73920d555c88 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -143,6 +143,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
143 inode->i_op = &empty_iops; 143 inode->i_op = &empty_iops;
144 inode->i_fop = &empty_fops; 144 inode->i_fop = &empty_fops;
145 inode->i_nlink = 1; 145 inode->i_nlink = 1;
146 inode->i_opflags = 0;
146 inode->i_uid = 0; 147 inode->i_uid = 0;
147 inode->i_gid = 0; 148 inode->i_gid = 0;
148 atomic_set(&inode->i_writecount, 0); 149 atomic_set(&inode->i_writecount, 0);
diff --git a/fs/namei.c b/fs/namei.c
index 3d607bd80e09..4a98bf154d88 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -308,6 +308,26 @@ int generic_permission(struct inode *inode, int mask)
308 return -EACCES; 308 return -EACCES;
309} 309}
310 310
311/*
312 * We _really_ want to just do "generic_permission()" without
313 * even looking at the inode->i_op values. So we keep a cache
314 * flag in inode->i_opflags, that says "this has not special
315 * permission function, use the fast case".
316 */
317static inline int do_inode_permission(struct inode *inode, int mask)
318{
319 if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
320 if (likely(inode->i_op->permission))
321 return inode->i_op->permission(inode, mask);
322
323 /* This gets set once for the inode lifetime */
324 spin_lock(&inode->i_lock);
325 inode->i_opflags |= IOP_FASTPERM;
326 spin_unlock(&inode->i_lock);
327 }
328 return generic_permission(inode, mask);
329}
330
311/** 331/**
312 * inode_permission - check for access rights to a given inode 332 * inode_permission - check for access rights to a given inode
313 * @inode: inode to check permission on 333 * @inode: inode to check permission on
@@ -322,7 +342,7 @@ int inode_permission(struct inode *inode, int mask)
322{ 342{
323 int retval; 343 int retval;
324 344
325 if (mask & MAY_WRITE) { 345 if (unlikely(mask & MAY_WRITE)) {
326 umode_t mode = inode->i_mode; 346 umode_t mode = inode->i_mode;
327 347
328 /* 348 /*
@@ -339,11 +359,7 @@ int inode_permission(struct inode *inode, int mask)
339 return -EACCES; 359 return -EACCES;
340 } 360 }
341 361
342 if (inode->i_op->permission) 362 retval = do_inode_permission(inode, mask);
343 retval = inode->i_op->permission(inode, mask);
344 else
345 retval = generic_permission(inode, mask);
346
347 if (retval) 363 if (retval)
348 return retval; 364 return retval;
349 365
@@ -1245,6 +1261,26 @@ static void terminate_walk(struct nameidata *nd)
1245 } 1261 }
1246} 1262}
1247 1263
1264/*
1265 * Do we need to follow links? We _really_ want to be able
1266 * to do this check without having to look at inode->i_op,
1267 * so we keep a cache of "no, this doesn't need follow_link"
1268 * for the common case.
1269 */
1270static inline int do_follow_link(struct inode *inode, int follow)
1271{
1272 if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
1273 if (likely(inode->i_op->follow_link))
1274 return follow;
1275
1276 /* This gets set once for the inode lifetime */
1277 spin_lock(&inode->i_lock);
1278 inode->i_opflags |= IOP_NOFOLLOW;
1279 spin_unlock(&inode->i_lock);
1280 }
1281 return 0;
1282}
1283
1248static inline int walk_component(struct nameidata *nd, struct path *path, 1284static inline int walk_component(struct nameidata *nd, struct path *path,
1249 struct qstr *name, int type, int follow) 1285 struct qstr *name, int type, int follow)
1250{ 1286{
@@ -1267,7 +1303,7 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
1267 terminate_walk(nd); 1303 terminate_walk(nd);
1268 return -ENOENT; 1304 return -ENOENT;
1269 } 1305 }
1270 if (unlikely(inode->i_op->follow_link) && follow) { 1306 if (do_follow_link(inode, follow)) {
1271 if (nd->flags & LOOKUP_RCU) { 1307 if (nd->flags & LOOKUP_RCU) {
1272 if (unlikely(unlazy_walk(nd, path->dentry))) { 1308 if (unlikely(unlazy_walk(nd, path->dentry))) {
1273 terminate_walk(nd); 1309 terminate_walk(nd);
@@ -1320,6 +1356,26 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd)
1320} 1356}
1321 1357
1322/* 1358/*
1359 * We really don't want to look at inode->i_op->lookup
1360 * when we don't have to. So we keep a cache bit in
1361 * the inode ->i_opflags field that says "yes, we can
1362 * do lookup on this inode".
1363 */
1364static inline int can_lookup(struct inode *inode)
1365{
1366 if (likely(inode->i_opflags & IOP_LOOKUP))
1367 return 1;
1368 if (likely(!inode->i_op->lookup))
1369 return 0;
1370
1371 /* We do this once for the lifetime of the inode */
1372 spin_lock(&inode->i_lock);
1373 inode->i_opflags |= IOP_LOOKUP;
1374 spin_unlock(&inode->i_lock);
1375 return 1;
1376}
1377
1378/*
1323 * Name resolution. 1379 * Name resolution.
1324 * This is the basic name resolution function, turning a pathname into 1380 * This is the basic name resolution function, turning a pathname into
1325 * the final dentry. We expect 'base' to be positive and a directory. 1381 * the final dentry. We expect 'base' to be positive and a directory.
@@ -1398,10 +1454,10 @@ static int link_path_walk(const char *name, struct nameidata *nd)
1398 if (err) 1454 if (err)
1399 return err; 1455 return err;
1400 } 1456 }
1457 if (can_lookup(nd->inode))
1458 continue;
1401 err = -ENOTDIR; 1459 err = -ENOTDIR;
1402 if (!nd->inode->i_op->lookup) 1460 break;
1403 break;
1404 continue;
1405 /* here ends the main loop */ 1461 /* here ends the main loop */
1406 1462
1407last_component: 1463last_component:
diff --git a/fs/stat.c b/fs/stat.c
index 961039121cb8..ba5316ffac61 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -27,12 +27,12 @@ void generic_fillattr(struct inode *inode, struct kstat *stat)
27 stat->uid = inode->i_uid; 27 stat->uid = inode->i_uid;
28 stat->gid = inode->i_gid; 28 stat->gid = inode->i_gid;
29 stat->rdev = inode->i_rdev; 29 stat->rdev = inode->i_rdev;
30 stat->size = i_size_read(inode);
30 stat->atime = inode->i_atime; 31 stat->atime = inode->i_atime;
31 stat->mtime = inode->i_mtime; 32 stat->mtime = inode->i_mtime;
32 stat->ctime = inode->i_ctime; 33 stat->ctime = inode->i_ctime;
33 stat->size = i_size_read(inode);
34 stat->blocks = inode->i_blocks;
35 stat->blksize = (1 << inode->i_blkbits); 34 stat->blksize = (1 << inode->i_blkbits);
35 stat->blocks = inode->i_blocks;
36} 36}
37 37
38EXPORT_SYMBOL(generic_fillattr); 38EXPORT_SYMBOL(generic_fillattr);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 786b3b1113cf..178cdb4f1d4a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -738,22 +738,54 @@ static inline int mapping_writably_mapped(struct address_space *mapping)
738struct posix_acl; 738struct posix_acl;
739#define ACL_NOT_CACHED ((void *)(-1)) 739#define ACL_NOT_CACHED ((void *)(-1))
740 740
741#define IOP_FASTPERM 0x0001
742#define IOP_LOOKUP 0x0002
743#define IOP_NOFOLLOW 0x0004
744
745/*
746 * Keep mostly read-only and often accessed (especially for
747 * the RCU path lookup and 'stat' data) fields at the beginning
748 * of the 'struct inode'
749 */
741struct inode { 750struct inode {
742 /* RCU path lookup touches following: */
743 umode_t i_mode; 751 umode_t i_mode;
752 unsigned short i_opflags;
744 uid_t i_uid; 753 uid_t i_uid;
745 gid_t i_gid; 754 gid_t i_gid;
755 unsigned int i_flags;
756
757#ifdef CONFIG_FS_POSIX_ACL
758 struct posix_acl *i_acl;
759 struct posix_acl *i_default_acl;
760#endif
761
746 const struct inode_operations *i_op; 762 const struct inode_operations *i_op;
747 struct super_block *i_sb; 763 struct super_block *i_sb;
764 struct address_space *i_mapping;
748 765
749 spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
750 unsigned int i_flags;
751 unsigned long i_state;
752#ifdef CONFIG_SECURITY 766#ifdef CONFIG_SECURITY
753 void *i_security; 767 void *i_security;
754#endif 768#endif
755 struct mutex i_mutex;
756 769
770 /* Stat data, not accessed from path walking */
771 unsigned long i_ino;
772 unsigned int i_nlink;
773 dev_t i_rdev;
774 loff_t i_size;
775 struct timespec i_atime;
776 struct timespec i_mtime;
777 struct timespec i_ctime;
778 unsigned int i_blkbits;
779 blkcnt_t i_blocks;
780
781#ifdef __NEED_I_SIZE_ORDERED
782 seqcount_t i_size_seqcount;
783#endif
784
785 /* Misc */
786 unsigned long i_state;
787 spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
788 struct mutex i_mutex;
757 789
758 unsigned long dirtied_when; /* jiffies of first dirtying */ 790 unsigned long dirtied_when; /* jiffies of first dirtying */
759 791
@@ -765,25 +797,12 @@ struct inode {
765 struct list_head i_dentry; 797 struct list_head i_dentry;
766 struct rcu_head i_rcu; 798 struct rcu_head i_rcu;
767 }; 799 };
768 unsigned long i_ino;
769 atomic_t i_count; 800 atomic_t i_count;
770 unsigned int i_nlink;
771 dev_t i_rdev;
772 unsigned int i_blkbits;
773 u64 i_version; 801 u64 i_version;
774 loff_t i_size;
775#ifdef __NEED_I_SIZE_ORDERED
776 seqcount_t i_size_seqcount;
777#endif
778 struct timespec i_atime;
779 struct timespec i_mtime;
780 struct timespec i_ctime;
781 blkcnt_t i_blocks;
782 unsigned short i_bytes; 802 unsigned short i_bytes;
783 atomic_t i_dio_count; 803 atomic_t i_dio_count;
784 const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ 804 const struct file_operations *i_fop; /* former ->i_op->default_file_ops */
785 struct file_lock *i_flock; 805 struct file_lock *i_flock;
786 struct address_space *i_mapping;
787 struct address_space i_data; 806 struct address_space i_data;
788#ifdef CONFIG_QUOTA 807#ifdef CONFIG_QUOTA
789 struct dquot *i_dquot[MAXQUOTAS]; 808 struct dquot *i_dquot[MAXQUOTAS];
@@ -806,10 +825,6 @@ struct inode {
806 atomic_t i_readcount; /* struct files open RO */ 825 atomic_t i_readcount; /* struct files open RO */
807#endif 826#endif
808 atomic_t i_writecount; 827 atomic_t i_writecount;
809#ifdef CONFIG_FS_POSIX_ACL
810 struct posix_acl *i_acl;
811 struct posix_acl *i_default_acl;
812#endif
813 void *i_private; /* fs or device private pointer */ 828 void *i_private; /* fs or device private pointer */
814}; 829};
815 830