From 44396f4b5cb8566f7118aec55eeac99be7ad94cb Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 31 May 2011 11:58:49 -0400 Subject: fs: add a DCACHE_NEED_LOOKUP flag for d_flags Btrfs (and I'd venture most other fs's) stores its indexes in nice disk order for readdir, but unfortunately in the case of anything that stats the files in order that readdir spits back (like oh say ls) that means we still have to do the normal lookup of the file, which means looking up our other index and then looking up the inode. What I want is a way to create dummy dentries when we find them in readdir so that when ls or anything else subsequently does a stat(), we already have the location information in the dentry and can go straight to the inode itself. The lookup stuff just assumes that if it finds a dentry it is done, it doesn't perform a lookup. So add a DCACHE_NEED_LOOKUP flag so that the lookup code knows it still needs to run i_op->lookup() on the parent to get the inode for the dentry. I have tested this with btrfs and I went from something that looks like this http://people.redhat.com/jwhiter/ls-noreada.png To this http://people.redhat.com/jwhiter/ls-good.png Thats a savings of 1300 seconds, or 22 minutes. That is a significant savings. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Al Viro --- include/linux/dcache.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 19d90a55541d..5fa5bd33b979 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -216,6 +216,7 @@ struct dentry_operations { #define DCACHE_MOUNTED 0x10000 /* is a mountpoint */ #define DCACHE_NEED_AUTOMOUNT 0x20000 /* handle automount on this dir */ #define DCACHE_MANAGE_TRANSIT 0x40000 /* manage transit from this dirent */ +#define DCACHE_NEED_LOOKUP 0x80000 /* dentry requires i_op->lookup */ #define DCACHE_MANAGED_DENTRY \ (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT) @@ -416,6 +417,12 @@ static inline bool d_mountpoint(struct dentry *dentry) return dentry->d_flags & DCACHE_MOUNTED; } +static inline bool d_need_lookup(struct dentry *dentry) +{ + return dentry->d_flags & DCACHE_NEED_LOOKUP; +} + +extern void d_clear_need_lookup(struct dentry *dentry); extern struct dentry *lookup_create(struct nameidata *nd, int is_dir); extern int sysctl_vfs_cache_pressure; -- cgit v1.2.2 From 43e15cdbefea4ce6d68113de98d4f61c0cf45687 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 3 Jun 2011 20:16:57 -0400 Subject: new helper: iterate_supers_type() Call the given function for all superblocks of given type. Function gets a superblock (with s_umount locked shared) and (void *) argument supplied by caller of iterator. Signed-off-by: Al Viro --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index b5b979247863..a8735e7e1b35 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2432,6 +2432,8 @@ extern struct super_block *get_active_super(struct block_device *bdev); extern struct super_block *user_get_super(dev_t); extern void drop_super(struct super_block *sb); extern void iterate_supers(void (*)(struct super_block *, void *), void *); +extern void iterate_supers_type(struct file_system_type *, + void (*)(struct super_block *, void *), void *); extern int dcache_dir_open(struct inode *, struct file *); extern int dcache_dir_close(struct inode *, struct file *); -- cgit v1.2.2 From 1b5d783c94c328d406e801566f161adcfb018dda Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 19 Jun 2011 12:49:47 -0400 Subject: consolidate BINPRM_FLAGS_ENFORCE_NONDUMP handling new helper: would_dump(bprm, file). Checks if we are allowed to read the file and if we are not - sets ENFORCE_NODUMP. Exported, used in places that previously open-coded the same logics. Signed-off-by: Al Viro --- include/linux/binfmts.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 8845613fd7e3..fd88a3945aa1 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -111,6 +111,7 @@ extern int __must_check remove_arg_zero(struct linux_binprm *); extern int search_binary_handler(struct linux_binprm *, struct pt_regs *); extern int flush_old_exec(struct linux_binprm * bprm); extern void setup_new_exec(struct linux_binprm * bprm); +extern void would_dump(struct linux_binprm *, struct file *); extern int suid_dumpable; #define SUID_DUMP_DISABLE 0 /* No setuid dumping */ -- cgit v1.2.2 From 3bfa784a6539f91a27d7ffdd408efdb638e3bebd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 19 Jun 2011 12:55:10 -0400 Subject: kill file_permission() completely convert the last remaining caller to inode_permission() Signed-off-by: Al Viro --- include/linux/fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index a8735e7e1b35..d04e55586a17 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1490,7 +1490,6 @@ extern void dentry_unhash(struct dentry *dentry); /* * VFS file helper functions. */ -extern int file_permission(struct file *, int); extern void inode_init_owner(struct inode *inode, const struct inode *dir, mode_t mode); /* -- cgit v1.2.2 From 07b8ce1ee87d291ff564c02cf878fae973317a52 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 20 Jun 2011 10:52:57 -0400 Subject: lockless get_write_access/deny_write_access new helpers: atomic_inc_unless_negative()/atomic_dec_unless_positive() Signed-off-by: Al Viro --- include/linux/atomic.h | 26 ++++++++++++++++++++++++++ include/linux/fs.h | 29 ++++++++++++++++++++++++++--- 2 files changed, 52 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/atomic.h b/include/linux/atomic.h index ee456c79b0e6..bc6615d4132b 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -34,6 +34,32 @@ static inline int atomic_inc_not_zero_hint(atomic_t *v, int hint) } #endif +#ifndef atomic_inc_unless_negative +static inline int atomic_inc_unless_negative(atomic_t *p) +{ + int v, v1; + for (v = 0; v >= 0; v = v1) { + v1 = atomic_cmpxchg(p, v, v + 1); + if (likely(v1 == v)) + return 1; + } + return 0; +} +#endif + +#ifndef atomic_dec_unless_positive +static inline int atomic_dec_unless_positive(atomic_t *p) +{ + int v, v1; + for (v = 0; v <= 0; v = v1) { + v1 = atomic_cmpxchg(p, v, v - 1); + if (likely(v1 == v)) + return 1; + } + return 0; +} +#endif + #ifndef CONFIG_ARCH_HAS_ATOMIC_OR static inline void atomic_or(int i, atomic_t *v) { diff --git a/include/linux/fs.h b/include/linux/fs.h index d04e55586a17..8c84ed930389 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -392,8 +392,8 @@ struct inodes_stat_t { #include #include #include +#include -#include #include struct export_operations; @@ -2195,8 +2195,31 @@ static inline bool execute_ok(struct inode *inode) return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode); } -extern int get_write_access(struct inode *); -extern int deny_write_access(struct file *); +/* + * get_write_access() gets write permission for a file. + * put_write_access() releases this write permission. + * This is used for regular files. + * We cannot support write (and maybe mmap read-write shared) accesses and + * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode + * can have the following values: + * 0: no writers, no VM_DENYWRITE mappings + * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist + * > 0: (i_writecount) users are writing to the file. + * + * Normally we operate on that counter with atomic_{inc,dec} and it's safe + * except for the cases where we don't hold i_writecount yet. Then we need to + * use {get,deny}_write_access() - these functions check the sign and refuse + * to do the change if sign is wrong. + */ +static inline int get_write_access(struct inode *inode) +{ + return atomic_inc_unless_negative(&inode->i_writecount) ? 0 : -ETXTBSY; +} +static inline int deny_write_access(struct file *file) +{ + struct inode *inode = file->f_path.dentry->d_inode; + return atomic_dec_unless_positive(&inode->i_writecount) ? 0 : -ETXTBSY; +} static inline void put_write_access(struct inode * inode) { atomic_dec(&inode->i_writecount); -- cgit v1.2.2 From 178ea73521d64ba41d7aa5488fb9f549c6d4507d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 20 Jun 2011 11:31:30 -0400 Subject: kill check_acl callback of generic_permission() its value depends only on inode and does not change; we might as well store it in ->i_op->check_acl and be done with that. Signed-off-by: Al Viro --- include/linux/fs.h | 3 +-- include/linux/reiserfs_xattr.h | 2 ++ 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 8c84ed930389..0c15d5e459d5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2187,8 +2187,7 @@ extern sector_t bmap(struct inode *, sector_t); #endif extern int notify_change(struct dentry *, struct iattr *); extern int inode_permission(struct inode *, int); -extern int generic_permission(struct inode *, int, unsigned int, - int (*check_acl)(struct inode *, int, unsigned int)); +extern int generic_permission(struct inode *, int, unsigned int); static inline bool execute_ok(struct inode *inode) { diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h index 6deef5dc95fb..1a3ca8f80200 100644 --- a/include/linux/reiserfs_xattr.h +++ b/include/linux/reiserfs_xattr.h @@ -45,6 +45,7 @@ int reiserfs_permission(struct inode *inode, int mask, unsigned int flags); #ifdef CONFIG_REISERFS_FS_XATTR #define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir) +int reiserfs_check_acl(struct inode *inode, int mask, unsigned int flags); ssize_t reiserfs_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size); int reiserfs_setxattr(struct dentry *dentry, const char *name, @@ -122,6 +123,7 @@ static inline void reiserfs_init_xattr_rwsem(struct inode *inode) #define reiserfs_setxattr NULL #define reiserfs_listxattr NULL #define reiserfs_removexattr NULL +#define reiserfs_check_acl NULL static inline void reiserfs_init_xattr_rwsem(struct inode *inode) { -- cgit v1.2.2 From 1fc0f78ca9f311c6277e2f1b7655bb4d43ceb311 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 20 Jun 2011 18:59:02 -0400 Subject: ->permission() sanitizing: MAY_NOT_BLOCK Duplicate the flags argument into mask bitmap. Signed-off-by: Al Viro --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 0c15d5e459d5..60c1fe65bb2d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -63,6 +63,7 @@ struct inodes_stat_t { #define MAY_ACCESS 16 #define MAY_OPEN 32 #define MAY_CHDIR 64 +#define MAY_NOT_BLOCK 128 /* called from RCU mode, don't block */ /* * flags in file.f_mode. Note that FMODE_READ and FMODE_WRITE must correspond -- cgit v1.2.2 From 7e40145eb111a5192e6d819f764db9d6828d1abb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 20 Jun 2011 19:12:17 -0400 Subject: ->permission() sanitizing: don't pass flags to ->check_acl() not used in the instances anymore. Signed-off-by: Al Viro --- include/linux/fs.h | 2 +- include/linux/generic_acl.h | 2 +- include/linux/reiserfs_xattr.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 60c1fe65bb2d..f218b42718aa 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1579,7 +1579,7 @@ struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); void * (*follow_link) (struct dentry *, struct nameidata *); int (*permission) (struct inode *, int, unsigned int); - int (*check_acl)(struct inode *, int, unsigned int); + int (*check_acl)(struct inode *, int); int (*readlink) (struct dentry *, char __user *,int); void (*put_link) (struct dentry *, struct nameidata *, void *); diff --git a/include/linux/generic_acl.h b/include/linux/generic_acl.h index 0437e377b555..574bea4013b6 100644 --- a/include/linux/generic_acl.h +++ b/include/linux/generic_acl.h @@ -10,6 +10,6 @@ extern const struct xattr_handler generic_acl_default_handler; int generic_acl_init(struct inode *, struct inode *); int generic_acl_chmod(struct inode *); -int generic_check_acl(struct inode *inode, int mask, unsigned int flags); +int generic_check_acl(struct inode *inode, int mask); #endif /* LINUX_GENERIC_ACL_H */ diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h index 1a3ca8f80200..424ba6416e7c 100644 --- a/include/linux/reiserfs_xattr.h +++ b/include/linux/reiserfs_xattr.h @@ -45,7 +45,7 @@ int reiserfs_permission(struct inode *inode, int mask, unsigned int flags); #ifdef CONFIG_REISERFS_FS_XATTR #define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir) -int reiserfs_check_acl(struct inode *inode, int mask, unsigned int flags); +int reiserfs_check_acl(struct inode *inode, int mask); ssize_t reiserfs_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size); int reiserfs_setxattr(struct dentry *dentry, const char *name, -- cgit v1.2.2 From 2830ba7f34ebb27c4e5b8b6ef408cd6d74860890 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 20 Jun 2011 19:16:29 -0400 Subject: ->permission() sanitizing: don't pass flags to generic_permission() redundant; all callers get it duplicated in mask & MAY_NOT_BLOCK and none of them removes that bit. Signed-off-by: Al Viro --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index f218b42718aa..a1689c13eb77 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2188,7 +2188,7 @@ extern sector_t bmap(struct inode *, sector_t); #endif extern int notify_change(struct dentry *, struct iattr *); extern int inode_permission(struct inode *, int); -extern int generic_permission(struct inode *, int, unsigned int); +extern int generic_permission(struct inode *, int); static inline bool execute_ok(struct inode *inode) { -- cgit v1.2.2 From 10556cb21a0d0b24d95f00ea6df16f599a3345b2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 20 Jun 2011 19:28:19 -0400 Subject: ->permission() sanitizing: don't pass flags to ->permission() not used by the instances anymore. Signed-off-by: Al Viro --- include/linux/fs.h | 2 +- include/linux/nfs_fs.h | 2 +- include/linux/reiserfs_xattr.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index a1689c13eb77..1fdefe3995fd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1578,7 +1578,7 @@ struct file_operations { struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); void * (*follow_link) (struct dentry *, struct nameidata *); - int (*permission) (struct inode *, int, unsigned int); + int (*permission) (struct inode *, int); int (*check_acl)(struct inode *, int); int (*readlink) (struct dentry *, char __user *,int); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 1b93b9c60e55..86014811a0fb 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -360,7 +360,7 @@ extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *); extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *); -extern int nfs_permission(struct inode *, int, unsigned int); +extern int nfs_permission(struct inode *, int); extern int nfs_open(struct inode *, struct file *); extern int nfs_release(struct inode *, struct file *); extern int nfs_attribute_timeout(struct inode *inode); diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h index 424ba6416e7c..57958c0e1d38 100644 --- a/include/linux/reiserfs_xattr.h +++ b/include/linux/reiserfs_xattr.h @@ -41,7 +41,7 @@ int reiserfs_xattr_init(struct super_block *sb, int mount_flags); int reiserfs_lookup_privroot(struct super_block *sb); int reiserfs_delete_xattrs(struct inode *inode); int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs); -int reiserfs_permission(struct inode *inode, int mask, unsigned int flags); +int reiserfs_permission(struct inode *inode, int mask); #ifdef CONFIG_REISERFS_FS_XATTR #define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir) -- cgit v1.2.2 From e74f71eb78a4a8b9eaf1bc65f20f761648e85f76 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 20 Jun 2011 19:38:15 -0400 Subject: ->permission() sanitizing: don't pass flags to ->inode_permission() pass that via mask instead. Signed-off-by: Al Viro --- include/linux/security.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/security.h b/include/linux/security.h index 8ce59ef3e5af..ca02f1716736 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1456,7 +1456,7 @@ struct security_operations { struct inode *new_dir, struct dentry *new_dentry); int (*inode_readlink) (struct dentry *dentry); int (*inode_follow_link) (struct dentry *dentry, struct nameidata *nd); - int (*inode_permission) (struct inode *inode, int mask, unsigned flags); + int (*inode_permission) (struct inode *inode, int mask); int (*inode_setattr) (struct dentry *dentry, struct iattr *attr); int (*inode_getattr) (struct vfsmount *mnt, struct dentry *dentry); int (*inode_setxattr) (struct dentry *dentry, const char *name, -- cgit v1.2.2 From eecdd358b467405a084d400d5ec571bbdbfe97a3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 20 Jun 2011 19:48:41 -0400 Subject: ->permission() sanitizing: don't pass flags to exec_permission() pass mask instead; kill security_inode_exec_permission() since we can use security_inode_permission() instead. Signed-off-by: Al Viro --- include/linux/security.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/linux/security.h b/include/linux/security.h index ca02f1716736..ebd2a53a3d07 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1720,7 +1720,6 @@ int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry, int security_inode_readlink(struct dentry *dentry); int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd); int security_inode_permission(struct inode *inode, int mask); -int security_inode_exec_permission(struct inode *inode, unsigned int flags); int security_inode_setattr(struct dentry *dentry, struct iattr *attr); int security_inode_getattr(struct vfsmount *mnt, struct dentry *dentry); int security_inode_setxattr(struct dentry *dentry, const char *name, @@ -2113,12 +2112,6 @@ static inline int security_inode_permission(struct inode *inode, int mask) return 0; } -static inline int security_inode_exec_permission(struct inode *inode, - unsigned int flags) -{ - return 0; -} - static inline int security_inode_setattr(struct dentry *dentry, struct iattr *attr) { -- cgit v1.2.2 From 729cdb3a1ee03a4363f9c7e66ddd979727e99e1f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 21 Jun 2011 01:01:22 -0400 Subject: kill IPERM_FLAG_RCU not used anymore Signed-off-by: Al Viro --- include/linux/fs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 1fdefe3995fd..8494aac189f0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1573,8 +1573,6 @@ struct file_operations { loff_t len); }; -#define IPERM_FLAG_RCU 0x0001 - struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); void * (*follow_link) (struct dentry *, struct nameidata *); -- cgit v1.2.2 From 3d4ff43d895c50319af45eb4bf04a4618eccdf76 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 22 Jun 2011 18:40:12 -0400 Subject: nfs_open_context doesn't need struct path either just dentry, please... Signed-off-by: Al Viro --- include/linux/nfs_fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 86014811a0fb..8b579beb6358 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -85,7 +85,7 @@ struct nfs_lock_context { struct nfs4_state; struct nfs_open_context { struct nfs_lock_context lock_context; - struct path path; + struct dentry *dentry; struct rpc_cred *cred; struct nfs4_state *state; fmode_t mode; @@ -372,7 +372,7 @@ extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr); extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx); extern void put_nfs_open_context(struct nfs_open_context *ctx); extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode); -extern struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred, fmode_t f_mode); +extern struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, struct rpc_cred *cred, fmode_t f_mode); extern void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx); extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx); extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx); -- cgit v1.2.2 From 49084c3bb2055c401f3493c13edae14d49128ca0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 25 Jun 2011 21:59:52 -0400 Subject: kill LOOKUP_CONTINUE LOOKUP_PARENT is equivalent to it now Signed-off-by: Al Viro --- include/linux/namei.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/namei.h b/include/linux/namei.h index eba45ea10298..3439ab862e1d 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -48,7 +48,6 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; */ #define LOOKUP_FOLLOW 0x0001 #define LOOKUP_DIRECTORY 0x0002 -#define LOOKUP_CONTINUE 0x0004 #define LOOKUP_PARENT 0x0010 #define LOOKUP_REVAL 0x0020 -- cgit v1.2.2 From dae6ad8f37529963ae7df52baaccf056b38f210e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 26 Jun 2011 11:50:15 -0400 Subject: new helpers: kern_path_create/user_path_create combination of kern_path_parent() and lookup_create(). Does *not* expose struct nameidata to caller. Syscalls converted to that... Signed-off-by: Al Viro --- include/linux/namei.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/namei.h b/include/linux/namei.h index 3439ab862e1d..b8cea804d31a 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -74,6 +74,8 @@ extern int user_path_at(int, const char __user *, unsigned, struct path *); extern int kern_path(const char *, unsigned, struct path *); +extern struct dentry *kern_path_create(int, const char *, struct path *, int); +extern struct dentry *user_path_create(int, const char __user *, struct path *, int); extern int kern_path_parent(const char *, struct nameidata *); extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct nameidata *); -- cgit v1.2.2 From 6657719390cd05be45f4e3b501d8bb46889c0a19 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 28 Jun 2011 15:41:10 -0400 Subject: make sure that nsproxy_cache is initialized early enough Signed-off-by: Al Viro --- include/linux/nsproxy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index 50d20aba57d3..cc37a55ad004 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -68,6 +68,7 @@ void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); void free_nsproxy(struct nsproxy *ns); int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, struct fs_struct *); +int __init nsproxy_cache_init(void); static inline void put_nsproxy(struct nsproxy *ns) { -- cgit v1.2.2 From ed75e95de574c99575e5f3e1d9ca59ea8c12a9cb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 27 Jun 2011 16:53:43 -0400 Subject: kill lookup_create() folded into the only caller (kern_path_create()) Signed-off-by: Al Viro --- include/linux/dcache.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 5fa5bd33b979..3f22d8d6d8a3 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -423,7 +423,6 @@ static inline bool d_need_lookup(struct dentry *dentry) } extern void d_clear_need_lookup(struct dentry *dentry); -extern struct dentry *lookup_create(struct nameidata *nd, int is_dir); extern int sysctl_vfs_cache_pressure; -- cgit v1.2.2 From e0a0124936171af6156b80fe8ac8799f039e767f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 27 Jun 2011 17:00:37 -0400 Subject: switch vfs_path_lookup() to struct path Signed-off-by: Al Viro --- include/linux/namei.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/namei.h b/include/linux/namei.h index b8cea804d31a..76fe2c62ae71 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -78,7 +78,7 @@ extern struct dentry *kern_path_create(int, const char *, struct path *, int); extern struct dentry *user_path_create(int, const char __user *, struct path *, int); extern int kern_path_parent(const char *, struct nameidata *); extern int vfs_path_lookup(struct dentry *, struct vfsmount *, - const char *, unsigned int, struct nameidata *); + const char *, unsigned int, struct path *); extern struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry, int (*open)(struct inode *, struct file *)); -- cgit v1.2.2 From 0ee5dc676a5f8fadede608c7281dfedb1ae714ea Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Jul 2011 15:44:25 -0400 Subject: btrfs: kill magical embedded struct superblock Signed-off-by: Al Viro --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 8494aac189f0..a0011aef4338 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1835,6 +1835,8 @@ void kill_litter_super(struct super_block *sb); void deactivate_super(struct super_block *sb); void deactivate_locked_super(struct super_block *sb); int set_anon_super(struct super_block *s, void *data); +int get_anon_bdev(dev_t *); +void free_anon_bdev(dev_t); struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), -- cgit v1.2.2 From 095760730c1047c69159ce88021a7fa3833502c8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 8 Jul 2011 14:14:34 +1000 Subject: vmscan: add shrink_slab tracepoints It is impossible to understand what the shrinkers are actually doing without instrumenting the code, so add a some tracepoints to allow insight to be gained. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- include/trace/events/vmscan.h | 77 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) (limited to 'include') diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index b2c33bd955fa..36851f7f13da 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -179,6 +179,83 @@ DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_softlimit_re TP_ARGS(nr_reclaimed) ); +TRACE_EVENT(mm_shrink_slab_start, + TP_PROTO(struct shrinker *shr, struct shrink_control *sc, + long nr_objects_to_shrink, unsigned long pgs_scanned, + unsigned long lru_pgs, unsigned long cache_items, + unsigned long long delta, unsigned long total_scan), + + TP_ARGS(shr, sc, nr_objects_to_shrink, pgs_scanned, lru_pgs, + cache_items, delta, total_scan), + + TP_STRUCT__entry( + __field(struct shrinker *, shr) + __field(void *, shrink) + __field(long, nr_objects_to_shrink) + __field(gfp_t, gfp_flags) + __field(unsigned long, pgs_scanned) + __field(unsigned long, lru_pgs) + __field(unsigned long, cache_items) + __field(unsigned long long, delta) + __field(unsigned long, total_scan) + ), + + TP_fast_assign( + __entry->shr = shr; + __entry->shrink = shr->shrink; + __entry->nr_objects_to_shrink = nr_objects_to_shrink; + __entry->gfp_flags = sc->gfp_mask; + __entry->pgs_scanned = pgs_scanned; + __entry->lru_pgs = lru_pgs; + __entry->cache_items = cache_items; + __entry->delta = delta; + __entry->total_scan = total_scan; + ), + + TP_printk("%pF %p: objects to shrink %ld gfp_flags %s pgs_scanned %ld lru_pgs %ld cache items %ld delta %lld total_scan %ld", + __entry->shrink, + __entry->shr, + __entry->nr_objects_to_shrink, + show_gfp_flags(__entry->gfp_flags), + __entry->pgs_scanned, + __entry->lru_pgs, + __entry->cache_items, + __entry->delta, + __entry->total_scan) +); + +TRACE_EVENT(mm_shrink_slab_end, + TP_PROTO(struct shrinker *shr, int shrinker_retval, + long unused_scan_cnt, long new_scan_cnt), + + TP_ARGS(shr, shrinker_retval, unused_scan_cnt, new_scan_cnt), + + TP_STRUCT__entry( + __field(struct shrinker *, shr) + __field(void *, shrink) + __field(long, unused_scan) + __field(long, new_scan) + __field(int, retval) + __field(long, total_scan) + ), + + TP_fast_assign( + __entry->shr = shr; + __entry->shrink = shr->shrink; + __entry->unused_scan = unused_scan_cnt; + __entry->new_scan = new_scan_cnt; + __entry->retval = shrinker_retval; + __entry->total_scan = new_scan_cnt - unused_scan_cnt; + ), + + TP_printk("%pF %p: unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d", + __entry->shrink, + __entry->shr, + __entry->unused_scan, + __entry->new_scan, + __entry->total_scan, + __entry->retval) +); DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, -- cgit v1.2.2 From e9299f5058595a655c3b207cda9635e28b9197e6 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 8 Jul 2011 14:14:37 +1000 Subject: vmscan: add customisable shrinker batch size For shrinkers that have their own cond_resched* calls, having shrink_slab break the work down into small batches is not paticularly efficient. Add a custom batchsize field to the struct shrinker so that shrinkers can use a larger batch size if they desire. A value of zero (uninitialised) means "use the default", so behaviour is unchanged by this patch. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9670f71d7be9..9b9777ac726d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1150,6 +1150,7 @@ struct shrink_control { struct shrinker { int (*shrink)(struct shrinker *, struct shrink_control *sc); int seeks; /* seeks to recreate an obj */ + long batch; /* reclaim batch size, 0 = default */ /* These are for internal use */ struct list_head list; -- cgit v1.2.2 From 98b745c647a5a90c3c21ea43cbfad9a47b0dfad7 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 8 Jul 2011 14:14:39 +1000 Subject: inode: Make unused inode LRU per superblock The inode unused list is currently a global LRU. This does not match the other global filesystem cache - the dentry cache - which uses per-superblock LRU lists. Hence we have related filesystem object types using different LRU reclaimation schemes. To enable a per-superblock filesystem cache shrinker, both of these caches need to have per-sb unused object LRU lists. Hence this patch converts the global inode LRU to per-sb LRUs. The patch only does rudimentary per-sb propotioning in the shrinker infrastructure, as this gets removed when the per-sb shrinker callouts are introduced later on. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- include/linux/fs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index a0011aef4338..9724f0a48742 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1397,6 +1397,10 @@ struct super_block { struct list_head s_dentry_lru; /* unused dentry lru */ int s_nr_dentry_unused; /* # of dentry on lru */ + /* inode_lru_lock protects s_inode_lru and s_nr_inodes_unused */ + struct list_head s_inode_lru; /* unused inode lru */ + int s_nr_inodes_unused; /* # of inodes on lru */ + struct block_device *s_bdev; struct backing_dev_info *s_bdi; struct mtd_info *s_mtd; -- cgit v1.2.2 From 09cc9fc7a7c3d872065426d7fb0f0ad6d3eb90fc Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 8 Jul 2011 14:14:40 +1000 Subject: inode: move to per-sb LRU locks With the inode LRUs moving to per-sb structures, there is no longer a need for a global inode_lru_lock. The locking can be made more fine-grained by moving to a per-sb LRU lock, isolating the LRU operations of different filesytsems completely from each other. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- include/linux/fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9724f0a48742..460d2cc21ec6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1397,7 +1397,8 @@ struct super_block { struct list_head s_dentry_lru; /* unused dentry lru */ int s_nr_dentry_unused; /* # of dentry on lru */ - /* inode_lru_lock protects s_inode_lru and s_nr_inodes_unused */ + /* s_inode_lru_lock protects s_inode_lru and s_nr_inodes_unused */ + spinlock_t s_inode_lru_lock ____cacheline_aligned_in_smp; struct list_head s_inode_lru; /* unused inode lru */ int s_nr_inodes_unused; /* # of inodes on lru */ -- cgit v1.2.2 From b0d40c92adafde7c2d81203ce7c1c69275f41140 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 8 Jul 2011 14:14:42 +1000 Subject: superblock: introduce per-sb cache shrinker infrastructure With context based shrinkers, we can implement a per-superblock shrinker that shrinks the caches attached to the superblock. We currently have global shrinkers for the inode and dentry caches that split up into per-superblock operations via a coarse proportioning method that does not batch very well. The global shrinkers also have a dependency - dentries pin inodes - so we have to be very careful about how we register the global shrinkers so that the implicit call order is always correct. With a per-sb shrinker callout, we can encode this dependency directly into the per-sb shrinker, hence avoiding the need for strictly ordering shrinker registrations. We also have no need for any proportioning code for the shrinker subsystem already provides this functionality across all shrinkers. Allowing the shrinker to operate on a single superblock at a time means that we do less superblock list traversals and locking and reclaim should batch more effectively. This should result in less CPU overhead for reclaim and potentially faster reclaim of items from each filesystem. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- include/linux/fs.h | 7 +++++++ include/linux/mm.h | 40 +--------------------------------------- include/linux/shrinker.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 39 deletions(-) create mode 100644 include/linux/shrinker.h (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 460d2cc21ec6..d7f35e90b84a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -393,6 +393,7 @@ struct inodes_stat_t { #include #include #include +#include #include #include @@ -1444,8 +1445,14 @@ struct super_block { * Saved pool identifier for cleancache (-1 means none) */ int cleancache_poolid; + + struct shrinker s_shrink; /* per-sb shrinker handle */ }; +/* superblock cache pruning functions */ +extern void prune_icache_sb(struct super_block *sb, int nr_to_scan); +extern void prune_dcache_sb(struct super_block *sb, int nr_to_scan); + extern struct timespec current_fs_time(struct super_block *sb); /* diff --git a/include/linux/mm.h b/include/linux/mm.h index 9b9777ac726d..e3a1a9eec0b1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -15,6 +15,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -1121,45 +1122,6 @@ static inline void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) } #endif -/* - * This struct is used to pass information from page reclaim to the shrinkers. - * We consolidate the values for easier extention later. - */ -struct shrink_control { - gfp_t gfp_mask; - - /* How many slab objects shrinker() should scan and try to reclaim */ - unsigned long nr_to_scan; -}; - -/* - * A callback you can register to apply pressure to ageable caches. - * - * 'sc' is passed shrink_control which includes a count 'nr_to_scan' - * and a 'gfpmask'. It should look through the least-recently-used - * 'nr_to_scan' entries and attempt to free them up. It should return - * the number of objects which remain in the cache. If it returns -1, it means - * it cannot do any scanning at this time (eg. there is a risk of deadlock). - * - * The 'gfpmask' refers to the allocation we are currently trying to - * fulfil. - * - * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is - * querying the cache size, so a fastpath for that case is appropriate. - */ -struct shrinker { - int (*shrink)(struct shrinker *, struct shrink_control *sc); - int seeks; /* seeks to recreate an obj */ - long batch; /* reclaim batch size, 0 = default */ - - /* These are for internal use */ - struct list_head list; - long nr; /* objs pending delete */ -}; -#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ -extern void register_shrinker(struct shrinker *); -extern void unregister_shrinker(struct shrinker *); - int vma_wants_writenotify(struct vm_area_struct *vma); extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h new file mode 100644 index 000000000000..790651b4e5ba --- /dev/null +++ b/include/linux/shrinker.h @@ -0,0 +1,42 @@ +#ifndef _LINUX_SHRINKER_H +#define _LINUX_SHRINKER_H + +/* + * This struct is used to pass information from page reclaim to the shrinkers. + * We consolidate the values for easier extention later. + */ +struct shrink_control { + gfp_t gfp_mask; + + /* How many slab objects shrinker() should scan and try to reclaim */ + unsigned long nr_to_scan; +}; + +/* + * A callback you can register to apply pressure to ageable caches. + * + * 'sc' is passed shrink_control which includes a count 'nr_to_scan' + * and a 'gfpmask'. It should look through the least-recently-used + * 'nr_to_scan' entries and attempt to free them up. It should return + * the number of objects which remain in the cache. If it returns -1, it means + * it cannot do any scanning at this time (eg. there is a risk of deadlock). + * + * The 'gfpmask' refers to the allocation we are currently trying to + * fulfil. + * + * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is + * querying the cache size, so a fastpath for that case is appropriate. + */ +struct shrinker { + int (*shrink)(struct shrinker *, struct shrink_control *sc); + int seeks; /* seeks to recreate an obj */ + long batch; /* reclaim batch size, 0 = default */ + + /* These are for internal use */ + struct list_head list; + long nr; /* objs pending delete */ +}; +#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ +extern void register_shrinker(struct shrinker *); +extern void unregister_shrinker(struct shrinker *); +#endif -- cgit v1.2.2 From 0e1fdafd93980eac62e778798549ce0f6073905c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 8 Jul 2011 14:14:44 +1000 Subject: superblock: add filesystem shrinker operations Now we have a per-superblock shrinker implementation, we can add a filesystem specific callout to it to allow filesystem internal caches to be shrunk by the superblock shrinker. Rather than perpetuate the multipurpose shrinker callback API (i.e. nr_to_scan == 0 meaning "tell me how many objects freeable in the cache), two operations will be added. The first will return the number of objects that are freeable, the second is the actual shrinker call. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index d7f35e90b84a..1393742bba9b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1655,6 +1655,8 @@ struct super_operations { ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); #endif int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); + int (*nr_cached_objects)(struct super_block *); + void (*free_cached_objects)(struct super_block *, int); }; /* -- cgit v1.2.2 From e46ebd27823b27273da780376d54c080250ff1a2 Mon Sep 17 00:00:00 2001 From: Tomasz Stanislawski Date: Tue, 12 Jul 2011 11:27:20 +0200 Subject: anonfd: fix missing declaration The forward declaration of struct file_operations is added to avoid compilation warnings. Signed-off-by: Tomasz Stanislawski Signed-off-by: Al Viro --- include/linux/anon_inodes.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/anon_inodes.h b/include/linux/anon_inodes.h index 69a21e0ebd33..8013a45242fe 100644 --- a/include/linux/anon_inodes.h +++ b/include/linux/anon_inodes.h @@ -8,6 +8,8 @@ #ifndef _LINUX_ANON_INODES_H #define _LINUX_ANON_INODES_H +struct file_operations; + struct file *anon_inode_getfile(const char *name, const struct file_operations *fops, void *priv, int flags); -- cgit v1.2.2 From bd5fe6c5eb9c548d7f07fe8f89a150bb6705e8e3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Jun 2011 14:29:43 -0400 Subject: fs: kill i_alloc_sem i_alloc_sem is a rather special rw_semaphore. It's the last one that may be released by a non-owner, and it's write side is always mirrored by real exclusion. It's intended use it to wait for all pending direct I/O requests to finish before starting a truncate. Replace it with a hand-grown construct: - exclusion for truncates is already guaranteed by i_mutex, so it can simply fall way - the reader side is replaced by an i_dio_count member in struct inode that counts the number of pending direct I/O requests. Truncate can't proceed as long as it's non-zero - when i_dio_count reaches non-zero we wake up a pending truncate using wake_up_bit on a new bit in i_flags - new references to i_dio_count can't appear while we are waiting for it to read zero because the direct I/O count always needs i_mutex (or an equivalent like XFS's i_iolock) for starting a new operation. This scheme is much simpler, and saves the space of a spinlock_t and a struct list_head in struct inode (typically 160 bits on a non-debug 64-bit system). Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- include/linux/fs.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 1393742bba9b..2fe920774abf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -779,7 +779,7 @@ struct inode { struct timespec i_ctime; blkcnt_t i_blocks; unsigned short i_bytes; - struct rw_semaphore i_alloc_sem; + atomic_t i_dio_count; const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ struct file_lock *i_flock; struct address_space *i_mapping; @@ -1705,6 +1705,10 @@ struct super_operations { * set during data writeback, and cleared with a wakeup * on the bit address once it is done. * + * I_REFERENCED Marks the inode as recently references on the LRU list. + * + * I_DIO_WAKEUP Never set. Only used as a key for wait_on_bit(). + * * Q: What is the difference between I_WILL_FREE and I_FREEING? */ #define I_DIRTY_SYNC (1 << 0) @@ -1718,6 +1722,8 @@ struct super_operations { #define __I_SYNC 7 #define I_SYNC (1 << __I_SYNC) #define I_REFERENCED (1 << 8) +#define __I_DIO_WAKEUP 9 +#define I_DIO_WAKEUP (1 << I_DIO_WAKEUP) #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) @@ -1828,7 +1834,6 @@ struct file_system_type { struct lock_class_key i_lock_key; struct lock_class_key i_mutex_key; struct lock_class_key i_mutex_dir_key; - struct lock_class_key i_alloc_sem_key; }; extern struct dentry *mount_ns(struct file_system_type *fs_type, int flags, @@ -2404,6 +2409,8 @@ enum { }; void dio_end_io(struct bio *bio, int error); +void inode_dio_wait(struct inode *inode); +void inode_dio_done(struct inode *inode); ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, -- cgit v1.2.2 From 11b80f459adaf91a712f95e7734a17655a36bf30 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Jun 2011 14:29:44 -0400 Subject: rw_semaphore: remove up/down_read_non_owner Now that the last users is gone these can be removed. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- include/linux/rwsem.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index a8afe9cd000c..77950dfa0a9e 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -124,19 +124,9 @@ extern void downgrade_write(struct rw_semaphore *sem); */ extern void down_read_nested(struct rw_semaphore *sem, int subclass); extern void down_write_nested(struct rw_semaphore *sem, int subclass); -/* - * Take/release a lock when not the owner will release it. - * - * [ This API should be avoided as much as possible - the - * proper abstraction for this case is completions. ] - */ -extern void down_read_non_owner(struct rw_semaphore *sem); -extern void up_read_non_owner(struct rw_semaphore *sem); #else # define down_read_nested(sem, subclass) down_read(sem) # define down_write_nested(sem, subclass) down_write(sem) -# define down_read_non_owner(sem) down_read(sem) -# define up_read_non_owner(sem) up_read(sem) #endif #endif /* _LINUX_RWSEM_H */ -- cgit v1.2.2 From aacfc19c626ebd3daa675652457d71019a1f583f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Jun 2011 14:29:47 -0400 Subject: fs: simplify the blockdev_direct_IO prototype Simple filesystems always pass inode->i_sb_bdev as the block device argument, and never need a end_io handler. Let's simply things for them and for my grepping activity by dropping these arguments. The only thing not falling into that scheme is ext4, which passes and end_io handler without needing special flags (yet), but given how messy the direct I/O code there is use of __blockdev_direct_IO in one instead of two out of three cases isn't going to make a large difference anyway. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- include/linux/fs.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 2fe920774abf..824453be9fee 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2418,12 +2418,11 @@ ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, dio_submit_t submit_io, int flags); static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, - struct inode *inode, struct block_device *bdev, const struct iovec *iov, - loff_t offset, unsigned long nr_segs, get_block_t get_block, - dio_iodone_t end_io) + struct inode *inode, const struct iovec *iov, loff_t offset, + unsigned long nr_segs, get_block_t get_block) { - return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_block, end_io, NULL, + return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + offset, nr_segs, get_block, NULL, NULL, DIO_LOCKING | DIO_SKIP_HOLES); } #endif -- cgit v1.2.2 From f15146380d28b746df3c8b81b392812eb982382a Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Tue, 12 Jul 2011 20:48:39 +0200 Subject: fs: seq_file - add event counter to simplify poll() support Moving the event counter into the dynamically allocated 'struc seq_file' allows poll() support without the need to allocate its own tracking structure. All current users are switched over to use the new counter. Requested-by: Andrew Morton akpm@linux-foundation.org Acked-by: NeilBrown Tested-by: Lucas De Marchi lucas.demarchi@profusion.mobi Signed-off-by: Kay Sievers Signed-off-by: Al Viro --- include/linux/mnt_namespace.h | 1 - include/linux/seq_file.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index 0b89efc6f215..29304855652d 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -18,7 +18,6 @@ struct proc_mounts { struct seq_file m; /* must be the first element */ struct mnt_namespace *ns; struct path root; - int event; }; struct fs_struct; diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 03c0232b4169..be720cd2038d 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -23,6 +23,7 @@ struct seq_file { u64 version; struct mutex lock; const struct seq_operations *op; + int poll_event; void *private; }; -- cgit v1.2.2 From 982d816581eeeacfe5b2b7c6d47d13a157616eff Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 18 Jul 2011 13:21:35 -0400 Subject: fs: add SEEK_HOLE and SEEK_DATA flags This just gets us ready to support the SEEK_HOLE and SEEK_DATA flags. Turns out using fiemap in things like cp cause more problems than it solves, so lets try and give userspace an interface that doesn't suck. We need to match solaris here, and the definitions are *o* If /whence/ is SEEK_HOLE, the offset of the start of the next hole greater than or equal to the supplied offset is returned. The definition of a hole is provided near the end of the DESCRIPTION. *o* If /whence/ is SEEK_DATA, the file pointer is set to the start of the next non-hole file region greater than or equal to the supplied offset. So in the generic case the entire file is data and there is a virtual hole at the end. That means we will just return i_size for SEEK_HOLE and will return the same offset for SEEK_DATA. This is how Solaris does it so we have to do it the same way. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Al Viro --- include/linux/fs.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 824453be9fee..4a61f98823a6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -32,7 +32,9 @@ #define SEEK_SET 0 /* seek relative to beginning of file */ #define SEEK_CUR 1 /* seek relative to current file position */ #define SEEK_END 2 /* seek relative to end of file */ -#define SEEK_MAX SEEK_END +#define SEEK_DATA 3 /* seek to the next data */ +#define SEEK_HOLE 4 /* seek to the next hole */ +#define SEEK_MAX SEEK_HOLE struct fstrim_range { __u64 start; -- cgit v1.2.2 From 02c24a82187d5a628c68edfe71ae60dc135cd178 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Sat, 16 Jul 2011 20:44:56 -0400 Subject: fs: push i_mutex and filemap_write_and_wait down into ->fsync() handlers Btrfs needs to be able to control how filemap_write_and_wait_range() is called in fsync to make it less of a painful operation, so push down taking i_mutex and the calling of filemap_write_and_wait() down into the ->fsync() handlers. Some file systems can drop taking the i_mutex altogether it seems, like ext3 and ocfs2. For correctness sake I just pushed everything down in all cases to make sure that we keep the current behavior the same for everybody, and then each individual fs maintainer can make up their mind about what to do from there. Thanks, Acked-by: Jan Kara Signed-off-by: Josef Bacik Signed-off-by: Al Viro --- include/linux/ext3_fs.h | 2 +- include/linux/fb.h | 3 ++- include/linux/fs.h | 9 +++++---- 3 files changed, 8 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index 5e06acf95d0f..0c473fd79acb 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -877,7 +877,7 @@ extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, extern void ext3_htree_free_dir_info(struct dir_private_info *p); /* fsync.c */ -extern int ext3_sync_file(struct file *, int); +extern int ext3_sync_file(struct file *, loff_t, loff_t, int); /* hash.c */ extern int ext3fs_dirhash(const char *name, int len, struct diff --git a/include/linux/fb.h b/include/linux/fb.h index 6a8274877171..1d6836c498dd 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -1043,7 +1043,8 @@ extern void fb_deferred_io_open(struct fb_info *info, struct inode *inode, struct file *file); extern void fb_deferred_io_cleanup(struct fb_info *info); -extern int fb_deferred_io_fsync(struct file *file, int datasync); +extern int fb_deferred_io_fsync(struct file *file, loff_t start, + loff_t end, int datasync); static inline bool fb_be_math(struct fb_info *info) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 4a61f98823a6..9cd2075c4a39 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1572,7 +1572,7 @@ struct file_operations { int (*open) (struct inode *, struct file *); int (*flush) (struct file *, fl_owner_t id); int (*release) (struct inode *, struct file *); - int (*fsync) (struct file *, int datasync); + int (*fsync) (struct file *, loff_t, loff_t, int datasync); int (*aio_fsync) (struct kiocb *, int datasync); int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *); @@ -2360,7 +2360,8 @@ extern int generic_segment_checks(const struct iovec *iov, /* fs/block_dev.c */ extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos); -extern int blkdev_fsync(struct file *filp, int datasync); +extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end, + int datasync); /* fs/splice.c */ extern ssize_t generic_file_splice_read(struct file *, loff_t *, @@ -2490,7 +2491,7 @@ extern int simple_link(struct dentry *, struct inode *, struct dentry *); extern int simple_unlink(struct inode *, struct dentry *); extern int simple_rmdir(struct inode *, struct dentry *); extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); -extern int noop_fsync(struct file *, int); +extern int noop_fsync(struct file *, loff_t, loff_t, int); extern int simple_empty(struct dentry *); extern int simple_readpage(struct file *file, struct page *page); extern int simple_write_begin(struct file *file, struct address_space *mapping, @@ -2515,7 +2516,7 @@ extern ssize_t simple_read_from_buffer(void __user *to, size_t count, extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, const void __user *from, size_t count); -extern int generic_file_fsync(struct file *, int); +extern int generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_check_addressable(unsigned, u64); -- cgit v1.2.2 From 295cc522c2b2834b302424e41ebb5846df6d61bd Mon Sep 17 00:00:00 2001 From: Wanlong Gao Date: Tue, 19 Jul 2011 09:48:39 +0800 Subject: fs:update the NOTE of the file_operations structure Big kernel lock had been removed and setlease now use the lock_flocks() to hold a special spin lock file_lock_lock by Matthew. So just remove the out-of-date NOTE. Signed-off-by: Wanlong Gao Signed-off-by: Al Viro --- include/linux/fs.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9cd2075c4a39..e0569e4ab2d5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1552,11 +1552,6 @@ struct block_device_operations; #define HAVE_COMPAT_IOCTL 1 #define HAVE_UNLOCKED_IOCTL 1 -/* - * NOTE: - * all file operations except setlease can be called without - * the big kernel lock held in all filesystems. - */ struct file_operations { struct module *owner; loff_t (*llseek) (struct file *, loff_t, int); -- cgit v1.2.2 From ed70afcd6e795e3de98df56f1cd0f898fbf641a7 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 21 Jul 2011 13:55:37 -0700 Subject: mm/truncate.c: fix build for CONFIG_BLOCK not enabled Fix build error when CONFIG_BLOCK is not enabled by providing a stub inode_dio_wait() function. mm/truncate.c:612: error: implicit declaration of function 'inode_dio_wait' Signed-off-by: Randy Dunlap Signed-off-by: Al Viro --- include/linux/fs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index e0569e4ab2d5..b224dc468a23 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2423,6 +2423,10 @@ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, offset, nr_segs, get_block, NULL, NULL, DIO_LOCKING | DIO_SKIP_HOLES); } +#else +static inline void inode_dio_wait(struct inode *inode) +{ +} #endif extern const struct file_operations generic_ro_fops; -- cgit v1.2.2