aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorNick Piggin <npiggin@kernel.dk>2011-01-07 01:49:19 -0500
committerNick Piggin <npiggin@kernel.dk>2011-01-07 01:50:17 -0500
commit3e880fb5e4bb6a012035e3edd0586ee2817c2e24 (patch)
tree665101c2c559c26e2dff2c7eca2c747fb736b524 /fs
parent86c8749ede0c59e590de9267066932a26f1ce796 (diff)
fs: use fast counters for vfs caches
percpu_counter library generates quite nasty code, so unless you need to dynamically allocate counters or take fast approximate value, a simple per cpu set of counters is much better. The percpu_counter can never be made to work as well, because it has an indirection from pointer to percpu memory, and it can't use direct this_cpu_inc interfaces because it doesn't use static PER_CPU data, so code will always be worse. In the fastpath, it is the difference between this: incl %gs:nr_dentry # nr_dentry and this: movl percpu_counter_batch(%rip), %edx # percpu_counter_batch, movl $1, %esi #, movq $nr_dentry, %rdi #, call __percpu_counter_add # (plus I clobber registers) __percpu_counter_add: pushq %rbp # movq %rsp, %rbp #, subq $32, %rsp #, movq %rbx, -24(%rbp) #, movq %r12, -16(%rbp) #, movq %r13, -8(%rbp) #, movq %rdi, %rbx # fbc, fbc #APP # 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1 movq %gs:kernel_stack,%rax #, pfo_ret__ # 0 "" 2 #NO_APP incl -8124(%rax) # <variable>.preempt_count movq 32(%rdi), %r12 # <variable>.counters, tcp_ptr__ #APP # 78 "lib/percpu_counter.c" 1 add %gs:this_cpu_off, %r12 # this_cpu_off, tcp_ptr__ # 0 "" 2 #NO_APP movslq (%r12),%r13 #* tcp_ptr__, tmp73 movslq %edx,%rax # batch, batch addq %rsi, %r13 # amount, count cmpq %rax, %r13 # batch, count jge .L27 #, negl %edx # tmp76 movslq %edx,%rdx # tmp76, tmp77 cmpq %rdx, %r13 # tmp77, count jg .L28 #, .L27: movq %rbx, %rdi # fbc, call _raw_spin_lock # addq %r13, 8(%rbx) # count, <variable>.count movq %rbx, %rdi # fbc, movl $0, (%r12) #,* tcp_ptr__ call _raw_spin_unlock # .L29: #APP # 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1 movq %gs:kernel_stack,%rax #, pfo_ret__ # 0 "" 2 #NO_APP decl -8124(%rax) # <variable>.preempt_count movq -8136(%rax), %rax #, D.14625 testb $8, %al #, D.14625 jne .L32 #, .L31: movq -24(%rbp), %rbx #, movq -16(%rbp), %r12 #, movq -8(%rbp), %r13 #, leave ret .p2align 4,,10 .p2align 3 .L28: movl %r13d, (%r12) # count,* jmp .L29 # .L32: call preempt_schedule # .p2align 4,,6 jmp .L31 # .size __percpu_counter_add, .-__percpu_counter_add .p2align 4,,15 Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Diffstat (limited to 'fs')
-rw-r--r--fs/dcache.c19
-rw-r--r--fs/inode.c17
2 files changed, 23 insertions, 13 deletions
diff --git a/fs/dcache.c b/fs/dcache.c
index f62ba90bce91..b2cb2662ca00 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -67,13 +67,22 @@ struct dentry_stat_t dentry_stat = {
67 .age_limit = 45, 67 .age_limit = 45,
68}; 68};
69 69
70static struct percpu_counter nr_dentry __cacheline_aligned_in_smp; 70static DEFINE_PER_CPU(unsigned int, nr_dentry);
71 71
72#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) 72#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
73static int get_nr_dentry(void)
74{
75 int i;
76 int sum = 0;
77 for_each_possible_cpu(i)
78 sum += per_cpu(nr_dentry, i);
79 return sum < 0 ? 0 : sum;
80}
81
73int proc_nr_dentry(ctl_table *table, int write, void __user *buffer, 82int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
74 size_t *lenp, loff_t *ppos) 83 size_t *lenp, loff_t *ppos)
75{ 84{
76 dentry_stat.nr_dentry = percpu_counter_sum_positive(&nr_dentry); 85 dentry_stat.nr_dentry = get_nr_dentry();
77 return proc_dointvec(table, write, buffer, lenp, ppos); 86 return proc_dointvec(table, write, buffer, lenp, ppos);
78} 87}
79#endif 88#endif
@@ -93,7 +102,7 @@ static void __d_free(struct rcu_head *head)
93 */ 102 */
94static void d_free(struct dentry *dentry) 103static void d_free(struct dentry *dentry)
95{ 104{
96 percpu_counter_dec(&nr_dentry); 105 this_cpu_dec(nr_dentry);
97 if (dentry->d_op && dentry->d_op->d_release) 106 if (dentry->d_op && dentry->d_op->d_release)
98 dentry->d_op->d_release(dentry); 107 dentry->d_op->d_release(dentry);
99 108
@@ -981,7 +990,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
981 list_add(&dentry->d_u.d_child, &parent->d_subdirs); 990 list_add(&dentry->d_u.d_child, &parent->d_subdirs);
982 spin_unlock(&dcache_lock); 991 spin_unlock(&dcache_lock);
983 992
984 percpu_counter_inc(&nr_dentry); 993 this_cpu_inc(nr_dentry);
985 994
986 return dentry; 995 return dentry;
987} 996}
@@ -2418,8 +2427,6 @@ static void __init dcache_init(void)
2418{ 2427{
2419 int loop; 2428 int loop;
2420 2429
2421 percpu_counter_init(&nr_dentry, 0);
2422
2423 /* 2430 /*
2424 * A constructor could be added for stable state like the lists, 2431 * A constructor could be added for stable state like the lists,
2425 * but it is probably not worth it because of the cache nature 2432 * but it is probably not worth it because of the cache nature
diff --git a/fs/inode.c b/fs/inode.c
index efc43979709f..5a0a898f55d1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -102,13 +102,17 @@ static DECLARE_RWSEM(iprune_sem);
102 */ 102 */
103struct inodes_stat_t inodes_stat; 103struct inodes_stat_t inodes_stat;
104 104
105static struct percpu_counter nr_inodes __cacheline_aligned_in_smp; 105static DEFINE_PER_CPU(unsigned int, nr_inodes);
106 106
107static struct kmem_cache *inode_cachep __read_mostly; 107static struct kmem_cache *inode_cachep __read_mostly;
108 108
109static inline int get_nr_inodes(void) 109static int get_nr_inodes(void)
110{ 110{
111 return percpu_counter_sum_positive(&nr_inodes); 111 int i;
112 int sum = 0;
113 for_each_possible_cpu(i)
114 sum += per_cpu(nr_inodes, i);
115 return sum < 0 ? 0 : sum;
112} 116}
113 117
114static inline int get_nr_inodes_unused(void) 118static inline int get_nr_inodes_unused(void)
@@ -118,9 +122,9 @@ static inline int get_nr_inodes_unused(void)
118 122
119int get_nr_dirty_inodes(void) 123int get_nr_dirty_inodes(void)
120{ 124{
125 /* not actually dirty inodes, but a wild approximation */
121 int nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); 126 int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
122 return nr_dirty > 0 ? nr_dirty : 0; 127 return nr_dirty > 0 ? nr_dirty : 0;
123
124} 128}
125 129
126/* 130/*
@@ -222,7 +226,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
222 inode->i_fsnotify_mask = 0; 226 inode->i_fsnotify_mask = 0;
223#endif 227#endif
224 228
225 percpu_counter_inc(&nr_inodes); 229 this_cpu_inc(nr_inodes);
226 230
227 return 0; 231 return 0;
228out: 232out:
@@ -264,7 +268,7 @@ void __destroy_inode(struct inode *inode)
264 if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED) 268 if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
265 posix_acl_release(inode->i_default_acl); 269 posix_acl_release(inode->i_default_acl);
266#endif 270#endif
267 percpu_counter_dec(&nr_inodes); 271 this_cpu_dec(nr_inodes);
268} 272}
269EXPORT_SYMBOL(__destroy_inode); 273EXPORT_SYMBOL(__destroy_inode);
270 274
@@ -1646,7 +1650,6 @@ void __init inode_init(void)
1646 SLAB_MEM_SPREAD), 1650 SLAB_MEM_SPREAD),
1647 init_once); 1651 init_once);
1648 register_shrinker(&icache_shrinker); 1652 register_shrinker(&icache_shrinker);
1649 percpu_counter_init(&nr_inodes, 0);
1650 1653
1651 /* Hash may have been set up in inode_init_early */ 1654 /* Hash may have been set up in inode_init_early */
1652 if (!hashdist) 1655 if (!hashdist)