diff options
author | Eric Dumazet <eric.dumazet@gmail.com> | 2010-10-08 02:37:34 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2010-10-11 16:06:53 -0400 |
commit | fc66f95c68b6d4535a0ea2ea15d5cf626e310956 (patch) | |
tree | ac3a7f08ad741a67ff683bf93e5669ddcae95ed7 /net/ipv4 | |
parent | 0ed8ddf4045fcfcac36bad753dc4046118c603ec (diff) |
net dst: use a percpu_counter to track entries
struct dst_ops tracks number of allocated dst in an atomic_t field,
subject to high cache line contention in stress workload.
Switch to a percpu_counter, to reduce number of time we need to dirty a
central location. Place it on a separate cache line to avoid dirtying
read only fields.
Stress test :
(Sending 160.000.000 UDP frames,
IP route cache disabled, dual E5540 @2.53GHz,
32bit kernel, FIB_TRIE, SLUB/NUMA)
Before:
real 0m51.179s
user 0m15.329s
sys 10m15.942s
After:
real 0m45.570s
user 0m15.525s
sys 9m56.669s
With a small reordering of struct neighbour fields, subject of a
following patch, (to separate refcnt from other read mostly fields)
real 0m41.841s
user 0m15.261s
sys 8m45.949s
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/route.c | 36 | ||||
-rw-r--r-- | net/ipv4/xfrm4_policy.c | 4 |
2 files changed, 24 insertions, 16 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3888f6ba0a5c..0755aa4af86c 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c | |||
@@ -159,7 +159,6 @@ static struct dst_ops ipv4_dst_ops = { | |||
159 | .link_failure = ipv4_link_failure, | 159 | .link_failure = ipv4_link_failure, |
160 | .update_pmtu = ip_rt_update_pmtu, | 160 | .update_pmtu = ip_rt_update_pmtu, |
161 | .local_out = __ip_local_out, | 161 | .local_out = __ip_local_out, |
162 | .entries = ATOMIC_INIT(0), | ||
163 | }; | 162 | }; |
164 | 163 | ||
165 | #define ECN_OR_COST(class) TC_PRIO_##class | 164 | #define ECN_OR_COST(class) TC_PRIO_##class |
@@ -466,7 +465,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v) | |||
466 | 465 | ||
467 | seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " | 466 | seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " |
468 | " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", | 467 | " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", |
469 | atomic_read(&ipv4_dst_ops.entries), | 468 | dst_entries_get_slow(&ipv4_dst_ops), |
470 | st->in_hit, | 469 | st->in_hit, |
471 | st->in_slow_tot, | 470 | st->in_slow_tot, |
472 | st->in_slow_mc, | 471 | st->in_slow_mc, |
@@ -945,6 +944,7 @@ static int rt_garbage_collect(struct dst_ops *ops) | |||
945 | struct rtable *rth, **rthp; | 944 | struct rtable *rth, **rthp; |
946 | unsigned long now = jiffies; | 945 | unsigned long now = jiffies; |
947 | int goal; | 946 | int goal; |
947 | int entries = dst_entries_get_fast(&ipv4_dst_ops); | ||
948 | 948 | ||
949 | /* | 949 | /* |
950 | * Garbage collection is pretty expensive, | 950 | * Garbage collection is pretty expensive, |
@@ -954,28 +954,28 @@ static int rt_garbage_collect(struct dst_ops *ops) | |||
954 | RT_CACHE_STAT_INC(gc_total); | 954 | RT_CACHE_STAT_INC(gc_total); |
955 | 955 | ||
956 | if (now - last_gc < ip_rt_gc_min_interval && | 956 | if (now - last_gc < ip_rt_gc_min_interval && |
957 | atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) { | 957 | entries < ip_rt_max_size) { |
958 | RT_CACHE_STAT_INC(gc_ignored); | 958 | RT_CACHE_STAT_INC(gc_ignored); |
959 | goto out; | 959 | goto out; |
960 | } | 960 | } |
961 | 961 | ||
962 | entries = dst_entries_get_slow(&ipv4_dst_ops); | ||
962 | /* Calculate number of entries, which we want to expire now. */ | 963 | /* Calculate number of entries, which we want to expire now. */ |
963 | goal = atomic_read(&ipv4_dst_ops.entries) - | 964 | goal = entries - (ip_rt_gc_elasticity << rt_hash_log); |
964 | (ip_rt_gc_elasticity << rt_hash_log); | ||
965 | if (goal <= 0) { | 965 | if (goal <= 0) { |
966 | if (equilibrium < ipv4_dst_ops.gc_thresh) | 966 | if (equilibrium < ipv4_dst_ops.gc_thresh) |
967 | equilibrium = ipv4_dst_ops.gc_thresh; | 967 | equilibrium = ipv4_dst_ops.gc_thresh; |
968 | goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; | 968 | goal = entries - equilibrium; |
969 | if (goal > 0) { | 969 | if (goal > 0) { |
970 | equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); | 970 | equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); |
971 | goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; | 971 | goal = entries - equilibrium; |
972 | } | 972 | } |
973 | } else { | 973 | } else { |
974 | /* We are in dangerous area. Try to reduce cache really | 974 | /* We are in dangerous area. Try to reduce cache really |
975 | * aggressively. | 975 | * aggressively. |
976 | */ | 976 | */ |
977 | goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); | 977 | goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); |
978 | equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal; | 978 | equilibrium = entries - goal; |
979 | } | 979 | } |
980 | 980 | ||
981 | if (now - last_gc >= ip_rt_gc_min_interval) | 981 | if (now - last_gc >= ip_rt_gc_min_interval) |
@@ -1032,14 +1032,16 @@ static int rt_garbage_collect(struct dst_ops *ops) | |||
1032 | expire >>= 1; | 1032 | expire >>= 1; |
1033 | #if RT_CACHE_DEBUG >= 2 | 1033 | #if RT_CACHE_DEBUG >= 2 |
1034 | printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, | 1034 | printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, |
1035 | atomic_read(&ipv4_dst_ops.entries), goal, i); | 1035 | dst_entries_get_fast(&ipv4_dst_ops), goal, i); |
1036 | #endif | 1036 | #endif |
1037 | 1037 | ||
1038 | if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) | 1038 | if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) |
1039 | goto out; | 1039 | goto out; |
1040 | } while (!in_softirq() && time_before_eq(jiffies, now)); | 1040 | } while (!in_softirq() && time_before_eq(jiffies, now)); |
1041 | 1041 | ||
1042 | if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) | 1042 | if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) |
1043 | goto out; | ||
1044 | if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size) | ||
1043 | goto out; | 1045 | goto out; |
1044 | if (net_ratelimit()) | 1046 | if (net_ratelimit()) |
1045 | printk(KERN_WARNING "dst cache overflow\n"); | 1047 | printk(KERN_WARNING "dst cache overflow\n"); |
@@ -1049,11 +1051,12 @@ static int rt_garbage_collect(struct dst_ops *ops) | |||
1049 | work_done: | 1051 | work_done: |
1050 | expire += ip_rt_gc_min_interval; | 1052 | expire += ip_rt_gc_min_interval; |
1051 | if (expire > ip_rt_gc_timeout || | 1053 | if (expire > ip_rt_gc_timeout || |
1052 | atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh) | 1054 | dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh || |
1055 | dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh) | ||
1053 | expire = ip_rt_gc_timeout; | 1056 | expire = ip_rt_gc_timeout; |
1054 | #if RT_CACHE_DEBUG >= 2 | 1057 | #if RT_CACHE_DEBUG >= 2 |
1055 | printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, | 1058 | printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, |
1056 | atomic_read(&ipv4_dst_ops.entries), goal, rover); | 1059 | dst_entries_get_fast(&ipv4_dst_ops), goal, rover); |
1057 | #endif | 1060 | #endif |
1058 | out: return 0; | 1061 | out: return 0; |
1059 | } | 1062 | } |
@@ -2717,7 +2720,6 @@ static struct dst_ops ipv4_dst_blackhole_ops = { | |||
2717 | .destroy = ipv4_dst_destroy, | 2720 | .destroy = ipv4_dst_destroy, |
2718 | .check = ipv4_blackhole_dst_check, | 2721 | .check = ipv4_blackhole_dst_check, |
2719 | .update_pmtu = ipv4_rt_blackhole_update_pmtu, | 2722 | .update_pmtu = ipv4_rt_blackhole_update_pmtu, |
2720 | .entries = ATOMIC_INIT(0), | ||
2721 | }; | 2723 | }; |
2722 | 2724 | ||
2723 | 2725 | ||
@@ -3287,6 +3289,12 @@ int __init ip_rt_init(void) | |||
3287 | 3289 | ||
3288 | ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; | 3290 | ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; |
3289 | 3291 | ||
3292 | if (dst_entries_init(&ipv4_dst_ops) < 0) | ||
3293 | panic("IP: failed to allocate ipv4_dst_ops counter\n"); | ||
3294 | |||
3295 | if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) | ||
3296 | panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); | ||
3297 | |||
3290 | rt_hash_table = (struct rt_hash_bucket *) | 3298 | rt_hash_table = (struct rt_hash_bucket *) |
3291 | alloc_large_system_hash("IP route cache", | 3299 | alloc_large_system_hash("IP route cache", |
3292 | sizeof(struct rt_hash_bucket), | 3300 | sizeof(struct rt_hash_bucket), |
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index a580349f0b8a..4464f3bff6a7 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c | |||
@@ -174,7 +174,7 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops) | |||
174 | struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops); | 174 | struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops); |
175 | 175 | ||
176 | xfrm4_policy_afinfo.garbage_collect(net); | 176 | xfrm4_policy_afinfo.garbage_collect(net); |
177 | return (atomic_read(&ops->entries) > ops->gc_thresh * 2); | 177 | return (dst_entries_get_slow(ops) > ops->gc_thresh * 2); |
178 | } | 178 | } |
179 | 179 | ||
180 | static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) | 180 | static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) |
@@ -232,7 +232,6 @@ static struct dst_ops xfrm4_dst_ops = { | |||
232 | .ifdown = xfrm4_dst_ifdown, | 232 | .ifdown = xfrm4_dst_ifdown, |
233 | .local_out = __ip_local_out, | 233 | .local_out = __ip_local_out, |
234 | .gc_thresh = 1024, | 234 | .gc_thresh = 1024, |
235 | .entries = ATOMIC_INIT(0), | ||
236 | }; | 235 | }; |
237 | 236 | ||
238 | static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { | 237 | static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { |
@@ -288,6 +287,7 @@ void __init xfrm4_init(int rt_max_size) | |||
288 | * and start cleaning when were 1/2 full | 287 | * and start cleaning when were 1/2 full |
289 | */ | 288 | */ |
290 | xfrm4_dst_ops.gc_thresh = rt_max_size/2; | 289 | xfrm4_dst_ops.gc_thresh = rt_max_size/2; |
290 | dst_entries_init(&xfrm4_dst_ops); | ||
291 | 291 | ||
292 | xfrm4_state_init(); | 292 | xfrm4_state_init(); |
293 | xfrm4_policy_init(); | 293 | xfrm4_policy_init(); |