diff options
author | Eric Dumazet <eric.dumazet@gmail.com> | 2010-10-15 01:44:11 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2010-10-20 06:02:23 -0400 |
commit | 27b75c95f10d249574d9c4cb9dab878107faede8 (patch) | |
tree | 466656d86aaa395951e12b50903e730203c5f86f | |
parent | e6484930d7c73d324bccda7d43d131088da697b9 (diff) |
net: avoid RCU for NOCACHE dst
There is no point using RCU for dst we allocate for a very short time
(used once).
Change dst_release() to take DST_NOCACHE into account, but also change
skb_dst_set_noref() to force a refcount increment for such dst.
This is a _huge_ gain, because we dont waste memory to store xx thousand
of dsts. Instead of queueing them to RCU, we can free them instantly.
CPU caches can stay hot, re-using same memory blocks to hold temporary
dsts.
Note : remove unneeded smp_mb__before_atomic_dec(); in dst_release(),
since atomic_dec_return() implies a full memory barrier.
Stress test, 160.000.000 udp frames sent, IP route cache disabled
(DDOS).
Before:
real 0m38.091s
user 0m13.189s
sys 7m53.018s
After:
real 0m29.946s
user 0m12.157s
sys 7m40.605s
For reference, if IP route cache was enabled :
real 0m32.030s
user 0m10.521s
sys 8m15.243s
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/linux/skbuff.h | 14 | ||||
-rw-r--r-- | net/core/dst.c | 29 | ||||
-rw-r--r-- | net/ipv4/route.c | 9 |
3 files changed, 33 insertions, 19 deletions
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 05a358f1ba11..e6ba898de61c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h | |||
@@ -460,19 +460,7 @@ static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst) | |||
460 | skb->_skb_refdst = (unsigned long)dst; | 460 | skb->_skb_refdst = (unsigned long)dst; |
461 | } | 461 | } |
462 | 462 | ||
463 | /** | 463 | extern void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst); |
464 | * skb_dst_set_noref - sets skb dst, without a reference | ||
465 | * @skb: buffer | ||
466 | * @dst: dst entry | ||
467 | * | ||
468 | * Sets skb dst, assuming a reference was not taken on dst | ||
469 | * skb_dst_drop() should not dst_release() this dst | ||
470 | */ | ||
471 | static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) | ||
472 | { | ||
473 | WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); | ||
474 | skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF; | ||
475 | } | ||
476 | 464 | ||
477 | /** | 465 | /** |
478 | * skb_dst_is_noref - Test if skb dst isnt refcounted | 466 | * skb_dst_is_noref - Test if skb dst isnt refcounted |
diff --git a/net/core/dst.c b/net/core/dst.c index 32e542d7f472..8abe628b79f1 100644 --- a/net/core/dst.c +++ b/net/core/dst.c | |||
@@ -271,13 +271,40 @@ void dst_release(struct dst_entry *dst) | |||
271 | if (dst) { | 271 | if (dst) { |
272 | int newrefcnt; | 272 | int newrefcnt; |
273 | 273 | ||
274 | smp_mb__before_atomic_dec(); | ||
275 | newrefcnt = atomic_dec_return(&dst->__refcnt); | 274 | newrefcnt = atomic_dec_return(&dst->__refcnt); |
276 | WARN_ON(newrefcnt < 0); | 275 | WARN_ON(newrefcnt < 0); |
276 | if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) { | ||
277 | dst = dst_destroy(dst); | ||
278 | if (dst) | ||
279 | __dst_free(dst); | ||
280 | } | ||
277 | } | 281 | } |
278 | } | 282 | } |
279 | EXPORT_SYMBOL(dst_release); | 283 | EXPORT_SYMBOL(dst_release); |
280 | 284 | ||
285 | /** | ||
286 | * skb_dst_set_noref - sets skb dst, without a reference | ||
287 | * @skb: buffer | ||
288 | * @dst: dst entry | ||
289 | * | ||
290 | * Sets skb dst, assuming a reference was not taken on dst | ||
291 | * skb_dst_drop() should not dst_release() this dst | ||
292 | */ | ||
293 | void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) | ||
294 | { | ||
295 | WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); | ||
296 | /* If dst not in cache, we must take a reference, because | ||
297 | * dst_release() will destroy dst as soon as its refcount becomes zero | ||
298 | */ | ||
299 | if (unlikely(dst->flags & DST_NOCACHE)) { | ||
300 | dst_hold(dst); | ||
301 | skb_dst_set(skb, dst); | ||
302 | } else { | ||
303 | skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF; | ||
304 | } | ||
305 | } | ||
306 | EXPORT_SYMBOL(skb_dst_set_noref); | ||
307 | |||
281 | /* Dirty hack. We did it in 2.2 (in __dst_free), | 308 | /* Dirty hack. We did it in 2.2 (in __dst_free), |
282 | * we have _very_ good reasons not to repeat | 309 | * we have _very_ good reasons not to repeat |
283 | * this mistake in 2.3, but we have no choice | 310 | * this mistake in 2.3, but we have no choice |
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ff98983d2a45..d6cb2bfcd8e1 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c | |||
@@ -1105,9 +1105,9 @@ restart: | |||
1105 | * Note that we do rt_free on this new route entry, so that | 1105 | * Note that we do rt_free on this new route entry, so that |
1106 | * once its refcount hits zero, we are still able to reap it | 1106 | * once its refcount hits zero, we are still able to reap it |
1107 | * (Thanks Alexey) | 1107 | * (Thanks Alexey) |
1108 | * Note also the rt_free uses call_rcu. We don't actually | 1108 | * Note: To avoid expensive rcu stuff for this uncached dst, |
1109 | * need rcu protection here, this is just our path to get | 1109 | * we set DST_NOCACHE so that dst_release() can free dst without |
1110 | * on the route gc list. | 1110 | * waiting a grace period. |
1111 | */ | 1111 | */ |
1112 | 1112 | ||
1113 | rt->dst.flags |= DST_NOCACHE; | 1113 | rt->dst.flags |= DST_NOCACHE; |
@@ -1117,12 +1117,11 @@ restart: | |||
1117 | if (net_ratelimit()) | 1117 | if (net_ratelimit()) |
1118 | printk(KERN_WARNING | 1118 | printk(KERN_WARNING |
1119 | "Neighbour table failure & not caching routes.\n"); | 1119 | "Neighbour table failure & not caching routes.\n"); |
1120 | rt_drop(rt); | 1120 | ip_rt_put(rt); |
1121 | return err; | 1121 | return err; |
1122 | } | 1122 | } |
1123 | } | 1123 | } |
1124 | 1124 | ||
1125 | rt_free(rt); | ||
1126 | goto skip_hashing; | 1125 | goto skip_hashing; |
1127 | } | 1126 | } |
1128 | 1127 | ||