diff options
author | Eric Dumazet <eric.dumazet@gmail.com> | 2010-10-05 06:41:36 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2010-10-05 23:39:38 -0400 |
commit | ebc0ffae5dfb4447e0a431ffe7fe1d467c48bbb9 (patch) | |
tree | 395e50547ffccc6b73e04a44190eb4b4f2d2316b /net/ipv4/route.c | |
parent | c2952c314b4fe61820ba8fd6c949eed636140d52 (diff) |
fib: RCU conversion of fib_lookup()
fib_lookup() converted to be called in RCU protected context, no
reference taken and released on a contended cache line (fib_clntref)
fib_table_lookup() and fib_semantic_match() get an additional parameter.
struct fib_info gets an rcu_head field, and is freed after an rcu grace
period.
Stress test :
(Sending 160.000.000 UDP frames on same neighbour,
IP route cache disabled, dual E5540 @2.53GHz,
32bit kernel, FIB_HASH) (about same results for FIB_TRIE)
Before patch :
real 1m31.199s
user 0m13.761s
sys 23m24.780s
After patch:
real 1m5.375s
user 0m14.997s
sys 15m50.115s
Before patch Profile :
13044.00 15.4% __ip_route_output_key vmlinux
8438.00 10.0% dst_destroy vmlinux
5983.00 7.1% fib_semantic_match vmlinux
5410.00 6.4% fib_rules_lookup vmlinux
4803.00 5.7% neigh_lookup vmlinux
4420.00 5.2% _raw_spin_lock vmlinux
3883.00 4.6% rt_set_nexthop vmlinux
3261.00 3.9% _raw_read_lock vmlinux
2794.00 3.3% fib_table_lookup vmlinux
2374.00 2.8% neigh_resolve_output vmlinux
2153.00 2.5% dst_alloc vmlinux
1502.00 1.8% _raw_read_lock_bh vmlinux
1484.00 1.8% kmem_cache_alloc vmlinux
1407.00 1.7% eth_header vmlinux
1406.00 1.7% ipv4_dst_destroy vmlinux
1298.00 1.5% __copy_from_user_ll vmlinux
1174.00 1.4% dev_queue_xmit vmlinux
1000.00 1.2% ip_output vmlinux
After patch Profile :
13712.00 15.8% dst_destroy vmlinux
8548.00 9.9% __ip_route_output_key vmlinux
7017.00 8.1% neigh_lookup vmlinux
4554.00 5.3% fib_semantic_match vmlinux
4067.00 4.7% _raw_read_lock vmlinux
3491.00 4.0% dst_alloc vmlinux
3186.00 3.7% neigh_resolve_output vmlinux
3103.00 3.6% fib_table_lookup vmlinux
2098.00 2.4% _raw_read_lock_bh vmlinux
2081.00 2.4% kmem_cache_alloc vmlinux
2013.00 2.3% _raw_spin_lock vmlinux
1763.00 2.0% __copy_from_user_ll vmlinux
1763.00 2.0% ip_output vmlinux
1761.00 2.0% ipv4_dst_destroy vmlinux
1631.00 1.9% eth_header vmlinux
1440.00 1.7% _raw_read_unlock_bh vmlinux
Reference results, if IP route cache is enabled :
real 0m29.718s
user 0m10.845s
sys 7m37.341s
25213.00 29.5% __ip_route_output_key vmlinux
9011.00 10.5% dst_release vmlinux
4817.00 5.6% ip_push_pending_frames vmlinux
4232.00 5.0% ip_finish_output vmlinux
3940.00 4.6% udp_sendmsg vmlinux
3730.00 4.4% __copy_from_user_ll vmlinux
3716.00 4.4% ip_route_output_flow vmlinux
2451.00 2.9% __xfrm_lookup vmlinux
2221.00 2.6% ip_append_data vmlinux
1718.00 2.0% _raw_spin_lock_bh vmlinux
1655.00 1.9% __alloc_skb vmlinux
1572.00 1.8% sock_wfree vmlinux
1345.00 1.6% kfree vmlinux
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/route.c')
-rw-r--r-- | net/ipv4/route.c | 59 |
1 files changed, 24 insertions, 35 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 04e0df82b88c..7864d0c48968 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c | |||
@@ -1773,12 +1773,15 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt) | |||
1773 | 1773 | ||
1774 | if (rt->fl.iif == 0) | 1774 | if (rt->fl.iif == 0) |
1775 | src = rt->rt_src; | 1775 | src = rt->rt_src; |
1776 | else if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) { | 1776 | else { |
1777 | src = FIB_RES_PREFSRC(res); | 1777 | rcu_read_lock(); |
1778 | fib_res_put(&res); | 1778 | if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) |
1779 | } else | 1779 | src = FIB_RES_PREFSRC(res); |
1780 | src = inet_select_addr(rt->dst.dev, rt->rt_gateway, | 1780 | else |
1781 | src = inet_select_addr(rt->dst.dev, rt->rt_gateway, | ||
1781 | RT_SCOPE_UNIVERSE); | 1782 | RT_SCOPE_UNIVERSE); |
1783 | rcu_read_unlock(); | ||
1784 | } | ||
1782 | memcpy(addr, &src, 4); | 1785 | memcpy(addr, &src, 4); |
1783 | } | 1786 | } |
1784 | 1787 | ||
@@ -2081,6 +2084,7 @@ static int ip_mkroute_input(struct sk_buff *skb, | |||
2081 | * Such approach solves two big problems: | 2084 | * Such approach solves two big problems: |
2082 | * 1. Not simplex devices are handled properly. | 2085 | * 1. Not simplex devices are handled properly. |
2083 | * 2. IP spoofing attempts are filtered with 100% of guarantee. | 2086 | * 2. IP spoofing attempts are filtered with 100% of guarantee. |
2087 | * called with rcu_read_lock() | ||
2084 | */ | 2088 | */ |
2085 | 2089 | ||
2086 | static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, | 2090 | static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, |
@@ -2102,7 +2106,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2102 | unsigned hash; | 2106 | unsigned hash; |
2103 | __be32 spec_dst; | 2107 | __be32 spec_dst; |
2104 | int err = -EINVAL; | 2108 | int err = -EINVAL; |
2105 | int free_res = 0; | ||
2106 | struct net * net = dev_net(dev); | 2109 | struct net * net = dev_net(dev); |
2107 | 2110 | ||
2108 | /* IP on this device is disabled. */ | 2111 | /* IP on this device is disabled. */ |
@@ -2134,12 +2137,12 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2134 | /* | 2137 | /* |
2135 | * Now we are ready to route packet. | 2138 | * Now we are ready to route packet. |
2136 | */ | 2139 | */ |
2137 | if ((err = fib_lookup(net, &fl, &res)) != 0) { | 2140 | err = fib_lookup(net, &fl, &res); |
2141 | if (err != 0) { | ||
2138 | if (!IN_DEV_FORWARD(in_dev)) | 2142 | if (!IN_DEV_FORWARD(in_dev)) |
2139 | goto e_hostunreach; | 2143 | goto e_hostunreach; |
2140 | goto no_route; | 2144 | goto no_route; |
2141 | } | 2145 | } |
2142 | free_res = 1; | ||
2143 | 2146 | ||
2144 | RT_CACHE_STAT_INC(in_slow_tot); | 2147 | RT_CACHE_STAT_INC(in_slow_tot); |
2145 | 2148 | ||
@@ -2148,8 +2151,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2148 | 2151 | ||
2149 | if (res.type == RTN_LOCAL) { | 2152 | if (res.type == RTN_LOCAL) { |
2150 | err = fib_validate_source(saddr, daddr, tos, | 2153 | err = fib_validate_source(saddr, daddr, tos, |
2151 | net->loopback_dev->ifindex, | 2154 | net->loopback_dev->ifindex, |
2152 | dev, &spec_dst, &itag, skb->mark); | 2155 | dev, &spec_dst, &itag, skb->mark); |
2153 | if (err < 0) | 2156 | if (err < 0) |
2154 | goto martian_source_keep_err; | 2157 | goto martian_source_keep_err; |
2155 | if (err) | 2158 | if (err) |
@@ -2164,9 +2167,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2164 | goto martian_destination; | 2167 | goto martian_destination; |
2165 | 2168 | ||
2166 | err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); | 2169 | err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); |
2167 | done: | ||
2168 | if (free_res) | ||
2169 | fib_res_put(&res); | ||
2170 | out: return err; | 2170 | out: return err; |
2171 | 2171 | ||
2172 | brd_input: | 2172 | brd_input: |
@@ -2226,7 +2226,7 @@ local_input: | |||
2226 | rth->rt_type = res.type; | 2226 | rth->rt_type = res.type; |
2227 | hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); | 2227 | hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); |
2228 | err = rt_intern_hash(hash, rth, NULL, skb, fl.iif); | 2228 | err = rt_intern_hash(hash, rth, NULL, skb, fl.iif); |
2229 | goto done; | 2229 | goto out; |
2230 | 2230 | ||
2231 | no_route: | 2231 | no_route: |
2232 | RT_CACHE_STAT_INC(in_no_route); | 2232 | RT_CACHE_STAT_INC(in_no_route); |
@@ -2249,21 +2249,21 @@ martian_destination: | |||
2249 | 2249 | ||
2250 | e_hostunreach: | 2250 | e_hostunreach: |
2251 | err = -EHOSTUNREACH; | 2251 | err = -EHOSTUNREACH; |
2252 | goto done; | 2252 | goto out; |
2253 | 2253 | ||
2254 | e_inval: | 2254 | e_inval: |
2255 | err = -EINVAL; | 2255 | err = -EINVAL; |
2256 | goto done; | 2256 | goto out; |
2257 | 2257 | ||
2258 | e_nobufs: | 2258 | e_nobufs: |
2259 | err = -ENOBUFS; | 2259 | err = -ENOBUFS; |
2260 | goto done; | 2260 | goto out; |
2261 | 2261 | ||
2262 | martian_source: | 2262 | martian_source: |
2263 | err = -EINVAL; | 2263 | err = -EINVAL; |
2264 | martian_source_keep_err: | 2264 | martian_source_keep_err: |
2265 | ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); | 2265 | ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); |
2266 | goto done; | 2266 | goto out; |
2267 | } | 2267 | } |
2268 | 2268 | ||
2269 | int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, | 2269 | int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, |
@@ -2349,6 +2349,7 @@ skip_cache: | |||
2349 | } | 2349 | } |
2350 | EXPORT_SYMBOL(ip_route_input_common); | 2350 | EXPORT_SYMBOL(ip_route_input_common); |
2351 | 2351 | ||
2352 | /* called with rcu_read_lock() */ | ||
2352 | static int __mkroute_output(struct rtable **result, | 2353 | static int __mkroute_output(struct rtable **result, |
2353 | struct fib_result *res, | 2354 | struct fib_result *res, |
2354 | const struct flowi *fl, | 2355 | const struct flowi *fl, |
@@ -2373,18 +2374,13 @@ static int __mkroute_output(struct rtable **result, | |||
2373 | if (dev_out->flags & IFF_LOOPBACK) | 2374 | if (dev_out->flags & IFF_LOOPBACK) |
2374 | flags |= RTCF_LOCAL; | 2375 | flags |= RTCF_LOCAL; |
2375 | 2376 | ||
2376 | rcu_read_lock(); | ||
2377 | in_dev = __in_dev_get_rcu(dev_out); | 2377 | in_dev = __in_dev_get_rcu(dev_out); |
2378 | if (!in_dev) { | 2378 | if (!in_dev) |
2379 | rcu_read_unlock(); | ||
2380 | return -EINVAL; | 2379 | return -EINVAL; |
2381 | } | 2380 | |
2382 | if (res->type == RTN_BROADCAST) { | 2381 | if (res->type == RTN_BROADCAST) { |
2383 | flags |= RTCF_BROADCAST | RTCF_LOCAL; | 2382 | flags |= RTCF_BROADCAST | RTCF_LOCAL; |
2384 | if (res->fi) { | 2383 | res->fi = NULL; |
2385 | fib_info_put(res->fi); | ||
2386 | res->fi = NULL; | ||
2387 | } | ||
2388 | } else if (res->type == RTN_MULTICAST) { | 2384 | } else if (res->type == RTN_MULTICAST) { |
2389 | flags |= RTCF_MULTICAST | RTCF_LOCAL; | 2385 | flags |= RTCF_MULTICAST | RTCF_LOCAL; |
2390 | if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, | 2386 | if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, |
@@ -2394,10 +2390,8 @@ static int __mkroute_output(struct rtable **result, | |||
2394 | * default one, but do not gateway in this case. | 2390 | * default one, but do not gateway in this case. |
2395 | * Yes, it is hack. | 2391 | * Yes, it is hack. |
2396 | */ | 2392 | */ |
2397 | if (res->fi && res->prefixlen < 4) { | 2393 | if (res->fi && res->prefixlen < 4) |
2398 | fib_info_put(res->fi); | ||
2399 | res->fi = NULL; | 2394 | res->fi = NULL; |
2400 | } | ||
2401 | } | 2395 | } |
2402 | 2396 | ||
2403 | 2397 | ||
@@ -2467,6 +2461,7 @@ static int __mkroute_output(struct rtable **result, | |||
2467 | return 0; | 2461 | return 0; |
2468 | } | 2462 | } |
2469 | 2463 | ||
2464 | /* called with rcu_read_lock() */ | ||
2470 | static int ip_mkroute_output(struct rtable **rp, | 2465 | static int ip_mkroute_output(struct rtable **rp, |
2471 | struct fib_result *res, | 2466 | struct fib_result *res, |
2472 | const struct flowi *fl, | 2467 | const struct flowi *fl, |
@@ -2509,7 +2504,6 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, | |||
2509 | struct fib_result res; | 2504 | struct fib_result res; |
2510 | unsigned int flags = 0; | 2505 | unsigned int flags = 0; |
2511 | struct net_device *dev_out = NULL; | 2506 | struct net_device *dev_out = NULL; |
2512 | int free_res = 0; | ||
2513 | int err; | 2507 | int err; |
2514 | 2508 | ||
2515 | 2509 | ||
@@ -2636,15 +2630,12 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, | |||
2636 | err = -ENETUNREACH; | 2630 | err = -ENETUNREACH; |
2637 | goto out; | 2631 | goto out; |
2638 | } | 2632 | } |
2639 | free_res = 1; | ||
2640 | 2633 | ||
2641 | if (res.type == RTN_LOCAL) { | 2634 | if (res.type == RTN_LOCAL) { |
2642 | if (!fl.fl4_src) | 2635 | if (!fl.fl4_src) |
2643 | fl.fl4_src = fl.fl4_dst; | 2636 | fl.fl4_src = fl.fl4_dst; |
2644 | dev_out = net->loopback_dev; | 2637 | dev_out = net->loopback_dev; |
2645 | fl.oif = dev_out->ifindex; | 2638 | fl.oif = dev_out->ifindex; |
2646 | if (res.fi) | ||
2647 | fib_info_put(res.fi); | ||
2648 | res.fi = NULL; | 2639 | res.fi = NULL; |
2649 | flags |= RTCF_LOCAL; | 2640 | flags |= RTCF_LOCAL; |
2650 | goto make_route; | 2641 | goto make_route; |
@@ -2668,8 +2659,6 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, | |||
2668 | make_route: | 2659 | make_route: |
2669 | err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); | 2660 | err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); |
2670 | 2661 | ||
2671 | if (free_res) | ||
2672 | fib_res_put(&res); | ||
2673 | out: return err; | 2662 | out: return err; |
2674 | } | 2663 | } |
2675 | 2664 | ||