aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorEric Dumazet <eric.dumazet@gmail.com>2010-10-05 06:41:36 -0400
committerDavid S. Miller <davem@davemloft.net>2010-10-05 23:39:38 -0400
commitebc0ffae5dfb4447e0a431ffe7fe1d467c48bbb9 (patch)
tree395e50547ffccc6b73e04a44190eb4b4f2d2316b /net
parentc2952c314b4fe61820ba8fd6c949eed636140d52 (diff)
fib: RCU conversion of fib_lookup()
fib_lookup() converted to be called in RCU protected context, no reference taken and released on a contended cache line (fib_clntref) fib_table_lookup() and fib_semantic_match() get an additional parameter. struct fib_info gets an rcu_head field, and is freed after an rcu grace period. Stress test : (Sending 160.000.000 UDP frames on same neighbour, IP route cache disabled, dual E5540 @2.53GHz, 32bit kernel, FIB_HASH) (about same results for FIB_TRIE) Before patch : real 1m31.199s user 0m13.761s sys 23m24.780s After patch: real 1m5.375s user 0m14.997s sys 15m50.115s Before patch Profile : 13044.00 15.4% __ip_route_output_key vmlinux 8438.00 10.0% dst_destroy vmlinux 5983.00 7.1% fib_semantic_match vmlinux 5410.00 6.4% fib_rules_lookup vmlinux 4803.00 5.7% neigh_lookup vmlinux 4420.00 5.2% _raw_spin_lock vmlinux 3883.00 4.6% rt_set_nexthop vmlinux 3261.00 3.9% _raw_read_lock vmlinux 2794.00 3.3% fib_table_lookup vmlinux 2374.00 2.8% neigh_resolve_output vmlinux 2153.00 2.5% dst_alloc vmlinux 1502.00 1.8% _raw_read_lock_bh vmlinux 1484.00 1.8% kmem_cache_alloc vmlinux 1407.00 1.7% eth_header vmlinux 1406.00 1.7% ipv4_dst_destroy vmlinux 1298.00 1.5% __copy_from_user_ll vmlinux 1174.00 1.4% dev_queue_xmit vmlinux 1000.00 1.2% ip_output vmlinux After patch Profile : 13712.00 15.8% dst_destroy vmlinux 8548.00 9.9% __ip_route_output_key vmlinux 7017.00 8.1% neigh_lookup vmlinux 4554.00 5.3% fib_semantic_match vmlinux 4067.00 4.7% _raw_read_lock vmlinux 3491.00 4.0% dst_alloc vmlinux 3186.00 3.7% neigh_resolve_output vmlinux 3103.00 3.6% fib_table_lookup vmlinux 2098.00 2.4% _raw_read_lock_bh vmlinux 2081.00 2.4% kmem_cache_alloc vmlinux 2013.00 2.3% _raw_spin_lock vmlinux 1763.00 2.0% __copy_from_user_ll vmlinux 1763.00 2.0% ip_output vmlinux 1761.00 2.0% ipv4_dst_destroy vmlinux 1631.00 1.9% eth_header vmlinux 1440.00 1.7% _raw_read_unlock_bh vmlinux Reference results, if IP route cache is enabled : real 0m29.718s user 0m10.845s sys 7m37.341s 25213.00 29.5% __ip_route_output_key vmlinux 9011.00 10.5% dst_release vmlinux 4817.00 5.6% ip_push_pending_frames vmlinux 4232.00 5.0% ip_finish_output vmlinux 3940.00 4.6% udp_sendmsg vmlinux 3730.00 4.4% __copy_from_user_ll vmlinux 3716.00 4.4% ip_route_output_flow vmlinux 2451.00 2.9% __xfrm_lookup vmlinux 2221.00 2.6% ip_append_data vmlinux 1718.00 2.0% _raw_spin_lock_bh vmlinux 1655.00 1.9% __alloc_skb vmlinux 1572.00 1.8% sock_wfree vmlinux 1345.00 1.6% kfree vmlinux Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/core/fib_rules.c3
-rw-r--r--net/ipv4/fib_frontend.c27
-rw-r--r--net/ipv4/fib_hash.c5
-rw-r--r--net/ipv4/fib_lookup.h2
-rw-r--r--net/ipv4/fib_rules.c3
-rw-r--r--net/ipv4/fib_semantics.c21
-rw-r--r--net/ipv4/fib_trie.c10
-rw-r--r--net/ipv4/route.c59
8 files changed, 66 insertions, 64 deletions
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index cfb7d25c172d..21698f8c49ee 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -225,7 +225,8 @@ jumped:
225 err = ops->action(rule, fl, flags, arg); 225 err = ops->action(rule, fl, flags, arg);
226 226
227 if (err != -EAGAIN) { 227 if (err != -EAGAIN) {
228 if (likely(atomic_inc_not_zero(&rule->refcnt))) { 228 if ((arg->flags & FIB_LOOKUP_NOREF) ||
229 likely(atomic_inc_not_zero(&rule->refcnt))) {
229 arg->rule = rule; 230 arg->rule = rule;
230 goto out; 231 goto out;
231 } 232 }
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index b05c23b05a9f..919f2ad19b49 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -168,8 +168,11 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
168 struct fib_result res = { 0 }; 168 struct fib_result res = { 0 };
169 struct net_device *dev = NULL; 169 struct net_device *dev = NULL;
170 170
171 if (fib_lookup(net, &fl, &res)) 171 rcu_read_lock();
172 if (fib_lookup(net, &fl, &res)) {
173 rcu_read_unlock();
172 return NULL; 174 return NULL;
175 }
173 if (res.type != RTN_LOCAL) 176 if (res.type != RTN_LOCAL)
174 goto out; 177 goto out;
175 dev = FIB_RES_DEV(res); 178 dev = FIB_RES_DEV(res);
@@ -177,7 +180,7 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
177 if (dev && devref) 180 if (dev && devref)
178 dev_hold(dev); 181 dev_hold(dev);
179out: 182out:
180 fib_res_put(&res); 183 rcu_read_unlock();
181 return dev; 184 return dev;
182} 185}
183EXPORT_SYMBOL(__ip_dev_find); 186EXPORT_SYMBOL(__ip_dev_find);
@@ -207,11 +210,12 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
207 local_table = fib_get_table(net, RT_TABLE_LOCAL); 210 local_table = fib_get_table(net, RT_TABLE_LOCAL);
208 if (local_table) { 211 if (local_table) {
209 ret = RTN_UNICAST; 212 ret = RTN_UNICAST;
210 if (!fib_table_lookup(local_table, &fl, &res)) { 213 rcu_read_lock();
214 if (!fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) {
211 if (!dev || dev == res.fi->fib_dev) 215 if (!dev || dev == res.fi->fib_dev)
212 ret = res.type; 216 ret = res.type;
213 fib_res_put(&res);
214 } 217 }
218 rcu_read_unlock();
215 } 219 }
216 return ret; 220 return ret;
217} 221}
@@ -235,6 +239,7 @@ EXPORT_SYMBOL(inet_dev_addr_type);
235 * - figure out what "logical" interface this packet arrived 239 * - figure out what "logical" interface this packet arrived
236 * and calculate "specific destination" address. 240 * and calculate "specific destination" address.
237 * - check, that packet arrived from expected physical interface. 241 * - check, that packet arrived from expected physical interface.
242 * called with rcu_read_lock()
238 */ 243 */
239int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, 244int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
240 struct net_device *dev, __be32 *spec_dst, 245 struct net_device *dev, __be32 *spec_dst,
@@ -259,7 +264,6 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
259 struct net *net; 264 struct net *net;
260 265
261 no_addr = rpf = accept_local = 0; 266 no_addr = rpf = accept_local = 0;
262 rcu_read_lock();
263 in_dev = __in_dev_get_rcu(dev); 267 in_dev = __in_dev_get_rcu(dev);
264 if (in_dev) { 268 if (in_dev) {
265 no_addr = in_dev->ifa_list == NULL; 269 no_addr = in_dev->ifa_list == NULL;
@@ -268,7 +272,6 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
268 if (mark && !IN_DEV_SRC_VMARK(in_dev)) 272 if (mark && !IN_DEV_SRC_VMARK(in_dev))
269 fl.mark = 0; 273 fl.mark = 0;
270 } 274 }
271 rcu_read_unlock();
272 275
273 if (in_dev == NULL) 276 if (in_dev == NULL)
274 goto e_inval; 277 goto e_inval;
@@ -278,7 +281,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
278 goto last_resort; 281 goto last_resort;
279 if (res.type != RTN_UNICAST) { 282 if (res.type != RTN_UNICAST) {
280 if (res.type != RTN_LOCAL || !accept_local) 283 if (res.type != RTN_LOCAL || !accept_local)
281 goto e_inval_res; 284 goto e_inval;
282 } 285 }
283 *spec_dst = FIB_RES_PREFSRC(res); 286 *spec_dst = FIB_RES_PREFSRC(res);
284 fib_combine_itag(itag, &res); 287 fib_combine_itag(itag, &res);
@@ -299,10 +302,8 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
299#endif 302#endif
300 if (dev_match) { 303 if (dev_match) {
301 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; 304 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
302 fib_res_put(&res);
303 return ret; 305 return ret;
304 } 306 }
305 fib_res_put(&res);
306 if (no_addr) 307 if (no_addr)
307 goto last_resort; 308 goto last_resort;
308 if (rpf == 1) 309 if (rpf == 1)
@@ -315,7 +316,6 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
315 *spec_dst = FIB_RES_PREFSRC(res); 316 *spec_dst = FIB_RES_PREFSRC(res);
316 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; 317 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
317 } 318 }
318 fib_res_put(&res);
319 } 319 }
320 return ret; 320 return ret;
321 321
@@ -326,8 +326,6 @@ last_resort:
326 *itag = 0; 326 *itag = 0;
327 return 0; 327 return 0;
328 328
329e_inval_res:
330 fib_res_put(&res);
331e_inval: 329e_inval:
332 return -EINVAL; 330 return -EINVAL;
333e_rpf: 331e_rpf:
@@ -873,15 +871,16 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
873 local_bh_disable(); 871 local_bh_disable();
874 872
875 frn->tb_id = tb->tb_id; 873 frn->tb_id = tb->tb_id;
876 frn->err = fib_table_lookup(tb, &fl, &res); 874 rcu_read_lock();
875 frn->err = fib_table_lookup(tb, &fl, &res, FIB_LOOKUP_NOREF);
877 876
878 if (!frn->err) { 877 if (!frn->err) {
879 frn->prefixlen = res.prefixlen; 878 frn->prefixlen = res.prefixlen;
880 frn->nh_sel = res.nh_sel; 879 frn->nh_sel = res.nh_sel;
881 frn->type = res.type; 880 frn->type = res.type;
882 frn->scope = res.scope; 881 frn->scope = res.scope;
883 fib_res_put(&res);
884 } 882 }
883 rcu_read_unlock();
885 local_bh_enable(); 884 local_bh_enable();
886 } 885 }
887} 886}
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 4ed7e0dea1bc..83cca68e259c 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -244,7 +244,8 @@ fn_new_zone(struct fn_hash *table, int z)
244} 244}
245 245
246int fib_table_lookup(struct fib_table *tb, 246int fib_table_lookup(struct fib_table *tb,
247 const struct flowi *flp, struct fib_result *res) 247 const struct flowi *flp, struct fib_result *res,
248 int fib_flags)
248{ 249{
249 int err; 250 int err;
250 struct fn_zone *fz; 251 struct fn_zone *fz;
@@ -264,7 +265,7 @@ int fib_table_lookup(struct fib_table *tb,
264 265
265 err = fib_semantic_match(&f->fn_alias, 266 err = fib_semantic_match(&f->fn_alias,
266 flp, res, 267 flp, res,
267 fz->fz_order); 268 fz->fz_order, fib_flags);
268 if (err <= 0) 269 if (err <= 0)
269 goto out; 270 goto out;
270 } 271 }
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 637b133973bd..b9c9a9f2aee5 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -22,7 +22,7 @@ struct fib_alias {
22/* Exported by fib_semantics.c */ 22/* Exported by fib_semantics.c */
23extern int fib_semantic_match(struct list_head *head, 23extern int fib_semantic_match(struct list_head *head,
24 const struct flowi *flp, 24 const struct flowi *flp,
25 struct fib_result *res, int prefixlen); 25 struct fib_result *res, int prefixlen, int fib_flags);
26extern void fib_release_info(struct fib_info *); 26extern void fib_release_info(struct fib_info *);
27extern struct fib_info *fib_create_info(struct fib_config *cfg); 27extern struct fib_info *fib_create_info(struct fib_config *cfg);
28extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); 28extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 32300521e32c..7981a24f5c7b 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -57,6 +57,7 @@ int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res)
57{ 57{
58 struct fib_lookup_arg arg = { 58 struct fib_lookup_arg arg = {
59 .result = res, 59 .result = res,
60 .flags = FIB_LOOKUP_NOREF,
60 }; 61 };
61 int err; 62 int err;
62 63
@@ -94,7 +95,7 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
94 if (!tbl) 95 if (!tbl)
95 goto errout; 96 goto errout;
96 97
97 err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result); 98 err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result, arg->flags);
98 if (err > 0) 99 if (err > 0)
99 err = -EAGAIN; 100 err = -EAGAIN;
100errout: 101errout:
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index ba52f399a898..0f80dfc2f7fb 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -148,6 +148,13 @@ static const struct
148 148
149/* Release a nexthop info record */ 149/* Release a nexthop info record */
150 150
151static void free_fib_info_rcu(struct rcu_head *head)
152{
153 struct fib_info *fi = container_of(head, struct fib_info, rcu);
154
155 kfree(fi);
156}
157
151void free_fib_info(struct fib_info *fi) 158void free_fib_info(struct fib_info *fi)
152{ 159{
153 if (fi->fib_dead == 0) { 160 if (fi->fib_dead == 0) {
@@ -161,7 +168,7 @@ void free_fib_info(struct fib_info *fi)
161 } endfor_nexthops(fi); 168 } endfor_nexthops(fi);
162 fib_info_cnt--; 169 fib_info_cnt--;
163 release_net(fi->fib_net); 170 release_net(fi->fib_net);
164 kfree(fi); 171 call_rcu(&fi->rcu, free_fib_info_rcu);
165} 172}
166 173
167void fib_release_info(struct fib_info *fi) 174void fib_release_info(struct fib_info *fi)
@@ -553,6 +560,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
553 nh->nh_scope = RT_SCOPE_LINK; 560 nh->nh_scope = RT_SCOPE_LINK;
554 return 0; 561 return 0;
555 } 562 }
563 rcu_read_lock();
556 { 564 {
557 struct flowi fl = { 565 struct flowi fl = {
558 .nl_u = { 566 .nl_u = {
@@ -568,8 +576,10 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
568 if (fl.fl4_scope < RT_SCOPE_LINK) 576 if (fl.fl4_scope < RT_SCOPE_LINK)
569 fl.fl4_scope = RT_SCOPE_LINK; 577 fl.fl4_scope = RT_SCOPE_LINK;
570 err = fib_lookup(net, &fl, &res); 578 err = fib_lookup(net, &fl, &res);
571 if (err) 579 if (err) {
580 rcu_read_unlock();
572 return err; 581 return err;
582 }
573 } 583 }
574 err = -EINVAL; 584 err = -EINVAL;
575 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) 585 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
@@ -585,7 +595,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
585 goto out; 595 goto out;
586 err = 0; 596 err = 0;
587out: 597out:
588 fib_res_put(&res); 598 rcu_read_unlock();
589 return err; 599 return err;
590 } else { 600 } else {
591 struct in_device *in_dev; 601 struct in_device *in_dev;
@@ -879,7 +889,7 @@ failure:
879 889
880/* Note! fib_semantic_match intentionally uses RCU list functions. */ 890/* Note! fib_semantic_match intentionally uses RCU list functions. */
881int fib_semantic_match(struct list_head *head, const struct flowi *flp, 891int fib_semantic_match(struct list_head *head, const struct flowi *flp,
882 struct fib_result *res, int prefixlen) 892 struct fib_result *res, int prefixlen, int fib_flags)
883{ 893{
884 struct fib_alias *fa; 894 struct fib_alias *fa;
885 int nh_sel = 0; 895 int nh_sel = 0;
@@ -943,7 +953,8 @@ out_fill_res:
943 res->type = fa->fa_type; 953 res->type = fa->fa_type;
944 res->scope = fa->fa_scope; 954 res->scope = fa->fa_scope;
945 res->fi = fa->fa_info; 955 res->fi = fa->fa_info;
946 atomic_inc(&res->fi->fib_clntref); 956 if (!(fib_flags & FIB_LOOKUP_NOREF))
957 atomic_inc(&res->fi->fib_clntref);
947 return 0; 958 return 0;
948} 959}
949 960
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index a96e5ec211a0..271c89bdf049 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1342,7 +1342,7 @@ err:
1342/* should be called with rcu_read_lock */ 1342/* should be called with rcu_read_lock */
1343static int check_leaf(struct trie *t, struct leaf *l, 1343static int check_leaf(struct trie *t, struct leaf *l,
1344 t_key key, const struct flowi *flp, 1344 t_key key, const struct flowi *flp,
1345 struct fib_result *res) 1345 struct fib_result *res, int fib_flags)
1346{ 1346{
1347 struct leaf_info *li; 1347 struct leaf_info *li;
1348 struct hlist_head *hhead = &l->list; 1348 struct hlist_head *hhead = &l->list;
@@ -1356,7 +1356,7 @@ static int check_leaf(struct trie *t, struct leaf *l,
1356 if (l->key != (key & ntohl(mask))) 1356 if (l->key != (key & ntohl(mask)))
1357 continue; 1357 continue;
1358 1358
1359 err = fib_semantic_match(&li->falh, flp, res, plen); 1359 err = fib_semantic_match(&li->falh, flp, res, plen, fib_flags);
1360 1360
1361#ifdef CONFIG_IP_FIB_TRIE_STATS 1361#ifdef CONFIG_IP_FIB_TRIE_STATS
1362 if (err <= 0) 1362 if (err <= 0)
@@ -1372,7 +1372,7 @@ static int check_leaf(struct trie *t, struct leaf *l,
1372} 1372}
1373 1373
1374int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, 1374int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1375 struct fib_result *res) 1375 struct fib_result *res, int fib_flags)
1376{ 1376{
1377 struct trie *t = (struct trie *) tb->tb_data; 1377 struct trie *t = (struct trie *) tb->tb_data;
1378 int ret; 1378 int ret;
@@ -1399,7 +1399,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1399 1399
1400 /* Just a leaf? */ 1400 /* Just a leaf? */
1401 if (IS_LEAF(n)) { 1401 if (IS_LEAF(n)) {
1402 ret = check_leaf(t, (struct leaf *)n, key, flp, res); 1402 ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
1403 goto found; 1403 goto found;
1404 } 1404 }
1405 1405
@@ -1424,7 +1424,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1424 } 1424 }
1425 1425
1426 if (IS_LEAF(n)) { 1426 if (IS_LEAF(n)) {
1427 ret = check_leaf(t, (struct leaf *)n, key, flp, res); 1427 ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
1428 if (ret > 0) 1428 if (ret > 0)
1429 goto backtrace; 1429 goto backtrace;
1430 goto found; 1430 goto found;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 04e0df82b88c..7864d0c48968 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1773,12 +1773,15 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
1773 1773
1774 if (rt->fl.iif == 0) 1774 if (rt->fl.iif == 0)
1775 src = rt->rt_src; 1775 src = rt->rt_src;
1776 else if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) { 1776 else {
1777 src = FIB_RES_PREFSRC(res); 1777 rcu_read_lock();
1778 fib_res_put(&res); 1778 if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
1779 } else 1779 src = FIB_RES_PREFSRC(res);
1780 src = inet_select_addr(rt->dst.dev, rt->rt_gateway, 1780 else
1781 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1781 RT_SCOPE_UNIVERSE); 1782 RT_SCOPE_UNIVERSE);
1783 rcu_read_unlock();
1784 }
1782 memcpy(addr, &src, 4); 1785 memcpy(addr, &src, 4);
1783} 1786}
1784 1787
@@ -2081,6 +2084,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
2081 * Such approach solves two big problems: 2084 * Such approach solves two big problems:
2082 * 1. Not simplex devices are handled properly. 2085 * 1. Not simplex devices are handled properly.
2083 * 2. IP spoofing attempts are filtered with 100% of guarantee. 2086 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2087 * called with rcu_read_lock()
2084 */ 2088 */
2085 2089
2086static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2090static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
@@ -2102,7 +2106,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2102 unsigned hash; 2106 unsigned hash;
2103 __be32 spec_dst; 2107 __be32 spec_dst;
2104 int err = -EINVAL; 2108 int err = -EINVAL;
2105 int free_res = 0;
2106 struct net * net = dev_net(dev); 2109 struct net * net = dev_net(dev);
2107 2110
2108 /* IP on this device is disabled. */ 2111 /* IP on this device is disabled. */
@@ -2134,12 +2137,12 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2134 /* 2137 /*
2135 * Now we are ready to route packet. 2138 * Now we are ready to route packet.
2136 */ 2139 */
2137 if ((err = fib_lookup(net, &fl, &res)) != 0) { 2140 err = fib_lookup(net, &fl, &res);
2141 if (err != 0) {
2138 if (!IN_DEV_FORWARD(in_dev)) 2142 if (!IN_DEV_FORWARD(in_dev))
2139 goto e_hostunreach; 2143 goto e_hostunreach;
2140 goto no_route; 2144 goto no_route;
2141 } 2145 }
2142 free_res = 1;
2143 2146
2144 RT_CACHE_STAT_INC(in_slow_tot); 2147 RT_CACHE_STAT_INC(in_slow_tot);
2145 2148
@@ -2148,8 +2151,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2148 2151
2149 if (res.type == RTN_LOCAL) { 2152 if (res.type == RTN_LOCAL) {
2150 err = fib_validate_source(saddr, daddr, tos, 2153 err = fib_validate_source(saddr, daddr, tos,
2151 net->loopback_dev->ifindex, 2154 net->loopback_dev->ifindex,
2152 dev, &spec_dst, &itag, skb->mark); 2155 dev, &spec_dst, &itag, skb->mark);
2153 if (err < 0) 2156 if (err < 0)
2154 goto martian_source_keep_err; 2157 goto martian_source_keep_err;
2155 if (err) 2158 if (err)
@@ -2164,9 +2167,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2164 goto martian_destination; 2167 goto martian_destination;
2165 2168
2166 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); 2169 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2167done:
2168 if (free_res)
2169 fib_res_put(&res);
2170out: return err; 2170out: return err;
2171 2171
2172brd_input: 2172brd_input:
@@ -2226,7 +2226,7 @@ local_input:
2226 rth->rt_type = res.type; 2226 rth->rt_type = res.type;
2227 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); 2227 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2228 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif); 2228 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2229 goto done; 2229 goto out;
2230 2230
2231no_route: 2231no_route:
2232 RT_CACHE_STAT_INC(in_no_route); 2232 RT_CACHE_STAT_INC(in_no_route);
@@ -2249,21 +2249,21 @@ martian_destination:
2249 2249
2250e_hostunreach: 2250e_hostunreach:
2251 err = -EHOSTUNREACH; 2251 err = -EHOSTUNREACH;
2252 goto done; 2252 goto out;
2253 2253
2254e_inval: 2254e_inval:
2255 err = -EINVAL; 2255 err = -EINVAL;
2256 goto done; 2256 goto out;
2257 2257
2258e_nobufs: 2258e_nobufs:
2259 err = -ENOBUFS; 2259 err = -ENOBUFS;
2260 goto done; 2260 goto out;
2261 2261
2262martian_source: 2262martian_source:
2263 err = -EINVAL; 2263 err = -EINVAL;
2264martian_source_keep_err: 2264martian_source_keep_err:
2265 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 2265 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2266 goto done; 2266 goto out;
2267} 2267}
2268 2268
2269int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2269int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
@@ -2349,6 +2349,7 @@ skip_cache:
2349} 2349}
2350EXPORT_SYMBOL(ip_route_input_common); 2350EXPORT_SYMBOL(ip_route_input_common);
2351 2351
2352/* called with rcu_read_lock() */
2352static int __mkroute_output(struct rtable **result, 2353static int __mkroute_output(struct rtable **result,
2353 struct fib_result *res, 2354 struct fib_result *res,
2354 const struct flowi *fl, 2355 const struct flowi *fl,
@@ -2373,18 +2374,13 @@ static int __mkroute_output(struct rtable **result,
2373 if (dev_out->flags & IFF_LOOPBACK) 2374 if (dev_out->flags & IFF_LOOPBACK)
2374 flags |= RTCF_LOCAL; 2375 flags |= RTCF_LOCAL;
2375 2376
2376 rcu_read_lock();
2377 in_dev = __in_dev_get_rcu(dev_out); 2377 in_dev = __in_dev_get_rcu(dev_out);
2378 if (!in_dev) { 2378 if (!in_dev)
2379 rcu_read_unlock();
2380 return -EINVAL; 2379 return -EINVAL;
2381 } 2380
2382 if (res->type == RTN_BROADCAST) { 2381 if (res->type == RTN_BROADCAST) {
2383 flags |= RTCF_BROADCAST | RTCF_LOCAL; 2382 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2384 if (res->fi) { 2383 res->fi = NULL;
2385 fib_info_put(res->fi);
2386 res->fi = NULL;
2387 }
2388 } else if (res->type == RTN_MULTICAST) { 2384 } else if (res->type == RTN_MULTICAST) {
2389 flags |= RTCF_MULTICAST | RTCF_LOCAL; 2385 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2390 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, 2386 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
@@ -2394,10 +2390,8 @@ static int __mkroute_output(struct rtable **result,
2394 * default one, but do not gateway in this case. 2390 * default one, but do not gateway in this case.
2395 * Yes, it is hack. 2391 * Yes, it is hack.
2396 */ 2392 */
2397 if (res->fi && res->prefixlen < 4) { 2393 if (res->fi && res->prefixlen < 4)
2398 fib_info_put(res->fi);
2399 res->fi = NULL; 2394 res->fi = NULL;
2400 }
2401 } 2395 }
2402 2396
2403 2397
@@ -2467,6 +2461,7 @@ static int __mkroute_output(struct rtable **result,
2467 return 0; 2461 return 0;
2468} 2462}
2469 2463
2464/* called with rcu_read_lock() */
2470static int ip_mkroute_output(struct rtable **rp, 2465static int ip_mkroute_output(struct rtable **rp,
2471 struct fib_result *res, 2466 struct fib_result *res,
2472 const struct flowi *fl, 2467 const struct flowi *fl,
@@ -2509,7 +2504,6 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2509 struct fib_result res; 2504 struct fib_result res;
2510 unsigned int flags = 0; 2505 unsigned int flags = 0;
2511 struct net_device *dev_out = NULL; 2506 struct net_device *dev_out = NULL;
2512 int free_res = 0;
2513 int err; 2507 int err;
2514 2508
2515 2509
@@ -2636,15 +2630,12 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2636 err = -ENETUNREACH; 2630 err = -ENETUNREACH;
2637 goto out; 2631 goto out;
2638 } 2632 }
2639 free_res = 1;
2640 2633
2641 if (res.type == RTN_LOCAL) { 2634 if (res.type == RTN_LOCAL) {
2642 if (!fl.fl4_src) 2635 if (!fl.fl4_src)
2643 fl.fl4_src = fl.fl4_dst; 2636 fl.fl4_src = fl.fl4_dst;
2644 dev_out = net->loopback_dev; 2637 dev_out = net->loopback_dev;
2645 fl.oif = dev_out->ifindex; 2638 fl.oif = dev_out->ifindex;
2646 if (res.fi)
2647 fib_info_put(res.fi);
2648 res.fi = NULL; 2639 res.fi = NULL;
2649 flags |= RTCF_LOCAL; 2640 flags |= RTCF_LOCAL;
2650 goto make_route; 2641 goto make_route;
@@ -2668,8 +2659,6 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2668make_route: 2659make_route:
2669 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); 2660 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2670 2661
2671 if (free_res)
2672 fib_res_put(&res);
2673out: return err; 2662out: return err;
2674} 2663}
2675 2664