aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2012-07-17 07:19:00 -0400
committerDavid S. Miller <davem@davemloft.net>2012-07-17 11:48:50 -0400
commit4895c771c7f006b4b90f9d6b1d2210939ba57b38 (patch)
tree66b132799e33a215b88bf3945965fefa7a0cde24
parent6700c2709c08d74ae2c3c29b84a30da012dbc7f1 (diff)
ipv4: Add FIB nexthop exceptions.
In a regime where we have subnetted route entries, we need a way to store persistent storage about destination specific learned values such as redirects and PMTU values. This is implemented here via nexthop exceptions. The initial implementation is a 2048 entry hash table with relaiming starting at chain length 5. A more sophisticated scheme can be devised if that proves necessary. Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/ip_fib.h18
-rw-r--r--net/ipv4/fib_semantics.c23
-rw-r--r--net/ipv4/route.c256
3 files changed, 266 insertions, 31 deletions
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 5697acefeba3..e9ee1ca07087 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -18,6 +18,7 @@
18 18
19#include <net/flow.h> 19#include <net/flow.h>
20#include <linux/seq_file.h> 20#include <linux/seq_file.h>
21#include <linux/rcupdate.h>
21#include <net/fib_rules.h> 22#include <net/fib_rules.h>
22#include <net/inetpeer.h> 23#include <net/inetpeer.h>
23 24
@@ -46,6 +47,22 @@ struct fib_config {
46 47
47struct fib_info; 48struct fib_info;
48 49
50struct fib_nh_exception {
51 struct fib_nh_exception __rcu *fnhe_next;
52 __be32 fnhe_daddr;
53 u32 fnhe_pmtu;
54 u32 fnhe_gw;
55 unsigned long fnhe_expires;
56 unsigned long fnhe_stamp;
57};
58
59struct fnhe_hash_bucket {
60 struct fib_nh_exception __rcu *chain;
61};
62
63#define FNHE_HASH_SIZE 2048
64#define FNHE_RECLAIM_DEPTH 5
65
49struct fib_nh { 66struct fib_nh {
50 struct net_device *nh_dev; 67 struct net_device *nh_dev;
51 struct hlist_node nh_hash; 68 struct hlist_node nh_hash;
@@ -63,6 +80,7 @@ struct fib_nh {
63 __be32 nh_gw; 80 __be32 nh_gw;
64 __be32 nh_saddr; 81 __be32 nh_saddr;
65 int nh_saddr_genid; 82 int nh_saddr_genid;
83 struct fnhe_hash_bucket *nh_exceptions;
66}; 84};
67 85
68/* 86/*
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index d71bfbdc0bf4..1e09852df512 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -140,6 +140,27 @@ const struct fib_prop fib_props[RTN_MAX + 1] = {
140 }, 140 },
141}; 141};
142 142
143static void free_nh_exceptions(struct fib_nh *nh)
144{
145 struct fnhe_hash_bucket *hash = nh->nh_exceptions;
146 int i;
147
148 for (i = 0; i < FNHE_HASH_SIZE; i++) {
149 struct fib_nh_exception *fnhe;
150
151 fnhe = rcu_dereference(hash[i].chain);
152 while (fnhe) {
153 struct fib_nh_exception *next;
154
155 next = rcu_dereference(fnhe->fnhe_next);
156 kfree(fnhe);
157
158 fnhe = next;
159 }
160 }
161 kfree(hash);
162}
163
143/* Release a nexthop info record */ 164/* Release a nexthop info record */
144static void free_fib_info_rcu(struct rcu_head *head) 165static void free_fib_info_rcu(struct rcu_head *head)
145{ 166{
@@ -148,6 +169,8 @@ static void free_fib_info_rcu(struct rcu_head *head)
148 change_nexthops(fi) { 169 change_nexthops(fi) {
149 if (nexthop_nh->nh_dev) 170 if (nexthop_nh->nh_dev)
150 dev_put(nexthop_nh->nh_dev); 171 dev_put(nexthop_nh->nh_dev);
172 if (nexthop_nh->nh_exceptions)
173 free_nh_exceptions(nexthop_nh);
151 } endfor_nexthops(fi); 174 } endfor_nexthops(fi);
152 175
153 release_net(fi->fib_net); 176 release_net(fi->fib_net);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index b35d3bfc66cd..a5bd0b4acc61 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1275,14 +1275,130 @@ static void rt_del(unsigned int hash, struct rtable *rt)
1275 spin_unlock_bh(rt_hash_lock_addr(hash)); 1275 spin_unlock_bh(rt_hash_lock_addr(hash));
1276} 1276}
1277 1277
1278static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 1278static void __build_flow_key(struct flowi4 *fl4, struct sock *sk,
1279 const struct iphdr *iph,
1280 int oif, u8 tos,
1281 u8 prot, u32 mark, int flow_flags)
1282{
1283 if (sk) {
1284 const struct inet_sock *inet = inet_sk(sk);
1285
1286 oif = sk->sk_bound_dev_if;
1287 mark = sk->sk_mark;
1288 tos = RT_CONN_FLAGS(sk);
1289 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
1290 }
1291 flowi4_init_output(fl4, oif, mark, tos,
1292 RT_SCOPE_UNIVERSE, prot,
1293 flow_flags,
1294 iph->daddr, iph->saddr, 0, 0);
1295}
1296
1297static void build_skb_flow_key(struct flowi4 *fl4, struct sk_buff *skb, struct sock *sk)
1298{
1299 const struct iphdr *iph = ip_hdr(skb);
1300 int oif = skb->dev->ifindex;
1301 u8 tos = RT_TOS(iph->tos);
1302 u8 prot = iph->protocol;
1303 u32 mark = skb->mark;
1304
1305 __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
1306}
1307
1308static void build_sk_flow_key(struct flowi4 *fl4, struct sock *sk)
1309{
1310 const struct inet_sock *inet = inet_sk(sk);
1311 struct ip_options_rcu *inet_opt;
1312 __be32 daddr = inet->inet_daddr;
1313
1314 rcu_read_lock();
1315 inet_opt = rcu_dereference(inet->inet_opt);
1316 if (inet_opt && inet_opt->opt.srr)
1317 daddr = inet_opt->opt.faddr;
1318 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
1319 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
1320 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
1321 inet_sk_flowi_flags(sk),
1322 daddr, inet->inet_saddr, 0, 0);
1323 rcu_read_unlock();
1324}
1325
1326static void ip_rt_build_flow_key(struct flowi4 *fl4, struct sock *sk,
1327 struct sk_buff *skb)
1328{
1329 if (skb)
1330 build_skb_flow_key(fl4, skb, sk);
1331 else
1332 build_sk_flow_key(fl4, sk);
1333}
1334
1335static DEFINE_SPINLOCK(fnhe_lock);
1336
1337static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash, __be32 daddr)
1338{
1339 struct fib_nh_exception *fnhe, *oldest;
1340
1341 oldest = rcu_dereference(hash->chain);
1342 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
1343 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1344 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
1345 oldest = fnhe;
1346 }
1347 return oldest;
1348}
1349
1350static struct fib_nh_exception *find_or_create_fnhe(struct fib_nh *nh, __be32 daddr)
1351{
1352 struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1353 struct fib_nh_exception *fnhe;
1354 int depth;
1355 u32 hval;
1356
1357 if (!hash) {
1358 hash = nh->nh_exceptions = kzalloc(FNHE_HASH_SIZE * sizeof(*hash),
1359 GFP_ATOMIC);
1360 if (!hash)
1361 return NULL;
1362 }
1363
1364 hval = (__force u32) daddr;
1365 hval ^= (hval >> 11) ^ (hval >> 22);
1366 hash += hval;
1367
1368 depth = 0;
1369 for (fnhe = rcu_dereference(hash->chain); fnhe;
1370 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1371 if (fnhe->fnhe_daddr == daddr)
1372 goto out;
1373 depth++;
1374 }
1375
1376 if (depth > FNHE_RECLAIM_DEPTH) {
1377 fnhe = fnhe_oldest(hash + hval, daddr);
1378 goto out_daddr;
1379 }
1380 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
1381 if (!fnhe)
1382 return NULL;
1383
1384 fnhe->fnhe_next = hash->chain;
1385 rcu_assign_pointer(hash->chain, fnhe);
1386
1387out_daddr:
1388 fnhe->fnhe_daddr = daddr;
1389out:
1390 fnhe->fnhe_stamp = jiffies;
1391 return fnhe;
1392}
1393
1394static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4)
1279{ 1395{
1280 __be32 new_gw = icmp_hdr(skb)->un.gateway; 1396 __be32 new_gw = icmp_hdr(skb)->un.gateway;
1281 __be32 old_gw = ip_hdr(skb)->saddr; 1397 __be32 old_gw = ip_hdr(skb)->saddr;
1282 struct net_device *dev = skb->dev; 1398 struct net_device *dev = skb->dev;
1283 struct in_device *in_dev; 1399 struct in_device *in_dev;
1400 struct fib_result res;
1284 struct neighbour *n; 1401 struct neighbour *n;
1285 struct rtable *rt;
1286 struct net *net; 1402 struct net *net;
1287 1403
1288 switch (icmp_hdr(skb)->code & 7) { 1404 switch (icmp_hdr(skb)->code & 7) {
@@ -1296,7 +1412,6 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
1296 return; 1412 return;
1297 } 1413 }
1298 1414
1299 rt = (struct rtable *) dst;
1300 if (rt->rt_gateway != old_gw) 1415 if (rt->rt_gateway != old_gw)
1301 return; 1416 return;
1302 1417
@@ -1320,11 +1435,21 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
1320 goto reject_redirect; 1435 goto reject_redirect;
1321 } 1436 }
1322 1437
1323 n = ipv4_neigh_lookup(dst, NULL, &new_gw); 1438 n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
1324 if (n) { 1439 if (n) {
1325 if (!(n->nud_state & NUD_VALID)) { 1440 if (!(n->nud_state & NUD_VALID)) {
1326 neigh_event_send(n, NULL); 1441 neigh_event_send(n, NULL);
1327 } else { 1442 } else {
1443 if (fib_lookup(net, fl4, &res) == 0) {
1444 struct fib_nh *nh = &FIB_RES_NH(res);
1445 struct fib_nh_exception *fnhe;
1446
1447 spin_lock_bh(&fnhe_lock);
1448 fnhe = find_or_create_fnhe(nh, fl4->daddr);
1449 if (fnhe)
1450 fnhe->fnhe_gw = new_gw;
1451 spin_unlock_bh(&fnhe_lock);
1452 }
1328 rt->rt_gateway = new_gw; 1453 rt->rt_gateway = new_gw;
1329 rt->rt_flags |= RTCF_REDIRECTED; 1454 rt->rt_flags |= RTCF_REDIRECTED;
1330 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); 1455 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
@@ -1349,6 +1474,17 @@ reject_redirect:
1349 ; 1474 ;
1350} 1475}
1351 1476
1477static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
1478{
1479 struct rtable *rt;
1480 struct flowi4 fl4;
1481
1482 rt = (struct rtable *) dst;
1483
1484 ip_rt_build_flow_key(&fl4, sk, skb);
1485 __ip_do_redirect(rt, skb, &fl4);
1486}
1487
1352static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 1488static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1353{ 1489{
1354 struct rtable *rt = (struct rtable *)dst; 1490 struct rtable *rt = (struct rtable *)dst;
@@ -1508,33 +1644,51 @@ out: kfree_skb(skb);
1508 return 0; 1644 return 0;
1509} 1645}
1510 1646
1511static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 1647static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1512 struct sk_buff *skb, u32 mtu)
1513{ 1648{
1514 struct rtable *rt = (struct rtable *) dst; 1649 struct fib_result res;
1515
1516 dst_confirm(dst);
1517 1650
1518 if (mtu < ip_rt_min_pmtu) 1651 if (mtu < ip_rt_min_pmtu)
1519 mtu = ip_rt_min_pmtu; 1652 mtu = ip_rt_min_pmtu;
1520 1653
1654 if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
1655 struct fib_nh *nh = &FIB_RES_NH(res);
1656 struct fib_nh_exception *fnhe;
1657
1658 spin_lock_bh(&fnhe_lock);
1659 fnhe = find_or_create_fnhe(nh, fl4->daddr);
1660 if (fnhe) {
1661 fnhe->fnhe_pmtu = mtu;
1662 fnhe->fnhe_expires = jiffies + ip_rt_mtu_expires;
1663 }
1664 spin_unlock_bh(&fnhe_lock);
1665 }
1521 rt->rt_pmtu = mtu; 1666 rt->rt_pmtu = mtu;
1522 dst_set_expires(&rt->dst, ip_rt_mtu_expires); 1667 dst_set_expires(&rt->dst, ip_rt_mtu_expires);
1523} 1668}
1524 1669
1670static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1671 struct sk_buff *skb, u32 mtu)
1672{
1673 struct rtable *rt = (struct rtable *) dst;
1674 struct flowi4 fl4;
1675
1676 ip_rt_build_flow_key(&fl4, sk, skb);
1677 __ip_rt_update_pmtu(rt, &fl4, mtu);
1678}
1679
1525void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, 1680void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1526 int oif, u32 mark, u8 protocol, int flow_flags) 1681 int oif, u32 mark, u8 protocol, int flow_flags)
1527{ 1682{
1528 const struct iphdr *iph = (const struct iphdr *)skb->data; 1683 const struct iphdr *iph = (const struct iphdr *) skb->data;
1529 struct flowi4 fl4; 1684 struct flowi4 fl4;
1530 struct rtable *rt; 1685 struct rtable *rt;
1531 1686
1532 flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE, 1687 __build_flow_key(&fl4, NULL, iph, oif,
1533 protocol, flow_flags, 1688 RT_TOS(iph->tos), protocol, mark, flow_flags);
1534 iph->daddr, iph->saddr, 0, 0);
1535 rt = __ip_route_output_key(net, &fl4); 1689 rt = __ip_route_output_key(net, &fl4);
1536 if (!IS_ERR(rt)) { 1690 if (!IS_ERR(rt)) {
1537 ip_rt_update_pmtu(&rt->dst, NULL, skb, mtu); 1691 __ip_rt_update_pmtu(rt, &fl4, mtu);
1538 ip_rt_put(rt); 1692 ip_rt_put(rt);
1539 } 1693 }
1540} 1694}
@@ -1542,27 +1696,31 @@ EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1542 1696
1543void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) 1697void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1544{ 1698{
1545 const struct inet_sock *inet = inet_sk(sk); 1699 const struct iphdr *iph = (const struct iphdr *) skb->data;
1700 struct flowi4 fl4;
1701 struct rtable *rt;
1546 1702
1547 return ipv4_update_pmtu(skb, sock_net(sk), mtu, 1703 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1548 sk->sk_bound_dev_if, sk->sk_mark, 1704 rt = __ip_route_output_key(sock_net(sk), &fl4);
1549 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, 1705 if (!IS_ERR(rt)) {
1550 inet_sk_flowi_flags(sk)); 1706 __ip_rt_update_pmtu(rt, &fl4, mtu);
1707 ip_rt_put(rt);
1708 }
1551} 1709}
1552EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); 1710EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1553 1711
1554void ipv4_redirect(struct sk_buff *skb, struct net *net, 1712void ipv4_redirect(struct sk_buff *skb, struct net *net,
1555 int oif, u32 mark, u8 protocol, int flow_flags) 1713 int oif, u32 mark, u8 protocol, int flow_flags)
1556{ 1714{
1557 const struct iphdr *iph = (const struct iphdr *)skb->data; 1715 const struct iphdr *iph = (const struct iphdr *) skb->data;
1558 struct flowi4 fl4; 1716 struct flowi4 fl4;
1559 struct rtable *rt; 1717 struct rtable *rt;
1560 1718
1561 flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE, 1719 __build_flow_key(&fl4, NULL, iph, oif,
1562 protocol, flow_flags, iph->daddr, iph->saddr, 0, 0); 1720 RT_TOS(iph->tos), protocol, mark, flow_flags);
1563 rt = __ip_route_output_key(net, &fl4); 1721 rt = __ip_route_output_key(net, &fl4);
1564 if (!IS_ERR(rt)) { 1722 if (!IS_ERR(rt)) {
1565 ip_do_redirect(&rt->dst, NULL, skb); 1723 __ip_do_redirect(rt, skb, &fl4);
1566 ip_rt_put(rt); 1724 ip_rt_put(rt);
1567 } 1725 }
1568} 1726}
@@ -1570,12 +1728,16 @@ EXPORT_SYMBOL_GPL(ipv4_redirect);
1570 1728
1571void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) 1729void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1572{ 1730{
1573 const struct inet_sock *inet = inet_sk(sk); 1731 const struct iphdr *iph = (const struct iphdr *) skb->data;
1732 struct flowi4 fl4;
1733 struct rtable *rt;
1574 1734
1575 return ipv4_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, 1735 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1576 sk->sk_mark, 1736 rt = __ip_route_output_key(sock_net(sk), &fl4);
1577 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, 1737 if (!IS_ERR(rt)) {
1578 inet_sk_flowi_flags(sk)); 1738 __ip_do_redirect(rt, skb, &fl4);
1739 ip_rt_put(rt);
1740 }
1579} 1741}
1580EXPORT_SYMBOL_GPL(ipv4_sk_redirect); 1742EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1581 1743
@@ -1722,14 +1884,46 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1722 dst_init_metrics(&rt->dst, fi->fib_metrics, true); 1884 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1723} 1885}
1724 1886
1887static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr)
1888{
1889 struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1890 struct fib_nh_exception *fnhe;
1891 u32 hval;
1892
1893 hval = (__force u32) daddr;
1894 hval ^= (hval >> 11) ^ (hval >> 22);
1895
1896 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1897 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1898 if (fnhe->fnhe_daddr == daddr) {
1899 if (fnhe->fnhe_pmtu) {
1900 unsigned long expires = fnhe->fnhe_expires;
1901 unsigned long diff = jiffies - expires;
1902
1903 if (time_before(jiffies, expires)) {
1904 rt->rt_pmtu = fnhe->fnhe_pmtu;
1905 dst_set_expires(&rt->dst, diff);
1906 }
1907 }
1908 if (fnhe->fnhe_gw)
1909 rt->rt_gateway = fnhe->fnhe_gw;
1910 fnhe->fnhe_stamp = jiffies;
1911 break;
1912 }
1913 }
1914}
1915
1725static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, 1916static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1726 const struct fib_result *res, 1917 const struct fib_result *res,
1727 struct fib_info *fi, u16 type, u32 itag) 1918 struct fib_info *fi, u16 type, u32 itag)
1728{ 1919{
1729 if (fi) { 1920 if (fi) {
1730 if (FIB_RES_GW(*res) && 1921 struct fib_nh *nh = &FIB_RES_NH(*res);
1731 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1922
1732 rt->rt_gateway = FIB_RES_GW(*res); 1923 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1924 rt->rt_gateway = nh->nh_gw;
1925 if (unlikely(nh->nh_exceptions))
1926 rt_bind_exception(rt, nh, fl4->daddr);
1733 rt_init_metrics(rt, fl4, fi); 1927 rt_init_metrics(rt, fl4, fi);
1734#ifdef CONFIG_IP_ROUTE_CLASSID 1928#ifdef CONFIG_IP_ROUTE_CLASSID
1735 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid; 1929 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;