aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/fib_semantics.c
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2015-08-14 01:43:22 -0400
committerDavid S. Miller <davem@davemloft.net>2015-08-14 01:43:22 -0400
commitd52736e24fe2e927c26817256f8d1a3c8b5d51a0 (patch)
tree945b31e0d4bb51d479771ed3a70948f59f23a775 /net/ipv4/fib_semantics.c
parent0344338bd883e5e4a2f80409ed8260cd65d69e3b (diff)
parent193125dbd8eb292d88feb201f030889b488b0a02 (diff)
Merge branch 'vrf-lite'
David Ahern says: ==================== VRF-lite - v6 In the context of internet scale routing a requirement that always comes up is the need to partition the available routing tables into disjoint routing planes. A specific use case is the multi-tenancy problem where each tenant has their own unique routing tables and in the very least need different default gateways. This patch allows the ability to create virtual router domains (aka VRFs (VRF-lite to be specific) in the linux packet forwarding stack. The main observation is that through the use of rules and socket binding to interfaces, all the facilities that we need are already present in the infrastructure. What is missing is a handle that identifies a routing domain and can be used to gather applicable rules/tables and uniqify neighbor selection. The scheme used needs to preserves the notions of ECMP, and general routing principles. This driver is a cross between functionality that the IPVLAN driver and the Team drivers provide where a device is created and packets into/out of the routing domain are shuttled through this device. The device is then used as a handle to identify the applicable rules. The VRF device is thus the layer3 equivalent of a vlan device. The very important point to note is that this is only a Layer3 concept so L2 tools (e.g., LLDP) do not need to be run in each VRF, processes can run in unaware mode or select a VRF to be talking through. Also the behavioral model is a generalized application of the familiar VRF-Lite model with some performance paths that need optimization. (Specifically the output route selector that Roopa, Robert, Thomas and EricB are currently discussing on the MPLS thread) High Level points ================= 1. Simple overlay driver (minimal changes to current stack) * uses the existing fib tables and fib rules infrastructure 2. Modelled closely after the ipvlan driver 3. Uses current API and infrastructure. * Applications can use SO_BINDTODEVICE or cmsg device indentifiers to pick VRF (ping, traceroute just work) * Standard IP Rules work, and since they are aggregated against the device, scale is manageable 4. Completely orthogonal to Namespaces and only provides separation in the routing plane (and ARP) N2 N1 (all configs here) +---------------+ +--------------+ | | |swp1 :10.0.1.1+----------------------+swp1 :10.0.1.2 | | | | | |swp2 :10.0.2.1+----------------------+swp2 :10.0.2.2 | | | +---------------+ | VRF 1 | | table 5 | | | +---------------+ | | | VRF 2 | N3 | table 6 | +---------------+ | | | | |swp3 :10.0.2.1+----------------------+swp1 :10.0.2.2 | | | | | |swp4 :10.0.3.1+----------------------+swp2 :10.0.3.2 | +--------------+ +---------------+ Given the topology above, the setup needed to get the basic VRF functions working would be Create the VRF devices and associate with a table ip link add vrf1 type vrf table 5 ip link add vrf2 type vrf table 6 Install the lookup rules that map table to VRF domain ip rule add pref 200 oif vrf1 lookup 5 ip rule add pref 200 iif vrf1 lookup 5 ip rule add pref 200 oif vrf2 lookup 6 ip rule add pref 200 iif vrf2 lookup 6 ip link set vrf1 up ip link set vrf2 up Enslave the routing member interfaces ip link set swp1 master vrf1 ip link set swp2 master vrf1 ip link set swp3 master vrf2 ip link set swp4 master vrf2 Connected and local routes are automatically moved from main and local tables to the VRF table. ping using VRF0 is simply ping -I vrf0 10.0.1.2 Design Highlights ================= If a device is enslaved to a VRF device (ie., associated with a VRF) then: 1. Rx path The master device index is used as the iif for all lookups. 2. Tx path Similarly, for Tx the VRF device oif is used in the flow to direct lookups to the table associated with the VRF via its rule. From there the FLOWI_FLAG_VRFSRC flag is used to indicate that the oif should not be used for FIB table lookups. 3. Connected and local routes On link up for a device, connected and local routes are added to the table associated with the VRF device, rather than the local and main tables. 4. Socket lookups Sockets operating in the VRF must be bound to the VRF device. As such socket lookups compare the VRF device index to sk_bound_dev_if. 5. Neighbor entries Neighbor entries are not impacted by the VRF device. Entries are associated with a particular interface; the VRF association is indirect via the interface-to-VRF device enslavement. Version 6 - addressed comments from DaveM - added patch to properly set oif in ip_send_unicast_reply. Needs to be set to VRF device for proper FIB lookup - added patch to handle IP fragments Version 5 - dropped patch regarding socket lookups; no longer needed + removed vrf helpers no longer needed after this patch is dropped - removed dev_open and close operations + no need to reset vrf data on an ifdown and creates problems if a slave is deleted while the vrf interface is down (Thanks, Nikolay) - cleanups for sparse warnings + make C=2 is now clean for vrf driver Version 4 - builds are clean with and without VRF device enabled (no, yes and module) - tightened the driver implementation + device add/delete, slave add/remove, and module unload are all clean - fixed RCU references + with RCU and lock debugging enabled changes are clean through the suite of tests - TX path uses custom dst, so patch refactoring rtable allocation is dropped along with the patch adding rt_nexthop helper - dropped the task patch that adds default bind to interface for sockets and the associated chvrf example command + the patches are a convenience for running unmodified code. They are not needed for the core functionality. Any application with support for SO_BINDTODEVICE works properly with this patch set. Version 3 - addressed comments from first 2 RFCs with the exception of the name Nicolas: We will do the name conversion once we agree on what the correct name should be (vrf, mrf or something else) - packets flow through the VRF device in both directions allowing the following: - tcpdump -i vrf<n> - tc rules on vrf device - netfilter rules on vrf device TO-DO ===== 1. IPv6 2. ipsec, xfrms - dst patch accepted into ipsec-next; will post VRF patch once merge happens 3. listen filter to allow 1 socket to work with multiple VRF devices - i.e., bind to VRF's a, b, c only or NOT VRFs e, f, g Eric B: I have ipsec working with VRFs implemented using the VRF driver, including the worst case scenario of complete duplication in the networking config. Thanks to Nikolay for his many, many code reviews whipping the device driver into shape, and bug-Fixes and ideas from Hannes, Roopa Prabhu, Jon Toppins, Jamal. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/fib_semantics.c')
-rw-r--r--net/ipv4/fib_semantics.c44
1 files changed, 34 insertions, 10 deletions
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 558e196bae0f..b7f1d20a9615 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -670,16 +670,18 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
670 struct fib_result res; 670 struct fib_result res;
671 671
672 if (nh->nh_flags & RTNH_F_ONLINK) { 672 if (nh->nh_flags & RTNH_F_ONLINK) {
673 unsigned int addr_type;
673 674
674 if (cfg->fc_scope >= RT_SCOPE_LINK) 675 if (cfg->fc_scope >= RT_SCOPE_LINK)
675 return -EINVAL; 676 return -EINVAL;
676 if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
677 return -EINVAL;
678 dev = __dev_get_by_index(net, nh->nh_oif); 677 dev = __dev_get_by_index(net, nh->nh_oif);
679 if (!dev) 678 if (!dev)
680 return -ENODEV; 679 return -ENODEV;
681 if (!(dev->flags & IFF_UP)) 680 if (!(dev->flags & IFF_UP))
682 return -ENETDOWN; 681 return -ENETDOWN;
682 addr_type = inet_addr_type_dev_table(net, dev, nh->nh_gw);
683 if (addr_type != RTN_UNICAST)
684 return -EINVAL;
683 if (!netif_carrier_ok(dev)) 685 if (!netif_carrier_ok(dev))
684 nh->nh_flags |= RTNH_F_LINKDOWN; 686 nh->nh_flags |= RTNH_F_LINKDOWN;
685 nh->nh_dev = dev; 687 nh->nh_dev = dev;
@@ -689,6 +691,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
689 } 691 }
690 rcu_read_lock(); 692 rcu_read_lock();
691 { 693 {
694 struct fib_table *tbl = NULL;
692 struct flowi4 fl4 = { 695 struct flowi4 fl4 = {
693 .daddr = nh->nh_gw, 696 .daddr = nh->nh_gw,
694 .flowi4_scope = cfg->fc_scope + 1, 697 .flowi4_scope = cfg->fc_scope + 1,
@@ -699,8 +702,16 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
699 /* It is not necessary, but requires a bit of thinking */ 702 /* It is not necessary, but requires a bit of thinking */
700 if (fl4.flowi4_scope < RT_SCOPE_LINK) 703 if (fl4.flowi4_scope < RT_SCOPE_LINK)
701 fl4.flowi4_scope = RT_SCOPE_LINK; 704 fl4.flowi4_scope = RT_SCOPE_LINK;
702 err = fib_lookup(net, &fl4, &res, 705
703 FIB_LOOKUP_IGNORE_LINKSTATE); 706 if (cfg->fc_table)
707 tbl = fib_get_table(net, cfg->fc_table);
708
709 if (tbl)
710 err = fib_table_lookup(tbl, &fl4, &res,
711 FIB_LOOKUP_IGNORE_LINKSTATE);
712 else
713 err = fib_lookup(net, &fl4, &res,
714 FIB_LOOKUP_IGNORE_LINKSTATE);
704 if (err) { 715 if (err) {
705 rcu_read_unlock(); 716 rcu_read_unlock();
706 return err; 717 return err;
@@ -836,6 +847,23 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
836 return nh->nh_saddr; 847 return nh->nh_saddr;
837} 848}
838 849
850static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
851{
852 if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
853 fib_prefsrc != cfg->fc_dst) {
854 int tb_id = cfg->fc_table;
855
856 if (tb_id == RT_TABLE_MAIN)
857 tb_id = RT_TABLE_LOCAL;
858
859 if (inet_addr_type_table(cfg->fc_nlinfo.nl_net,
860 fib_prefsrc, tb_id) != RTN_LOCAL) {
861 return false;
862 }
863 }
864 return true;
865}
866
839struct fib_info *fib_create_info(struct fib_config *cfg) 867struct fib_info *fib_create_info(struct fib_config *cfg)
840{ 868{
841 int err; 869 int err;
@@ -1031,12 +1059,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
1031 fi->fib_flags |= RTNH_F_LINKDOWN; 1059 fi->fib_flags |= RTNH_F_LINKDOWN;
1032 } 1060 }
1033 1061
1034 if (fi->fib_prefsrc) { 1062 if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc))
1035 if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || 1063 goto err_inval;
1036 fi->fib_prefsrc != cfg->fc_dst)
1037 if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
1038 goto err_inval;
1039 }
1040 1064
1041 change_nexthops(fi) { 1065 change_nexthops(fi) {
1042 fib_info_update_nh_saddr(net, nexthop_nh); 1066 fib_info_update_nh_saddr(net, nexthop_nh);