diff options
| author | Mahesh Bandewar <maheshb@google.com> | 2016-09-16 15:59:19 -0400 |
|---|---|---|
| committer | David S. Miller <davem@davemloft.net> | 2016-09-19 01:25:22 -0400 |
| commit | 4fbae7d83c98c30efcf0a2a2ac55fbb75ef5a1a5 (patch) | |
| tree | 3ea819d38ad4fbbae8d4db166f58451c2a78ee20 | |
| parent | e8bffe0cf964f0330595bb376b74921cccdaac88 (diff) | |
ipvlan: Introduce l3s mode
In a typical IPvlan L3 setup where master is in default-ns and
each slave is into different (slave) ns. In this setup egress
packet processing for traffic originating from slave-ns will
hit all NF_HOOKs in slave-ns as well as default-ns. However same
is not true for ingress processing. All these NF_HOOKs are
hit only in the slave-ns skipping them in the default-ns.
IPvlan in L3 mode is restrictive and if admins want to deploy
iptables rules in default-ns, this asymmetric data path makes it
impossible to do so.
This patch makes use of the l3_rcv() (added as part of l3mdev
enhancements) to perform input route lookup on RX packets without
changing the skb->dev and then uses nf_hook at NF_INET_LOCAL_IN
to change the skb->dev just before handing over skb to L4.
Signed-off-by: Mahesh Bandewar <maheshb@google.com>
CC: David Ahern <dsa@cumulusnetworks.com>
Reviewed-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
| -rw-r--r-- | Documentation/networking/ipvlan.txt | 7 | ||||
| -rw-r--r-- | drivers/net/Kconfig | 1 | ||||
| -rw-r--r-- | drivers/net/ipvlan/ipvlan.h | 6 | ||||
| -rw-r--r-- | drivers/net/ipvlan/ipvlan_core.c | 94 | ||||
| -rw-r--r-- | drivers/net/ipvlan/ipvlan_main.c | 87 | ||||
| -rw-r--r-- | include/uapi/linux/if_link.h | 1 |
6 files changed, 188 insertions, 8 deletions
diff --git a/Documentation/networking/ipvlan.txt b/Documentation/networking/ipvlan.txt index 14422f8fcdc4..24196cef7c91 100644 --- a/Documentation/networking/ipvlan.txt +++ b/Documentation/networking/ipvlan.txt | |||
| @@ -22,7 +22,7 @@ The driver can be built into the kernel (CONFIG_IPVLAN=y) or as a module | |||
| 22 | There are no module parameters for this driver and it can be configured | 22 | There are no module parameters for this driver and it can be configured |
| 23 | using IProute2/ip utility. | 23 | using IProute2/ip utility. |
| 24 | 24 | ||
| 25 | ip link add link <master-dev> <slave-dev> type ipvlan mode { l2 | L3 } | 25 | ip link add link <master-dev> <slave-dev> type ipvlan mode { l2 | l3 | l3s } |
| 26 | 26 | ||
| 27 | e.g. ip link add link ipvl0 eth0 type ipvlan mode l2 | 27 | e.g. ip link add link ipvl0 eth0 type ipvlan mode l2 |
| 28 | 28 | ||
| @@ -48,6 +48,11 @@ master device for the L2 processing and routing from that instance will be | |||
| 48 | used before packets are queued on the outbound device. In this mode the slaves | 48 | used before packets are queued on the outbound device. In this mode the slaves |
| 49 | will not receive nor can send multicast / broadcast traffic. | 49 | will not receive nor can send multicast / broadcast traffic. |
| 50 | 50 | ||
| 51 | 4.3 L3S mode: | ||
| 52 | This is very similar to the L3 mode except that iptables (conn-tracking) | ||
| 53 | works in this mode and hence it is L3-symmetric (L3s). This will have slightly less | ||
| 54 | performance but that shouldn't matter since you are choosing this mode over plain-L3 | ||
| 55 | mode to make conn-tracking work. | ||
| 51 | 56 | ||
| 52 | 5. What to choose (macvlan vs. ipvlan)? | 57 | 5. What to choose (macvlan vs. ipvlan)? |
| 53 | These two devices are very similar in many regards and the specific use | 58 | These two devices are very similar in many regards and the specific use |
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 0c5415b05ea9..8768a625350d 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig | |||
| @@ -149,6 +149,7 @@ config IPVLAN | |||
| 149 | tristate "IP-VLAN support" | 149 | tristate "IP-VLAN support" |
| 150 | depends on INET | 150 | depends on INET |
| 151 | depends on IPV6 | 151 | depends on IPV6 |
| 152 | depends on NET_L3_MASTER_DEV | ||
| 152 | ---help--- | 153 | ---help--- |
| 153 | This allows one to create virtual devices off of a main interface | 154 | This allows one to create virtual devices off of a main interface |
| 154 | and packets will be delivered based on the dest L3 (IPv6/IPv4 addr) | 155 | and packets will be delivered based on the dest L3 (IPv6/IPv4 addr) |
diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h index 695a5dc9ace3..7e0732f5ea07 100644 --- a/drivers/net/ipvlan/ipvlan.h +++ b/drivers/net/ipvlan/ipvlan.h | |||
| @@ -23,11 +23,13 @@ | |||
| 23 | #include <linux/if_vlan.h> | 23 | #include <linux/if_vlan.h> |
| 24 | #include <linux/ip.h> | 24 | #include <linux/ip.h> |
| 25 | #include <linux/inetdevice.h> | 25 | #include <linux/inetdevice.h> |
| 26 | #include <linux/netfilter.h> | ||
| 26 | #include <net/ip.h> | 27 | #include <net/ip.h> |
| 27 | #include <net/ip6_route.h> | 28 | #include <net/ip6_route.h> |
| 28 | #include <net/rtnetlink.h> | 29 | #include <net/rtnetlink.h> |
| 29 | #include <net/route.h> | 30 | #include <net/route.h> |
| 30 | #include <net/addrconf.h> | 31 | #include <net/addrconf.h> |
| 32 | #include <net/l3mdev.h> | ||
| 31 | 33 | ||
| 32 | #define IPVLAN_DRV "ipvlan" | 34 | #define IPVLAN_DRV "ipvlan" |
| 33 | #define IPV_DRV_VER "0.1" | 35 | #define IPV_DRV_VER "0.1" |
| @@ -124,4 +126,8 @@ struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan, | |||
| 124 | const void *iaddr, bool is_v6); | 126 | const void *iaddr, bool is_v6); |
| 125 | bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6); | 127 | bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6); |
| 126 | void ipvlan_ht_addr_del(struct ipvl_addr *addr); | 128 | void ipvlan_ht_addr_del(struct ipvl_addr *addr); |
| 129 | struct sk_buff *ipvlan_l3_rcv(struct net_device *dev, struct sk_buff *skb, | ||
| 130 | u16 proto); | ||
| 131 | unsigned int ipvlan_nf_input(void *priv, struct sk_buff *skb, | ||
| 132 | const struct nf_hook_state *state); | ||
| 127 | #endif /* __IPVLAN_H */ | 133 | #endif /* __IPVLAN_H */ |
diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index b5f9511d819e..b4e990743e1d 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c | |||
| @@ -560,6 +560,7 @@ int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) | |||
| 560 | case IPVLAN_MODE_L2: | 560 | case IPVLAN_MODE_L2: |
| 561 | return ipvlan_xmit_mode_l2(skb, dev); | 561 | return ipvlan_xmit_mode_l2(skb, dev); |
| 562 | case IPVLAN_MODE_L3: | 562 | case IPVLAN_MODE_L3: |
| 563 | case IPVLAN_MODE_L3S: | ||
| 563 | return ipvlan_xmit_mode_l3(skb, dev); | 564 | return ipvlan_xmit_mode_l3(skb, dev); |
| 564 | } | 565 | } |
| 565 | 566 | ||
| @@ -664,6 +665,8 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb) | |||
| 664 | return ipvlan_handle_mode_l2(pskb, port); | 665 | return ipvlan_handle_mode_l2(pskb, port); |
| 665 | case IPVLAN_MODE_L3: | 666 | case IPVLAN_MODE_L3: |
| 666 | return ipvlan_handle_mode_l3(pskb, port); | 667 | return ipvlan_handle_mode_l3(pskb, port); |
| 668 | case IPVLAN_MODE_L3S: | ||
| 669 | return RX_HANDLER_PASS; | ||
| 667 | } | 670 | } |
| 668 | 671 | ||
| 669 | /* Should not reach here */ | 672 | /* Should not reach here */ |
| @@ -672,3 +675,94 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb) | |||
| 672 | kfree_skb(skb); | 675 | kfree_skb(skb); |
| 673 | return RX_HANDLER_CONSUMED; | 676 | return RX_HANDLER_CONSUMED; |
| 674 | } | 677 | } |
| 678 | |||
| 679 | static struct ipvl_addr *ipvlan_skb_to_addr(struct sk_buff *skb, | ||
| 680 | struct net_device *dev) | ||
| 681 | { | ||
| 682 | struct ipvl_addr *addr = NULL; | ||
| 683 | struct ipvl_port *port; | ||
| 684 | void *lyr3h; | ||
| 685 | int addr_type; | ||
| 686 | |||
| 687 | if (!dev || !netif_is_ipvlan_port(dev)) | ||
| 688 | goto out; | ||
| 689 | |||
| 690 | port = ipvlan_port_get_rcu(dev); | ||
| 691 | if (!port || port->mode != IPVLAN_MODE_L3S) | ||
| 692 | goto out; | ||
| 693 | |||
| 694 | lyr3h = ipvlan_get_L3_hdr(skb, &addr_type); | ||
| 695 | if (!lyr3h) | ||
| 696 | goto out; | ||
| 697 | |||
| 698 | addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true); | ||
| 699 | out: | ||
| 700 | return addr; | ||
| 701 | } | ||
| 702 | |||
| 703 | struct sk_buff *ipvlan_l3_rcv(struct net_device *dev, struct sk_buff *skb, | ||
| 704 | u16 proto) | ||
| 705 | { | ||
| 706 | struct ipvl_addr *addr; | ||
| 707 | struct net_device *sdev; | ||
| 708 | |||
| 709 | addr = ipvlan_skb_to_addr(skb, dev); | ||
| 710 | if (!addr) | ||
| 711 | goto out; | ||
| 712 | |||
| 713 | sdev = addr->master->dev; | ||
| 714 | switch (proto) { | ||
| 715 | case AF_INET: | ||
| 716 | { | ||
| 717 | int err; | ||
| 718 | struct iphdr *ip4h = ip_hdr(skb); | ||
| 719 | |||
| 720 | err = ip_route_input_noref(skb, ip4h->daddr, ip4h->saddr, | ||
| 721 | ip4h->tos, sdev); | ||
| 722 | if (unlikely(err)) | ||
| 723 | goto out; | ||
| 724 | break; | ||
| 725 | } | ||
| 726 | case AF_INET6: | ||
| 727 | { | ||
| 728 | struct dst_entry *dst; | ||
| 729 | struct ipv6hdr *ip6h = ipv6_hdr(skb); | ||
| 730 | int flags = RT6_LOOKUP_F_HAS_SADDR; | ||
| 731 | struct flowi6 fl6 = { | ||
| 732 | .flowi6_iif = sdev->ifindex, | ||
| 733 | .daddr = ip6h->daddr, | ||
| 734 | .saddr = ip6h->saddr, | ||
| 735 | .flowlabel = ip6_flowinfo(ip6h), | ||
| 736 | .flowi6_mark = skb->mark, | ||
| 737 | .flowi6_proto = ip6h->nexthdr, | ||
| 738 | }; | ||
| 739 | |||
| 740 | skb_dst_drop(skb); | ||
| 741 | dst = ip6_route_input_lookup(dev_net(sdev), sdev, &fl6, flags); | ||
| 742 | skb_dst_set(skb, dst); | ||
| 743 | break; | ||
| 744 | } | ||
| 745 | default: | ||
| 746 | break; | ||
| 747 | } | ||
| 748 | |||
| 749 | out: | ||
| 750 | return skb; | ||
| 751 | } | ||
| 752 | |||
| 753 | unsigned int ipvlan_nf_input(void *priv, struct sk_buff *skb, | ||
| 754 | const struct nf_hook_state *state) | ||
| 755 | { | ||
| 756 | struct ipvl_addr *addr; | ||
| 757 | unsigned int len; | ||
| 758 | |||
| 759 | addr = ipvlan_skb_to_addr(skb, skb->dev); | ||
| 760 | if (!addr) | ||
| 761 | goto out; | ||
| 762 | |||
| 763 | skb->dev = addr->master->dev; | ||
| 764 | len = skb->len + ETH_HLEN; | ||
| 765 | ipvlan_count_rx(addr->master, len, true, false); | ||
| 766 | out: | ||
| 767 | return NF_ACCEPT; | ||
| 768 | } | ||
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 18b4e8c7f68a..f442eb366863 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c | |||
| @@ -9,24 +9,87 @@ | |||
| 9 | 9 | ||
| 10 | #include "ipvlan.h" | 10 | #include "ipvlan.h" |
| 11 | 11 | ||
| 12 | static u32 ipvl_nf_hook_refcnt = 0; | ||
| 13 | |||
| 14 | static struct nf_hook_ops ipvl_nfops[] __read_mostly = { | ||
| 15 | { | ||
| 16 | .hook = ipvlan_nf_input, | ||
| 17 | .pf = NFPROTO_IPV4, | ||
| 18 | .hooknum = NF_INET_LOCAL_IN, | ||
| 19 | .priority = INT_MAX, | ||
| 20 | }, | ||
| 21 | { | ||
| 22 | .hook = ipvlan_nf_input, | ||
| 23 | .pf = NFPROTO_IPV6, | ||
| 24 | .hooknum = NF_INET_LOCAL_IN, | ||
| 25 | .priority = INT_MAX, | ||
| 26 | }, | ||
| 27 | }; | ||
| 28 | |||
| 29 | static struct l3mdev_ops ipvl_l3mdev_ops __read_mostly = { | ||
| 30 | .l3mdev_l3_rcv = ipvlan_l3_rcv, | ||
| 31 | }; | ||
| 32 | |||
| 12 | static void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev) | 33 | static void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev) |
| 13 | { | 34 | { |
| 14 | ipvlan->dev->mtu = dev->mtu - ipvlan->mtu_adj; | 35 | ipvlan->dev->mtu = dev->mtu - ipvlan->mtu_adj; |
| 15 | } | 36 | } |
| 16 | 37 | ||
| 17 | static void ipvlan_set_port_mode(struct ipvl_port *port, u16 nval) | 38 | static int ipvlan_register_nf_hook(void) |
| 39 | { | ||
| 40 | int err = 0; | ||
| 41 | |||
| 42 | if (!ipvl_nf_hook_refcnt) { | ||
| 43 | err = _nf_register_hooks(ipvl_nfops, ARRAY_SIZE(ipvl_nfops)); | ||
| 44 | if (!err) | ||
| 45 | ipvl_nf_hook_refcnt = 1; | ||
| 46 | } else { | ||
| 47 | ipvl_nf_hook_refcnt++; | ||
| 48 | } | ||
| 49 | |||
| 50 | return err; | ||
| 51 | } | ||
| 52 | |||
| 53 | static void ipvlan_unregister_nf_hook(void) | ||
| 54 | { | ||
| 55 | WARN_ON(!ipvl_nf_hook_refcnt); | ||
| 56 | |||
| 57 | ipvl_nf_hook_refcnt--; | ||
| 58 | if (!ipvl_nf_hook_refcnt) | ||
| 59 | _nf_unregister_hooks(ipvl_nfops, ARRAY_SIZE(ipvl_nfops)); | ||
| 60 | } | ||
| 61 | |||
| 62 | static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval) | ||
| 18 | { | 63 | { |
| 19 | struct ipvl_dev *ipvlan; | 64 | struct ipvl_dev *ipvlan; |
| 65 | struct net_device *mdev = port->dev; | ||
| 66 | int err = 0; | ||
| 20 | 67 | ||
| 68 | ASSERT_RTNL(); | ||
| 21 | if (port->mode != nval) { | 69 | if (port->mode != nval) { |
| 70 | if (nval == IPVLAN_MODE_L3S) { | ||
| 71 | /* New mode is L3S */ | ||
| 72 | err = ipvlan_register_nf_hook(); | ||
| 73 | if (!err) { | ||
| 74 | mdev->l3mdev_ops = &ipvl_l3mdev_ops; | ||
| 75 | mdev->priv_flags |= IFF_L3MDEV_MASTER; | ||
| 76 | } else | ||
| 77 | return err; | ||
| 78 | } else if (port->mode == IPVLAN_MODE_L3S) { | ||
| 79 | /* Old mode was L3S */ | ||
| 80 | mdev->priv_flags &= ~IFF_L3MDEV_MASTER; | ||
| 81 | ipvlan_unregister_nf_hook(); | ||
| 82 | mdev->l3mdev_ops = NULL; | ||
| 83 | } | ||
| 22 | list_for_each_entry(ipvlan, &port->ipvlans, pnode) { | 84 | list_for_each_entry(ipvlan, &port->ipvlans, pnode) { |
| 23 | if (nval == IPVLAN_MODE_L3) | 85 | if (nval == IPVLAN_MODE_L3 || nval == IPVLAN_MODE_L3S) |
| 24 | ipvlan->dev->flags |= IFF_NOARP; | 86 | ipvlan->dev->flags |= IFF_NOARP; |
| 25 | else | 87 | else |
| 26 | ipvlan->dev->flags &= ~IFF_NOARP; | 88 | ipvlan->dev->flags &= ~IFF_NOARP; |
| 27 | } | 89 | } |
| 28 | port->mode = nval; | 90 | port->mode = nval; |
| 29 | } | 91 | } |
| 92 | return err; | ||
| 30 | } | 93 | } |
| 31 | 94 | ||
| 32 | static int ipvlan_port_create(struct net_device *dev) | 95 | static int ipvlan_port_create(struct net_device *dev) |
| @@ -74,6 +137,11 @@ static void ipvlan_port_destroy(struct net_device *dev) | |||
| 74 | struct ipvl_port *port = ipvlan_port_get_rtnl(dev); | 137 | struct ipvl_port *port = ipvlan_port_get_rtnl(dev); |
| 75 | 138 | ||
| 76 | dev->priv_flags &= ~IFF_IPVLAN_MASTER; | 139 | dev->priv_flags &= ~IFF_IPVLAN_MASTER; |
| 140 | if (port->mode == IPVLAN_MODE_L3S) { | ||
| 141 | dev->priv_flags &= ~IFF_L3MDEV_MASTER; | ||
| 142 | ipvlan_unregister_nf_hook(); | ||
| 143 | dev->l3mdev_ops = NULL; | ||
| 144 | } | ||
| 77 | netdev_rx_handler_unregister(dev); | 145 | netdev_rx_handler_unregister(dev); |
| 78 | cancel_work_sync(&port->wq); | 146 | cancel_work_sync(&port->wq); |
| 79 | __skb_queue_purge(&port->backlog); | 147 | __skb_queue_purge(&port->backlog); |
| @@ -132,7 +200,8 @@ static int ipvlan_open(struct net_device *dev) | |||
| 132 | struct net_device *phy_dev = ipvlan->phy_dev; | 200 | struct net_device *phy_dev = ipvlan->phy_dev; |
| 133 | struct ipvl_addr *addr; | 201 | struct ipvl_addr *addr; |
| 134 | 202 | ||
| 135 | if (ipvlan->port->mode == IPVLAN_MODE_L3) | 203 | if (ipvlan->port->mode == IPVLAN_MODE_L3 || |
| 204 | ipvlan->port->mode == IPVLAN_MODE_L3S) | ||
| 136 | dev->flags |= IFF_NOARP; | 205 | dev->flags |= IFF_NOARP; |
| 137 | else | 206 | else |
| 138 | dev->flags &= ~IFF_NOARP; | 207 | dev->flags &= ~IFF_NOARP; |
| @@ -372,13 +441,14 @@ static int ipvlan_nl_changelink(struct net_device *dev, | |||
| 372 | { | 441 | { |
| 373 | struct ipvl_dev *ipvlan = netdev_priv(dev); | 442 | struct ipvl_dev *ipvlan = netdev_priv(dev); |
| 374 | struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev); | 443 | struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev); |
| 444 | int err = 0; | ||
| 375 | 445 | ||
| 376 | if (data && data[IFLA_IPVLAN_MODE]) { | 446 | if (data && data[IFLA_IPVLAN_MODE]) { |
| 377 | u16 nmode = nla_get_u16(data[IFLA_IPVLAN_MODE]); | 447 | u16 nmode = nla_get_u16(data[IFLA_IPVLAN_MODE]); |
| 378 | 448 | ||
| 379 | ipvlan_set_port_mode(port, nmode); | 449 | err = ipvlan_set_port_mode(port, nmode); |
| 380 | } | 450 | } |
| 381 | return 0; | 451 | return err; |
| 382 | } | 452 | } |
| 383 | 453 | ||
| 384 | static size_t ipvlan_nl_getsize(const struct net_device *dev) | 454 | static size_t ipvlan_nl_getsize(const struct net_device *dev) |
| @@ -473,10 +543,13 @@ static int ipvlan_link_new(struct net *src_net, struct net_device *dev, | |||
| 473 | unregister_netdevice(dev); | 543 | unregister_netdevice(dev); |
| 474 | return err; | 544 | return err; |
| 475 | } | 545 | } |
| 546 | err = ipvlan_set_port_mode(port, mode); | ||
| 547 | if (err) { | ||
| 548 | unregister_netdevice(dev); | ||
| 549 | return err; | ||
| 550 | } | ||
| 476 | 551 | ||
| 477 | list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans); | 552 | list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans); |
| 478 | ipvlan_set_port_mode(port, mode); | ||
| 479 | |||
| 480 | netif_stacked_transfer_operstate(phy_dev, dev); | 553 | netif_stacked_transfer_operstate(phy_dev, dev); |
| 481 | return 0; | 554 | return 0; |
| 482 | } | 555 | } |
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 2351776a724f..7ec9e99d5491 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h | |||
| @@ -464,6 +464,7 @@ enum { | |||
| 464 | enum ipvlan_mode { | 464 | enum ipvlan_mode { |
| 465 | IPVLAN_MODE_L2 = 0, | 465 | IPVLAN_MODE_L2 = 0, |
| 466 | IPVLAN_MODE_L3, | 466 | IPVLAN_MODE_L3, |
| 467 | IPVLAN_MODE_L3S, | ||
| 467 | IPVLAN_MODE_MAX | 468 | IPVLAN_MODE_MAX |
| 468 | }; | 469 | }; |
| 469 | 470 | ||
