aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorstephen hemminger <stephen@networkplumber.org>2013-05-16 07:35:20 -0400
committerDavid S. Miller <davem@davemloft.net>2013-05-17 17:06:29 -0400
commit553675fb5e9ce3d71aa6cb527f98cd34793c0dbc (patch)
tree6345ef556d6eeac40b9a287c9e70f559a99eae03
parente998fd413e5e7dac750dfe79c79cbf8f417d41b7 (diff)
vxlan: listen on multiple ports
The commit 823aa873bc782f1c51b1ce8ec6da7cfcaf93836e Author: stephen hemminger <stephen@networkplumber.org> Date: Sat Apr 27 11:31:57 2013 +0000 vxlan: allow choosing destination port per vxlan introduced per-vxlan UDP port configuration but only did half of the necessary work. It added per vxlan destination for sending, but overlooked the handling of multiple ports for incoming traffic. This patch changes the listening port management to handle multiple incoming UDP ports. The earlier per-namespace structure is now a hash list per namespace. It is also now possible to define the same virtual network id but with different UDP port values which can be useful for migration. Signed-off-by: Stephen Hemminger <stephen@networkplumber.org> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--drivers/net/vxlan.c263
1 files changed, 181 insertions, 82 deletions
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index ba81f3c39a83..80d359c6310b 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -44,6 +44,8 @@
44 44
45#define VXLAN_VERSION "0.1" 45#define VXLAN_VERSION "0.1"
46 46
47#define PORT_HASH_BITS 8
48#define PORT_HASH_SIZE (1<<PORT_HASH_BITS)
47#define VNI_HASH_BITS 10 49#define VNI_HASH_BITS 10
48#define VNI_HASH_SIZE (1<<VNI_HASH_BITS) 50#define VNI_HASH_SIZE (1<<VNI_HASH_BITS)
49#define FDB_HASH_BITS 8 51#define FDB_HASH_BITS 8
@@ -76,13 +78,24 @@ static bool log_ecn_error = true;
76module_param(log_ecn_error, bool, 0644); 78module_param(log_ecn_error, bool, 0644);
77MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); 79MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
78 80
79/* per-net private data for this module */
80static unsigned int vxlan_net_id; 81static unsigned int vxlan_net_id;
81struct vxlan_net { 82
82 struct socket *sock; /* UDP encap socket */ 83/* per UDP socket information */
84struct vxlan_sock {
85 struct hlist_node hlist;
86 struct rcu_head rcu;
87 struct work_struct del_work;
88 unsigned int refcnt;
89 struct socket *sock;
83 struct hlist_head vni_list[VNI_HASH_SIZE]; 90 struct hlist_head vni_list[VNI_HASH_SIZE];
84}; 91};
85 92
93/* per-network namespace private data for this module */
94struct vxlan_net {
95 struct list_head vxlan_list;
96 struct hlist_head sock_list[PORT_HASH_SIZE];
97};
98
86struct vxlan_rdst { 99struct vxlan_rdst {
87 struct rcu_head rcu; 100 struct rcu_head rcu;
88 __be32 remote_ip; 101 __be32 remote_ip;
@@ -106,7 +119,9 @@ struct vxlan_fdb {
106 119
107/* Pseudo network device */ 120/* Pseudo network device */
108struct vxlan_dev { 121struct vxlan_dev {
109 struct hlist_node hlist; 122 struct hlist_node hlist; /* vni hash table */
123 struct list_head next; /* vxlan's per namespace list */
124 struct vxlan_sock *vn_sock; /* listening socket */
110 struct net_device *dev; 125 struct net_device *dev;
111 struct vxlan_rdst default_dst; /* default destination */ 126 struct vxlan_rdst default_dst; /* default destination */
112 __be32 saddr; /* source address */ 127 __be32 saddr; /* source address */
@@ -135,19 +150,43 @@ struct vxlan_dev {
135/* salt for hash table */ 150/* salt for hash table */
136static u32 vxlan_salt __read_mostly; 151static u32 vxlan_salt __read_mostly;
137 152
138static inline struct hlist_head *vni_head(struct net *net, u32 id) 153/* Virtual Network hash table head */
154static inline struct hlist_head *vni_head(struct vxlan_sock *vs, u32 id)
155{
156 return &vs->vni_list[hash_32(id, VNI_HASH_BITS)];
157}
158
159/* Socket hash table head */
160static inline struct hlist_head *vs_head(struct net *net, __be16 port)
139{ 161{
140 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 162 struct vxlan_net *vn = net_generic(net, vxlan_net_id);
141 163
142 return &vn->vni_list[hash_32(id, VNI_HASH_BITS)]; 164 return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
165}
166
167/* Find VXLAN socket based on network namespace and UDP port */
168static struct vxlan_sock *vxlan_find_port(struct net *net, __be16 port)
169{
170 struct vxlan_sock *vs;
171
172 hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) {
173 if (inet_sk(vs->sock->sk)->inet_sport == port)
174 return vs;
175 }
176 return NULL;
143} 177}
144 178
145/* Look up VNI in a per net namespace table */ 179/* Look up VNI in a per net namespace table */
146static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id) 180static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id, __be16 port)
147{ 181{
182 struct vxlan_sock *vs;
148 struct vxlan_dev *vxlan; 183 struct vxlan_dev *vxlan;
149 184
150 hlist_for_each_entry_rcu(vxlan, vni_head(net, id), hlist) { 185 vs = vxlan_find_port(net, port);
186 if (!vs)
187 return NULL;
188
189 hlist_for_each_entry_rcu(vxlan, vni_head(vs, id), hlist) {
151 if (vxlan->default_dst.remote_vni == id) 190 if (vxlan->default_dst.remote_vni == id)
152 return vxlan; 191 return vxlan;
153 } 192 }
@@ -592,20 +631,18 @@ static void vxlan_snoop(struct net_device *dev,
592static bool vxlan_group_used(struct vxlan_net *vn, 631static bool vxlan_group_used(struct vxlan_net *vn,
593 const struct vxlan_dev *this) 632 const struct vxlan_dev *this)
594{ 633{
595 const struct vxlan_dev *vxlan; 634 struct vxlan_dev *vxlan;
596 unsigned h;
597 635
598 for (h = 0; h < VNI_HASH_SIZE; ++h) 636 list_for_each_entry(vxlan, &vn->vxlan_list, next) {
599 hlist_for_each_entry(vxlan, &vn->vni_list[h], hlist) { 637 if (vxlan == this)
600 if (vxlan == this) 638 continue;
601 continue;
602 639
603 if (!netif_running(vxlan->dev)) 640 if (!netif_running(vxlan->dev))
604 continue; 641 continue;
605 642
606 if (vxlan->default_dst.remote_ip == this->default_dst.remote_ip) 643 if (vxlan->default_dst.remote_ip == this->default_dst.remote_ip)
607 return true; 644 return true;
608 } 645 }
609 646
610 return false; 647 return false;
611} 648}
@@ -615,7 +652,7 @@ static int vxlan_join_group(struct net_device *dev)
615{ 652{
616 struct vxlan_dev *vxlan = netdev_priv(dev); 653 struct vxlan_dev *vxlan = netdev_priv(dev);
617 struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); 654 struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
618 struct sock *sk = vn->sock->sk; 655 struct sock *sk = vxlan->vn_sock->sock->sk;
619 struct ip_mreqn mreq = { 656 struct ip_mreqn mreq = {
620 .imr_multiaddr.s_addr = vxlan->default_dst.remote_ip, 657 .imr_multiaddr.s_addr = vxlan->default_dst.remote_ip,
621 .imr_ifindex = vxlan->default_dst.remote_ifindex, 658 .imr_ifindex = vxlan->default_dst.remote_ifindex,
@@ -643,7 +680,7 @@ static int vxlan_leave_group(struct net_device *dev)
643 struct vxlan_dev *vxlan = netdev_priv(dev); 680 struct vxlan_dev *vxlan = netdev_priv(dev);
644 struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); 681 struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
645 int err = 0; 682 int err = 0;
646 struct sock *sk = vn->sock->sk; 683 struct sock *sk = vxlan->vn_sock->sock->sk;
647 struct ip_mreqn mreq = { 684 struct ip_mreqn mreq = {
648 .imr_multiaddr.s_addr = vxlan->default_dst.remote_ip, 685 .imr_multiaddr.s_addr = vxlan->default_dst.remote_ip,
649 .imr_ifindex = vxlan->default_dst.remote_ifindex, 686 .imr_ifindex = vxlan->default_dst.remote_ifindex,
@@ -670,6 +707,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
670 struct vxlanhdr *vxh; 707 struct vxlanhdr *vxh;
671 struct vxlan_dev *vxlan; 708 struct vxlan_dev *vxlan;
672 struct pcpu_tstats *stats; 709 struct pcpu_tstats *stats;
710 __be16 port;
673 __u32 vni; 711 __u32 vni;
674 int err; 712 int err;
675 713
@@ -693,9 +731,11 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
693 731
694 /* Is this VNI defined? */ 732 /* Is this VNI defined? */
695 vni = ntohl(vxh->vx_vni) >> 8; 733 vni = ntohl(vxh->vx_vni) >> 8;
696 vxlan = vxlan_find_vni(sock_net(sk), vni); 734 port = inet_sk(sk)->inet_sport;
735 vxlan = vxlan_find_vni(sock_net(sk), vni, port);
697 if (!vxlan) { 736 if (!vxlan) {
698 netdev_dbg(skb->dev, "unknown vni %d\n", vni); 737 netdev_dbg(skb->dev, "unknown vni %d port %u\n",
738 vni, ntohs(port));
699 goto drop; 739 goto drop;
700 } 740 }
701 741
@@ -875,7 +915,7 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
875 return false; 915 return false;
876} 916}
877 917
878static void vxlan_sock_free(struct sk_buff *skb) 918static void vxlan_sock_put(struct sk_buff *skb)
879{ 919{
880 sock_put(skb->sk); 920 sock_put(skb->sk);
881} 921}
@@ -883,13 +923,13 @@ static void vxlan_sock_free(struct sk_buff *skb)
883/* On transmit, associate with the tunnel socket */ 923/* On transmit, associate with the tunnel socket */
884static void vxlan_set_owner(struct net_device *dev, struct sk_buff *skb) 924static void vxlan_set_owner(struct net_device *dev, struct sk_buff *skb)
885{ 925{
886 struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); 926 struct vxlan_dev *vxlan = netdev_priv(dev);
887 struct sock *sk = vn->sock->sk; 927 struct sock *sk = vxlan->vn_sock->sock->sk;
888 928
889 skb_orphan(skb); 929 skb_orphan(skb);
890 sock_hold(sk); 930 sock_hold(sk);
891 skb->sk = sk; 931 skb->sk = sk;
892 skb->destructor = vxlan_sock_free; 932 skb->destructor = vxlan_sock_put;
893} 933}
894 934
895/* Compute source port for outgoing packet 935/* Compute source port for outgoing packet
@@ -1031,7 +1071,7 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
1031 struct vxlan_dev *dst_vxlan; 1071 struct vxlan_dev *dst_vxlan;
1032 1072
1033 ip_rt_put(rt); 1073 ip_rt_put(rt);
1034 dst_vxlan = vxlan_find_vni(dev_net(dev), vni); 1074 dst_vxlan = vxlan_find_vni(dev_net(dev), vni, dst_port);
1035 if (!dst_vxlan) 1075 if (!dst_vxlan)
1036 goto tx_error; 1076 goto tx_error;
1037 vxlan_encap_bypass(skb, vxlan, dst_vxlan); 1077 vxlan_encap_bypass(skb, vxlan, dst_vxlan);
@@ -1306,6 +1346,7 @@ static void vxlan_setup(struct net_device *dev)
1306 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; 1346 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
1307 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1347 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1308 1348
1349 INIT_LIST_HEAD(&vxlan->next);
1309 spin_lock_init(&vxlan->hash_lock); 1350 spin_lock_init(&vxlan->hash_lock);
1310 1351
1311 init_timer_deferrable(&vxlan->age_timer); 1352 init_timer_deferrable(&vxlan->age_timer);
@@ -1390,11 +1431,78 @@ static const struct ethtool_ops vxlan_ethtool_ops = {
1390 .get_link = ethtool_op_get_link, 1431 .get_link = ethtool_op_get_link,
1391}; 1432};
1392 1433
1434static void vxlan_del_work(struct work_struct *work)
1435{
1436 struct vxlan_sock *vs = container_of(work, struct vxlan_sock, del_work);
1437
1438 sk_release_kernel(vs->sock->sk);
1439 kfree_rcu(vs, rcu);
1440}
1441
1442/* Create new listen socket if needed */
1443static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port)
1444{
1445 struct vxlan_sock *vs;
1446 struct sock *sk;
1447 struct sockaddr_in vxlan_addr = {
1448 .sin_family = AF_INET,
1449 .sin_addr.s_addr = htonl(INADDR_ANY),
1450 };
1451 int rc;
1452 unsigned h;
1453
1454 vs = kmalloc(sizeof(*vs), GFP_KERNEL);
1455 if (!vs)
1456 return ERR_PTR(-ENOMEM);
1457
1458 for (h = 0; h < VNI_HASH_SIZE; ++h)
1459 INIT_HLIST_HEAD(&vs->vni_list[h]);
1460
1461 INIT_WORK(&vs->del_work, vxlan_del_work);
1462
1463 /* Create UDP socket for encapsulation receive. */
1464 rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &vs->sock);
1465 if (rc < 0) {
1466 pr_debug("UDP socket create failed\n");
1467 kfree(vs);
1468 return ERR_PTR(rc);
1469 }
1470
1471 /* Put in proper namespace */
1472 sk = vs->sock->sk;
1473 sk_change_net(sk, net);
1474
1475 vxlan_addr.sin_port = port;
1476
1477 rc = kernel_bind(vs->sock, (struct sockaddr *) &vxlan_addr,
1478 sizeof(vxlan_addr));
1479 if (rc < 0) {
1480 pr_debug("bind for UDP socket %pI4:%u (%d)\n",
1481 &vxlan_addr.sin_addr, ntohs(vxlan_addr.sin_port), rc);
1482 sk_release_kernel(sk);
1483 kfree(vs);
1484 return ERR_PTR(rc);
1485 }
1486
1487 /* Disable multicast loopback */
1488 inet_sk(sk)->mc_loop = 0;
1489
1490 /* Mark socket as an encapsulation socket. */
1491 udp_sk(sk)->encap_type = 1;
1492 udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv;
1493 udp_encap_enable();
1494
1495 vs->refcnt = 1;
1496 return vs;
1497}
1498
1393static int vxlan_newlink(struct net *net, struct net_device *dev, 1499static int vxlan_newlink(struct net *net, struct net_device *dev,
1394 struct nlattr *tb[], struct nlattr *data[]) 1500 struct nlattr *tb[], struct nlattr *data[])
1395{ 1501{
1502 struct vxlan_net *vn = net_generic(net, vxlan_net_id);
1396 struct vxlan_dev *vxlan = netdev_priv(dev); 1503 struct vxlan_dev *vxlan = netdev_priv(dev);
1397 struct vxlan_rdst *dst = &vxlan->default_dst; 1504 struct vxlan_rdst *dst = &vxlan->default_dst;
1505 struct vxlan_sock *vs;
1398 __u32 vni; 1506 __u32 vni;
1399 int err; 1507 int err;
1400 1508
@@ -1402,10 +1510,6 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
1402 return -EINVAL; 1510 return -EINVAL;
1403 1511
1404 vni = nla_get_u32(data[IFLA_VXLAN_ID]); 1512 vni = nla_get_u32(data[IFLA_VXLAN_ID]);
1405 if (vxlan_find_vni(net, vni)) {
1406 pr_info("duplicate VNI %u\n", vni);
1407 return -EEXIST;
1408 }
1409 dst->remote_vni = vni; 1513 dst->remote_vni = vni;
1410 1514
1411 if (data[IFLA_VXLAN_GROUP]) 1515 if (data[IFLA_VXLAN_GROUP])
@@ -1471,22 +1575,58 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
1471 if (data[IFLA_VXLAN_PORT]) 1575 if (data[IFLA_VXLAN_PORT])
1472 vxlan->dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]); 1576 vxlan->dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]);
1473 1577
1578 if (vxlan_find_vni(net, vni, vxlan->dst_port)) {
1579 pr_info("duplicate VNI %u\n", vni);
1580 return -EEXIST;
1581 }
1582
1583 vs = vxlan_find_port(net, vxlan->dst_port);
1584 if (vs)
1585 ++vs->refcnt;
1586 else {
1587 /* Drop lock because socket create acquires RTNL lock */
1588 rtnl_unlock();
1589 vs = vxlan_socket_create(net, vxlan->dst_port);
1590 rtnl_lock();
1591 if (IS_ERR(vs))
1592 return PTR_ERR(vs);
1593
1594 hlist_add_head_rcu(&vs->hlist, vs_head(net, vxlan->dst_port));
1595 }
1596 vxlan->vn_sock = vs;
1597
1474 SET_ETHTOOL_OPS(dev, &vxlan_ethtool_ops); 1598 SET_ETHTOOL_OPS(dev, &vxlan_ethtool_ops);
1475 1599
1476 err = register_netdevice(dev); 1600 err = register_netdevice(dev);
1477 if (!err) 1601 if (err) {
1478 hlist_add_head_rcu(&vxlan->hlist, vni_head(net, dst->remote_vni)); 1602 if (--vs->refcnt == 0) {
1603 rtnl_unlock();
1604 sk_release_kernel(vs->sock->sk);
1605 kfree(vs);
1606 rtnl_lock();
1607 }
1608 return err;
1609 }
1479 1610
1480 return err; 1611 list_add(&vxlan->next, &vn->vxlan_list);
1612 hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni));
1613
1614 return 0;
1481} 1615}
1482 1616
1483static void vxlan_dellink(struct net_device *dev, struct list_head *head) 1617static void vxlan_dellink(struct net_device *dev, struct list_head *head)
1484{ 1618{
1485 struct vxlan_dev *vxlan = netdev_priv(dev); 1619 struct vxlan_dev *vxlan = netdev_priv(dev);
1620 struct vxlan_sock *vs = vxlan->vn_sock;
1486 1621
1487 hlist_del_rcu(&vxlan->hlist); 1622 hlist_del_rcu(&vxlan->hlist);
1488 1623 list_del(&vxlan->next);
1489 unregister_netdevice_queue(dev, head); 1624 unregister_netdevice_queue(dev, head);
1625
1626 if (--vs->refcnt == 0) {
1627 hlist_del_rcu(&vs->hlist);
1628 schedule_work(&vs->del_work);
1629 }
1490} 1630}
1491 1631
1492static size_t vxlan_get_size(const struct net_device *dev) 1632static size_t vxlan_get_size(const struct net_device *dev)
@@ -1572,46 +1712,12 @@ static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
1572static __net_init int vxlan_init_net(struct net *net) 1712static __net_init int vxlan_init_net(struct net *net)
1573{ 1713{
1574 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 1714 struct vxlan_net *vn = net_generic(net, vxlan_net_id);
1575 struct sock *sk;
1576 struct sockaddr_in vxlan_addr = {
1577 .sin_family = AF_INET,
1578 .sin_addr.s_addr = htonl(INADDR_ANY),
1579 };
1580 int rc;
1581 unsigned h; 1715 unsigned h;
1582 1716
1583 /* Create UDP socket for encapsulation receive. */ 1717 INIT_LIST_HEAD(&vn->vxlan_list);
1584 rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &vn->sock);
1585 if (rc < 0) {
1586 pr_debug("UDP socket create failed\n");
1587 return rc;
1588 }
1589 /* Put in proper namespace */
1590 sk = vn->sock->sk;
1591 sk_change_net(sk, net);
1592
1593 vxlan_addr.sin_port = htons(vxlan_port);
1594
1595 rc = kernel_bind(vn->sock, (struct sockaddr *) &vxlan_addr,
1596 sizeof(vxlan_addr));
1597 if (rc < 0) {
1598 pr_debug("bind for UDP socket %pI4:%u (%d)\n",
1599 &vxlan_addr.sin_addr, ntohs(vxlan_addr.sin_port), rc);
1600 sk_release_kernel(sk);
1601 vn->sock = NULL;
1602 return rc;
1603 }
1604
1605 /* Disable multicast loopback */
1606 inet_sk(sk)->mc_loop = 0;
1607 1718
1608 /* Mark socket as an encapsulation socket. */ 1719 for (h = 0; h < PORT_HASH_SIZE; ++h)
1609 udp_sk(sk)->encap_type = 1; 1720 INIT_HLIST_HEAD(&vn->sock_list[h]);
1610 udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv;
1611 udp_encap_enable();
1612
1613 for (h = 0; h < VNI_HASH_SIZE; ++h)
1614 INIT_HLIST_HEAD(&vn->vni_list[h]);
1615 1721
1616 return 0; 1722 return 0;
1617} 1723}
@@ -1620,18 +1726,11 @@ static __net_exit void vxlan_exit_net(struct net *net)
1620{ 1726{
1621 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 1727 struct vxlan_net *vn = net_generic(net, vxlan_net_id);
1622 struct vxlan_dev *vxlan; 1728 struct vxlan_dev *vxlan;
1623 unsigned h;
1624 1729
1625 rtnl_lock(); 1730 rtnl_lock();
1626 for (h = 0; h < VNI_HASH_SIZE; ++h) 1731 list_for_each_entry(vxlan, &vn->vxlan_list, next)
1627 hlist_for_each_entry(vxlan, &vn->vni_list[h], hlist) 1732 dev_close(vxlan->dev);
1628 dev_close(vxlan->dev);
1629 rtnl_unlock(); 1733 rtnl_unlock();
1630
1631 if (vn->sock) {
1632 sk_release_kernel(vn->sock->sk);
1633 vn->sock = NULL;
1634 }
1635} 1734}
1636 1735
1637static struct pernet_operations vxlan_net_ops = { 1736static struct pernet_operations vxlan_net_ops = {