aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/net/vxlan.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/net/vxlan.c')
-rw-r--r--drivers/net/vxlan.c399
1 files changed, 235 insertions, 164 deletions
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 7cee7a3068ec..62a4438c6084 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -33,7 +33,7 @@
33#include <net/arp.h> 33#include <net/arp.h>
34#include <net/ndisc.h> 34#include <net/ndisc.h>
35#include <net/ip.h> 35#include <net/ip.h>
36#include <net/ipip.h> 36#include <net/ip_tunnels.h>
37#include <net/icmp.h> 37#include <net/icmp.h>
38#include <net/udp.h> 38#include <net/udp.h>
39#include <net/rtnetlink.h> 39#include <net/rtnetlink.h>
@@ -81,31 +81,30 @@ struct vxlan_net {
81 struct hlist_head vni_list[VNI_HASH_SIZE]; 81 struct hlist_head vni_list[VNI_HASH_SIZE];
82}; 82};
83 83
84struct vxlan_rdst {
85 struct rcu_head rcu;
86 __be32 remote_ip;
87 __be16 remote_port;
88 u32 remote_vni;
89 u32 remote_ifindex;
90 struct vxlan_rdst *remote_next;
91};
92
84/* Forwarding table entry */ 93/* Forwarding table entry */
85struct vxlan_fdb { 94struct vxlan_fdb {
86 struct hlist_node hlist; /* linked list of entries */ 95 struct hlist_node hlist; /* linked list of entries */
87 struct rcu_head rcu; 96 struct rcu_head rcu;
88 unsigned long updated; /* jiffies */ 97 unsigned long updated; /* jiffies */
89 unsigned long used; 98 unsigned long used;
90 __be32 remote_ip; 99 struct vxlan_rdst remote;
91 u16 state; /* see ndm_state */ 100 u16 state; /* see ndm_state */
92 u8 eth_addr[ETH_ALEN]; 101 u8 eth_addr[ETH_ALEN];
93}; 102};
94 103
95/* Per-cpu network traffic stats */
96struct vxlan_stats {
97 u64 rx_packets;
98 u64 rx_bytes;
99 u64 tx_packets;
100 u64 tx_bytes;
101 struct u64_stats_sync syncp;
102};
103
104/* Pseudo network device */ 104/* Pseudo network device */
105struct vxlan_dev { 105struct vxlan_dev {
106 struct hlist_node hlist; 106 struct hlist_node hlist;
107 struct net_device *dev; 107 struct net_device *dev;
108 struct vxlan_stats __percpu *stats;
109 __u32 vni; /* virtual network id */ 108 __u32 vni; /* virtual network id */
110 __be32 gaddr; /* multicast group */ 109 __be32 gaddr; /* multicast group */
111 __be32 saddr; /* source address */ 110 __be32 saddr; /* source address */
@@ -157,7 +156,8 @@ static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id)
157/* Fill in neighbour message in skbuff. */ 156/* Fill in neighbour message in skbuff. */
158static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, 157static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
159 const struct vxlan_fdb *fdb, 158 const struct vxlan_fdb *fdb,
160 u32 portid, u32 seq, int type, unsigned int flags) 159 u32 portid, u32 seq, int type, unsigned int flags,
160 const struct vxlan_rdst *rdst)
161{ 161{
162 unsigned long now = jiffies; 162 unsigned long now = jiffies;
163 struct nda_cacheinfo ci; 163 struct nda_cacheinfo ci;
@@ -176,7 +176,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
176 176
177 if (type == RTM_GETNEIGH) { 177 if (type == RTM_GETNEIGH) {
178 ndm->ndm_family = AF_INET; 178 ndm->ndm_family = AF_INET;
179 send_ip = fdb->remote_ip != 0; 179 send_ip = rdst->remote_ip != htonl(INADDR_ANY);
180 send_eth = !is_zero_ether_addr(fdb->eth_addr); 180 send_eth = !is_zero_ether_addr(fdb->eth_addr);
181 } else 181 } else
182 ndm->ndm_family = AF_BRIDGE; 182 ndm->ndm_family = AF_BRIDGE;
@@ -188,7 +188,17 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
188 if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) 188 if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
189 goto nla_put_failure; 189 goto nla_put_failure;
190 190
191 if (send_ip && nla_put_be32(skb, NDA_DST, fdb->remote_ip)) 191 if (send_ip && nla_put_be32(skb, NDA_DST, rdst->remote_ip))
192 goto nla_put_failure;
193
194 if (rdst->remote_port && rdst->remote_port != vxlan_port &&
195 nla_put_be16(skb, NDA_PORT, rdst->remote_port))
196 goto nla_put_failure;
197 if (rdst->remote_vni != vxlan->vni &&
198 nla_put_be32(skb, NDA_VNI, rdst->remote_vni))
199 goto nla_put_failure;
200 if (rdst->remote_ifindex &&
201 nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex))
192 goto nla_put_failure; 202 goto nla_put_failure;
193 203
194 ci.ndm_used = jiffies_to_clock_t(now - fdb->used); 204 ci.ndm_used = jiffies_to_clock_t(now - fdb->used);
@@ -211,6 +221,9 @@ static inline size_t vxlan_nlmsg_size(void)
211 return NLMSG_ALIGN(sizeof(struct ndmsg)) 221 return NLMSG_ALIGN(sizeof(struct ndmsg))
212 + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ 222 + nla_total_size(ETH_ALEN) /* NDA_LLADDR */
213 + nla_total_size(sizeof(__be32)) /* NDA_DST */ 223 + nla_total_size(sizeof(__be32)) /* NDA_DST */
224 + nla_total_size(sizeof(__be32)) /* NDA_PORT */
225 + nla_total_size(sizeof(__be32)) /* NDA_VNI */
226 + nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */
214 + nla_total_size(sizeof(struct nda_cacheinfo)); 227 + nla_total_size(sizeof(struct nda_cacheinfo));
215} 228}
216 229
@@ -225,7 +238,7 @@ static void vxlan_fdb_notify(struct vxlan_dev *vxlan,
225 if (skb == NULL) 238 if (skb == NULL)
226 goto errout; 239 goto errout;
227 240
228 err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0); 241 err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, &fdb->remote);
229 if (err < 0) { 242 if (err < 0) {
230 /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */ 243 /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
231 WARN_ON(err == -EMSGSIZE); 244 WARN_ON(err == -EMSGSIZE);
@@ -247,7 +260,8 @@ static void vxlan_ip_miss(struct net_device *dev, __be32 ipa)
247 260
248 memset(&f, 0, sizeof f); 261 memset(&f, 0, sizeof f);
249 f.state = NUD_STALE; 262 f.state = NUD_STALE;
250 f.remote_ip = ipa; /* goes to NDA_DST */ 263 f.remote.remote_ip = ipa; /* goes to NDA_DST */
264 f.remote.remote_vni = VXLAN_N_VID;
251 265
252 vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH); 266 vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH);
253} 267}
@@ -300,10 +314,38 @@ static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
300 return NULL; 314 return NULL;
301} 315}
302 316
317/* Add/update destinations for multicast */
318static int vxlan_fdb_append(struct vxlan_fdb *f,
319 __be32 ip, __u32 port, __u32 vni, __u32 ifindex)
320{
321 struct vxlan_rdst *rd_prev, *rd;
322
323 rd_prev = NULL;
324 for (rd = &f->remote; rd; rd = rd->remote_next) {
325 if (rd->remote_ip == ip &&
326 rd->remote_port == port &&
327 rd->remote_vni == vni &&
328 rd->remote_ifindex == ifindex)
329 return 0;
330 rd_prev = rd;
331 }
332 rd = kmalloc(sizeof(*rd), GFP_ATOMIC);
333 if (rd == NULL)
334 return -ENOBUFS;
335 rd->remote_ip = ip;
336 rd->remote_port = port;
337 rd->remote_vni = vni;
338 rd->remote_ifindex = ifindex;
339 rd->remote_next = NULL;
340 rd_prev->remote_next = rd;
341 return 1;
342}
343
303/* Add new entry to forwarding table -- assumes lock held */ 344/* Add new entry to forwarding table -- assumes lock held */
304static int vxlan_fdb_create(struct vxlan_dev *vxlan, 345static int vxlan_fdb_create(struct vxlan_dev *vxlan,
305 const u8 *mac, __be32 ip, 346 const u8 *mac, __be32 ip,
306 __u16 state, __u16 flags) 347 __u16 state, __u16 flags,
348 __u32 port, __u32 vni, __u32 ifindex)
307{ 349{
308 struct vxlan_fdb *f; 350 struct vxlan_fdb *f;
309 int notify = 0; 351 int notify = 0;
@@ -320,6 +362,14 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan,
320 f->updated = jiffies; 362 f->updated = jiffies;
321 notify = 1; 363 notify = 1;
322 } 364 }
365 if ((flags & NLM_F_APPEND) &&
366 is_multicast_ether_addr(f->eth_addr)) {
367 int rc = vxlan_fdb_append(f, ip, port, vni, ifindex);
368
369 if (rc < 0)
370 return rc;
371 notify |= rc;
372 }
323 } else { 373 } else {
324 if (!(flags & NLM_F_CREATE)) 374 if (!(flags & NLM_F_CREATE))
325 return -ENOENT; 375 return -ENOENT;
@@ -333,7 +383,11 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan,
333 return -ENOMEM; 383 return -ENOMEM;
334 384
335 notify = 1; 385 notify = 1;
336 f->remote_ip = ip; 386 f->remote.remote_ip = ip;
387 f->remote.remote_port = port;
388 f->remote.remote_vni = vni;
389 f->remote.remote_ifindex = ifindex;
390 f->remote.remote_next = NULL;
337 f->state = state; 391 f->state = state;
338 f->updated = f->used = jiffies; 392 f->updated = f->used = jiffies;
339 memcpy(f->eth_addr, mac, ETH_ALEN); 393 memcpy(f->eth_addr, mac, ETH_ALEN);
@@ -349,6 +403,19 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan,
349 return 0; 403 return 0;
350} 404}
351 405
406void vxlan_fdb_free(struct rcu_head *head)
407{
408 struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu);
409
410 while (f->remote.remote_next) {
411 struct vxlan_rdst *rd = f->remote.remote_next;
412
413 f->remote.remote_next = rd->remote_next;
414 kfree(rd);
415 }
416 kfree(f);
417}
418
352static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f) 419static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f)
353{ 420{
354 netdev_dbg(vxlan->dev, 421 netdev_dbg(vxlan->dev,
@@ -358,7 +425,7 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f)
358 vxlan_fdb_notify(vxlan, f, RTM_DELNEIGH); 425 vxlan_fdb_notify(vxlan, f, RTM_DELNEIGH);
359 426
360 hlist_del_rcu(&f->hlist); 427 hlist_del_rcu(&f->hlist);
361 kfree_rcu(f, rcu); 428 call_rcu(&f->rcu, vxlan_fdb_free);
362} 429}
363 430
364/* Add static entry (via netlink) */ 431/* Add static entry (via netlink) */
@@ -367,7 +434,9 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
367 const unsigned char *addr, u16 flags) 434 const unsigned char *addr, u16 flags)
368{ 435{
369 struct vxlan_dev *vxlan = netdev_priv(dev); 436 struct vxlan_dev *vxlan = netdev_priv(dev);
437 struct net *net = dev_net(vxlan->dev);
370 __be32 ip; 438 __be32 ip;
439 u32 port, vni, ifindex;
371 int err; 440 int err;
372 441
373 if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) { 442 if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) {
@@ -384,8 +453,36 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
384 453
385 ip = nla_get_be32(tb[NDA_DST]); 454 ip = nla_get_be32(tb[NDA_DST]);
386 455
456 if (tb[NDA_PORT]) {
457 if (nla_len(tb[NDA_PORT]) != sizeof(u32))
458 return -EINVAL;
459 port = nla_get_u32(tb[NDA_PORT]);
460 } else
461 port = vxlan_port;
462
463 if (tb[NDA_VNI]) {
464 if (nla_len(tb[NDA_VNI]) != sizeof(u32))
465 return -EINVAL;
466 vni = nla_get_u32(tb[NDA_VNI]);
467 } else
468 vni = vxlan->vni;
469
470 if (tb[NDA_IFINDEX]) {
471 struct net_device *tdev;
472
473 if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32))
474 return -EINVAL;
475 ifindex = nla_get_u32(tb[NDA_IFINDEX]);
476 tdev = dev_get_by_index(net, ifindex);
477 if (!tdev)
478 return -EADDRNOTAVAIL;
479 dev_put(tdev);
480 } else
481 ifindex = 0;
482
387 spin_lock_bh(&vxlan->hash_lock); 483 spin_lock_bh(&vxlan->hash_lock);
388 err = vxlan_fdb_create(vxlan, addr, ip, ndm->ndm_state, flags); 484 err = vxlan_fdb_create(vxlan, addr, ip, ndm->ndm_state, flags, port,
485 vni, ifindex);
389 spin_unlock_bh(&vxlan->hash_lock); 486 spin_unlock_bh(&vxlan->hash_lock);
390 487
391 return err; 488 return err;
@@ -423,18 +520,21 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
423 int err; 520 int err;
424 521
425 hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) { 522 hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) {
426 if (idx < cb->args[0]) 523 struct vxlan_rdst *rd;
427 goto skip; 524 for (rd = &f->remote; rd; rd = rd->remote_next) {
428 525 if (idx < cb->args[0])
429 err = vxlan_fdb_info(skb, vxlan, f, 526 goto skip;
430 NETLINK_CB(cb->skb).portid, 527
431 cb->nlh->nlmsg_seq, 528 err = vxlan_fdb_info(skb, vxlan, f,
432 RTM_NEWNEIGH, 529 NETLINK_CB(cb->skb).portid,
433 NLM_F_MULTI); 530 cb->nlh->nlmsg_seq,
434 if (err < 0) 531 RTM_NEWNEIGH,
435 break; 532 NLM_F_MULTI, rd);
533 if (err < 0)
534 break;
436skip: 535skip:
437 ++idx; 536 ++idx;
537 }
438 } 538 }
439 } 539 }
440 540
@@ -454,22 +554,23 @@ static void vxlan_snoop(struct net_device *dev,
454 f = vxlan_find_mac(vxlan, src_mac); 554 f = vxlan_find_mac(vxlan, src_mac);
455 if (likely(f)) { 555 if (likely(f)) {
456 f->used = jiffies; 556 f->used = jiffies;
457 if (likely(f->remote_ip == src_ip)) 557 if (likely(f->remote.remote_ip == src_ip))
458 return; 558 return;
459 559
460 if (net_ratelimit()) 560 if (net_ratelimit())
461 netdev_info(dev, 561 netdev_info(dev,
462 "%pM migrated from %pI4 to %pI4\n", 562 "%pM migrated from %pI4 to %pI4\n",
463 src_mac, &f->remote_ip, &src_ip); 563 src_mac, &f->remote.remote_ip, &src_ip);
464 564
465 f->remote_ip = src_ip; 565 f->remote.remote_ip = src_ip;
466 f->updated = jiffies; 566 f->updated = jiffies;
467 } else { 567 } else {
468 /* learned new entry */ 568 /* learned new entry */
469 spin_lock(&vxlan->hash_lock); 569 spin_lock(&vxlan->hash_lock);
470 err = vxlan_fdb_create(vxlan, src_mac, src_ip, 570 err = vxlan_fdb_create(vxlan, src_mac, src_ip,
471 NUD_REACHABLE, 571 NUD_REACHABLE,
472 NLM_F_EXCL|NLM_F_CREATE); 572 NLM_F_EXCL|NLM_F_CREATE,
573 vxlan_port, vxlan->vni, 0);
473 spin_unlock(&vxlan->hash_lock); 574 spin_unlock(&vxlan->hash_lock);
474 } 575 }
475} 576}
@@ -556,7 +657,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
556 struct iphdr *oip; 657 struct iphdr *oip;
557 struct vxlanhdr *vxh; 658 struct vxlanhdr *vxh;
558 struct vxlan_dev *vxlan; 659 struct vxlan_dev *vxlan;
559 struct vxlan_stats *stats; 660 struct pcpu_tstats *stats;
560 __u32 vni; 661 __u32 vni;
561 int err; 662 int err;
562 663
@@ -632,7 +733,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
632 } 733 }
633 } 734 }
634 735
635 stats = this_cpu_ptr(vxlan->stats); 736 stats = this_cpu_ptr(vxlan->dev->tstats);
636 u64_stats_update_begin(&stats->syncp); 737 u64_stats_update_begin(&stats->syncp);
637 stats->rx_packets++; 738 stats->rx_packets++;
638 stats->rx_bytes += skb->len; 739 stats->rx_bytes += skb->len;
@@ -691,7 +792,6 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb)
691 n = neigh_lookup(&arp_tbl, &tip, dev); 792 n = neigh_lookup(&arp_tbl, &tip, dev);
692 793
693 if (n) { 794 if (n) {
694 struct vxlan_dev *vxlan = netdev_priv(dev);
695 struct vxlan_fdb *f; 795 struct vxlan_fdb *f;
696 struct sk_buff *reply; 796 struct sk_buff *reply;
697 797
@@ -701,7 +801,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb)
701 } 801 }
702 802
703 f = vxlan_find_mac(vxlan, n->ha); 803 f = vxlan_find_mac(vxlan, n->ha);
704 if (f && f->remote_ip == 0) { 804 if (f && f->remote.remote_ip == htonl(INADDR_ANY)) {
705 /* bridge-local neighbor */ 805 /* bridge-local neighbor */
706 neigh_release(n); 806 neigh_release(n);
707 goto out; 807 goto out;
@@ -763,28 +863,6 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
763 return false; 863 return false;
764} 864}
765 865
766/* Extract dsfield from inner protocol */
767static inline u8 vxlan_get_dsfield(const struct iphdr *iph,
768 const struct sk_buff *skb)
769{
770 if (skb->protocol == htons(ETH_P_IP))
771 return iph->tos;
772 else if (skb->protocol == htons(ETH_P_IPV6))
773 return ipv6_get_dsfield((const struct ipv6hdr *)iph);
774 else
775 return 0;
776}
777
778/* Propogate ECN bits out */
779static inline u8 vxlan_ecn_encap(u8 tos,
780 const struct iphdr *iph,
781 const struct sk_buff *skb)
782{
783 u8 inner = vxlan_get_dsfield(iph, skb);
784
785 return INET_ECN_encapsulate(tos, inner);
786}
787
788static void vxlan_sock_free(struct sk_buff *skb) 866static void vxlan_sock_free(struct sk_buff *skb)
789{ 867{
790 sock_put(skb->sk); 868 sock_put(skb->sk);
@@ -820,48 +898,40 @@ static u16 vxlan_src_port(const struct vxlan_dev *vxlan, struct sk_buff *skb)
820 return (((u64) hash * range) >> 32) + vxlan->port_min; 898 return (((u64) hash * range) >> 32) + vxlan->port_min;
821} 899}
822 900
823/* Transmit local packets over Vxlan 901static int handle_offloads(struct sk_buff *skb)
824 * 902{
825 * Outer IP header inherits ECN and DF from inner header. 903 if (skb_is_gso(skb)) {
826 * Outer UDP destination is the VXLAN assigned port. 904 int err = skb_unclone(skb, GFP_ATOMIC);
827 * source port is based on hash of flow 905 if (unlikely(err))
828 */ 906 return err;
829static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) 907
908 skb_shinfo(skb)->gso_type |= (SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP);
909 } else if (skb->ip_summed != CHECKSUM_PARTIAL)
910 skb->ip_summed = CHECKSUM_NONE;
911
912 return 0;
913}
914
915static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
916 struct vxlan_rdst *rdst, bool did_rsc)
830{ 917{
831 struct vxlan_dev *vxlan = netdev_priv(dev); 918 struct vxlan_dev *vxlan = netdev_priv(dev);
832 struct rtable *rt; 919 struct rtable *rt;
833 const struct iphdr *old_iph; 920 const struct iphdr *old_iph;
834 struct ethhdr *eth;
835 struct iphdr *iph; 921 struct iphdr *iph;
836 struct vxlanhdr *vxh; 922 struct vxlanhdr *vxh;
837 struct udphdr *uh; 923 struct udphdr *uh;
838 struct flowi4 fl4; 924 struct flowi4 fl4;
839 unsigned int pkt_len = skb->len; 925 unsigned int pkt_len = skb->len;
840 __be32 dst; 926 __be32 dst;
841 __u16 src_port; 927 __u16 src_port, dst_port;
928 u32 vni;
842 __be16 df = 0; 929 __be16 df = 0;
843 __u8 tos, ttl; 930 __u8 tos, ttl;
844 int err;
845 bool did_rsc = false;
846 const struct vxlan_fdb *f;
847
848 skb_reset_mac_header(skb);
849 eth = eth_hdr(skb);
850 931
851 if ((vxlan->flags & VXLAN_F_PROXY) && ntohs(eth->h_proto) == ETH_P_ARP) 932 dst_port = rdst->remote_port ? rdst->remote_port : vxlan_port;
852 return arp_reduce(dev, skb); 933 vni = rdst->remote_vni;
853 else if ((vxlan->flags&VXLAN_F_RSC) && ntohs(eth->h_proto) == ETH_P_IP) 934 dst = rdst->remote_ip;
854 did_rsc = route_shortcircuit(dev, skb);
855
856 f = vxlan_find_mac(vxlan, eth->h_dest);
857 if (f == NULL) {
858 did_rsc = false;
859 dst = vxlan->gaddr;
860 if (!dst && (vxlan->flags & VXLAN_F_L2MISS) &&
861 !is_multicast_ether_addr(eth->h_dest))
862 vxlan_fdb_miss(vxlan, eth->h_dest);
863 } else
864 dst = f->remote_ip;
865 935
866 if (!dst) { 936 if (!dst) {
867 if (did_rsc) { 937 if (did_rsc) {
@@ -871,8 +941,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
871 941
872 /* short-circuited back to local bridge */ 942 /* short-circuited back to local bridge */
873 if (netif_rx(skb) == NET_RX_SUCCESS) { 943 if (netif_rx(skb) == NET_RX_SUCCESS) {
874 struct vxlan_stats *stats = 944 struct pcpu_tstats *stats = this_cpu_ptr(dev->tstats);
875 this_cpu_ptr(vxlan->stats);
876 945
877 u64_stats_update_begin(&stats->syncp); 946 u64_stats_update_begin(&stats->syncp);
878 stats->tx_packets++; 947 stats->tx_packets++;
@@ -904,12 +973,12 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
904 973
905 tos = vxlan->tos; 974 tos = vxlan->tos;
906 if (tos == 1) 975 if (tos == 1)
907 tos = vxlan_get_dsfield(old_iph, skb); 976 tos = ip_tunnel_get_dsfield(old_iph, skb);
908 977
909 src_port = vxlan_src_port(vxlan, skb); 978 src_port = vxlan_src_port(vxlan, skb);
910 979
911 memset(&fl4, 0, sizeof(fl4)); 980 memset(&fl4, 0, sizeof(fl4));
912 fl4.flowi4_oif = vxlan->link; 981 fl4.flowi4_oif = rdst->remote_ifindex;
913 fl4.flowi4_tos = RT_TOS(tos); 982 fl4.flowi4_tos = RT_TOS(tos);
914 fl4.daddr = dst; 983 fl4.daddr = dst;
915 fl4.saddr = vxlan->saddr; 984 fl4.saddr = vxlan->saddr;
@@ -936,13 +1005,13 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
936 1005
937 vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); 1006 vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
938 vxh->vx_flags = htonl(VXLAN_FLAGS); 1007 vxh->vx_flags = htonl(VXLAN_FLAGS);
939 vxh->vx_vni = htonl(vxlan->vni << 8); 1008 vxh->vx_vni = htonl(vni << 8);
940 1009
941 __skb_push(skb, sizeof(*uh)); 1010 __skb_push(skb, sizeof(*uh));
942 skb_reset_transport_header(skb); 1011 skb_reset_transport_header(skb);
943 uh = udp_hdr(skb); 1012 uh = udp_hdr(skb);
944 1013
945 uh->dest = htons(vxlan_port); 1014 uh->dest = htons(dst_port);
946 uh->source = htons(src_port); 1015 uh->source = htons(src_port);
947 1016
948 uh->len = htons(skb->len); 1017 uh->len = htons(skb->len);
@@ -955,7 +1024,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
955 iph->ihl = sizeof(struct iphdr) >> 2; 1024 iph->ihl = sizeof(struct iphdr) >> 2;
956 iph->frag_off = df; 1025 iph->frag_off = df;
957 iph->protocol = IPPROTO_UDP; 1026 iph->protocol = IPPROTO_UDP;
958 iph->tos = vxlan_ecn_encap(tos, old_iph, skb); 1027 iph->tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
959 iph->daddr = dst; 1028 iph->daddr = dst;
960 iph->saddr = fl4.saddr; 1029 iph->saddr = fl4.saddr;
961 iph->ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); 1030 iph->ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
@@ -965,22 +1034,10 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
965 1034
966 vxlan_set_owner(dev, skb); 1035 vxlan_set_owner(dev, skb);
967 1036
968 /* See iptunnel_xmit() */ 1037 if (handle_offloads(skb))
969 if (skb->ip_summed != CHECKSUM_PARTIAL) 1038 goto drop;
970 skb->ip_summed = CHECKSUM_NONE;
971
972 err = ip_local_out(skb);
973 if (likely(net_xmit_eval(err) == 0)) {
974 struct vxlan_stats *stats = this_cpu_ptr(vxlan->stats);
975 1039
976 u64_stats_update_begin(&stats->syncp); 1040 iptunnel_xmit(skb, dev);
977 stats->tx_packets++;
978 stats->tx_bytes += pkt_len;
979 u64_stats_update_end(&stats->syncp);
980 } else {
981 dev->stats.tx_errors++;
982 dev->stats.tx_aborted_errors++;
983 }
984 return NETDEV_TX_OK; 1041 return NETDEV_TX_OK;
985 1042
986drop: 1043drop:
@@ -994,6 +1051,64 @@ tx_free:
994 return NETDEV_TX_OK; 1051 return NETDEV_TX_OK;
995} 1052}
996 1053
1054/* Transmit local packets over Vxlan
1055 *
1056 * Outer IP header inherits ECN and DF from inner header.
1057 * Outer UDP destination is the VXLAN assigned port.
1058 * source port is based on hash of flow
1059 */
1060static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
1061{
1062 struct vxlan_dev *vxlan = netdev_priv(dev);
1063 struct ethhdr *eth;
1064 bool did_rsc = false;
1065 struct vxlan_rdst group, *rdst0, *rdst;
1066 struct vxlan_fdb *f;
1067 int rc1, rc;
1068
1069 skb_reset_mac_header(skb);
1070 eth = eth_hdr(skb);
1071
1072 if ((vxlan->flags & VXLAN_F_PROXY) && ntohs(eth->h_proto) == ETH_P_ARP)
1073 return arp_reduce(dev, skb);
1074 else if ((vxlan->flags&VXLAN_F_RSC) && ntohs(eth->h_proto) == ETH_P_IP)
1075 did_rsc = route_shortcircuit(dev, skb);
1076
1077 f = vxlan_find_mac(vxlan, eth->h_dest);
1078 if (f == NULL) {
1079 did_rsc = false;
1080 group.remote_port = vxlan_port;
1081 group.remote_vni = vxlan->vni;
1082 group.remote_ip = vxlan->gaddr;
1083 group.remote_ifindex = vxlan->link;
1084 group.remote_next = 0;
1085 rdst0 = &group;
1086
1087 if (group.remote_ip == htonl(INADDR_ANY) &&
1088 (vxlan->flags & VXLAN_F_L2MISS) &&
1089 !is_multicast_ether_addr(eth->h_dest))
1090 vxlan_fdb_miss(vxlan, eth->h_dest);
1091 } else
1092 rdst0 = &f->remote;
1093
1094 rc = NETDEV_TX_OK;
1095
1096 /* if there are multiple destinations, send copies */
1097 for (rdst = rdst0->remote_next; rdst; rdst = rdst->remote_next) {
1098 struct sk_buff *skb1;
1099
1100 skb1 = skb_clone(skb, GFP_ATOMIC);
1101 rc1 = vxlan_xmit_one(skb1, dev, rdst, did_rsc);
1102 if (rc == NETDEV_TX_OK)
1103 rc = rc1;
1104 }
1105
1106 rc1 = vxlan_xmit_one(skb, dev, rdst0, did_rsc);
1107 if (rc == NETDEV_TX_OK)
1108 rc = rc1;
1109 return rc;
1110}
1111
997/* Walk the forwarding table and purge stale entries */ 1112/* Walk the forwarding table and purge stale entries */
998static void vxlan_cleanup(unsigned long arg) 1113static void vxlan_cleanup(unsigned long arg)
999{ 1114{
@@ -1034,10 +1149,8 @@ static void vxlan_cleanup(unsigned long arg)
1034/* Setup stats when device is created */ 1149/* Setup stats when device is created */
1035static int vxlan_init(struct net_device *dev) 1150static int vxlan_init(struct net_device *dev)
1036{ 1151{
1037 struct vxlan_dev *vxlan = netdev_priv(dev); 1152 dev->tstats = alloc_percpu(struct pcpu_tstats);
1038 1153 if (!dev->tstats)
1039 vxlan->stats = alloc_percpu(struct vxlan_stats);
1040 if (!vxlan->stats)
1041 return -ENOMEM; 1154 return -ENOMEM;
1042 1155
1043 return 0; 1156 return 0;
@@ -1093,49 +1206,6 @@ static int vxlan_stop(struct net_device *dev)
1093 return 0; 1206 return 0;
1094} 1207}
1095 1208
1096/* Merge per-cpu statistics */
1097static struct rtnl_link_stats64 *vxlan_stats64(struct net_device *dev,
1098 struct rtnl_link_stats64 *stats)
1099{
1100 struct vxlan_dev *vxlan = netdev_priv(dev);
1101 struct vxlan_stats tmp, sum = { 0 };
1102 unsigned int cpu;
1103
1104 for_each_possible_cpu(cpu) {
1105 unsigned int start;
1106 const struct vxlan_stats *stats
1107 = per_cpu_ptr(vxlan->stats, cpu);
1108
1109 do {
1110 start = u64_stats_fetch_begin_bh(&stats->syncp);
1111 memcpy(&tmp, stats, sizeof(tmp));
1112 } while (u64_stats_fetch_retry_bh(&stats->syncp, start));
1113
1114 sum.tx_bytes += tmp.tx_bytes;
1115 sum.tx_packets += tmp.tx_packets;
1116 sum.rx_bytes += tmp.rx_bytes;
1117 sum.rx_packets += tmp.rx_packets;
1118 }
1119
1120 stats->tx_bytes = sum.tx_bytes;
1121 stats->tx_packets = sum.tx_packets;
1122 stats->rx_bytes = sum.rx_bytes;
1123 stats->rx_packets = sum.rx_packets;
1124
1125 stats->multicast = dev->stats.multicast;
1126 stats->rx_length_errors = dev->stats.rx_length_errors;
1127 stats->rx_frame_errors = dev->stats.rx_frame_errors;
1128 stats->rx_errors = dev->stats.rx_errors;
1129
1130 stats->tx_dropped = dev->stats.tx_dropped;
1131 stats->tx_carrier_errors = dev->stats.tx_carrier_errors;
1132 stats->tx_aborted_errors = dev->stats.tx_aborted_errors;
1133 stats->collisions = dev->stats.collisions;
1134 stats->tx_errors = dev->stats.tx_errors;
1135
1136 return stats;
1137}
1138
1139/* Stub, nothing needs to be done. */ 1209/* Stub, nothing needs to be done. */
1140static void vxlan_set_multicast_list(struct net_device *dev) 1210static void vxlan_set_multicast_list(struct net_device *dev)
1141{ 1211{
@@ -1146,7 +1216,7 @@ static const struct net_device_ops vxlan_netdev_ops = {
1146 .ndo_open = vxlan_open, 1216 .ndo_open = vxlan_open,
1147 .ndo_stop = vxlan_stop, 1217 .ndo_stop = vxlan_stop,
1148 .ndo_start_xmit = vxlan_xmit, 1218 .ndo_start_xmit = vxlan_xmit,
1149 .ndo_get_stats64 = vxlan_stats64, 1219 .ndo_get_stats64 = ip_tunnel_get_stats64,
1150 .ndo_set_rx_mode = vxlan_set_multicast_list, 1220 .ndo_set_rx_mode = vxlan_set_multicast_list,
1151 .ndo_change_mtu = eth_change_mtu, 1221 .ndo_change_mtu = eth_change_mtu,
1152 .ndo_validate_addr = eth_validate_addr, 1222 .ndo_validate_addr = eth_validate_addr,
@@ -1163,9 +1233,7 @@ static struct device_type vxlan_type = {
1163 1233
1164static void vxlan_free(struct net_device *dev) 1234static void vxlan_free(struct net_device *dev)
1165{ 1235{
1166 struct vxlan_dev *vxlan = netdev_priv(dev); 1236 free_percpu(dev->tstats);
1167
1168 free_percpu(vxlan->stats);
1169 free_netdev(dev); 1237 free_netdev(dev);
1170} 1238}
1171 1239
@@ -1189,8 +1257,10 @@ static void vxlan_setup(struct net_device *dev)
1189 dev->features |= NETIF_F_NETNS_LOCAL; 1257 dev->features |= NETIF_F_NETNS_LOCAL;
1190 dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM; 1258 dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM;
1191 dev->features |= NETIF_F_RXCSUM; 1259 dev->features |= NETIF_F_RXCSUM;
1260 dev->features |= NETIF_F_GSO_SOFTWARE;
1192 1261
1193 dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM; 1262 dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
1263 dev->hw_features |= NETIF_F_GSO_SOFTWARE;
1194 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; 1264 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
1195 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1265 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1196 1266
@@ -1555,6 +1625,7 @@ static void __exit vxlan_cleanup_module(void)
1555{ 1625{
1556 rtnl_link_unregister(&vxlan_link_ops); 1626 rtnl_link_unregister(&vxlan_link_ops);
1557 unregister_pernet_device(&vxlan_net_ops); 1627 unregister_pernet_device(&vxlan_net_ops);
1628 rcu_barrier();
1558} 1629}
1559module_exit(vxlan_cleanup_module); 1630module_exit(vxlan_cleanup_module);
1560 1631