summaryrefslogtreecommitdiffstats
path: root/drivers/net/vxlan.c
diff options
context:
space:
mode:
authorstephen hemminger <shemminger@vyatta.com>2012-10-01 08:32:35 -0400
committerDavid S. Miller <davem@davemloft.net>2012-10-01 18:39:45 -0400
commitd342894c5d2f8c7df194c793ec4059656e09ca31 (patch)
tree13dec930ffdfa590c869488203584ea0c6b5cf7c /drivers/net/vxlan.c
parent193ba924524e6afe192217982b2c2d67e4715d33 (diff)
vxlan: virtual extensible lan
This is an implementation of Virtual eXtensible Local Area Network as described in draft RFC: http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-02 The driver integrates a Virtual Tunnel Endpoint (VTEP) functionality that learns MAC to IP address mapping. This implementation has not been tested only against the Linux userspace implementation using TAP, not against other vendor's equipment. Signed-off-by: Stephen Hemminger <shemminger@vyatta.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers/net/vxlan.c')
-rw-r--r--drivers/net/vxlan.c1217
1 files changed, 1217 insertions, 0 deletions
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
new file mode 100644
index 000000000000..f87a98f1aec2
--- /dev/null
+++ b/drivers/net/vxlan.c
@@ -0,0 +1,1217 @@
1/*
2 * VXLAN: Virtual eXtensiable Local Area Network
3 *
4 * Copyright (c) 2012 Vyatta Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * TODO
11 * - use IANA UDP port number (when defined)
12 * - IPv6 (not in RFC)
13 */
14
15#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16
17#include <linux/kernel.h>
18#include <linux/types.h>
19#include <linux/module.h>
20#include <linux/errno.h>
21#include <linux/slab.h>
22#include <linux/skbuff.h>
23#include <linux/rculist.h>
24#include <linux/netdevice.h>
25#include <linux/in.h>
26#include <linux/ip.h>
27#include <linux/udp.h>
28#include <linux/igmp.h>
29#include <linux/etherdevice.h>
30#include <linux/if_ether.h>
31#include <linux/version.h>
32#include <linux/hash.h>
33#include <net/ip.h>
34#include <net/icmp.h>
35#include <net/udp.h>
36#include <net/rtnetlink.h>
37#include <net/route.h>
38#include <net/dsfield.h>
39#include <net/inet_ecn.h>
40#include <net/net_namespace.h>
41#include <net/netns/generic.h>
42
43#define VXLAN_VERSION "0.1"
44
45#define VNI_HASH_BITS 10
46#define VNI_HASH_SIZE (1<<VNI_HASH_BITS)
47#define FDB_HASH_BITS 8
48#define FDB_HASH_SIZE (1<<FDB_HASH_BITS)
49#define FDB_AGE_DEFAULT 300 /* 5 min */
50#define FDB_AGE_INTERVAL (10 * HZ) /* rescan interval */
51
52#define VXLAN_N_VID (1u << 24)
53#define VXLAN_VID_MASK (VXLAN_N_VID - 1)
54/* VLAN + IP header + UDP + VXLAN */
55#define VXLAN_HEADROOM (4 + 20 + 8 + 8)
56
57#define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags required value. */
58
59/* VXLAN protocol header */
60struct vxlanhdr {
61 __be32 vx_flags;
62 __be32 vx_vni;
63};
64
65/* UDP port for VXLAN traffic. */
66static unsigned int vxlan_port __read_mostly = 8472;
67module_param_named(udp_port, vxlan_port, uint, 0444);
68MODULE_PARM_DESC(udp_port, "Destination UDP port");
69
70static bool log_ecn_error = true;
71module_param(log_ecn_error, bool, 0644);
72MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
73
74/* per-net private data for this module */
75static unsigned int vxlan_net_id;
76struct vxlan_net {
77 struct socket *sock; /* UDP encap socket */
78 struct hlist_head vni_list[VNI_HASH_SIZE];
79};
80
81/* Forwarding table entry */
82struct vxlan_fdb {
83 struct hlist_node hlist; /* linked list of entries */
84 struct rcu_head rcu;
85 unsigned long updated; /* jiffies */
86 unsigned long used;
87 __be32 remote_ip;
88 u16 state; /* see ndm_state */
89 u8 eth_addr[ETH_ALEN];
90};
91
92/* Per-cpu network traffic stats */
93struct vxlan_stats {
94 u64 rx_packets;
95 u64 rx_bytes;
96 u64 tx_packets;
97 u64 tx_bytes;
98 struct u64_stats_sync syncp;
99};
100
101/* Pseudo network device */
102struct vxlan_dev {
103 struct hlist_node hlist;
104 struct net_device *dev;
105 struct vxlan_stats __percpu *stats;
106 __u32 vni; /* virtual network id */
107 __be32 gaddr; /* multicast group */
108 __be32 saddr; /* source address */
109 unsigned int link; /* link to multicast over */
110 __u8 tos; /* TOS override */
111 __u8 ttl;
112 bool learn;
113
114 unsigned long age_interval;
115 struct timer_list age_timer;
116 spinlock_t hash_lock;
117 unsigned int addrcnt;
118 unsigned int addrmax;
119 unsigned int addrexceeded;
120
121 struct hlist_head fdb_head[FDB_HASH_SIZE];
122};
123
124/* salt for hash table */
125static u32 vxlan_salt __read_mostly;
126
127static inline struct hlist_head *vni_head(struct net *net, u32 id)
128{
129 struct vxlan_net *vn = net_generic(net, vxlan_net_id);
130
131 return &vn->vni_list[hash_32(id, VNI_HASH_BITS)];
132}
133
134/* Look up VNI in a per net namespace table */
135static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id)
136{
137 struct vxlan_dev *vxlan;
138 struct hlist_node *node;
139
140 hlist_for_each_entry_rcu(vxlan, node, vni_head(net, id), hlist) {
141 if (vxlan->vni == id)
142 return vxlan;
143 }
144
145 return NULL;
146}
147
148/* Fill in neighbour message in skbuff. */
149static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
150 const struct vxlan_fdb *fdb,
151 u32 portid, u32 seq, int type, unsigned int flags)
152{
153 unsigned long now = jiffies;
154 struct nda_cacheinfo ci;
155 struct nlmsghdr *nlh;
156 struct ndmsg *ndm;
157
158 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
159 if (nlh == NULL)
160 return -EMSGSIZE;
161
162 ndm = nlmsg_data(nlh);
163 memset(ndm, 0, sizeof(*ndm));
164 ndm->ndm_family = AF_BRIDGE;
165 ndm->ndm_state = fdb->state;
166 ndm->ndm_ifindex = vxlan->dev->ifindex;
167 ndm->ndm_flags = NTF_SELF;
168 ndm->ndm_type = NDA_DST;
169
170 if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
171 goto nla_put_failure;
172
173 if (nla_put_be32(skb, NDA_DST, fdb->remote_ip))
174 goto nla_put_failure;
175
176 ci.ndm_used = jiffies_to_clock_t(now - fdb->used);
177 ci.ndm_confirmed = 0;
178 ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated);
179 ci.ndm_refcnt = 0;
180
181 if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
182 goto nla_put_failure;
183
184 return nlmsg_end(skb, nlh);
185
186nla_put_failure:
187 nlmsg_cancel(skb, nlh);
188 return -EMSGSIZE;
189}
190
191static inline size_t vxlan_nlmsg_size(void)
192{
193 return NLMSG_ALIGN(sizeof(struct ndmsg))
194 + nla_total_size(ETH_ALEN) /* NDA_LLADDR */
195 + nla_total_size(sizeof(__be32)) /* NDA_DST */
196 + nla_total_size(sizeof(struct nda_cacheinfo));
197}
198
199static void vxlan_fdb_notify(struct vxlan_dev *vxlan,
200 const struct vxlan_fdb *fdb, int type)
201{
202 struct net *net = dev_net(vxlan->dev);
203 struct sk_buff *skb;
204 int err = -ENOBUFS;
205
206 skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC);
207 if (skb == NULL)
208 goto errout;
209
210 err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0);
211 if (err < 0) {
212 /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
213 WARN_ON(err == -EMSGSIZE);
214 kfree_skb(skb);
215 goto errout;
216 }
217
218 rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
219 return;
220errout:
221 if (err < 0)
222 rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
223}
224
225/* Hash Ethernet address */
226static u32 eth_hash(const unsigned char *addr)
227{
228 u64 value = get_unaligned((u64 *)addr);
229
230 /* only want 6 bytes */
231#ifdef __BIG_ENDIAN
232 value <<= 16;
233#else
234 value >>= 16;
235#endif
236 return hash_64(value, FDB_HASH_BITS);
237}
238
239/* Hash chain to use given mac address */
240static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
241 const u8 *mac)
242{
243 return &vxlan->fdb_head[eth_hash(mac)];
244}
245
246/* Look up Ethernet address in forwarding table */
247static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
248 const u8 *mac)
249
250{
251 struct hlist_head *head = vxlan_fdb_head(vxlan, mac);
252 struct vxlan_fdb *f;
253 struct hlist_node *node;
254
255 hlist_for_each_entry_rcu(f, node, head, hlist) {
256 if (compare_ether_addr(mac, f->eth_addr) == 0)
257 return f;
258 }
259
260 return NULL;
261}
262
263/* Add new entry to forwarding table -- assumes lock held */
264static int vxlan_fdb_create(struct vxlan_dev *vxlan,
265 const u8 *mac, __be32 ip,
266 __u16 state, __u16 flags)
267{
268 struct vxlan_fdb *f;
269 int notify = 0;
270
271 f = vxlan_find_mac(vxlan, mac);
272 if (f) {
273 if (flags & NLM_F_EXCL) {
274 netdev_dbg(vxlan->dev,
275 "lost race to create %pM\n", mac);
276 return -EEXIST;
277 }
278 if (f->state != state) {
279 f->state = state;
280 f->updated = jiffies;
281 notify = 1;
282 }
283 } else {
284 if (!(flags & NLM_F_CREATE))
285 return -ENOENT;
286
287 if (vxlan->addrmax && vxlan->addrcnt >= vxlan->addrmax)
288 return -ENOSPC;
289
290 netdev_dbg(vxlan->dev, "add %pM -> %pI4\n", mac, &ip);
291 f = kmalloc(sizeof(*f), GFP_ATOMIC);
292 if (!f)
293 return -ENOMEM;
294
295 notify = 1;
296 f->remote_ip = ip;
297 f->state = state;
298 f->updated = f->used = jiffies;
299 memcpy(f->eth_addr, mac, ETH_ALEN);
300
301 ++vxlan->addrcnt;
302 hlist_add_head_rcu(&f->hlist,
303 vxlan_fdb_head(vxlan, mac));
304 }
305
306 if (notify)
307 vxlan_fdb_notify(vxlan, f, RTM_NEWNEIGH);
308
309 return 0;
310}
311
312static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f)
313{
314 netdev_dbg(vxlan->dev,
315 "delete %pM\n", f->eth_addr);
316
317 --vxlan->addrcnt;
318 vxlan_fdb_notify(vxlan, f, RTM_DELNEIGH);
319
320 hlist_del_rcu(&f->hlist);
321 kfree_rcu(f, rcu);
322}
323
324/* Add static entry (via netlink) */
325static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
326 struct net_device *dev,
327 const unsigned char *addr, u16 flags)
328{
329 struct vxlan_dev *vxlan = netdev_priv(dev);
330 __be32 ip;
331 int err;
332
333 if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) {
334 pr_info("RTM_NEWNEIGH with invalid state %#x\n",
335 ndm->ndm_state);
336 return -EINVAL;
337 }
338
339 if (tb[NDA_DST] == NULL)
340 return -EINVAL;
341
342 if (nla_len(tb[NDA_DST]) != sizeof(__be32))
343 return -EAFNOSUPPORT;
344
345 ip = nla_get_be32(tb[NDA_DST]);
346
347 spin_lock_bh(&vxlan->hash_lock);
348 err = vxlan_fdb_create(vxlan, addr, ip, ndm->ndm_state, flags);
349 spin_unlock_bh(&vxlan->hash_lock);
350
351 return err;
352}
353
354/* Delete entry (via netlink) */
355static int vxlan_fdb_delete(struct ndmsg *ndm, struct net_device *dev,
356 const unsigned char *addr)
357{
358 struct vxlan_dev *vxlan = netdev_priv(dev);
359 struct vxlan_fdb *f;
360 int err = -ENOENT;
361
362 spin_lock_bh(&vxlan->hash_lock);
363 f = vxlan_find_mac(vxlan, addr);
364 if (f) {
365 vxlan_fdb_destroy(vxlan, f);
366 err = 0;
367 }
368 spin_unlock_bh(&vxlan->hash_lock);
369
370 return err;
371}
372
373/* Dump forwarding table */
374static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
375 struct net_device *dev, int idx)
376{
377 struct vxlan_dev *vxlan = netdev_priv(dev);
378 unsigned int h;
379
380 for (h = 0; h < FDB_HASH_SIZE; ++h) {
381 struct vxlan_fdb *f;
382 struct hlist_node *n;
383 int err;
384
385 hlist_for_each_entry_rcu(f, n, &vxlan->fdb_head[h], hlist) {
386 if (idx < cb->args[0])
387 goto skip;
388
389 err = vxlan_fdb_info(skb, vxlan, f,
390 NETLINK_CB(cb->skb).portid,
391 cb->nlh->nlmsg_seq,
392 RTM_NEWNEIGH,
393 NLM_F_MULTI);
394 if (err < 0)
395 break;
396skip:
397 ++idx;
398 }
399 }
400
401 return idx;
402}
403
404/* Watch incoming packets to learn mapping between Ethernet address
405 * and Tunnel endpoint.
406 */
407static void vxlan_snoop(struct net_device *dev,
408 __be32 src_ip, const u8 *src_mac)
409{
410 struct vxlan_dev *vxlan = netdev_priv(dev);
411 struct vxlan_fdb *f;
412 int err;
413
414 f = vxlan_find_mac(vxlan, src_mac);
415 if (likely(f)) {
416 f->used = jiffies;
417 if (likely(f->remote_ip == src_ip))
418 return;
419
420 if (net_ratelimit())
421 netdev_info(dev,
422 "%pM migrated from %pI4 to %pI4\n",
423 src_mac, &f->remote_ip, &src_ip);
424
425 f->remote_ip = src_ip;
426 f->updated = jiffies;
427 } else {
428 /* learned new entry */
429 spin_lock(&vxlan->hash_lock);
430 err = vxlan_fdb_create(vxlan, src_mac, src_ip,
431 NUD_REACHABLE,
432 NLM_F_EXCL|NLM_F_CREATE);
433 spin_unlock(&vxlan->hash_lock);
434 }
435}
436
437
438/* See if multicast group is already in use by other ID */
439static bool vxlan_group_used(struct vxlan_net *vn,
440 const struct vxlan_dev *this)
441{
442 const struct vxlan_dev *vxlan;
443 struct hlist_node *node;
444 unsigned h;
445
446 for (h = 0; h < VNI_HASH_SIZE; ++h)
447 hlist_for_each_entry(vxlan, node, &vn->vni_list[h], hlist) {
448 if (vxlan == this)
449 continue;
450
451 if (!netif_running(vxlan->dev))
452 continue;
453
454 if (vxlan->gaddr == this->gaddr)
455 return true;
456 }
457
458 return false;
459}
460
461/* kernel equivalent to IP_ADD_MEMBERSHIP */
462static int vxlan_join_group(struct net_device *dev)
463{
464 struct vxlan_dev *vxlan = netdev_priv(dev);
465 struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
466 struct sock *sk = vn->sock->sk;
467 struct ip_mreqn mreq = {
468 .imr_multiaddr.s_addr = vxlan->gaddr,
469 };
470 int err;
471
472 /* Already a member of group */
473 if (vxlan_group_used(vn, vxlan))
474 return 0;
475
476 /* Need to drop RTNL to call multicast join */
477 rtnl_unlock();
478 lock_sock(sk);
479 err = ip_mc_join_group(sk, &mreq);
480 release_sock(sk);
481 rtnl_lock();
482
483 return err;
484}
485
486
487/* kernel equivalent to IP_DROP_MEMBERSHIP */
488static int vxlan_leave_group(struct net_device *dev)
489{
490 struct vxlan_dev *vxlan = netdev_priv(dev);
491 struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
492 int err = 0;
493 struct sock *sk = vn->sock->sk;
494 struct ip_mreqn mreq = {
495 .imr_multiaddr.s_addr = vxlan->gaddr,
496 };
497
498 /* Only leave group when last vxlan is done. */
499 if (vxlan_group_used(vn, vxlan))
500 return 0;
501
502 /* Need to drop RTNL to call multicast leave */
503 rtnl_unlock();
504 lock_sock(sk);
505 err = ip_mc_leave_group(sk, &mreq);
506 release_sock(sk);
507 rtnl_lock();
508
509 return err;
510}
511
512/* Callback from net/ipv4/udp.c to receive packets */
513static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
514{
515 struct iphdr *oip;
516 struct vxlanhdr *vxh;
517 struct vxlan_dev *vxlan;
518 struct vxlan_stats *stats;
519 __u32 vni;
520 int err;
521
522 /* pop off outer UDP header */
523 __skb_pull(skb, sizeof(struct udphdr));
524
525 /* Need Vxlan and inner Ethernet header to be present */
526 if (!pskb_may_pull(skb, sizeof(struct vxlanhdr)))
527 goto error;
528
529 /* Drop packets with reserved bits set */
530 vxh = (struct vxlanhdr *) skb->data;
531 if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
532 (vxh->vx_vni & htonl(0xff))) {
533 netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
534 ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));
535 goto error;
536 }
537
538 __skb_pull(skb, sizeof(struct vxlanhdr));
539 skb_postpull_rcsum(skb, eth_hdr(skb), sizeof(struct vxlanhdr));
540
541 /* Is this VNI defined? */
542 vni = ntohl(vxh->vx_vni) >> 8;
543 vxlan = vxlan_find_vni(sock_net(sk), vni);
544 if (!vxlan) {
545 netdev_dbg(skb->dev, "unknown vni %d\n", vni);
546 goto drop;
547 }
548
549 if (!pskb_may_pull(skb, ETH_HLEN)) {
550 vxlan->dev->stats.rx_length_errors++;
551 vxlan->dev->stats.rx_errors++;
552 goto drop;
553 }
554
555 /* Re-examine inner Ethernet packet */
556 oip = ip_hdr(skb);
557 skb->protocol = eth_type_trans(skb, vxlan->dev);
558 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
559
560 /* Ignore packet loops (and multicast echo) */
561 if (compare_ether_addr(eth_hdr(skb)->h_source,
562 vxlan->dev->dev_addr) == 0)
563 goto drop;
564
565 if (vxlan->learn)
566 vxlan_snoop(skb->dev, oip->saddr, eth_hdr(skb)->h_source);
567
568 __skb_tunnel_rx(skb, vxlan->dev);
569 skb_reset_network_header(skb);
570
571 err = IP_ECN_decapsulate(oip, skb);
572 if (unlikely(err)) {
573 if (log_ecn_error)
574 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
575 &oip->saddr, oip->tos);
576 if (err > 1) {
577 ++vxlan->dev->stats.rx_frame_errors;
578 ++vxlan->dev->stats.rx_errors;
579 goto drop;
580 }
581 }
582
583 stats = this_cpu_ptr(vxlan->stats);
584 u64_stats_update_begin(&stats->syncp);
585 stats->rx_packets++;
586 stats->rx_bytes += skb->len;
587 u64_stats_update_end(&stats->syncp);
588
589 netif_rx(skb);
590
591 return 0;
592error:
593 /* Put UDP header back */
594 __skb_push(skb, sizeof(struct udphdr));
595
596 return 1;
597drop:
598 /* Consume bad packet */
599 kfree_skb(skb);
600 return 0;
601}
602
603/* Extract dsfield from inner protocol */
604static inline u8 vxlan_get_dsfield(const struct iphdr *iph,
605 const struct sk_buff *skb)
606{
607 if (skb->protocol == htons(ETH_P_IP))
608 return iph->tos;
609 else if (skb->protocol == htons(ETH_P_IPV6))
610 return ipv6_get_dsfield((const struct ipv6hdr *)iph);
611 else
612 return 0;
613}
614
615/* Propogate ECN bits out */
616static inline u8 vxlan_ecn_encap(u8 tos,
617 const struct iphdr *iph,
618 const struct sk_buff *skb)
619{
620 u8 inner = vxlan_get_dsfield(iph, skb);
621
622 return INET_ECN_encapsulate(tos, inner);
623}
624
625/* Transmit local packets over Vxlan
626 *
627 * Outer IP header inherits ECN and DF from inner header.
628 * Outer UDP destination is the VXLAN assigned port.
629 * source port is based on hash of flow if available
630 * otherwise use a random value
631 */
632static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
633{
634 struct vxlan_dev *vxlan = netdev_priv(dev);
635 struct rtable *rt;
636 const struct ethhdr *eth;
637 const struct iphdr *old_iph;
638 struct iphdr *iph;
639 struct vxlanhdr *vxh;
640 struct udphdr *uh;
641 struct flowi4 fl4;
642 struct vxlan_fdb *f;
643 unsigned int pkt_len = skb->len;
644 u32 hash;
645 __be32 dst;
646 __be16 df = 0;
647 __u8 tos, ttl;
648 int err;
649
650 /* Need space for new headers (invalidates iph ptr) */
651 if (skb_cow_head(skb, VXLAN_HEADROOM))
652 goto drop;
653
654 eth = (void *)skb->data;
655 old_iph = ip_hdr(skb);
656
657 if (!is_multicast_ether_addr(eth->h_dest) &&
658 (f = vxlan_find_mac(vxlan, eth->h_dest)))
659 dst = f->remote_ip;
660 else if (vxlan->gaddr) {
661 dst = vxlan->gaddr;
662 } else
663 goto drop;
664
665 ttl = vxlan->ttl;
666 if (!ttl && IN_MULTICAST(ntohl(dst)))
667 ttl = 1;
668
669 tos = vxlan->tos;
670 if (tos == 1)
671 tos = vxlan_get_dsfield(old_iph, skb);
672
673 hash = skb_get_rxhash(skb);
674
675 rt = ip_route_output_gre(dev_net(dev), &fl4, dst,
676 vxlan->saddr, vxlan->vni,
677 RT_TOS(tos), vxlan->link);
678 if (IS_ERR(rt)) {
679 netdev_dbg(dev, "no route to %pI4\n", &dst);
680 dev->stats.tx_carrier_errors++;
681 goto tx_error;
682 }
683
684 if (rt->dst.dev == dev) {
685 netdev_dbg(dev, "circular route to %pI4\n", &dst);
686 ip_rt_put(rt);
687 dev->stats.collisions++;
688 goto tx_error;
689 }
690
691 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
692 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
693 IPSKB_REROUTED);
694 skb_dst_drop(skb);
695 skb_dst_set(skb, &rt->dst);
696
697 vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
698 vxh->vx_flags = htonl(VXLAN_FLAGS);
699 vxh->vx_vni = htonl(vxlan->vni << 8);
700
701 __skb_push(skb, sizeof(*uh));
702 skb_reset_transport_header(skb);
703 uh = udp_hdr(skb);
704
705 uh->dest = htons(vxlan_port);
706 uh->source = hash ? :random32();
707
708 uh->len = htons(skb->len);
709 uh->check = 0;
710
711 __skb_push(skb, sizeof(*iph));
712 skb_reset_network_header(skb);
713 iph = ip_hdr(skb);
714 iph->version = 4;
715 iph->ihl = sizeof(struct iphdr) >> 2;
716 iph->frag_off = df;
717 iph->protocol = IPPROTO_UDP;
718 iph->tos = vxlan_ecn_encap(tos, old_iph, skb);
719 iph->daddr = fl4.daddr;
720 iph->saddr = fl4.saddr;
721 iph->ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
722
723 /* See __IPTUNNEL_XMIT */
724 skb->ip_summed = CHECKSUM_NONE;
725 ip_select_ident(iph, &rt->dst, NULL);
726
727 err = ip_local_out(skb);
728 if (likely(net_xmit_eval(err) == 0)) {
729 struct vxlan_stats *stats = this_cpu_ptr(vxlan->stats);
730
731 u64_stats_update_begin(&stats->syncp);
732 stats->tx_packets++;
733 stats->tx_bytes += pkt_len;
734 u64_stats_update_end(&stats->syncp);
735 } else {
736 dev->stats.tx_errors++;
737 dev->stats.tx_aborted_errors++;
738 }
739 return NETDEV_TX_OK;
740
741drop:
742 dev->stats.tx_dropped++;
743 goto tx_free;
744
745tx_error:
746 dev->stats.tx_errors++;
747tx_free:
748 dev_kfree_skb(skb);
749 return NETDEV_TX_OK;
750}
751
752/* Walk the forwarding table and purge stale entries */
753static void vxlan_cleanup(unsigned long arg)
754{
755 struct vxlan_dev *vxlan = (struct vxlan_dev *) arg;
756 unsigned long next_timer = jiffies + FDB_AGE_INTERVAL;
757 unsigned int h;
758
759 if (!netif_running(vxlan->dev))
760 return;
761
762 spin_lock_bh(&vxlan->hash_lock);
763 for (h = 0; h < FDB_HASH_SIZE; ++h) {
764 struct hlist_node *p, *n;
765 hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
766 struct vxlan_fdb *f
767 = container_of(p, struct vxlan_fdb, hlist);
768 unsigned long timeout;
769
770 if (f->state == NUD_PERMANENT)
771 continue;
772
773 timeout = f->used + vxlan->age_interval * HZ;
774 if (time_before_eq(timeout, jiffies)) {
775 netdev_dbg(vxlan->dev,
776 "garbage collect %pM\n",
777 f->eth_addr);
778 f->state = NUD_STALE;
779 vxlan_fdb_destroy(vxlan, f);
780 } else if (time_before(timeout, next_timer))
781 next_timer = timeout;
782 }
783 }
784 spin_unlock_bh(&vxlan->hash_lock);
785
786 mod_timer(&vxlan->age_timer, next_timer);
787}
788
789/* Setup stats when device is created */
790static int vxlan_init(struct net_device *dev)
791{
792 struct vxlan_dev *vxlan = netdev_priv(dev);
793
794 vxlan->stats = alloc_percpu(struct vxlan_stats);
795 if (!vxlan->stats)
796 return -ENOMEM;
797
798 return 0;
799}
800
801/* Start ageing timer and join group when device is brought up */
802static int vxlan_open(struct net_device *dev)
803{
804 struct vxlan_dev *vxlan = netdev_priv(dev);
805 int err;
806
807 if (vxlan->gaddr) {
808 err = vxlan_join_group(dev);
809 if (err)
810 return err;
811 }
812
813 if (vxlan->age_interval)
814 mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);
815
816 return 0;
817}
818
819/* Purge the forwarding table */
820static void vxlan_flush(struct vxlan_dev *vxlan)
821{
822 unsigned h;
823
824 spin_lock_bh(&vxlan->hash_lock);
825 for (h = 0; h < FDB_HASH_SIZE; ++h) {
826 struct hlist_node *p, *n;
827 hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
828 struct vxlan_fdb *f
829 = container_of(p, struct vxlan_fdb, hlist);
830 vxlan_fdb_destroy(vxlan, f);
831 }
832 }
833 spin_unlock_bh(&vxlan->hash_lock);
834}
835
836/* Cleanup timer and forwarding table on shutdown */
837static int vxlan_stop(struct net_device *dev)
838{
839 struct vxlan_dev *vxlan = netdev_priv(dev);
840
841 if (vxlan->gaddr)
842 vxlan_leave_group(dev);
843
844 del_timer_sync(&vxlan->age_timer);
845
846 vxlan_flush(vxlan);
847
848 return 0;
849}
850
851/* Merge per-cpu statistics */
852static struct rtnl_link_stats64 *vxlan_stats64(struct net_device *dev,
853 struct rtnl_link_stats64 *stats)
854{
855 struct vxlan_dev *vxlan = netdev_priv(dev);
856 struct vxlan_stats tmp, sum = { 0 };
857 unsigned int cpu;
858
859 for_each_possible_cpu(cpu) {
860 unsigned int start;
861 const struct vxlan_stats *stats
862 = per_cpu_ptr(vxlan->stats, cpu);
863
864 do {
865 start = u64_stats_fetch_begin_bh(&stats->syncp);
866 memcpy(&tmp, stats, sizeof(tmp));
867 } while (u64_stats_fetch_retry_bh(&stats->syncp, start));
868
869 sum.tx_bytes += tmp.tx_bytes;
870 sum.tx_packets += tmp.tx_packets;
871 sum.rx_bytes += tmp.rx_bytes;
872 sum.rx_packets += tmp.rx_packets;
873 }
874
875 stats->tx_bytes = sum.tx_bytes;
876 stats->tx_packets = sum.tx_packets;
877 stats->rx_bytes = sum.rx_bytes;
878 stats->rx_packets = sum.rx_packets;
879
880 stats->multicast = dev->stats.multicast;
881 stats->rx_length_errors = dev->stats.rx_length_errors;
882 stats->rx_frame_errors = dev->stats.rx_frame_errors;
883 stats->rx_errors = dev->stats.rx_errors;
884
885 stats->tx_dropped = dev->stats.tx_dropped;
886 stats->tx_carrier_errors = dev->stats.tx_carrier_errors;
887 stats->tx_aborted_errors = dev->stats.tx_aborted_errors;
888 stats->collisions = dev->stats.collisions;
889 stats->tx_errors = dev->stats.tx_errors;
890
891 return stats;
892}
893
894/* Stub, nothing needs to be done. */
895static void vxlan_set_multicast_list(struct net_device *dev)
896{
897}
898
899static const struct net_device_ops vxlan_netdev_ops = {
900 .ndo_init = vxlan_init,
901 .ndo_open = vxlan_open,
902 .ndo_stop = vxlan_stop,
903 .ndo_start_xmit = vxlan_xmit,
904 .ndo_get_stats64 = vxlan_stats64,
905 .ndo_set_rx_mode = vxlan_set_multicast_list,
906 .ndo_change_mtu = eth_change_mtu,
907 .ndo_validate_addr = eth_validate_addr,
908 .ndo_set_mac_address = eth_mac_addr,
909 .ndo_fdb_add = vxlan_fdb_add,
910 .ndo_fdb_del = vxlan_fdb_delete,
911 .ndo_fdb_dump = vxlan_fdb_dump,
912};
913
914/* Info for udev, that this is a virtual tunnel endpoint */
915static struct device_type vxlan_type = {
916 .name = "vxlan",
917};
918
919static void vxlan_free(struct net_device *dev)
920{
921 struct vxlan_dev *vxlan = netdev_priv(dev);
922
923 free_percpu(vxlan->stats);
924 free_netdev(dev);
925}
926
927/* Initialize the device structure. */
928static void vxlan_setup(struct net_device *dev)
929{
930 struct vxlan_dev *vxlan = netdev_priv(dev);
931 unsigned h;
932
933 eth_hw_addr_random(dev);
934 ether_setup(dev);
935
936 dev->netdev_ops = &vxlan_netdev_ops;
937 dev->destructor = vxlan_free;
938 SET_NETDEV_DEVTYPE(dev, &vxlan_type);
939
940 dev->tx_queue_len = 0;
941 dev->features |= NETIF_F_LLTX;
942 dev->features |= NETIF_F_NETNS_LOCAL;
943 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
944
945 spin_lock_init(&vxlan->hash_lock);
946
947 init_timer_deferrable(&vxlan->age_timer);
948 vxlan->age_timer.function = vxlan_cleanup;
949 vxlan->age_timer.data = (unsigned long) vxlan;
950
951 vxlan->dev = dev;
952
953 for (h = 0; h < FDB_HASH_SIZE; ++h)
954 INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
955}
956
957static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
958 [IFLA_VXLAN_ID] = { .type = NLA_U32 },
959 [IFLA_VXLAN_GROUP] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
960 [IFLA_VXLAN_LINK] = { .type = NLA_U32 },
961 [IFLA_VXLAN_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
962 [IFLA_VXLAN_TOS] = { .type = NLA_U8 },
963 [IFLA_VXLAN_TTL] = { .type = NLA_U8 },
964 [IFLA_VXLAN_LEARNING] = { .type = NLA_U8 },
965 [IFLA_VXLAN_AGEING] = { .type = NLA_U32 },
966 [IFLA_VXLAN_LIMIT] = { .type = NLA_U32 },
967};
968
969static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
970{
971 if (tb[IFLA_ADDRESS]) {
972 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
973 pr_debug("invalid link address (not ethernet)\n");
974 return -EINVAL;
975 }
976
977 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
978 pr_debug("invalid all zero ethernet address\n");
979 return -EADDRNOTAVAIL;
980 }
981 }
982
983 if (!data)
984 return -EINVAL;
985
986 if (data[IFLA_VXLAN_ID]) {
987 __u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
988 if (id >= VXLAN_VID_MASK)
989 return -ERANGE;
990 }
991
992 if (data[IFLA_VXLAN_GROUP]) {
993 __be32 gaddr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
994 if (!IN_MULTICAST(ntohl(gaddr))) {
995 pr_debug("group address is not IPv4 multicast\n");
996 return -EADDRNOTAVAIL;
997 }
998 }
999 return 0;
1000}
1001
1002static int vxlan_newlink(struct net *net, struct net_device *dev,
1003 struct nlattr *tb[], struct nlattr *data[])
1004{
1005 struct vxlan_dev *vxlan = netdev_priv(dev);
1006 __u32 vni;
1007 int err;
1008
1009 if (!data[IFLA_VXLAN_ID])
1010 return -EINVAL;
1011
1012 vni = nla_get_u32(data[IFLA_VXLAN_ID]);
1013 if (vxlan_find_vni(net, vni)) {
1014 pr_info("duplicate VNI %u\n", vni);
1015 return -EEXIST;
1016 }
1017 vxlan->vni = vni;
1018
1019 if (data[IFLA_VXLAN_GROUP])
1020 vxlan->gaddr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
1021
1022 if (data[IFLA_VXLAN_LOCAL])
1023 vxlan->saddr = nla_get_be32(data[IFLA_VXLAN_LOCAL]);
1024
1025 if (data[IFLA_VXLAN_LINK]) {
1026 vxlan->link = nla_get_u32(data[IFLA_VXLAN_LINK]);
1027
1028 if (!tb[IFLA_MTU]) {
1029 struct net_device *lowerdev;
1030 lowerdev = __dev_get_by_index(net, vxlan->link);
1031 dev->mtu = lowerdev->mtu - VXLAN_HEADROOM;
1032 }
1033 }
1034
1035 if (data[IFLA_VXLAN_TOS])
1036 vxlan->tos = nla_get_u8(data[IFLA_VXLAN_TOS]);
1037
1038 if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING]))
1039 vxlan->learn = true;
1040
1041 if (data[IFLA_VXLAN_AGEING])
1042 vxlan->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]);
1043 else
1044 vxlan->age_interval = FDB_AGE_DEFAULT;
1045
1046 if (data[IFLA_VXLAN_LIMIT])
1047 vxlan->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
1048
1049 err = register_netdevice(dev);
1050 if (!err)
1051 hlist_add_head_rcu(&vxlan->hlist, vni_head(net, vxlan->vni));
1052
1053 return err;
1054}
1055
1056static void vxlan_dellink(struct net_device *dev, struct list_head *head)
1057{
1058 struct vxlan_dev *vxlan = netdev_priv(dev);
1059
1060 hlist_del_rcu(&vxlan->hlist);
1061
1062 unregister_netdevice_queue(dev, head);
1063}
1064
1065static size_t vxlan_get_size(const struct net_device *dev)
1066{
1067
1068 return nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_ID */
1069 nla_total_size(sizeof(__be32)) +/* IFLA_VXLAN_GROUP */
1070 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */
1071 nla_total_size(sizeof(__be32))+ /* IFLA_VXLAN_LOCAL */
1072 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL */
1073 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TOS */
1074 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LEARNING */
1075 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */
1076 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */
1077 0;
1078}
1079
1080static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
1081{
1082 const struct vxlan_dev *vxlan = netdev_priv(dev);
1083
1084 if (nla_put_u32(skb, IFLA_VXLAN_ID, vxlan->vni))
1085 goto nla_put_failure;
1086
1087 if (vxlan->gaddr && nla_put_u32(skb, IFLA_VXLAN_GROUP, vxlan->gaddr))
1088 goto nla_put_failure;
1089
1090 if (vxlan->link && nla_put_u32(skb, IFLA_VXLAN_LINK, vxlan->link))
1091 goto nla_put_failure;
1092
1093 if (vxlan->saddr && nla_put_u32(skb, IFLA_VXLAN_LOCAL, vxlan->saddr))
1094 goto nla_put_failure;
1095
1096 if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) ||
1097 nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) ||
1098 nla_put_u8(skb, IFLA_VXLAN_LEARNING, vxlan->learn) ||
1099 nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->age_interval) ||
1100 nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->addrmax))
1101 goto nla_put_failure;
1102
1103 return 0;
1104
1105nla_put_failure:
1106 return -EMSGSIZE;
1107}
1108
1109static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
1110 .kind = "vxlan",
1111 .maxtype = IFLA_VXLAN_MAX,
1112 .policy = vxlan_policy,
1113 .priv_size = sizeof(struct vxlan_dev),
1114 .setup = vxlan_setup,
1115 .validate = vxlan_validate,
1116 .newlink = vxlan_newlink,
1117 .dellink = vxlan_dellink,
1118 .get_size = vxlan_get_size,
1119 .fill_info = vxlan_fill_info,
1120};
1121
1122static __net_init int vxlan_init_net(struct net *net)
1123{
1124 struct vxlan_net *vn = net_generic(net, vxlan_net_id);
1125 struct sock *sk;
1126 struct sockaddr_in vxlan_addr = {
1127 .sin_family = AF_INET,
1128 .sin_addr.s_addr = htonl(INADDR_ANY),
1129 };
1130 int rc;
1131 unsigned h;
1132
1133 /* Create UDP socket for encapsulation receive. */
1134 rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &vn->sock);
1135 if (rc < 0) {
1136 pr_debug("UDP socket create failed\n");
1137 return rc;
1138 }
1139
1140 vxlan_addr.sin_port = htons(vxlan_port);
1141
1142 rc = kernel_bind(vn->sock, (struct sockaddr *) &vxlan_addr,
1143 sizeof(vxlan_addr));
1144 if (rc < 0) {
1145 pr_debug("bind for UDP socket %pI4:%u (%d)\n",
1146 &vxlan_addr.sin_addr, ntohs(vxlan_addr.sin_port), rc);
1147 sock_release(vn->sock);
1148 vn->sock = NULL;
1149 return rc;
1150 }
1151
1152 /* Disable multicast loopback */
1153 sk = vn->sock->sk;
1154 inet_sk(sk)->mc_loop = 0;
1155
1156 /* Mark socket as an encapsulation socket. */
1157 udp_sk(sk)->encap_type = 1;
1158 udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv;
1159 udp_encap_enable();
1160
1161 for (h = 0; h < VNI_HASH_SIZE; ++h)
1162 INIT_HLIST_HEAD(&vn->vni_list[h]);
1163
1164 return 0;
1165}
1166
1167static __net_exit void vxlan_exit_net(struct net *net)
1168{
1169 struct vxlan_net *vn = net_generic(net, vxlan_net_id);
1170
1171 if (vn->sock) {
1172 sock_release(vn->sock);
1173 vn->sock = NULL;
1174 }
1175}
1176
1177static struct pernet_operations vxlan_net_ops = {
1178 .init = vxlan_init_net,
1179 .exit = vxlan_exit_net,
1180 .id = &vxlan_net_id,
1181 .size = sizeof(struct vxlan_net),
1182};
1183
1184static int __init vxlan_init_module(void)
1185{
1186 int rc;
1187
1188 get_random_bytes(&vxlan_salt, sizeof(vxlan_salt));
1189
1190 rc = register_pernet_device(&vxlan_net_ops);
1191 if (rc)
1192 goto out1;
1193
1194 rc = rtnl_link_register(&vxlan_link_ops);
1195 if (rc)
1196 goto out2;
1197
1198 return 0;
1199
1200out2:
1201 unregister_pernet_device(&vxlan_net_ops);
1202out1:
1203 return rc;
1204}
1205module_init(vxlan_init_module);
1206
1207static void __exit vxlan_cleanup_module(void)
1208{
1209 rtnl_link_unregister(&vxlan_link_ops);
1210 unregister_pernet_device(&vxlan_net_ops);
1211}
1212module_exit(vxlan_cleanup_module);
1213
1214MODULE_LICENSE("GPL");
1215MODULE_VERSION(VXLAN_VERSION);
1216MODULE_AUTHOR("Stephen Hemminger <shemminger@vyatta.com>");
1217MODULE_ALIAS_RTNL_LINK("vxlan");