diff options
Diffstat (limited to 'drivers/net/vxlan.c')
-rw-r--r-- | drivers/net/vxlan.c | 1219 |
1 files changed, 1219 insertions, 0 deletions
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c new file mode 100644 index 000000000000..51de9edb55f5 --- /dev/null +++ b/drivers/net/vxlan.c | |||
@@ -0,0 +1,1219 @@ | |||
1 | /* | ||
2 | * VXLAN: Virtual eXtensiable Local Area Network | ||
3 | * | ||
4 | * Copyright (c) 2012 Vyatta Inc. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | * | ||
10 | * TODO | ||
11 | * - use IANA UDP port number (when defined) | ||
12 | * - IPv6 (not in RFC) | ||
13 | */ | ||
14 | |||
15 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
16 | |||
17 | #include <linux/kernel.h> | ||
18 | #include <linux/types.h> | ||
19 | #include <linux/module.h> | ||
20 | #include <linux/errno.h> | ||
21 | #include <linux/slab.h> | ||
22 | #include <linux/skbuff.h> | ||
23 | #include <linux/rculist.h> | ||
24 | #include <linux/netdevice.h> | ||
25 | #include <linux/in.h> | ||
26 | #include <linux/ip.h> | ||
27 | #include <linux/udp.h> | ||
28 | #include <linux/igmp.h> | ||
29 | #include <linux/etherdevice.h> | ||
30 | #include <linux/if_ether.h> | ||
31 | #include <linux/version.h> | ||
32 | #include <linux/hash.h> | ||
33 | #include <net/ip.h> | ||
34 | #include <net/icmp.h> | ||
35 | #include <net/udp.h> | ||
36 | #include <net/rtnetlink.h> | ||
37 | #include <net/route.h> | ||
38 | #include <net/dsfield.h> | ||
39 | #include <net/inet_ecn.h> | ||
40 | #include <net/net_namespace.h> | ||
41 | #include <net/netns/generic.h> | ||
42 | |||
43 | #define VXLAN_VERSION "0.1" | ||
44 | |||
45 | #define VNI_HASH_BITS 10 | ||
46 | #define VNI_HASH_SIZE (1<<VNI_HASH_BITS) | ||
47 | #define FDB_HASH_BITS 8 | ||
48 | #define FDB_HASH_SIZE (1<<FDB_HASH_BITS) | ||
49 | #define FDB_AGE_DEFAULT 300 /* 5 min */ | ||
50 | #define FDB_AGE_INTERVAL (10 * HZ) /* rescan interval */ | ||
51 | |||
52 | #define VXLAN_N_VID (1u << 24) | ||
53 | #define VXLAN_VID_MASK (VXLAN_N_VID - 1) | ||
54 | /* VLAN + IP header + UDP + VXLAN */ | ||
55 | #define VXLAN_HEADROOM (4 + 20 + 8 + 8) | ||
56 | |||
57 | #define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags required value. */ | ||
58 | |||
59 | /* VXLAN protocol header */ | ||
60 | struct vxlanhdr { | ||
61 | __be32 vx_flags; | ||
62 | __be32 vx_vni; | ||
63 | }; | ||
64 | |||
65 | /* UDP port for VXLAN traffic. */ | ||
66 | static unsigned int vxlan_port __read_mostly = 8472; | ||
67 | module_param_named(udp_port, vxlan_port, uint, 0444); | ||
68 | MODULE_PARM_DESC(udp_port, "Destination UDP port"); | ||
69 | |||
70 | static bool log_ecn_error = true; | ||
71 | module_param(log_ecn_error, bool, 0644); | ||
72 | MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); | ||
73 | |||
74 | /* per-net private data for this module */ | ||
75 | static unsigned int vxlan_net_id; | ||
76 | struct vxlan_net { | ||
77 | struct socket *sock; /* UDP encap socket */ | ||
78 | struct hlist_head vni_list[VNI_HASH_SIZE]; | ||
79 | }; | ||
80 | |||
81 | /* Forwarding table entry */ | ||
82 | struct vxlan_fdb { | ||
83 | struct hlist_node hlist; /* linked list of entries */ | ||
84 | struct rcu_head rcu; | ||
85 | unsigned long updated; /* jiffies */ | ||
86 | unsigned long used; | ||
87 | __be32 remote_ip; | ||
88 | u16 state; /* see ndm_state */ | ||
89 | u8 eth_addr[ETH_ALEN]; | ||
90 | }; | ||
91 | |||
92 | /* Per-cpu network traffic stats */ | ||
93 | struct vxlan_stats { | ||
94 | u64 rx_packets; | ||
95 | u64 rx_bytes; | ||
96 | u64 tx_packets; | ||
97 | u64 tx_bytes; | ||
98 | struct u64_stats_sync syncp; | ||
99 | }; | ||
100 | |||
101 | /* Pseudo network device */ | ||
102 | struct vxlan_dev { | ||
103 | struct hlist_node hlist; | ||
104 | struct net_device *dev; | ||
105 | struct vxlan_stats __percpu *stats; | ||
106 | __u32 vni; /* virtual network id */ | ||
107 | __be32 gaddr; /* multicast group */ | ||
108 | __be32 saddr; /* source address */ | ||
109 | unsigned int link; /* link to multicast over */ | ||
110 | __u8 tos; /* TOS override */ | ||
111 | __u8 ttl; | ||
112 | bool learn; | ||
113 | |||
114 | unsigned long age_interval; | ||
115 | struct timer_list age_timer; | ||
116 | spinlock_t hash_lock; | ||
117 | unsigned int addrcnt; | ||
118 | unsigned int addrmax; | ||
119 | unsigned int addrexceeded; | ||
120 | |||
121 | struct hlist_head fdb_head[FDB_HASH_SIZE]; | ||
122 | }; | ||
123 | |||
124 | /* salt for hash table */ | ||
125 | static u32 vxlan_salt __read_mostly; | ||
126 | |||
127 | static inline struct hlist_head *vni_head(struct net *net, u32 id) | ||
128 | { | ||
129 | struct vxlan_net *vn = net_generic(net, vxlan_net_id); | ||
130 | |||
131 | return &vn->vni_list[hash_32(id, VNI_HASH_BITS)]; | ||
132 | } | ||
133 | |||
134 | /* Look up VNI in a per net namespace table */ | ||
135 | static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id) | ||
136 | { | ||
137 | struct vxlan_dev *vxlan; | ||
138 | struct hlist_node *node; | ||
139 | |||
140 | hlist_for_each_entry_rcu(vxlan, node, vni_head(net, id), hlist) { | ||
141 | if (vxlan->vni == id) | ||
142 | return vxlan; | ||
143 | } | ||
144 | |||
145 | return NULL; | ||
146 | } | ||
147 | |||
148 | /* Fill in neighbour message in skbuff. */ | ||
149 | static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, | ||
150 | const struct vxlan_fdb *fdb, | ||
151 | u32 portid, u32 seq, int type, unsigned int flags) | ||
152 | { | ||
153 | unsigned long now = jiffies; | ||
154 | struct nda_cacheinfo ci; | ||
155 | struct nlmsghdr *nlh; | ||
156 | struct ndmsg *ndm; | ||
157 | |||
158 | nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags); | ||
159 | if (nlh == NULL) | ||
160 | return -EMSGSIZE; | ||
161 | |||
162 | ndm = nlmsg_data(nlh); | ||
163 | memset(ndm, 0, sizeof(*ndm)); | ||
164 | ndm->ndm_family = AF_BRIDGE; | ||
165 | ndm->ndm_state = fdb->state; | ||
166 | ndm->ndm_ifindex = vxlan->dev->ifindex; | ||
167 | ndm->ndm_flags = NTF_SELF; | ||
168 | ndm->ndm_type = NDA_DST; | ||
169 | |||
170 | if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) | ||
171 | goto nla_put_failure; | ||
172 | |||
173 | if (nla_put_be32(skb, NDA_DST, fdb->remote_ip)) | ||
174 | goto nla_put_failure; | ||
175 | |||
176 | ci.ndm_used = jiffies_to_clock_t(now - fdb->used); | ||
177 | ci.ndm_confirmed = 0; | ||
178 | ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated); | ||
179 | ci.ndm_refcnt = 0; | ||
180 | |||
181 | if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) | ||
182 | goto nla_put_failure; | ||
183 | |||
184 | return nlmsg_end(skb, nlh); | ||
185 | |||
186 | nla_put_failure: | ||
187 | nlmsg_cancel(skb, nlh); | ||
188 | return -EMSGSIZE; | ||
189 | } | ||
190 | |||
191 | static inline size_t vxlan_nlmsg_size(void) | ||
192 | { | ||
193 | return NLMSG_ALIGN(sizeof(struct ndmsg)) | ||
194 | + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ | ||
195 | + nla_total_size(sizeof(__be32)) /* NDA_DST */ | ||
196 | + nla_total_size(sizeof(struct nda_cacheinfo)); | ||
197 | } | ||
198 | |||
199 | static void vxlan_fdb_notify(struct vxlan_dev *vxlan, | ||
200 | const struct vxlan_fdb *fdb, int type) | ||
201 | { | ||
202 | struct net *net = dev_net(vxlan->dev); | ||
203 | struct sk_buff *skb; | ||
204 | int err = -ENOBUFS; | ||
205 | |||
206 | skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC); | ||
207 | if (skb == NULL) | ||
208 | goto errout; | ||
209 | |||
210 | err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0); | ||
211 | if (err < 0) { | ||
212 | /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */ | ||
213 | WARN_ON(err == -EMSGSIZE); | ||
214 | kfree_skb(skb); | ||
215 | goto errout; | ||
216 | } | ||
217 | |||
218 | rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); | ||
219 | return; | ||
220 | errout: | ||
221 | if (err < 0) | ||
222 | rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); | ||
223 | } | ||
224 | |||
225 | /* Hash Ethernet address */ | ||
226 | static u32 eth_hash(const unsigned char *addr) | ||
227 | { | ||
228 | u64 value = get_unaligned((u64 *)addr); | ||
229 | |||
230 | /* only want 6 bytes */ | ||
231 | #ifdef __BIG_ENDIAN | ||
232 | value <<= 16; | ||
233 | #else | ||
234 | value >>= 16; | ||
235 | #endif | ||
236 | return hash_64(value, FDB_HASH_BITS); | ||
237 | } | ||
238 | |||
239 | /* Hash chain to use given mac address */ | ||
240 | static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan, | ||
241 | const u8 *mac) | ||
242 | { | ||
243 | return &vxlan->fdb_head[eth_hash(mac)]; | ||
244 | } | ||
245 | |||
246 | /* Look up Ethernet address in forwarding table */ | ||
247 | static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, | ||
248 | const u8 *mac) | ||
249 | |||
250 | { | ||
251 | struct hlist_head *head = vxlan_fdb_head(vxlan, mac); | ||
252 | struct vxlan_fdb *f; | ||
253 | struct hlist_node *node; | ||
254 | |||
255 | hlist_for_each_entry_rcu(f, node, head, hlist) { | ||
256 | if (compare_ether_addr(mac, f->eth_addr) == 0) | ||
257 | return f; | ||
258 | } | ||
259 | |||
260 | return NULL; | ||
261 | } | ||
262 | |||
263 | /* Add new entry to forwarding table -- assumes lock held */ | ||
264 | static int vxlan_fdb_create(struct vxlan_dev *vxlan, | ||
265 | const u8 *mac, __be32 ip, | ||
266 | __u16 state, __u16 flags) | ||
267 | { | ||
268 | struct vxlan_fdb *f; | ||
269 | int notify = 0; | ||
270 | |||
271 | f = vxlan_find_mac(vxlan, mac); | ||
272 | if (f) { | ||
273 | if (flags & NLM_F_EXCL) { | ||
274 | netdev_dbg(vxlan->dev, | ||
275 | "lost race to create %pM\n", mac); | ||
276 | return -EEXIST; | ||
277 | } | ||
278 | if (f->state != state) { | ||
279 | f->state = state; | ||
280 | f->updated = jiffies; | ||
281 | notify = 1; | ||
282 | } | ||
283 | } else { | ||
284 | if (!(flags & NLM_F_CREATE)) | ||
285 | return -ENOENT; | ||
286 | |||
287 | if (vxlan->addrmax && vxlan->addrcnt >= vxlan->addrmax) | ||
288 | return -ENOSPC; | ||
289 | |||
290 | netdev_dbg(vxlan->dev, "add %pM -> %pI4\n", mac, &ip); | ||
291 | f = kmalloc(sizeof(*f), GFP_ATOMIC); | ||
292 | if (!f) | ||
293 | return -ENOMEM; | ||
294 | |||
295 | notify = 1; | ||
296 | f->remote_ip = ip; | ||
297 | f->state = state; | ||
298 | f->updated = f->used = jiffies; | ||
299 | memcpy(f->eth_addr, mac, ETH_ALEN); | ||
300 | |||
301 | ++vxlan->addrcnt; | ||
302 | hlist_add_head_rcu(&f->hlist, | ||
303 | vxlan_fdb_head(vxlan, mac)); | ||
304 | } | ||
305 | |||
306 | if (notify) | ||
307 | vxlan_fdb_notify(vxlan, f, RTM_NEWNEIGH); | ||
308 | |||
309 | return 0; | ||
310 | } | ||
311 | |||
312 | static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f) | ||
313 | { | ||
314 | netdev_dbg(vxlan->dev, | ||
315 | "delete %pM\n", f->eth_addr); | ||
316 | |||
317 | --vxlan->addrcnt; | ||
318 | vxlan_fdb_notify(vxlan, f, RTM_DELNEIGH); | ||
319 | |||
320 | hlist_del_rcu(&f->hlist); | ||
321 | kfree_rcu(f, rcu); | ||
322 | } | ||
323 | |||
324 | /* Add static entry (via netlink) */ | ||
325 | static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], | ||
326 | struct net_device *dev, | ||
327 | const unsigned char *addr, u16 flags) | ||
328 | { | ||
329 | struct vxlan_dev *vxlan = netdev_priv(dev); | ||
330 | __be32 ip; | ||
331 | int err; | ||
332 | |||
333 | if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) { | ||
334 | pr_info("RTM_NEWNEIGH with invalid state %#x\n", | ||
335 | ndm->ndm_state); | ||
336 | return -EINVAL; | ||
337 | } | ||
338 | |||
339 | if (tb[NDA_DST] == NULL) | ||
340 | return -EINVAL; | ||
341 | |||
342 | if (nla_len(tb[NDA_DST]) != sizeof(__be32)) | ||
343 | return -EAFNOSUPPORT; | ||
344 | |||
345 | ip = nla_get_be32(tb[NDA_DST]); | ||
346 | |||
347 | spin_lock_bh(&vxlan->hash_lock); | ||
348 | err = vxlan_fdb_create(vxlan, addr, ip, ndm->ndm_state, flags); | ||
349 | spin_unlock_bh(&vxlan->hash_lock); | ||
350 | |||
351 | return err; | ||
352 | } | ||
353 | |||
354 | /* Delete entry (via netlink) */ | ||
355 | static int vxlan_fdb_delete(struct ndmsg *ndm, struct net_device *dev, | ||
356 | const unsigned char *addr) | ||
357 | { | ||
358 | struct vxlan_dev *vxlan = netdev_priv(dev); | ||
359 | struct vxlan_fdb *f; | ||
360 | int err = -ENOENT; | ||
361 | |||
362 | spin_lock_bh(&vxlan->hash_lock); | ||
363 | f = vxlan_find_mac(vxlan, addr); | ||
364 | if (f) { | ||
365 | vxlan_fdb_destroy(vxlan, f); | ||
366 | err = 0; | ||
367 | } | ||
368 | spin_unlock_bh(&vxlan->hash_lock); | ||
369 | |||
370 | return err; | ||
371 | } | ||
372 | |||
373 | /* Dump forwarding table */ | ||
374 | static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, | ||
375 | struct net_device *dev, int idx) | ||
376 | { | ||
377 | struct vxlan_dev *vxlan = netdev_priv(dev); | ||
378 | unsigned int h; | ||
379 | |||
380 | for (h = 0; h < FDB_HASH_SIZE; ++h) { | ||
381 | struct vxlan_fdb *f; | ||
382 | struct hlist_node *n; | ||
383 | int err; | ||
384 | |||
385 | hlist_for_each_entry_rcu(f, n, &vxlan->fdb_head[h], hlist) { | ||
386 | if (idx < cb->args[0]) | ||
387 | goto skip; | ||
388 | |||
389 | err = vxlan_fdb_info(skb, vxlan, f, | ||
390 | NETLINK_CB(cb->skb).portid, | ||
391 | cb->nlh->nlmsg_seq, | ||
392 | RTM_NEWNEIGH, | ||
393 | NLM_F_MULTI); | ||
394 | if (err < 0) | ||
395 | break; | ||
396 | skip: | ||
397 | ++idx; | ||
398 | } | ||
399 | } | ||
400 | |||
401 | return idx; | ||
402 | } | ||
403 | |||
404 | /* Watch incoming packets to learn mapping between Ethernet address | ||
405 | * and Tunnel endpoint. | ||
406 | */ | ||
407 | static void vxlan_snoop(struct net_device *dev, | ||
408 | __be32 src_ip, const u8 *src_mac) | ||
409 | { | ||
410 | struct vxlan_dev *vxlan = netdev_priv(dev); | ||
411 | struct vxlan_fdb *f; | ||
412 | int err; | ||
413 | |||
414 | f = vxlan_find_mac(vxlan, src_mac); | ||
415 | if (likely(f)) { | ||
416 | f->used = jiffies; | ||
417 | if (likely(f->remote_ip == src_ip)) | ||
418 | return; | ||
419 | |||
420 | if (net_ratelimit()) | ||
421 | netdev_info(dev, | ||
422 | "%pM migrated from %pI4 to %pI4\n", | ||
423 | src_mac, &f->remote_ip, &src_ip); | ||
424 | |||
425 | f->remote_ip = src_ip; | ||
426 | f->updated = jiffies; | ||
427 | } else { | ||
428 | /* learned new entry */ | ||
429 | spin_lock(&vxlan->hash_lock); | ||
430 | err = vxlan_fdb_create(vxlan, src_mac, src_ip, | ||
431 | NUD_REACHABLE, | ||
432 | NLM_F_EXCL|NLM_F_CREATE); | ||
433 | spin_unlock(&vxlan->hash_lock); | ||
434 | } | ||
435 | } | ||
436 | |||
437 | |||
438 | /* See if multicast group is already in use by other ID */ | ||
439 | static bool vxlan_group_used(struct vxlan_net *vn, | ||
440 | const struct vxlan_dev *this) | ||
441 | { | ||
442 | const struct vxlan_dev *vxlan; | ||
443 | struct hlist_node *node; | ||
444 | unsigned h; | ||
445 | |||
446 | for (h = 0; h < VNI_HASH_SIZE; ++h) | ||
447 | hlist_for_each_entry(vxlan, node, &vn->vni_list[h], hlist) { | ||
448 | if (vxlan == this) | ||
449 | continue; | ||
450 | |||
451 | if (!netif_running(vxlan->dev)) | ||
452 | continue; | ||
453 | |||
454 | if (vxlan->gaddr == this->gaddr) | ||
455 | return true; | ||
456 | } | ||
457 | |||
458 | return false; | ||
459 | } | ||
460 | |||
461 | /* kernel equivalent to IP_ADD_MEMBERSHIP */ | ||
462 | static int vxlan_join_group(struct net_device *dev) | ||
463 | { | ||
464 | struct vxlan_dev *vxlan = netdev_priv(dev); | ||
465 | struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); | ||
466 | struct sock *sk = vn->sock->sk; | ||
467 | struct ip_mreqn mreq = { | ||
468 | .imr_multiaddr.s_addr = vxlan->gaddr, | ||
469 | }; | ||
470 | int err; | ||
471 | |||
472 | /* Already a member of group */ | ||
473 | if (vxlan_group_used(vn, vxlan)) | ||
474 | return 0; | ||
475 | |||
476 | /* Need to drop RTNL to call multicast join */ | ||
477 | rtnl_unlock(); | ||
478 | lock_sock(sk); | ||
479 | err = ip_mc_join_group(sk, &mreq); | ||
480 | release_sock(sk); | ||
481 | rtnl_lock(); | ||
482 | |||
483 | return err; | ||
484 | } | ||
485 | |||
486 | |||
487 | /* kernel equivalent to IP_DROP_MEMBERSHIP */ | ||
488 | static int vxlan_leave_group(struct net_device *dev) | ||
489 | { | ||
490 | struct vxlan_dev *vxlan = netdev_priv(dev); | ||
491 | struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); | ||
492 | int err = 0; | ||
493 | struct sock *sk = vn->sock->sk; | ||
494 | struct ip_mreqn mreq = { | ||
495 | .imr_multiaddr.s_addr = vxlan->gaddr, | ||
496 | }; | ||
497 | |||
498 | /* Only leave group when last vxlan is done. */ | ||
499 | if (vxlan_group_used(vn, vxlan)) | ||
500 | return 0; | ||
501 | |||
502 | /* Need to drop RTNL to call multicast leave */ | ||
503 | rtnl_unlock(); | ||
504 | lock_sock(sk); | ||
505 | err = ip_mc_leave_group(sk, &mreq); | ||
506 | release_sock(sk); | ||
507 | rtnl_lock(); | ||
508 | |||
509 | return err; | ||
510 | } | ||
511 | |||
512 | /* Callback from net/ipv4/udp.c to receive packets */ | ||
513 | static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) | ||
514 | { | ||
515 | struct iphdr *oip; | ||
516 | struct vxlanhdr *vxh; | ||
517 | struct vxlan_dev *vxlan; | ||
518 | struct vxlan_stats *stats; | ||
519 | __u32 vni; | ||
520 | int err; | ||
521 | |||
522 | /* pop off outer UDP header */ | ||
523 | __skb_pull(skb, sizeof(struct udphdr)); | ||
524 | |||
525 | /* Need Vxlan and inner Ethernet header to be present */ | ||
526 | if (!pskb_may_pull(skb, sizeof(struct vxlanhdr))) | ||
527 | goto error; | ||
528 | |||
529 | /* Drop packets with reserved bits set */ | ||
530 | vxh = (struct vxlanhdr *) skb->data; | ||
531 | if (vxh->vx_flags != htonl(VXLAN_FLAGS) || | ||
532 | (vxh->vx_vni & htonl(0xff))) { | ||
533 | netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n", | ||
534 | ntohl(vxh->vx_flags), ntohl(vxh->vx_vni)); | ||
535 | goto error; | ||
536 | } | ||
537 | |||
538 | __skb_pull(skb, sizeof(struct vxlanhdr)); | ||
539 | skb_postpull_rcsum(skb, eth_hdr(skb), sizeof(struct vxlanhdr)); | ||
540 | |||
541 | /* Is this VNI defined? */ | ||
542 | vni = ntohl(vxh->vx_vni) >> 8; | ||
543 | vxlan = vxlan_find_vni(sock_net(sk), vni); | ||
544 | if (!vxlan) { | ||
545 | netdev_dbg(skb->dev, "unknown vni %d\n", vni); | ||
546 | goto drop; | ||
547 | } | ||
548 | |||
549 | if (!pskb_may_pull(skb, ETH_HLEN)) { | ||
550 | vxlan->dev->stats.rx_length_errors++; | ||
551 | vxlan->dev->stats.rx_errors++; | ||
552 | goto drop; | ||
553 | } | ||
554 | |||
555 | /* Re-examine inner Ethernet packet */ | ||
556 | oip = ip_hdr(skb); | ||
557 | skb->protocol = eth_type_trans(skb, vxlan->dev); | ||
558 | skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); | ||
559 | |||
560 | /* Ignore packet loops (and multicast echo) */ | ||
561 | if (compare_ether_addr(eth_hdr(skb)->h_source, | ||
562 | vxlan->dev->dev_addr) == 0) | ||
563 | goto drop; | ||
564 | |||
565 | if (vxlan->learn) | ||
566 | vxlan_snoop(skb->dev, oip->saddr, eth_hdr(skb)->h_source); | ||
567 | |||
568 | __skb_tunnel_rx(skb, vxlan->dev); | ||
569 | skb_reset_network_header(skb); | ||
570 | |||
571 | err = IP_ECN_decapsulate(oip, skb); | ||
572 | if (unlikely(err)) { | ||
573 | if (log_ecn_error) | ||
574 | net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", | ||
575 | &oip->saddr, oip->tos); | ||
576 | if (err > 1) { | ||
577 | ++vxlan->dev->stats.rx_frame_errors; | ||
578 | ++vxlan->dev->stats.rx_errors; | ||
579 | goto drop; | ||
580 | } | ||
581 | } | ||
582 | |||
583 | stats = this_cpu_ptr(vxlan->stats); | ||
584 | u64_stats_update_begin(&stats->syncp); | ||
585 | stats->rx_packets++; | ||
586 | stats->rx_bytes += skb->len; | ||
587 | u64_stats_update_end(&stats->syncp); | ||
588 | |||
589 | netif_rx(skb); | ||
590 | |||
591 | return 0; | ||
592 | error: | ||
593 | /* Put UDP header back */ | ||
594 | __skb_push(skb, sizeof(struct udphdr)); | ||
595 | |||
596 | return 1; | ||
597 | drop: | ||
598 | /* Consume bad packet */ | ||
599 | kfree_skb(skb); | ||
600 | return 0; | ||
601 | } | ||
602 | |||
603 | /* Extract dsfield from inner protocol */ | ||
604 | static inline u8 vxlan_get_dsfield(const struct iphdr *iph, | ||
605 | const struct sk_buff *skb) | ||
606 | { | ||
607 | if (skb->protocol == htons(ETH_P_IP)) | ||
608 | return iph->tos; | ||
609 | else if (skb->protocol == htons(ETH_P_IPV6)) | ||
610 | return ipv6_get_dsfield((const struct ipv6hdr *)iph); | ||
611 | else | ||
612 | return 0; | ||
613 | } | ||
614 | |||
615 | /* Propogate ECN bits out */ | ||
616 | static inline u8 vxlan_ecn_encap(u8 tos, | ||
617 | const struct iphdr *iph, | ||
618 | const struct sk_buff *skb) | ||
619 | { | ||
620 | u8 inner = vxlan_get_dsfield(iph, skb); | ||
621 | |||
622 | return INET_ECN_encapsulate(tos, inner); | ||
623 | } | ||
624 | |||
625 | /* Transmit local packets over Vxlan | ||
626 | * | ||
627 | * Outer IP header inherits ECN and DF from inner header. | ||
628 | * Outer UDP destination is the VXLAN assigned port. | ||
629 | * source port is based on hash of flow if available | ||
630 | * otherwise use a random value | ||
631 | */ | ||
632 | static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) | ||
633 | { | ||
634 | struct vxlan_dev *vxlan = netdev_priv(dev); | ||
635 | struct rtable *rt; | ||
636 | const struct ethhdr *eth; | ||
637 | const struct iphdr *old_iph; | ||
638 | struct iphdr *iph; | ||
639 | struct vxlanhdr *vxh; | ||
640 | struct udphdr *uh; | ||
641 | struct flowi4 fl4; | ||
642 | struct vxlan_fdb *f; | ||
643 | unsigned int pkt_len = skb->len; | ||
644 | u32 hash; | ||
645 | __be32 dst; | ||
646 | __be16 df = 0; | ||
647 | __u8 tos, ttl; | ||
648 | int err; | ||
649 | |||
650 | /* Need space for new headers (invalidates iph ptr) */ | ||
651 | if (skb_cow_head(skb, VXLAN_HEADROOM)) | ||
652 | goto drop; | ||
653 | |||
654 | eth = (void *)skb->data; | ||
655 | old_iph = ip_hdr(skb); | ||
656 | |||
657 | if (!is_multicast_ether_addr(eth->h_dest) && | ||
658 | (f = vxlan_find_mac(vxlan, eth->h_dest))) | ||
659 | dst = f->remote_ip; | ||
660 | else if (vxlan->gaddr) { | ||
661 | dst = vxlan->gaddr; | ||
662 | } else | ||
663 | goto drop; | ||
664 | |||
665 | ttl = vxlan->ttl; | ||
666 | if (!ttl && IN_MULTICAST(ntohl(dst))) | ||
667 | ttl = 1; | ||
668 | |||
669 | tos = vxlan->tos; | ||
670 | if (tos == 1) | ||
671 | tos = vxlan_get_dsfield(old_iph, skb); | ||
672 | |||
673 | hash = skb_get_rxhash(skb); | ||
674 | |||
675 | rt = ip_route_output_gre(dev_net(dev), &fl4, dst, | ||
676 | vxlan->saddr, vxlan->vni, | ||
677 | RT_TOS(tos), vxlan->link); | ||
678 | if (IS_ERR(rt)) { | ||
679 | netdev_dbg(dev, "no route to %pI4\n", &dst); | ||
680 | dev->stats.tx_carrier_errors++; | ||
681 | goto tx_error; | ||
682 | } | ||
683 | |||
684 | if (rt->dst.dev == dev) { | ||
685 | netdev_dbg(dev, "circular route to %pI4\n", &dst); | ||
686 | ip_rt_put(rt); | ||
687 | dev->stats.collisions++; | ||
688 | goto tx_error; | ||
689 | } | ||
690 | |||
691 | memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); | ||
692 | IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | | ||
693 | IPSKB_REROUTED); | ||
694 | skb_dst_drop(skb); | ||
695 | skb_dst_set(skb, &rt->dst); | ||
696 | |||
697 | vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); | ||
698 | vxh->vx_flags = htonl(VXLAN_FLAGS); | ||
699 | vxh->vx_vni = htonl(vxlan->vni << 8); | ||
700 | |||
701 | __skb_push(skb, sizeof(*uh)); | ||
702 | skb_reset_transport_header(skb); | ||
703 | uh = udp_hdr(skb); | ||
704 | |||
705 | uh->dest = htons(vxlan_port); | ||
706 | uh->source = hash ? :random32(); | ||
707 | |||
708 | uh->len = htons(skb->len); | ||
709 | uh->check = 0; | ||
710 | |||
711 | __skb_push(skb, sizeof(*iph)); | ||
712 | skb_reset_network_header(skb); | ||
713 | iph = ip_hdr(skb); | ||
714 | iph->version = 4; | ||
715 | iph->ihl = sizeof(struct iphdr) >> 2; | ||
716 | iph->frag_off = df; | ||
717 | iph->protocol = IPPROTO_UDP; | ||
718 | iph->tos = vxlan_ecn_encap(tos, old_iph, skb); | ||
719 | iph->daddr = fl4.daddr; | ||
720 | iph->saddr = fl4.saddr; | ||
721 | iph->ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); | ||
722 | |||
723 | /* See __IPTUNNEL_XMIT */ | ||
724 | skb->ip_summed = CHECKSUM_NONE; | ||
725 | ip_select_ident(iph, &rt->dst, NULL); | ||
726 | |||
727 | err = ip_local_out(skb); | ||
728 | if (likely(net_xmit_eval(err) == 0)) { | ||
729 | struct vxlan_stats *stats = this_cpu_ptr(vxlan->stats); | ||
730 | |||
731 | u64_stats_update_begin(&stats->syncp); | ||
732 | stats->tx_packets++; | ||
733 | stats->tx_bytes += pkt_len; | ||
734 | u64_stats_update_end(&stats->syncp); | ||
735 | } else { | ||
736 | dev->stats.tx_errors++; | ||
737 | dev->stats.tx_aborted_errors++; | ||
738 | } | ||
739 | return NETDEV_TX_OK; | ||
740 | |||
741 | drop: | ||
742 | dev->stats.tx_dropped++; | ||
743 | goto tx_free; | ||
744 | |||
745 | tx_error: | ||
746 | dev->stats.tx_errors++; | ||
747 | tx_free: | ||
748 | dev_kfree_skb(skb); | ||
749 | return NETDEV_TX_OK; | ||
750 | } | ||
751 | |||
752 | /* Walk the forwarding table and purge stale entries */ | ||
753 | static void vxlan_cleanup(unsigned long arg) | ||
754 | { | ||
755 | struct vxlan_dev *vxlan = (struct vxlan_dev *) arg; | ||
756 | unsigned long next_timer = jiffies + FDB_AGE_INTERVAL; | ||
757 | unsigned int h; | ||
758 | |||
759 | if (!netif_running(vxlan->dev)) | ||
760 | return; | ||
761 | |||
762 | spin_lock_bh(&vxlan->hash_lock); | ||
763 | for (h = 0; h < FDB_HASH_SIZE; ++h) { | ||
764 | struct hlist_node *p, *n; | ||
765 | hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { | ||
766 | struct vxlan_fdb *f | ||
767 | = container_of(p, struct vxlan_fdb, hlist); | ||
768 | unsigned long timeout; | ||
769 | |||
770 | if (f->state == NUD_PERMANENT) | ||
771 | continue; | ||
772 | |||
773 | timeout = f->used + vxlan->age_interval * HZ; | ||
774 | if (time_before_eq(timeout, jiffies)) { | ||
775 | netdev_dbg(vxlan->dev, | ||
776 | "garbage collect %pM\n", | ||
777 | f->eth_addr); | ||
778 | f->state = NUD_STALE; | ||
779 | vxlan_fdb_destroy(vxlan, f); | ||
780 | } else if (time_before(timeout, next_timer)) | ||
781 | next_timer = timeout; | ||
782 | } | ||
783 | } | ||
784 | spin_unlock_bh(&vxlan->hash_lock); | ||
785 | |||
786 | mod_timer(&vxlan->age_timer, next_timer); | ||
787 | } | ||
788 | |||
789 | /* Setup stats when device is created */ | ||
790 | static int vxlan_init(struct net_device *dev) | ||
791 | { | ||
792 | struct vxlan_dev *vxlan = netdev_priv(dev); | ||
793 | |||
794 | vxlan->stats = alloc_percpu(struct vxlan_stats); | ||
795 | if (!vxlan->stats) | ||
796 | return -ENOMEM; | ||
797 | |||
798 | return 0; | ||
799 | } | ||
800 | |||
801 | /* Start ageing timer and join group when device is brought up */ | ||
802 | static int vxlan_open(struct net_device *dev) | ||
803 | { | ||
804 | struct vxlan_dev *vxlan = netdev_priv(dev); | ||
805 | int err; | ||
806 | |||
807 | if (vxlan->gaddr) { | ||
808 | err = vxlan_join_group(dev); | ||
809 | if (err) | ||
810 | return err; | ||
811 | } | ||
812 | |||
813 | if (vxlan->age_interval) | ||
814 | mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL); | ||
815 | |||
816 | return 0; | ||
817 | } | ||
818 | |||
819 | /* Purge the forwarding table */ | ||
820 | static void vxlan_flush(struct vxlan_dev *vxlan) | ||
821 | { | ||
822 | unsigned h; | ||
823 | |||
824 | spin_lock_bh(&vxlan->hash_lock); | ||
825 | for (h = 0; h < FDB_HASH_SIZE; ++h) { | ||
826 | struct hlist_node *p, *n; | ||
827 | hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { | ||
828 | struct vxlan_fdb *f | ||
829 | = container_of(p, struct vxlan_fdb, hlist); | ||
830 | vxlan_fdb_destroy(vxlan, f); | ||
831 | } | ||
832 | } | ||
833 | spin_unlock_bh(&vxlan->hash_lock); | ||
834 | } | ||
835 | |||
836 | /* Cleanup timer and forwarding table on shutdown */ | ||
837 | static int vxlan_stop(struct net_device *dev) | ||
838 | { | ||
839 | struct vxlan_dev *vxlan = netdev_priv(dev); | ||
840 | |||
841 | if (vxlan->gaddr) | ||
842 | vxlan_leave_group(dev); | ||
843 | |||
844 | del_timer_sync(&vxlan->age_timer); | ||
845 | |||
846 | vxlan_flush(vxlan); | ||
847 | |||
848 | return 0; | ||
849 | } | ||
850 | |||
851 | /* Merge per-cpu statistics */ | ||
852 | static struct rtnl_link_stats64 *vxlan_stats64(struct net_device *dev, | ||
853 | struct rtnl_link_stats64 *stats) | ||
854 | { | ||
855 | struct vxlan_dev *vxlan = netdev_priv(dev); | ||
856 | struct vxlan_stats tmp, sum = { 0 }; | ||
857 | unsigned int cpu; | ||
858 | |||
859 | for_each_possible_cpu(cpu) { | ||
860 | unsigned int start; | ||
861 | const struct vxlan_stats *stats | ||
862 | = per_cpu_ptr(vxlan->stats, cpu); | ||
863 | |||
864 | do { | ||
865 | start = u64_stats_fetch_begin_bh(&stats->syncp); | ||
866 | memcpy(&tmp, stats, sizeof(tmp)); | ||
867 | } while (u64_stats_fetch_retry_bh(&stats->syncp, start)); | ||
868 | |||
869 | sum.tx_bytes += tmp.tx_bytes; | ||
870 | sum.tx_packets += tmp.tx_packets; | ||
871 | sum.rx_bytes += tmp.rx_bytes; | ||
872 | sum.rx_packets += tmp.rx_packets; | ||
873 | } | ||
874 | |||
875 | stats->tx_bytes = sum.tx_bytes; | ||
876 | stats->tx_packets = sum.tx_packets; | ||
877 | stats->rx_bytes = sum.rx_bytes; | ||
878 | stats->rx_packets = sum.rx_packets; | ||
879 | |||
880 | stats->multicast = dev->stats.multicast; | ||
881 | stats->rx_length_errors = dev->stats.rx_length_errors; | ||
882 | stats->rx_frame_errors = dev->stats.rx_frame_errors; | ||
883 | stats->rx_errors = dev->stats.rx_errors; | ||
884 | |||
885 | stats->tx_dropped = dev->stats.tx_dropped; | ||
886 | stats->tx_carrier_errors = dev->stats.tx_carrier_errors; | ||
887 | stats->tx_aborted_errors = dev->stats.tx_aborted_errors; | ||
888 | stats->collisions = dev->stats.collisions; | ||
889 | stats->tx_errors = dev->stats.tx_errors; | ||
890 | |||
891 | return stats; | ||
892 | } | ||
893 | |||
894 | /* Stub, nothing needs to be done. */ | ||
895 | static void vxlan_set_multicast_list(struct net_device *dev) | ||
896 | { | ||
897 | } | ||
898 | |||
899 | static const struct net_device_ops vxlan_netdev_ops = { | ||
900 | .ndo_init = vxlan_init, | ||
901 | .ndo_open = vxlan_open, | ||
902 | .ndo_stop = vxlan_stop, | ||
903 | .ndo_start_xmit = vxlan_xmit, | ||
904 | .ndo_get_stats64 = vxlan_stats64, | ||
905 | .ndo_set_rx_mode = vxlan_set_multicast_list, | ||
906 | .ndo_change_mtu = eth_change_mtu, | ||
907 | .ndo_validate_addr = eth_validate_addr, | ||
908 | .ndo_set_mac_address = eth_mac_addr, | ||
909 | .ndo_fdb_add = vxlan_fdb_add, | ||
910 | .ndo_fdb_del = vxlan_fdb_delete, | ||
911 | .ndo_fdb_dump = vxlan_fdb_dump, | ||
912 | }; | ||
913 | |||
914 | /* Info for udev, that this is a virtual tunnel endpoint */ | ||
915 | static struct device_type vxlan_type = { | ||
916 | .name = "vxlan", | ||
917 | }; | ||
918 | |||
919 | static void vxlan_free(struct net_device *dev) | ||
920 | { | ||
921 | struct vxlan_dev *vxlan = netdev_priv(dev); | ||
922 | |||
923 | free_percpu(vxlan->stats); | ||
924 | free_netdev(dev); | ||
925 | } | ||
926 | |||
927 | /* Initialize the device structure. */ | ||
928 | static void vxlan_setup(struct net_device *dev) | ||
929 | { | ||
930 | struct vxlan_dev *vxlan = netdev_priv(dev); | ||
931 | unsigned h; | ||
932 | |||
933 | eth_hw_addr_random(dev); | ||
934 | ether_setup(dev); | ||
935 | |||
936 | dev->netdev_ops = &vxlan_netdev_ops; | ||
937 | dev->destructor = vxlan_free; | ||
938 | SET_NETDEV_DEVTYPE(dev, &vxlan_type); | ||
939 | |||
940 | dev->tx_queue_len = 0; | ||
941 | dev->features |= NETIF_F_LLTX; | ||
942 | dev->features |= NETIF_F_NETNS_LOCAL; | ||
943 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; | ||
944 | |||
945 | spin_lock_init(&vxlan->hash_lock); | ||
946 | |||
947 | init_timer_deferrable(&vxlan->age_timer); | ||
948 | vxlan->age_timer.function = vxlan_cleanup; | ||
949 | vxlan->age_timer.data = (unsigned long) vxlan; | ||
950 | |||
951 | vxlan->dev = dev; | ||
952 | |||
953 | for (h = 0; h < FDB_HASH_SIZE; ++h) | ||
954 | INIT_HLIST_HEAD(&vxlan->fdb_head[h]); | ||
955 | } | ||
956 | |||
957 | static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { | ||
958 | [IFLA_VXLAN_ID] = { .type = NLA_U32 }, | ||
959 | [IFLA_VXLAN_GROUP] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, | ||
960 | [IFLA_VXLAN_LINK] = { .type = NLA_U32 }, | ||
961 | [IFLA_VXLAN_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, | ||
962 | [IFLA_VXLAN_TOS] = { .type = NLA_U8 }, | ||
963 | [IFLA_VXLAN_TTL] = { .type = NLA_U8 }, | ||
964 | [IFLA_VXLAN_LEARNING] = { .type = NLA_U8 }, | ||
965 | [IFLA_VXLAN_AGEING] = { .type = NLA_U32 }, | ||
966 | [IFLA_VXLAN_LIMIT] = { .type = NLA_U32 }, | ||
967 | }; | ||
968 | |||
969 | static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[]) | ||
970 | { | ||
971 | if (tb[IFLA_ADDRESS]) { | ||
972 | if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { | ||
973 | pr_debug("invalid link address (not ethernet)\n"); | ||
974 | return -EINVAL; | ||
975 | } | ||
976 | |||
977 | if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { | ||
978 | pr_debug("invalid all zero ethernet address\n"); | ||
979 | return -EADDRNOTAVAIL; | ||
980 | } | ||
981 | } | ||
982 | |||
983 | if (!data) | ||
984 | return -EINVAL; | ||
985 | |||
986 | if (data[IFLA_VXLAN_ID]) { | ||
987 | __u32 id = nla_get_u32(data[IFLA_VXLAN_ID]); | ||
988 | if (id >= VXLAN_VID_MASK) | ||
989 | return -ERANGE; | ||
990 | } | ||
991 | |||
992 | if (data[IFLA_VXLAN_GROUP]) { | ||
993 | __be32 gaddr = nla_get_be32(data[IFLA_VXLAN_GROUP]); | ||
994 | if (!IN_MULTICAST(ntohl(gaddr))) { | ||
995 | pr_debug("group address is not IPv4 multicast\n"); | ||
996 | return -EADDRNOTAVAIL; | ||
997 | } | ||
998 | } | ||
999 | return 0; | ||
1000 | } | ||
1001 | |||
1002 | static int vxlan_newlink(struct net *net, struct net_device *dev, | ||
1003 | struct nlattr *tb[], struct nlattr *data[]) | ||
1004 | { | ||
1005 | struct vxlan_dev *vxlan = netdev_priv(dev); | ||
1006 | __u32 vni; | ||
1007 | int err; | ||
1008 | |||
1009 | if (!data[IFLA_VXLAN_ID]) | ||
1010 | return -EINVAL; | ||
1011 | |||
1012 | vni = nla_get_u32(data[IFLA_VXLAN_ID]); | ||
1013 | if (vxlan_find_vni(net, vni)) { | ||
1014 | pr_info("duplicate VNI %u\n", vni); | ||
1015 | return -EEXIST; | ||
1016 | } | ||
1017 | vxlan->vni = vni; | ||
1018 | |||
1019 | if (data[IFLA_VXLAN_GROUP]) | ||
1020 | vxlan->gaddr = nla_get_be32(data[IFLA_VXLAN_GROUP]); | ||
1021 | |||
1022 | if (data[IFLA_VXLAN_LOCAL]) | ||
1023 | vxlan->saddr = nla_get_be32(data[IFLA_VXLAN_LOCAL]); | ||
1024 | |||
1025 | if (data[IFLA_VXLAN_LINK]) { | ||
1026 | vxlan->link = nla_get_u32(data[IFLA_VXLAN_LINK]); | ||
1027 | |||
1028 | if (!tb[IFLA_MTU]) { | ||
1029 | struct net_device *lowerdev; | ||
1030 | lowerdev = __dev_get_by_index(net, vxlan->link); | ||
1031 | dev->mtu = lowerdev->mtu - VXLAN_HEADROOM; | ||
1032 | } | ||
1033 | } | ||
1034 | |||
1035 | if (data[IFLA_VXLAN_TOS]) | ||
1036 | vxlan->tos = nla_get_u8(data[IFLA_VXLAN_TOS]); | ||
1037 | |||
1038 | if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING])) | ||
1039 | vxlan->learn = true; | ||
1040 | |||
1041 | if (data[IFLA_VXLAN_AGEING]) | ||
1042 | vxlan->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]); | ||
1043 | else | ||
1044 | vxlan->age_interval = FDB_AGE_DEFAULT; | ||
1045 | |||
1046 | if (data[IFLA_VXLAN_LIMIT]) | ||
1047 | vxlan->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]); | ||
1048 | |||
1049 | err = register_netdevice(dev); | ||
1050 | if (!err) | ||
1051 | hlist_add_head_rcu(&vxlan->hlist, vni_head(net, vxlan->vni)); | ||
1052 | |||
1053 | return err; | ||
1054 | } | ||
1055 | |||
1056 | static void vxlan_dellink(struct net_device *dev, struct list_head *head) | ||
1057 | { | ||
1058 | struct vxlan_dev *vxlan = netdev_priv(dev); | ||
1059 | |||
1060 | hlist_del_rcu(&vxlan->hlist); | ||
1061 | |||
1062 | unregister_netdevice_queue(dev, head); | ||
1063 | } | ||
1064 | |||
1065 | static size_t vxlan_get_size(const struct net_device *dev) | ||
1066 | { | ||
1067 | |||
1068 | return nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_ID */ | ||
1069 | nla_total_size(sizeof(__be32)) +/* IFLA_VXLAN_GROUP */ | ||
1070 | nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */ | ||
1071 | nla_total_size(sizeof(__be32))+ /* IFLA_VXLAN_LOCAL */ | ||
1072 | nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL */ | ||
1073 | nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TOS */ | ||
1074 | nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LEARNING */ | ||
1075 | nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */ | ||
1076 | nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */ | ||
1077 | 0; | ||
1078 | } | ||
1079 | |||
1080 | static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) | ||
1081 | { | ||
1082 | const struct vxlan_dev *vxlan = netdev_priv(dev); | ||
1083 | |||
1084 | if (nla_put_u32(skb, IFLA_VXLAN_ID, vxlan->vni)) | ||
1085 | goto nla_put_failure; | ||
1086 | |||
1087 | if (vxlan->gaddr && nla_put_u32(skb, IFLA_VXLAN_GROUP, vxlan->gaddr)) | ||
1088 | goto nla_put_failure; | ||
1089 | |||
1090 | if (vxlan->link && nla_put_u32(skb, IFLA_VXLAN_LINK, vxlan->link)) | ||
1091 | goto nla_put_failure; | ||
1092 | |||
1093 | if (vxlan->saddr && nla_put_u32(skb, IFLA_VXLAN_LOCAL, vxlan->saddr)) | ||
1094 | goto nla_put_failure; | ||
1095 | |||
1096 | if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) || | ||
1097 | nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) || | ||
1098 | nla_put_u8(skb, IFLA_VXLAN_LEARNING, vxlan->learn) || | ||
1099 | nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->age_interval) || | ||
1100 | nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->addrmax)) | ||
1101 | goto nla_put_failure; | ||
1102 | |||
1103 | return 0; | ||
1104 | |||
1105 | nla_put_failure: | ||
1106 | return -EMSGSIZE; | ||
1107 | } | ||
1108 | |||
1109 | static struct rtnl_link_ops vxlan_link_ops __read_mostly = { | ||
1110 | .kind = "vxlan", | ||
1111 | .maxtype = IFLA_VXLAN_MAX, | ||
1112 | .policy = vxlan_policy, | ||
1113 | .priv_size = sizeof(struct vxlan_dev), | ||
1114 | .setup = vxlan_setup, | ||
1115 | .validate = vxlan_validate, | ||
1116 | .newlink = vxlan_newlink, | ||
1117 | .dellink = vxlan_dellink, | ||
1118 | .get_size = vxlan_get_size, | ||
1119 | .fill_info = vxlan_fill_info, | ||
1120 | }; | ||
1121 | |||
1122 | static __net_init int vxlan_init_net(struct net *net) | ||
1123 | { | ||
1124 | struct vxlan_net *vn = net_generic(net, vxlan_net_id); | ||
1125 | struct sock *sk; | ||
1126 | struct sockaddr_in vxlan_addr = { | ||
1127 | .sin_family = AF_INET, | ||
1128 | .sin_addr.s_addr = htonl(INADDR_ANY), | ||
1129 | }; | ||
1130 | int rc; | ||
1131 | unsigned h; | ||
1132 | |||
1133 | /* Create UDP socket for encapsulation receive. */ | ||
1134 | rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &vn->sock); | ||
1135 | if (rc < 0) { | ||
1136 | pr_debug("UDP socket create failed\n"); | ||
1137 | return rc; | ||
1138 | } | ||
1139 | /* Put in proper namespace */ | ||
1140 | sk = vn->sock->sk; | ||
1141 | sk_change_net(sk, net); | ||
1142 | |||
1143 | vxlan_addr.sin_port = htons(vxlan_port); | ||
1144 | |||
1145 | rc = kernel_bind(vn->sock, (struct sockaddr *) &vxlan_addr, | ||
1146 | sizeof(vxlan_addr)); | ||
1147 | if (rc < 0) { | ||
1148 | pr_debug("bind for UDP socket %pI4:%u (%d)\n", | ||
1149 | &vxlan_addr.sin_addr, ntohs(vxlan_addr.sin_port), rc); | ||
1150 | sk_release_kernel(sk); | ||
1151 | vn->sock = NULL; | ||
1152 | return rc; | ||
1153 | } | ||
1154 | |||
1155 | /* Disable multicast loopback */ | ||
1156 | inet_sk(sk)->mc_loop = 0; | ||
1157 | |||
1158 | /* Mark socket as an encapsulation socket. */ | ||
1159 | udp_sk(sk)->encap_type = 1; | ||
1160 | udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv; | ||
1161 | udp_encap_enable(); | ||
1162 | |||
1163 | for (h = 0; h < VNI_HASH_SIZE; ++h) | ||
1164 | INIT_HLIST_HEAD(&vn->vni_list[h]); | ||
1165 | |||
1166 | return 0; | ||
1167 | } | ||
1168 | |||
1169 | static __net_exit void vxlan_exit_net(struct net *net) | ||
1170 | { | ||
1171 | struct vxlan_net *vn = net_generic(net, vxlan_net_id); | ||
1172 | |||
1173 | if (vn->sock) { | ||
1174 | sk_release_kernel(vn->sock->sk); | ||
1175 | vn->sock = NULL; | ||
1176 | } | ||
1177 | } | ||
1178 | |||
1179 | static struct pernet_operations vxlan_net_ops = { | ||
1180 | .init = vxlan_init_net, | ||
1181 | .exit = vxlan_exit_net, | ||
1182 | .id = &vxlan_net_id, | ||
1183 | .size = sizeof(struct vxlan_net), | ||
1184 | }; | ||
1185 | |||
1186 | static int __init vxlan_init_module(void) | ||
1187 | { | ||
1188 | int rc; | ||
1189 | |||
1190 | get_random_bytes(&vxlan_salt, sizeof(vxlan_salt)); | ||
1191 | |||
1192 | rc = register_pernet_device(&vxlan_net_ops); | ||
1193 | if (rc) | ||
1194 | goto out1; | ||
1195 | |||
1196 | rc = rtnl_link_register(&vxlan_link_ops); | ||
1197 | if (rc) | ||
1198 | goto out2; | ||
1199 | |||
1200 | return 0; | ||
1201 | |||
1202 | out2: | ||
1203 | unregister_pernet_device(&vxlan_net_ops); | ||
1204 | out1: | ||
1205 | return rc; | ||
1206 | } | ||
1207 | module_init(vxlan_init_module); | ||
1208 | |||
1209 | static void __exit vxlan_cleanup_module(void) | ||
1210 | { | ||
1211 | rtnl_link_unregister(&vxlan_link_ops); | ||
1212 | unregister_pernet_device(&vxlan_net_ops); | ||
1213 | } | ||
1214 | module_exit(vxlan_cleanup_module); | ||
1215 | |||
1216 | MODULE_LICENSE("GPL"); | ||
1217 | MODULE_VERSION(VXLAN_VERSION); | ||
1218 | MODULE_AUTHOR("Stephen Hemminger <shemminger@vyatta.com>"); | ||
1219 | MODULE_ALIAS_RTNL_LINK("vxlan"); | ||