diff options
Diffstat (limited to 'drivers/net/vxlan.c')
-rw-r--r-- | drivers/net/vxlan.c | 399 |
1 files changed, 235 insertions, 164 deletions
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 7cee7a3068ec..62a4438c6084 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c | |||
@@ -33,7 +33,7 @@ | |||
33 | #include <net/arp.h> | 33 | #include <net/arp.h> |
34 | #include <net/ndisc.h> | 34 | #include <net/ndisc.h> |
35 | #include <net/ip.h> | 35 | #include <net/ip.h> |
36 | #include <net/ipip.h> | 36 | #include <net/ip_tunnels.h> |
37 | #include <net/icmp.h> | 37 | #include <net/icmp.h> |
38 | #include <net/udp.h> | 38 | #include <net/udp.h> |
39 | #include <net/rtnetlink.h> | 39 | #include <net/rtnetlink.h> |
@@ -81,31 +81,30 @@ struct vxlan_net { | |||
81 | struct hlist_head vni_list[VNI_HASH_SIZE]; | 81 | struct hlist_head vni_list[VNI_HASH_SIZE]; |
82 | }; | 82 | }; |
83 | 83 | ||
84 | struct vxlan_rdst { | ||
85 | struct rcu_head rcu; | ||
86 | __be32 remote_ip; | ||
87 | __be16 remote_port; | ||
88 | u32 remote_vni; | ||
89 | u32 remote_ifindex; | ||
90 | struct vxlan_rdst *remote_next; | ||
91 | }; | ||
92 | |||
84 | /* Forwarding table entry */ | 93 | /* Forwarding table entry */ |
85 | struct vxlan_fdb { | 94 | struct vxlan_fdb { |
86 | struct hlist_node hlist; /* linked list of entries */ | 95 | struct hlist_node hlist; /* linked list of entries */ |
87 | struct rcu_head rcu; | 96 | struct rcu_head rcu; |
88 | unsigned long updated; /* jiffies */ | 97 | unsigned long updated; /* jiffies */ |
89 | unsigned long used; | 98 | unsigned long used; |
90 | __be32 remote_ip; | 99 | struct vxlan_rdst remote; |
91 | u16 state; /* see ndm_state */ | 100 | u16 state; /* see ndm_state */ |
92 | u8 eth_addr[ETH_ALEN]; | 101 | u8 eth_addr[ETH_ALEN]; |
93 | }; | 102 | }; |
94 | 103 | ||
95 | /* Per-cpu network traffic stats */ | ||
96 | struct vxlan_stats { | ||
97 | u64 rx_packets; | ||
98 | u64 rx_bytes; | ||
99 | u64 tx_packets; | ||
100 | u64 tx_bytes; | ||
101 | struct u64_stats_sync syncp; | ||
102 | }; | ||
103 | |||
104 | /* Pseudo network device */ | 104 | /* Pseudo network device */ |
105 | struct vxlan_dev { | 105 | struct vxlan_dev { |
106 | struct hlist_node hlist; | 106 | struct hlist_node hlist; |
107 | struct net_device *dev; | 107 | struct net_device *dev; |
108 | struct vxlan_stats __percpu *stats; | ||
109 | __u32 vni; /* virtual network id */ | 108 | __u32 vni; /* virtual network id */ |
110 | __be32 gaddr; /* multicast group */ | 109 | __be32 gaddr; /* multicast group */ |
111 | __be32 saddr; /* source address */ | 110 | __be32 saddr; /* source address */ |
@@ -157,7 +156,8 @@ static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id) | |||
157 | /* Fill in neighbour message in skbuff. */ | 156 | /* Fill in neighbour message in skbuff. */ |
158 | static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, | 157 | static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, |
159 | const struct vxlan_fdb *fdb, | 158 | const struct vxlan_fdb *fdb, |
160 | u32 portid, u32 seq, int type, unsigned int flags) | 159 | u32 portid, u32 seq, int type, unsigned int flags, |
160 | const struct vxlan_rdst *rdst) | ||
161 | { | 161 | { |
162 | unsigned long now = jiffies; | 162 | unsigned long now = jiffies; |
163 | struct nda_cacheinfo ci; | 163 | struct nda_cacheinfo ci; |
@@ -176,7 +176,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, | |||
176 | 176 | ||
177 | if (type == RTM_GETNEIGH) { | 177 | if (type == RTM_GETNEIGH) { |
178 | ndm->ndm_family = AF_INET; | 178 | ndm->ndm_family = AF_INET; |
179 | send_ip = fdb->remote_ip != 0; | 179 | send_ip = rdst->remote_ip != htonl(INADDR_ANY); |
180 | send_eth = !is_zero_ether_addr(fdb->eth_addr); | 180 | send_eth = !is_zero_ether_addr(fdb->eth_addr); |
181 | } else | 181 | } else |
182 | ndm->ndm_family = AF_BRIDGE; | 182 | ndm->ndm_family = AF_BRIDGE; |
@@ -188,7 +188,17 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, | |||
188 | if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) | 188 | if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) |
189 | goto nla_put_failure; | 189 | goto nla_put_failure; |
190 | 190 | ||
191 | if (send_ip && nla_put_be32(skb, NDA_DST, fdb->remote_ip)) | 191 | if (send_ip && nla_put_be32(skb, NDA_DST, rdst->remote_ip)) |
192 | goto nla_put_failure; | ||
193 | |||
194 | if (rdst->remote_port && rdst->remote_port != vxlan_port && | ||
195 | nla_put_be16(skb, NDA_PORT, rdst->remote_port)) | ||
196 | goto nla_put_failure; | ||
197 | if (rdst->remote_vni != vxlan->vni && | ||
198 | nla_put_be32(skb, NDA_VNI, rdst->remote_vni)) | ||
199 | goto nla_put_failure; | ||
200 | if (rdst->remote_ifindex && | ||
201 | nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex)) | ||
192 | goto nla_put_failure; | 202 | goto nla_put_failure; |
193 | 203 | ||
194 | ci.ndm_used = jiffies_to_clock_t(now - fdb->used); | 204 | ci.ndm_used = jiffies_to_clock_t(now - fdb->used); |
@@ -211,6 +221,9 @@ static inline size_t vxlan_nlmsg_size(void) | |||
211 | return NLMSG_ALIGN(sizeof(struct ndmsg)) | 221 | return NLMSG_ALIGN(sizeof(struct ndmsg)) |
212 | + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ | 222 | + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ |
213 | + nla_total_size(sizeof(__be32)) /* NDA_DST */ | 223 | + nla_total_size(sizeof(__be32)) /* NDA_DST */ |
224 | + nla_total_size(sizeof(__be32)) /* NDA_PORT */ | ||
225 | + nla_total_size(sizeof(__be32)) /* NDA_VNI */ | ||
226 | + nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */ | ||
214 | + nla_total_size(sizeof(struct nda_cacheinfo)); | 227 | + nla_total_size(sizeof(struct nda_cacheinfo)); |
215 | } | 228 | } |
216 | 229 | ||
@@ -225,7 +238,7 @@ static void vxlan_fdb_notify(struct vxlan_dev *vxlan, | |||
225 | if (skb == NULL) | 238 | if (skb == NULL) |
226 | goto errout; | 239 | goto errout; |
227 | 240 | ||
228 | err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0); | 241 | err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, &fdb->remote); |
229 | if (err < 0) { | 242 | if (err < 0) { |
230 | /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */ | 243 | /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */ |
231 | WARN_ON(err == -EMSGSIZE); | 244 | WARN_ON(err == -EMSGSIZE); |
@@ -247,7 +260,8 @@ static void vxlan_ip_miss(struct net_device *dev, __be32 ipa) | |||
247 | 260 | ||
248 | memset(&f, 0, sizeof f); | 261 | memset(&f, 0, sizeof f); |
249 | f.state = NUD_STALE; | 262 | f.state = NUD_STALE; |
250 | f.remote_ip = ipa; /* goes to NDA_DST */ | 263 | f.remote.remote_ip = ipa; /* goes to NDA_DST */ |
264 | f.remote.remote_vni = VXLAN_N_VID; | ||
251 | 265 | ||
252 | vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH); | 266 | vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH); |
253 | } | 267 | } |
@@ -300,10 +314,38 @@ static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, | |||
300 | return NULL; | 314 | return NULL; |
301 | } | 315 | } |
302 | 316 | ||
317 | /* Add/update destinations for multicast */ | ||
318 | static int vxlan_fdb_append(struct vxlan_fdb *f, | ||
319 | __be32 ip, __u32 port, __u32 vni, __u32 ifindex) | ||
320 | { | ||
321 | struct vxlan_rdst *rd_prev, *rd; | ||
322 | |||
323 | rd_prev = NULL; | ||
324 | for (rd = &f->remote; rd; rd = rd->remote_next) { | ||
325 | if (rd->remote_ip == ip && | ||
326 | rd->remote_port == port && | ||
327 | rd->remote_vni == vni && | ||
328 | rd->remote_ifindex == ifindex) | ||
329 | return 0; | ||
330 | rd_prev = rd; | ||
331 | } | ||
332 | rd = kmalloc(sizeof(*rd), GFP_ATOMIC); | ||
333 | if (rd == NULL) | ||
334 | return -ENOBUFS; | ||
335 | rd->remote_ip = ip; | ||
336 | rd->remote_port = port; | ||
337 | rd->remote_vni = vni; | ||
338 | rd->remote_ifindex = ifindex; | ||
339 | rd->remote_next = NULL; | ||
340 | rd_prev->remote_next = rd; | ||
341 | return 1; | ||
342 | } | ||
343 | |||
303 | /* Add new entry to forwarding table -- assumes lock held */ | 344 | /* Add new entry to forwarding table -- assumes lock held */ |
304 | static int vxlan_fdb_create(struct vxlan_dev *vxlan, | 345 | static int vxlan_fdb_create(struct vxlan_dev *vxlan, |
305 | const u8 *mac, __be32 ip, | 346 | const u8 *mac, __be32 ip, |
306 | __u16 state, __u16 flags) | 347 | __u16 state, __u16 flags, |
348 | __u32 port, __u32 vni, __u32 ifindex) | ||
307 | { | 349 | { |
308 | struct vxlan_fdb *f; | 350 | struct vxlan_fdb *f; |
309 | int notify = 0; | 351 | int notify = 0; |
@@ -320,6 +362,14 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan, | |||
320 | f->updated = jiffies; | 362 | f->updated = jiffies; |
321 | notify = 1; | 363 | notify = 1; |
322 | } | 364 | } |
365 | if ((flags & NLM_F_APPEND) && | ||
366 | is_multicast_ether_addr(f->eth_addr)) { | ||
367 | int rc = vxlan_fdb_append(f, ip, port, vni, ifindex); | ||
368 | |||
369 | if (rc < 0) | ||
370 | return rc; | ||
371 | notify |= rc; | ||
372 | } | ||
323 | } else { | 373 | } else { |
324 | if (!(flags & NLM_F_CREATE)) | 374 | if (!(flags & NLM_F_CREATE)) |
325 | return -ENOENT; | 375 | return -ENOENT; |
@@ -333,7 +383,11 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan, | |||
333 | return -ENOMEM; | 383 | return -ENOMEM; |
334 | 384 | ||
335 | notify = 1; | 385 | notify = 1; |
336 | f->remote_ip = ip; | 386 | f->remote.remote_ip = ip; |
387 | f->remote.remote_port = port; | ||
388 | f->remote.remote_vni = vni; | ||
389 | f->remote.remote_ifindex = ifindex; | ||
390 | f->remote.remote_next = NULL; | ||
337 | f->state = state; | 391 | f->state = state; |
338 | f->updated = f->used = jiffies; | 392 | f->updated = f->used = jiffies; |
339 | memcpy(f->eth_addr, mac, ETH_ALEN); | 393 | memcpy(f->eth_addr, mac, ETH_ALEN); |
@@ -349,6 +403,19 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan, | |||
349 | return 0; | 403 | return 0; |
350 | } | 404 | } |
351 | 405 | ||
406 | void vxlan_fdb_free(struct rcu_head *head) | ||
407 | { | ||
408 | struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu); | ||
409 | |||
410 | while (f->remote.remote_next) { | ||
411 | struct vxlan_rdst *rd = f->remote.remote_next; | ||
412 | |||
413 | f->remote.remote_next = rd->remote_next; | ||
414 | kfree(rd); | ||
415 | } | ||
416 | kfree(f); | ||
417 | } | ||
418 | |||
352 | static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f) | 419 | static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f) |
353 | { | 420 | { |
354 | netdev_dbg(vxlan->dev, | 421 | netdev_dbg(vxlan->dev, |
@@ -358,7 +425,7 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f) | |||
358 | vxlan_fdb_notify(vxlan, f, RTM_DELNEIGH); | 425 | vxlan_fdb_notify(vxlan, f, RTM_DELNEIGH); |
359 | 426 | ||
360 | hlist_del_rcu(&f->hlist); | 427 | hlist_del_rcu(&f->hlist); |
361 | kfree_rcu(f, rcu); | 428 | call_rcu(&f->rcu, vxlan_fdb_free); |
362 | } | 429 | } |
363 | 430 | ||
364 | /* Add static entry (via netlink) */ | 431 | /* Add static entry (via netlink) */ |
@@ -367,7 +434,9 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], | |||
367 | const unsigned char *addr, u16 flags) | 434 | const unsigned char *addr, u16 flags) |
368 | { | 435 | { |
369 | struct vxlan_dev *vxlan = netdev_priv(dev); | 436 | struct vxlan_dev *vxlan = netdev_priv(dev); |
437 | struct net *net = dev_net(vxlan->dev); | ||
370 | __be32 ip; | 438 | __be32 ip; |
439 | u32 port, vni, ifindex; | ||
371 | int err; | 440 | int err; |
372 | 441 | ||
373 | if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) { | 442 | if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) { |
@@ -384,8 +453,36 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], | |||
384 | 453 | ||
385 | ip = nla_get_be32(tb[NDA_DST]); | 454 | ip = nla_get_be32(tb[NDA_DST]); |
386 | 455 | ||
456 | if (tb[NDA_PORT]) { | ||
457 | if (nla_len(tb[NDA_PORT]) != sizeof(u32)) | ||
458 | return -EINVAL; | ||
459 | port = nla_get_u32(tb[NDA_PORT]); | ||
460 | } else | ||
461 | port = vxlan_port; | ||
462 | |||
463 | if (tb[NDA_VNI]) { | ||
464 | if (nla_len(tb[NDA_VNI]) != sizeof(u32)) | ||
465 | return -EINVAL; | ||
466 | vni = nla_get_u32(tb[NDA_VNI]); | ||
467 | } else | ||
468 | vni = vxlan->vni; | ||
469 | |||
470 | if (tb[NDA_IFINDEX]) { | ||
471 | struct net_device *tdev; | ||
472 | |||
473 | if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32)) | ||
474 | return -EINVAL; | ||
475 | ifindex = nla_get_u32(tb[NDA_IFINDEX]); | ||
476 | tdev = dev_get_by_index(net, ifindex); | ||
477 | if (!tdev) | ||
478 | return -EADDRNOTAVAIL; | ||
479 | dev_put(tdev); | ||
480 | } else | ||
481 | ifindex = 0; | ||
482 | |||
387 | spin_lock_bh(&vxlan->hash_lock); | 483 | spin_lock_bh(&vxlan->hash_lock); |
388 | err = vxlan_fdb_create(vxlan, addr, ip, ndm->ndm_state, flags); | 484 | err = vxlan_fdb_create(vxlan, addr, ip, ndm->ndm_state, flags, port, |
485 | vni, ifindex); | ||
389 | spin_unlock_bh(&vxlan->hash_lock); | 486 | spin_unlock_bh(&vxlan->hash_lock); |
390 | 487 | ||
391 | return err; | 488 | return err; |
@@ -423,18 +520,21 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, | |||
423 | int err; | 520 | int err; |
424 | 521 | ||
425 | hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) { | 522 | hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) { |
426 | if (idx < cb->args[0]) | 523 | struct vxlan_rdst *rd; |
427 | goto skip; | 524 | for (rd = &f->remote; rd; rd = rd->remote_next) { |
428 | 525 | if (idx < cb->args[0]) | |
429 | err = vxlan_fdb_info(skb, vxlan, f, | 526 | goto skip; |
430 | NETLINK_CB(cb->skb).portid, | 527 | |
431 | cb->nlh->nlmsg_seq, | 528 | err = vxlan_fdb_info(skb, vxlan, f, |
432 | RTM_NEWNEIGH, | 529 | NETLINK_CB(cb->skb).portid, |
433 | NLM_F_MULTI); | 530 | cb->nlh->nlmsg_seq, |
434 | if (err < 0) | 531 | RTM_NEWNEIGH, |
435 | break; | 532 | NLM_F_MULTI, rd); |
533 | if (err < 0) | ||
534 | break; | ||
436 | skip: | 535 | skip: |
437 | ++idx; | 536 | ++idx; |
537 | } | ||
438 | } | 538 | } |
439 | } | 539 | } |
440 | 540 | ||
@@ -454,22 +554,23 @@ static void vxlan_snoop(struct net_device *dev, | |||
454 | f = vxlan_find_mac(vxlan, src_mac); | 554 | f = vxlan_find_mac(vxlan, src_mac); |
455 | if (likely(f)) { | 555 | if (likely(f)) { |
456 | f->used = jiffies; | 556 | f->used = jiffies; |
457 | if (likely(f->remote_ip == src_ip)) | 557 | if (likely(f->remote.remote_ip == src_ip)) |
458 | return; | 558 | return; |
459 | 559 | ||
460 | if (net_ratelimit()) | 560 | if (net_ratelimit()) |
461 | netdev_info(dev, | 561 | netdev_info(dev, |
462 | "%pM migrated from %pI4 to %pI4\n", | 562 | "%pM migrated from %pI4 to %pI4\n", |
463 | src_mac, &f->remote_ip, &src_ip); | 563 | src_mac, &f->remote.remote_ip, &src_ip); |
464 | 564 | ||
465 | f->remote_ip = src_ip; | 565 | f->remote.remote_ip = src_ip; |
466 | f->updated = jiffies; | 566 | f->updated = jiffies; |
467 | } else { | 567 | } else { |
468 | /* learned new entry */ | 568 | /* learned new entry */ |
469 | spin_lock(&vxlan->hash_lock); | 569 | spin_lock(&vxlan->hash_lock); |
470 | err = vxlan_fdb_create(vxlan, src_mac, src_ip, | 570 | err = vxlan_fdb_create(vxlan, src_mac, src_ip, |
471 | NUD_REACHABLE, | 571 | NUD_REACHABLE, |
472 | NLM_F_EXCL|NLM_F_CREATE); | 572 | NLM_F_EXCL|NLM_F_CREATE, |
573 | vxlan_port, vxlan->vni, 0); | ||
473 | spin_unlock(&vxlan->hash_lock); | 574 | spin_unlock(&vxlan->hash_lock); |
474 | } | 575 | } |
475 | } | 576 | } |
@@ -556,7 +657,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) | |||
556 | struct iphdr *oip; | 657 | struct iphdr *oip; |
557 | struct vxlanhdr *vxh; | 658 | struct vxlanhdr *vxh; |
558 | struct vxlan_dev *vxlan; | 659 | struct vxlan_dev *vxlan; |
559 | struct vxlan_stats *stats; | 660 | struct pcpu_tstats *stats; |
560 | __u32 vni; | 661 | __u32 vni; |
561 | int err; | 662 | int err; |
562 | 663 | ||
@@ -632,7 +733,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) | |||
632 | } | 733 | } |
633 | } | 734 | } |
634 | 735 | ||
635 | stats = this_cpu_ptr(vxlan->stats); | 736 | stats = this_cpu_ptr(vxlan->dev->tstats); |
636 | u64_stats_update_begin(&stats->syncp); | 737 | u64_stats_update_begin(&stats->syncp); |
637 | stats->rx_packets++; | 738 | stats->rx_packets++; |
638 | stats->rx_bytes += skb->len; | 739 | stats->rx_bytes += skb->len; |
@@ -691,7 +792,6 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb) | |||
691 | n = neigh_lookup(&arp_tbl, &tip, dev); | 792 | n = neigh_lookup(&arp_tbl, &tip, dev); |
692 | 793 | ||
693 | if (n) { | 794 | if (n) { |
694 | struct vxlan_dev *vxlan = netdev_priv(dev); | ||
695 | struct vxlan_fdb *f; | 795 | struct vxlan_fdb *f; |
696 | struct sk_buff *reply; | 796 | struct sk_buff *reply; |
697 | 797 | ||
@@ -701,7 +801,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb) | |||
701 | } | 801 | } |
702 | 802 | ||
703 | f = vxlan_find_mac(vxlan, n->ha); | 803 | f = vxlan_find_mac(vxlan, n->ha); |
704 | if (f && f->remote_ip == 0) { | 804 | if (f && f->remote.remote_ip == htonl(INADDR_ANY)) { |
705 | /* bridge-local neighbor */ | 805 | /* bridge-local neighbor */ |
706 | neigh_release(n); | 806 | neigh_release(n); |
707 | goto out; | 807 | goto out; |
@@ -763,28 +863,6 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) | |||
763 | return false; | 863 | return false; |
764 | } | 864 | } |
765 | 865 | ||
766 | /* Extract dsfield from inner protocol */ | ||
767 | static inline u8 vxlan_get_dsfield(const struct iphdr *iph, | ||
768 | const struct sk_buff *skb) | ||
769 | { | ||
770 | if (skb->protocol == htons(ETH_P_IP)) | ||
771 | return iph->tos; | ||
772 | else if (skb->protocol == htons(ETH_P_IPV6)) | ||
773 | return ipv6_get_dsfield((const struct ipv6hdr *)iph); | ||
774 | else | ||
775 | return 0; | ||
776 | } | ||
777 | |||
778 | /* Propogate ECN bits out */ | ||
779 | static inline u8 vxlan_ecn_encap(u8 tos, | ||
780 | const struct iphdr *iph, | ||
781 | const struct sk_buff *skb) | ||
782 | { | ||
783 | u8 inner = vxlan_get_dsfield(iph, skb); | ||
784 | |||
785 | return INET_ECN_encapsulate(tos, inner); | ||
786 | } | ||
787 | |||
788 | static void vxlan_sock_free(struct sk_buff *skb) | 866 | static void vxlan_sock_free(struct sk_buff *skb) |
789 | { | 867 | { |
790 | sock_put(skb->sk); | 868 | sock_put(skb->sk); |
@@ -820,48 +898,40 @@ static u16 vxlan_src_port(const struct vxlan_dev *vxlan, struct sk_buff *skb) | |||
820 | return (((u64) hash * range) >> 32) + vxlan->port_min; | 898 | return (((u64) hash * range) >> 32) + vxlan->port_min; |
821 | } | 899 | } |
822 | 900 | ||
823 | /* Transmit local packets over Vxlan | 901 | static int handle_offloads(struct sk_buff *skb) |
824 | * | 902 | { |
825 | * Outer IP header inherits ECN and DF from inner header. | 903 | if (skb_is_gso(skb)) { |
826 | * Outer UDP destination is the VXLAN assigned port. | 904 | int err = skb_unclone(skb, GFP_ATOMIC); |
827 | * source port is based on hash of flow | 905 | if (unlikely(err)) |
828 | */ | 906 | return err; |
829 | static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) | 907 | |
908 | skb_shinfo(skb)->gso_type |= (SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP); | ||
909 | } else if (skb->ip_summed != CHECKSUM_PARTIAL) | ||
910 | skb->ip_summed = CHECKSUM_NONE; | ||
911 | |||
912 | return 0; | ||
913 | } | ||
914 | |||
915 | static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, | ||
916 | struct vxlan_rdst *rdst, bool did_rsc) | ||
830 | { | 917 | { |
831 | struct vxlan_dev *vxlan = netdev_priv(dev); | 918 | struct vxlan_dev *vxlan = netdev_priv(dev); |
832 | struct rtable *rt; | 919 | struct rtable *rt; |
833 | const struct iphdr *old_iph; | 920 | const struct iphdr *old_iph; |
834 | struct ethhdr *eth; | ||
835 | struct iphdr *iph; | 921 | struct iphdr *iph; |
836 | struct vxlanhdr *vxh; | 922 | struct vxlanhdr *vxh; |
837 | struct udphdr *uh; | 923 | struct udphdr *uh; |
838 | struct flowi4 fl4; | 924 | struct flowi4 fl4; |
839 | unsigned int pkt_len = skb->len; | 925 | unsigned int pkt_len = skb->len; |
840 | __be32 dst; | 926 | __be32 dst; |
841 | __u16 src_port; | 927 | __u16 src_port, dst_port; |
928 | u32 vni; | ||
842 | __be16 df = 0; | 929 | __be16 df = 0; |
843 | __u8 tos, ttl; | 930 | __u8 tos, ttl; |
844 | int err; | ||
845 | bool did_rsc = false; | ||
846 | const struct vxlan_fdb *f; | ||
847 | |||
848 | skb_reset_mac_header(skb); | ||
849 | eth = eth_hdr(skb); | ||
850 | 931 | ||
851 | if ((vxlan->flags & VXLAN_F_PROXY) && ntohs(eth->h_proto) == ETH_P_ARP) | 932 | dst_port = rdst->remote_port ? rdst->remote_port : vxlan_port; |
852 | return arp_reduce(dev, skb); | 933 | vni = rdst->remote_vni; |
853 | else if ((vxlan->flags&VXLAN_F_RSC) && ntohs(eth->h_proto) == ETH_P_IP) | 934 | dst = rdst->remote_ip; |
854 | did_rsc = route_shortcircuit(dev, skb); | ||
855 | |||
856 | f = vxlan_find_mac(vxlan, eth->h_dest); | ||
857 | if (f == NULL) { | ||
858 | did_rsc = false; | ||
859 | dst = vxlan->gaddr; | ||
860 | if (!dst && (vxlan->flags & VXLAN_F_L2MISS) && | ||
861 | !is_multicast_ether_addr(eth->h_dest)) | ||
862 | vxlan_fdb_miss(vxlan, eth->h_dest); | ||
863 | } else | ||
864 | dst = f->remote_ip; | ||
865 | 935 | ||
866 | if (!dst) { | 936 | if (!dst) { |
867 | if (did_rsc) { | 937 | if (did_rsc) { |
@@ -871,8 +941,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) | |||
871 | 941 | ||
872 | /* short-circuited back to local bridge */ | 942 | /* short-circuited back to local bridge */ |
873 | if (netif_rx(skb) == NET_RX_SUCCESS) { | 943 | if (netif_rx(skb) == NET_RX_SUCCESS) { |
874 | struct vxlan_stats *stats = | 944 | struct pcpu_tstats *stats = this_cpu_ptr(dev->tstats); |
875 | this_cpu_ptr(vxlan->stats); | ||
876 | 945 | ||
877 | u64_stats_update_begin(&stats->syncp); | 946 | u64_stats_update_begin(&stats->syncp); |
878 | stats->tx_packets++; | 947 | stats->tx_packets++; |
@@ -904,12 +973,12 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) | |||
904 | 973 | ||
905 | tos = vxlan->tos; | 974 | tos = vxlan->tos; |
906 | if (tos == 1) | 975 | if (tos == 1) |
907 | tos = vxlan_get_dsfield(old_iph, skb); | 976 | tos = ip_tunnel_get_dsfield(old_iph, skb); |
908 | 977 | ||
909 | src_port = vxlan_src_port(vxlan, skb); | 978 | src_port = vxlan_src_port(vxlan, skb); |
910 | 979 | ||
911 | memset(&fl4, 0, sizeof(fl4)); | 980 | memset(&fl4, 0, sizeof(fl4)); |
912 | fl4.flowi4_oif = vxlan->link; | 981 | fl4.flowi4_oif = rdst->remote_ifindex; |
913 | fl4.flowi4_tos = RT_TOS(tos); | 982 | fl4.flowi4_tos = RT_TOS(tos); |
914 | fl4.daddr = dst; | 983 | fl4.daddr = dst; |
915 | fl4.saddr = vxlan->saddr; | 984 | fl4.saddr = vxlan->saddr; |
@@ -936,13 +1005,13 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) | |||
936 | 1005 | ||
937 | vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); | 1006 | vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); |
938 | vxh->vx_flags = htonl(VXLAN_FLAGS); | 1007 | vxh->vx_flags = htonl(VXLAN_FLAGS); |
939 | vxh->vx_vni = htonl(vxlan->vni << 8); | 1008 | vxh->vx_vni = htonl(vni << 8); |
940 | 1009 | ||
941 | __skb_push(skb, sizeof(*uh)); | 1010 | __skb_push(skb, sizeof(*uh)); |
942 | skb_reset_transport_header(skb); | 1011 | skb_reset_transport_header(skb); |
943 | uh = udp_hdr(skb); | 1012 | uh = udp_hdr(skb); |
944 | 1013 | ||
945 | uh->dest = htons(vxlan_port); | 1014 | uh->dest = htons(dst_port); |
946 | uh->source = htons(src_port); | 1015 | uh->source = htons(src_port); |
947 | 1016 | ||
948 | uh->len = htons(skb->len); | 1017 | uh->len = htons(skb->len); |
@@ -955,7 +1024,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) | |||
955 | iph->ihl = sizeof(struct iphdr) >> 2; | 1024 | iph->ihl = sizeof(struct iphdr) >> 2; |
956 | iph->frag_off = df; | 1025 | iph->frag_off = df; |
957 | iph->protocol = IPPROTO_UDP; | 1026 | iph->protocol = IPPROTO_UDP; |
958 | iph->tos = vxlan_ecn_encap(tos, old_iph, skb); | 1027 | iph->tos = ip_tunnel_ecn_encap(tos, old_iph, skb); |
959 | iph->daddr = dst; | 1028 | iph->daddr = dst; |
960 | iph->saddr = fl4.saddr; | 1029 | iph->saddr = fl4.saddr; |
961 | iph->ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); | 1030 | iph->ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); |
@@ -965,22 +1034,10 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) | |||
965 | 1034 | ||
966 | vxlan_set_owner(dev, skb); | 1035 | vxlan_set_owner(dev, skb); |
967 | 1036 | ||
968 | /* See iptunnel_xmit() */ | 1037 | if (handle_offloads(skb)) |
969 | if (skb->ip_summed != CHECKSUM_PARTIAL) | 1038 | goto drop; |
970 | skb->ip_summed = CHECKSUM_NONE; | ||
971 | |||
972 | err = ip_local_out(skb); | ||
973 | if (likely(net_xmit_eval(err) == 0)) { | ||
974 | struct vxlan_stats *stats = this_cpu_ptr(vxlan->stats); | ||
975 | 1039 | ||
976 | u64_stats_update_begin(&stats->syncp); | 1040 | iptunnel_xmit(skb, dev); |
977 | stats->tx_packets++; | ||
978 | stats->tx_bytes += pkt_len; | ||
979 | u64_stats_update_end(&stats->syncp); | ||
980 | } else { | ||
981 | dev->stats.tx_errors++; | ||
982 | dev->stats.tx_aborted_errors++; | ||
983 | } | ||
984 | return NETDEV_TX_OK; | 1041 | return NETDEV_TX_OK; |
985 | 1042 | ||
986 | drop: | 1043 | drop: |
@@ -994,6 +1051,64 @@ tx_free: | |||
994 | return NETDEV_TX_OK; | 1051 | return NETDEV_TX_OK; |
995 | } | 1052 | } |
996 | 1053 | ||
1054 | /* Transmit local packets over Vxlan | ||
1055 | * | ||
1056 | * Outer IP header inherits ECN and DF from inner header. | ||
1057 | * Outer UDP destination is the VXLAN assigned port. | ||
1058 | * source port is based on hash of flow | ||
1059 | */ | ||
1060 | static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) | ||
1061 | { | ||
1062 | struct vxlan_dev *vxlan = netdev_priv(dev); | ||
1063 | struct ethhdr *eth; | ||
1064 | bool did_rsc = false; | ||
1065 | struct vxlan_rdst group, *rdst0, *rdst; | ||
1066 | struct vxlan_fdb *f; | ||
1067 | int rc1, rc; | ||
1068 | |||
1069 | skb_reset_mac_header(skb); | ||
1070 | eth = eth_hdr(skb); | ||
1071 | |||
1072 | if ((vxlan->flags & VXLAN_F_PROXY) && ntohs(eth->h_proto) == ETH_P_ARP) | ||
1073 | return arp_reduce(dev, skb); | ||
1074 | else if ((vxlan->flags&VXLAN_F_RSC) && ntohs(eth->h_proto) == ETH_P_IP) | ||
1075 | did_rsc = route_shortcircuit(dev, skb); | ||
1076 | |||
1077 | f = vxlan_find_mac(vxlan, eth->h_dest); | ||
1078 | if (f == NULL) { | ||
1079 | did_rsc = false; | ||
1080 | group.remote_port = vxlan_port; | ||
1081 | group.remote_vni = vxlan->vni; | ||
1082 | group.remote_ip = vxlan->gaddr; | ||
1083 | group.remote_ifindex = vxlan->link; | ||
1084 | group.remote_next = 0; | ||
1085 | rdst0 = &group; | ||
1086 | |||
1087 | if (group.remote_ip == htonl(INADDR_ANY) && | ||
1088 | (vxlan->flags & VXLAN_F_L2MISS) && | ||
1089 | !is_multicast_ether_addr(eth->h_dest)) | ||
1090 | vxlan_fdb_miss(vxlan, eth->h_dest); | ||
1091 | } else | ||
1092 | rdst0 = &f->remote; | ||
1093 | |||
1094 | rc = NETDEV_TX_OK; | ||
1095 | |||
1096 | /* if there are multiple destinations, send copies */ | ||
1097 | for (rdst = rdst0->remote_next; rdst; rdst = rdst->remote_next) { | ||
1098 | struct sk_buff *skb1; | ||
1099 | |||
1100 | skb1 = skb_clone(skb, GFP_ATOMIC); | ||
1101 | rc1 = vxlan_xmit_one(skb1, dev, rdst, did_rsc); | ||
1102 | if (rc == NETDEV_TX_OK) | ||
1103 | rc = rc1; | ||
1104 | } | ||
1105 | |||
1106 | rc1 = vxlan_xmit_one(skb, dev, rdst0, did_rsc); | ||
1107 | if (rc == NETDEV_TX_OK) | ||
1108 | rc = rc1; | ||
1109 | return rc; | ||
1110 | } | ||
1111 | |||
997 | /* Walk the forwarding table and purge stale entries */ | 1112 | /* Walk the forwarding table and purge stale entries */ |
998 | static void vxlan_cleanup(unsigned long arg) | 1113 | static void vxlan_cleanup(unsigned long arg) |
999 | { | 1114 | { |
@@ -1034,10 +1149,8 @@ static void vxlan_cleanup(unsigned long arg) | |||
1034 | /* Setup stats when device is created */ | 1149 | /* Setup stats when device is created */ |
1035 | static int vxlan_init(struct net_device *dev) | 1150 | static int vxlan_init(struct net_device *dev) |
1036 | { | 1151 | { |
1037 | struct vxlan_dev *vxlan = netdev_priv(dev); | 1152 | dev->tstats = alloc_percpu(struct pcpu_tstats); |
1038 | 1153 | if (!dev->tstats) | |
1039 | vxlan->stats = alloc_percpu(struct vxlan_stats); | ||
1040 | if (!vxlan->stats) | ||
1041 | return -ENOMEM; | 1154 | return -ENOMEM; |
1042 | 1155 | ||
1043 | return 0; | 1156 | return 0; |
@@ -1093,49 +1206,6 @@ static int vxlan_stop(struct net_device *dev) | |||
1093 | return 0; | 1206 | return 0; |
1094 | } | 1207 | } |
1095 | 1208 | ||
1096 | /* Merge per-cpu statistics */ | ||
1097 | static struct rtnl_link_stats64 *vxlan_stats64(struct net_device *dev, | ||
1098 | struct rtnl_link_stats64 *stats) | ||
1099 | { | ||
1100 | struct vxlan_dev *vxlan = netdev_priv(dev); | ||
1101 | struct vxlan_stats tmp, sum = { 0 }; | ||
1102 | unsigned int cpu; | ||
1103 | |||
1104 | for_each_possible_cpu(cpu) { | ||
1105 | unsigned int start; | ||
1106 | const struct vxlan_stats *stats | ||
1107 | = per_cpu_ptr(vxlan->stats, cpu); | ||
1108 | |||
1109 | do { | ||
1110 | start = u64_stats_fetch_begin_bh(&stats->syncp); | ||
1111 | memcpy(&tmp, stats, sizeof(tmp)); | ||
1112 | } while (u64_stats_fetch_retry_bh(&stats->syncp, start)); | ||
1113 | |||
1114 | sum.tx_bytes += tmp.tx_bytes; | ||
1115 | sum.tx_packets += tmp.tx_packets; | ||
1116 | sum.rx_bytes += tmp.rx_bytes; | ||
1117 | sum.rx_packets += tmp.rx_packets; | ||
1118 | } | ||
1119 | |||
1120 | stats->tx_bytes = sum.tx_bytes; | ||
1121 | stats->tx_packets = sum.tx_packets; | ||
1122 | stats->rx_bytes = sum.rx_bytes; | ||
1123 | stats->rx_packets = sum.rx_packets; | ||
1124 | |||
1125 | stats->multicast = dev->stats.multicast; | ||
1126 | stats->rx_length_errors = dev->stats.rx_length_errors; | ||
1127 | stats->rx_frame_errors = dev->stats.rx_frame_errors; | ||
1128 | stats->rx_errors = dev->stats.rx_errors; | ||
1129 | |||
1130 | stats->tx_dropped = dev->stats.tx_dropped; | ||
1131 | stats->tx_carrier_errors = dev->stats.tx_carrier_errors; | ||
1132 | stats->tx_aborted_errors = dev->stats.tx_aborted_errors; | ||
1133 | stats->collisions = dev->stats.collisions; | ||
1134 | stats->tx_errors = dev->stats.tx_errors; | ||
1135 | |||
1136 | return stats; | ||
1137 | } | ||
1138 | |||
1139 | /* Stub, nothing needs to be done. */ | 1209 | /* Stub, nothing needs to be done. */ |
1140 | static void vxlan_set_multicast_list(struct net_device *dev) | 1210 | static void vxlan_set_multicast_list(struct net_device *dev) |
1141 | { | 1211 | { |
@@ -1146,7 +1216,7 @@ static const struct net_device_ops vxlan_netdev_ops = { | |||
1146 | .ndo_open = vxlan_open, | 1216 | .ndo_open = vxlan_open, |
1147 | .ndo_stop = vxlan_stop, | 1217 | .ndo_stop = vxlan_stop, |
1148 | .ndo_start_xmit = vxlan_xmit, | 1218 | .ndo_start_xmit = vxlan_xmit, |
1149 | .ndo_get_stats64 = vxlan_stats64, | 1219 | .ndo_get_stats64 = ip_tunnel_get_stats64, |
1150 | .ndo_set_rx_mode = vxlan_set_multicast_list, | 1220 | .ndo_set_rx_mode = vxlan_set_multicast_list, |
1151 | .ndo_change_mtu = eth_change_mtu, | 1221 | .ndo_change_mtu = eth_change_mtu, |
1152 | .ndo_validate_addr = eth_validate_addr, | 1222 | .ndo_validate_addr = eth_validate_addr, |
@@ -1163,9 +1233,7 @@ static struct device_type vxlan_type = { | |||
1163 | 1233 | ||
1164 | static void vxlan_free(struct net_device *dev) | 1234 | static void vxlan_free(struct net_device *dev) |
1165 | { | 1235 | { |
1166 | struct vxlan_dev *vxlan = netdev_priv(dev); | 1236 | free_percpu(dev->tstats); |
1167 | |||
1168 | free_percpu(vxlan->stats); | ||
1169 | free_netdev(dev); | 1237 | free_netdev(dev); |
1170 | } | 1238 | } |
1171 | 1239 | ||
@@ -1189,8 +1257,10 @@ static void vxlan_setup(struct net_device *dev) | |||
1189 | dev->features |= NETIF_F_NETNS_LOCAL; | 1257 | dev->features |= NETIF_F_NETNS_LOCAL; |
1190 | dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM; | 1258 | dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM; |
1191 | dev->features |= NETIF_F_RXCSUM; | 1259 | dev->features |= NETIF_F_RXCSUM; |
1260 | dev->features |= NETIF_F_GSO_SOFTWARE; | ||
1192 | 1261 | ||
1193 | dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM; | 1262 | dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM; |
1263 | dev->hw_features |= NETIF_F_GSO_SOFTWARE; | ||
1194 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; | 1264 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; |
1195 | dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; | 1265 | dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; |
1196 | 1266 | ||
@@ -1555,6 +1625,7 @@ static void __exit vxlan_cleanup_module(void) | |||
1555 | { | 1625 | { |
1556 | rtnl_link_unregister(&vxlan_link_ops); | 1626 | rtnl_link_unregister(&vxlan_link_ops); |
1557 | unregister_pernet_device(&vxlan_net_ops); | 1627 | unregister_pernet_device(&vxlan_net_ops); |
1628 | rcu_barrier(); | ||
1558 | } | 1629 | } |
1559 | module_exit(vxlan_cleanup_module); | 1630 | module_exit(vxlan_cleanup_module); |
1560 | 1631 | ||