aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Ahern <dsa@cumulusnetworks.com>2016-05-10 14:19:50 -0400
committerDavid S. Miller <davem@davemloft.net>2016-05-11 19:31:40 -0400
commit74b20582ac389ee9f18a6fcc0eef244658ce8de0 (patch)
tree63f32ea105bb23a61a15c91615ef5cf8895e7bd7
parentca4aa976f04d14bc7da60dce0e2afc34c9f0f1d2 (diff)
net: l3mdev: Add hook in ip and ipv6
Currently the VRF driver uses the rx_handler to switch the skb device to the VRF device. Switching the dev prior to the ip / ipv6 layer means the VRF driver has to duplicate IP/IPv6 processing which adds overhead and makes features such as retaining the ingress device index more complicated than necessary. This patch moves the hook to the L3 layer just after the first NF_HOOK for PRE_ROUTING. This location makes exposing the original ingress device trivial (next patch) and allows adding other NF_HOOKs to the VRF driver in the future. dev_queue_xmit_nit is exported so that the VRF driver can cycle the skb with the switched device through the packet taps to maintain current behavior (tcpdump can be used on either the vrf device or the enslaved devices). Signed-off-by: David Ahern <dsa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--drivers/net/vrf.c189
-rw-r--r--include/linux/ipv6.h17
-rw-r--r--include/linux/netdevice.h2
-rw-r--r--include/net/l3mdev.h42
-rw-r--r--include/net/tcp.h4
-rw-r--r--net/core/dev.c3
-rw-r--r--net/ipv4/ip_input.c7
-rw-r--r--net/ipv6/ip6_input.c7
8 files changed, 170 insertions, 101 deletions
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index c8db55aa8280..0ea29345eb2e 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -42,9 +42,6 @@
42#define DRV_NAME "vrf" 42#define DRV_NAME "vrf"
43#define DRV_VERSION "1.0" 43#define DRV_VERSION "1.0"
44 44
45#define vrf_master_get_rcu(dev) \
46 ((struct net_device *)rcu_dereference(dev->rx_handler_data))
47
48struct net_vrf { 45struct net_vrf {
49 struct rtable *rth; 46 struct rtable *rth;
50 struct rt6_info *rt6; 47 struct rt6_info *rt6;
@@ -60,90 +57,12 @@ struct pcpu_dstats {
60 struct u64_stats_sync syncp; 57 struct u64_stats_sync syncp;
61}; 58};
62 59
63/* neighbor handling is done with actual device; do not want
64 * to flip skb->dev for those ndisc packets. This really fails
65 * for multiple next protocols (e.g., NEXTHDR_HOP). But it is
66 * a start.
67 */
68#if IS_ENABLED(CONFIG_IPV6)
69static bool check_ipv6_frame(const struct sk_buff *skb)
70{
71 const struct ipv6hdr *ipv6h;
72 struct ipv6hdr _ipv6h;
73 bool rc = true;
74
75 ipv6h = skb_header_pointer(skb, 0, sizeof(_ipv6h), &_ipv6h);
76 if (!ipv6h)
77 goto out;
78
79 if (ipv6h->nexthdr == NEXTHDR_ICMP) {
80 const struct icmp6hdr *icmph;
81 struct icmp6hdr _icmph;
82
83 icmph = skb_header_pointer(skb, sizeof(_ipv6h),
84 sizeof(_icmph), &_icmph);
85 if (!icmph)
86 goto out;
87
88 switch (icmph->icmp6_type) {
89 case NDISC_ROUTER_SOLICITATION:
90 case NDISC_ROUTER_ADVERTISEMENT:
91 case NDISC_NEIGHBOUR_SOLICITATION:
92 case NDISC_NEIGHBOUR_ADVERTISEMENT:
93 case NDISC_REDIRECT:
94 rc = false;
95 break;
96 }
97 }
98
99out:
100 return rc;
101}
102#else
103static bool check_ipv6_frame(const struct sk_buff *skb)
104{
105 return false;
106}
107#endif
108
109static bool is_ip_rx_frame(struct sk_buff *skb)
110{
111 switch (skb->protocol) {
112 case htons(ETH_P_IP):
113 return true;
114 case htons(ETH_P_IPV6):
115 return check_ipv6_frame(skb);
116 }
117 return false;
118}
119
120static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb) 60static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb)
121{ 61{
122 vrf_dev->stats.tx_errors++; 62 vrf_dev->stats.tx_errors++;
123 kfree_skb(skb); 63 kfree_skb(skb);
124} 64}
125 65
126/* note: already called with rcu_read_lock */
127static rx_handler_result_t vrf_handle_frame(struct sk_buff **pskb)
128{
129 struct sk_buff *skb = *pskb;
130
131 if (is_ip_rx_frame(skb)) {
132 struct net_device *dev = vrf_master_get_rcu(skb->dev);
133 struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
134
135 u64_stats_update_begin(&dstats->syncp);
136 dstats->rx_pkts++;
137 dstats->rx_bytes += skb->len;
138 u64_stats_update_end(&dstats->syncp);
139
140 skb->dev = dev;
141
142 return RX_HANDLER_ANOTHER;
143 }
144 return RX_HANDLER_PASS;
145}
146
147static struct rtnl_link_stats64 *vrf_get_stats64(struct net_device *dev, 66static struct rtnl_link_stats64 *vrf_get_stats64(struct net_device *dev,
148 struct rtnl_link_stats64 *stats) 67 struct rtnl_link_stats64 *stats)
149{ 68{
@@ -506,28 +425,14 @@ static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
506{ 425{
507 int ret; 426 int ret;
508 427
509 /* register the packet handler for slave ports */
510 ret = netdev_rx_handler_register(port_dev, vrf_handle_frame, dev);
511 if (ret) {
512 netdev_err(port_dev,
513 "Device %s failed to register rx_handler\n",
514 port_dev->name);
515 goto out_fail;
516 }
517
518 ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL); 428 ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL);
519 if (ret < 0) 429 if (ret < 0)
520 goto out_unregister; 430 return ret;
521 431
522 port_dev->priv_flags |= IFF_L3MDEV_SLAVE; 432 port_dev->priv_flags |= IFF_L3MDEV_SLAVE;
523 cycle_netdev(port_dev); 433 cycle_netdev(port_dev);
524 434
525 return 0; 435 return 0;
526
527out_unregister:
528 netdev_rx_handler_unregister(port_dev);
529out_fail:
530 return ret;
531} 436}
532 437
533static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev) 438static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
@@ -544,8 +449,6 @@ static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev)
544 netdev_upper_dev_unlink(port_dev, dev); 449 netdev_upper_dev_unlink(port_dev, dev);
545 port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE; 450 port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE;
546 451
547 netdev_rx_handler_unregister(port_dev);
548
549 cycle_netdev(port_dev); 452 cycle_netdev(port_dev);
550 453
551 return 0; 454 return 0;
@@ -670,6 +573,95 @@ static int vrf_get_saddr(struct net_device *dev, struct flowi4 *fl4)
670} 573}
671 574
672#if IS_ENABLED(CONFIG_IPV6) 575#if IS_ENABLED(CONFIG_IPV6)
576/* neighbor handling is done with actual device; do not want
577 * to flip skb->dev for those ndisc packets. This really fails
578 * for multiple next protocols (e.g., NEXTHDR_HOP). But it is
579 * a start.
580 */
581static bool ipv6_ndisc_frame(const struct sk_buff *skb)
582{
583 const struct ipv6hdr *iph = ipv6_hdr(skb);
584 bool rc = false;
585
586 if (iph->nexthdr == NEXTHDR_ICMP) {
587 const struct icmp6hdr *icmph;
588 struct icmp6hdr _icmph;
589
590 icmph = skb_header_pointer(skb, sizeof(*iph),
591 sizeof(_icmph), &_icmph);
592 if (!icmph)
593 goto out;
594
595 switch (icmph->icmp6_type) {
596 case NDISC_ROUTER_SOLICITATION:
597 case NDISC_ROUTER_ADVERTISEMENT:
598 case NDISC_NEIGHBOUR_SOLICITATION:
599 case NDISC_NEIGHBOUR_ADVERTISEMENT:
600 case NDISC_REDIRECT:
601 rc = true;
602 break;
603 }
604 }
605
606out:
607 return rc;
608}
609
610static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev,
611 struct sk_buff *skb)
612{
613 /* if packet is NDISC keep the ingress interface */
614 if (!ipv6_ndisc_frame(skb)) {
615 skb->dev = vrf_dev;
616 skb->skb_iif = vrf_dev->ifindex;
617
618 skb_push(skb, skb->mac_len);
619 dev_queue_xmit_nit(skb, vrf_dev);
620 skb_pull(skb, skb->mac_len);
621
622 IP6CB(skb)->flags |= IP6SKB_L3SLAVE;
623 }
624
625 return skb;
626}
627
628#else
629static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev,
630 struct sk_buff *skb)
631{
632 return skb;
633}
634#endif
635
636static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev,
637 struct sk_buff *skb)
638{
639 skb->dev = vrf_dev;
640 skb->skb_iif = vrf_dev->ifindex;
641
642 skb_push(skb, skb->mac_len);
643 dev_queue_xmit_nit(skb, vrf_dev);
644 skb_pull(skb, skb->mac_len);
645
646 return skb;
647}
648
649/* called with rcu lock held */
650static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev,
651 struct sk_buff *skb,
652 u16 proto)
653{
654 switch (proto) {
655 case AF_INET:
656 return vrf_ip_rcv(vrf_dev, skb);
657 case AF_INET6:
658 return vrf_ip6_rcv(vrf_dev, skb);
659 }
660
661 return skb;
662}
663
664#if IS_ENABLED(CONFIG_IPV6)
673static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev, 665static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev,
674 const struct flowi6 *fl6) 666 const struct flowi6 *fl6)
675{ 667{
@@ -690,6 +682,7 @@ static const struct l3mdev_ops vrf_l3mdev_ops = {
690 .l3mdev_fib_table = vrf_fib_table, 682 .l3mdev_fib_table = vrf_fib_table,
691 .l3mdev_get_rtable = vrf_get_rtable, 683 .l3mdev_get_rtable = vrf_get_rtable,
692 .l3mdev_get_saddr = vrf_get_saddr, 684 .l3mdev_get_saddr = vrf_get_saddr,
685 .l3mdev_l3_rcv = vrf_l3_rcv,
693#if IS_ENABLED(CONFIG_IPV6) 686#if IS_ENABLED(CONFIG_IPV6)
694 .l3mdev_get_rt6_dst = vrf_get_rt6_dst, 687 .l3mdev_get_rt6_dst = vrf_get_rt6_dst,
695#endif 688#endif
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 58d6e158755f..5c91b0b055d4 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -118,14 +118,29 @@ struct inet6_skb_parm {
118#define IP6SKB_ROUTERALERT 8 118#define IP6SKB_ROUTERALERT 8
119#define IP6SKB_FRAGMENTED 16 119#define IP6SKB_FRAGMENTED 16
120#define IP6SKB_HOPBYHOP 32 120#define IP6SKB_HOPBYHOP 32
121#define IP6SKB_L3SLAVE 64
121}; 122};
122 123
124#if defined(CONFIG_NET_L3_MASTER_DEV)
125static inline bool skb_l3mdev_slave(__u16 flags)
126{
127 return flags & IP6SKB_L3SLAVE;
128}
129#else
130static inline bool skb_l3mdev_slave(__u16 flags)
131{
132 return false;
133}
134#endif
135
123#define IP6CB(skb) ((struct inet6_skb_parm*)((skb)->cb)) 136#define IP6CB(skb) ((struct inet6_skb_parm*)((skb)->cb))
124#define IP6CBMTU(skb) ((struct ip6_mtuinfo *)((skb)->cb)) 137#define IP6CBMTU(skb) ((struct ip6_mtuinfo *)((skb)->cb))
125 138
126static inline int inet6_iif(const struct sk_buff *skb) 139static inline int inet6_iif(const struct sk_buff *skb)
127{ 140{
128 return IP6CB(skb)->iif; 141 bool l3_slave = skb_l3mdev_slave(IP6CB(skb)->flags);
142
143 return l3_slave ? skb->skb_iif : IP6CB(skb)->iif;
129} 144}
130 145
131struct tcp6_request_sock { 146struct tcp6_request_sock {
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 63580e6d0df4..c2f5112f08f7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3258,6 +3258,8 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
3258bool is_skb_forwardable(const struct net_device *dev, 3258bool is_skb_forwardable(const struct net_device *dev,
3259 const struct sk_buff *skb); 3259 const struct sk_buff *skb);
3260 3260
3261void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
3262
3261extern int netdev_budget; 3263extern int netdev_budget;
3262 3264
3263/* Called by rtnetlink.c:rtnl_unlock() */ 3265/* Called by rtnetlink.c:rtnl_unlock() */
diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index 78872bd1dc2c..374388dc01c8 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -25,6 +25,8 @@
25 25
26struct l3mdev_ops { 26struct l3mdev_ops {
27 u32 (*l3mdev_fib_table)(const struct net_device *dev); 27 u32 (*l3mdev_fib_table)(const struct net_device *dev);
28 struct sk_buff * (*l3mdev_l3_rcv)(struct net_device *dev,
29 struct sk_buff *skb, u16 proto);
28 30
29 /* IPv4 ops */ 31 /* IPv4 ops */
30 struct rtable * (*l3mdev_get_rtable)(const struct net_device *dev, 32 struct rtable * (*l3mdev_get_rtable)(const struct net_device *dev,
@@ -134,6 +136,34 @@ int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4);
134 136
135struct dst_entry *l3mdev_get_rt6_dst(struct net *net, const struct flowi6 *fl6); 137struct dst_entry *l3mdev_get_rt6_dst(struct net *net, const struct flowi6 *fl6);
136 138
139static inline
140struct sk_buff *l3mdev_l3_rcv(struct sk_buff *skb, u16 proto)
141{
142 struct net_device *master = NULL;
143
144 if (netif_is_l3_slave(skb->dev))
145 master = netdev_master_upper_dev_get_rcu(skb->dev);
146 else if (netif_is_l3_master(skb->dev))
147 master = skb->dev;
148
149 if (master && master->l3mdev_ops->l3mdev_l3_rcv)
150 skb = master->l3mdev_ops->l3mdev_l3_rcv(master, skb, proto);
151
152 return skb;
153}
154
155static inline
156struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb)
157{
158 return l3mdev_l3_rcv(skb, AF_INET);
159}
160
161static inline
162struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
163{
164 return l3mdev_l3_rcv(skb, AF_INET6);
165}
166
137#else 167#else
138 168
139static inline int l3mdev_master_ifindex_rcu(const struct net_device *dev) 169static inline int l3mdev_master_ifindex_rcu(const struct net_device *dev)
@@ -194,6 +224,18 @@ struct dst_entry *l3mdev_get_rt6_dst(struct net *net, const struct flowi6 *fl6)
194{ 224{
195 return NULL; 225 return NULL;
196} 226}
227
228static inline
229struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb)
230{
231 return skb;
232}
233
234static inline
235struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
236{
237 return skb;
238}
197#endif 239#endif
198 240
199#endif /* _NET_L3MDEV_H_ */ 241#endif /* _NET_L3MDEV_H_ */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index c9ab561387c4..0bcc70f4e1fb 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -786,7 +786,9 @@ struct tcp_skb_cb {
786 */ 786 */
787static inline int tcp_v6_iif(const struct sk_buff *skb) 787static inline int tcp_v6_iif(const struct sk_buff *skb)
788{ 788{
789 return TCP_SKB_CB(skb)->header.h6.iif; 789 bool l3_slave = skb_l3mdev_slave(TCP_SKB_CB(skb)->header.h6.flags);
790
791 return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif;
790} 792}
791#endif 793#endif
792 794
diff --git a/net/core/dev.c b/net/core/dev.c
index c7490339315c..12436d1312ca 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1850,7 +1850,7 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1850 * taps currently in use. 1850 * taps currently in use.
1851 */ 1851 */
1852 1852
1853static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) 1853void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1854{ 1854{
1855 struct packet_type *ptype; 1855 struct packet_type *ptype;
1856 struct sk_buff *skb2 = NULL; 1856 struct sk_buff *skb2 = NULL;
@@ -1907,6 +1907,7 @@ out_unlock:
1907 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); 1907 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1908 rcu_read_unlock(); 1908 rcu_read_unlock();
1909} 1909}
1910EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1910 1911
1911/** 1912/**
1912 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change 1913 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 751c0658e194..37375eedeef9 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -313,6 +313,13 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
313 const struct iphdr *iph = ip_hdr(skb); 313 const struct iphdr *iph = ip_hdr(skb);
314 struct rtable *rt; 314 struct rtable *rt;
315 315
316 /* if ingress device is enslaved to an L3 master device pass the
317 * skb to its handler for processing
318 */
319 skb = l3mdev_ip_rcv(skb);
320 if (!skb)
321 return NET_RX_SUCCESS;
322
316 if (net->ipv4.sysctl_ip_early_demux && 323 if (net->ipv4.sysctl_ip_early_demux &&
317 !skb_dst(skb) && 324 !skb_dst(skb) &&
318 !skb->sk && 325 !skb->sk &&
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 6ed56012005d..f185cbcda114 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -49,6 +49,13 @@
49 49
50int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) 50int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
51{ 51{
52 /* if ingress device is enslaved to an L3 master device pass the
53 * skb to its handler for processing
54 */
55 skb = l3mdev_ip6_rcv(skb);
56 if (!skb)
57 return NET_RX_SUCCESS;
58
52 if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { 59 if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
53 const struct inet6_protocol *ipprot; 60 const struct inet6_protocol *ipprot;
54 61