diff options
author | David S. Miller <davem@davemloft.net> | 2016-11-12 23:38:08 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2016-11-12 23:38:08 -0500 |
commit | 79774d6bfacb40699ecd5a343e5d4ac5a9cdd173 (patch) | |
tree | 5732e718ec730c9da67ad4521dca5bfd2068c2d0 | |
parent | 23dd8315485acae0acf4452509e2be9fc587d72c (diff) | |
parent | 90e02896f1a4627b14624245fbcbc19f8fd916cb (diff) |
Merge branch 'fix-bpf_redirect'
Martin KaFai Lau says:
====================
bpf: Fix bpf_redirect to an ipip/ip6tnl dev
This patch set fixes a bug in bpf_redirect(dev, flags) when dev is an
ipip/ip6tnl. The current problem is IP-EthHdr-IP is sent out instead of
IP-IP.
Patch 1 adds a dev->type test similar to dev_is_mac_header_xmit()
in act_mirred.c which is only available in net-next. We can consider to
refactor it once this patch is pulled into net-next from net.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/linux/netdevice.h | 15 | ||||
-rw-r--r-- | net/core/dev.c | 17 | ||||
-rw-r--r-- | net/core/filter.c | 68 | ||||
-rw-r--r-- | samples/bpf/Makefile | 4 | ||||
-rwxr-xr-x | samples/bpf/tc_l2_redirect.sh | 173 | ||||
-rw-r--r-- | samples/bpf/tc_l2_redirect_kern.c | 236 | ||||
-rw-r--r-- | samples/bpf/tc_l2_redirect_user.c | 73 |
7 files changed, 567 insertions, 19 deletions
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 91ee3643ccc8..bf04a46f6d5b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h | |||
@@ -3354,6 +3354,21 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); | |||
3354 | bool is_skb_forwardable(const struct net_device *dev, | 3354 | bool is_skb_forwardable(const struct net_device *dev, |
3355 | const struct sk_buff *skb); | 3355 | const struct sk_buff *skb); |
3356 | 3356 | ||
3357 | static __always_inline int ____dev_forward_skb(struct net_device *dev, | ||
3358 | struct sk_buff *skb) | ||
3359 | { | ||
3360 | if (skb_orphan_frags(skb, GFP_ATOMIC) || | ||
3361 | unlikely(!is_skb_forwardable(dev, skb))) { | ||
3362 | atomic_long_inc(&dev->rx_dropped); | ||
3363 | kfree_skb(skb); | ||
3364 | return NET_RX_DROP; | ||
3365 | } | ||
3366 | |||
3367 | skb_scrub_packet(skb, true); | ||
3368 | skb->priority = 0; | ||
3369 | return 0; | ||
3370 | } | ||
3371 | |||
3357 | void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev); | 3372 | void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev); |
3358 | 3373 | ||
3359 | extern int netdev_budget; | 3374 | extern int netdev_budget; |
diff --git a/net/core/dev.c b/net/core/dev.c index eaad4c28069f..6666b28b6815 100644 --- a/net/core/dev.c +++ b/net/core/dev.c | |||
@@ -1766,19 +1766,14 @@ EXPORT_SYMBOL_GPL(is_skb_forwardable); | |||
1766 | 1766 | ||
1767 | int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) | 1767 | int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) |
1768 | { | 1768 | { |
1769 | if (skb_orphan_frags(skb, GFP_ATOMIC) || | 1769 | int ret = ____dev_forward_skb(dev, skb); |
1770 | unlikely(!is_skb_forwardable(dev, skb))) { | ||
1771 | atomic_long_inc(&dev->rx_dropped); | ||
1772 | kfree_skb(skb); | ||
1773 | return NET_RX_DROP; | ||
1774 | } | ||
1775 | 1770 | ||
1776 | skb_scrub_packet(skb, true); | 1771 | if (likely(!ret)) { |
1777 | skb->priority = 0; | 1772 | skb->protocol = eth_type_trans(skb, dev); |
1778 | skb->protocol = eth_type_trans(skb, dev); | 1773 | skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); |
1779 | skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); | 1774 | } |
1780 | 1775 | ||
1781 | return 0; | 1776 | return ret; |
1782 | } | 1777 | } |
1783 | EXPORT_SYMBOL_GPL(__dev_forward_skb); | 1778 | EXPORT_SYMBOL_GPL(__dev_forward_skb); |
1784 | 1779 | ||
diff --git a/net/core/filter.c b/net/core/filter.c index 00351cdf7d0c..b391209838ef 100644 --- a/net/core/filter.c +++ b/net/core/filter.c | |||
@@ -1628,6 +1628,19 @@ static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) | |||
1628 | return dev_forward_skb(dev, skb); | 1628 | return dev_forward_skb(dev, skb); |
1629 | } | 1629 | } |
1630 | 1630 | ||
1631 | static inline int __bpf_rx_skb_no_mac(struct net_device *dev, | ||
1632 | struct sk_buff *skb) | ||
1633 | { | ||
1634 | int ret = ____dev_forward_skb(dev, skb); | ||
1635 | |||
1636 | if (likely(!ret)) { | ||
1637 | skb->dev = dev; | ||
1638 | ret = netif_rx(skb); | ||
1639 | } | ||
1640 | |||
1641 | return ret; | ||
1642 | } | ||
1643 | |||
1631 | static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) | 1644 | static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) |
1632 | { | 1645 | { |
1633 | int ret; | 1646 | int ret; |
@@ -1647,6 +1660,51 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) | |||
1647 | return ret; | 1660 | return ret; |
1648 | } | 1661 | } |
1649 | 1662 | ||
1663 | static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, | ||
1664 | u32 flags) | ||
1665 | { | ||
1666 | /* skb->mac_len is not set on normal egress */ | ||
1667 | unsigned int mlen = skb->network_header - skb->mac_header; | ||
1668 | |||
1669 | __skb_pull(skb, mlen); | ||
1670 | |||
1671 | /* At ingress, the mac header has already been pulled once. | ||
1672 | * At egress, skb_pospull_rcsum has to be done in case that | ||
1673 | * the skb is originated from ingress (i.e. a forwarded skb) | ||
1674 | * to ensure that rcsum starts at net header. | ||
1675 | */ | ||
1676 | if (!skb_at_tc_ingress(skb)) | ||
1677 | skb_postpull_rcsum(skb, skb_mac_header(skb), mlen); | ||
1678 | skb_pop_mac_header(skb); | ||
1679 | skb_reset_mac_len(skb); | ||
1680 | return flags & BPF_F_INGRESS ? | ||
1681 | __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb); | ||
1682 | } | ||
1683 | |||
1684 | static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, | ||
1685 | u32 flags) | ||
1686 | { | ||
1687 | bpf_push_mac_rcsum(skb); | ||
1688 | return flags & BPF_F_INGRESS ? | ||
1689 | __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); | ||
1690 | } | ||
1691 | |||
1692 | static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, | ||
1693 | u32 flags) | ||
1694 | { | ||
1695 | switch (dev->type) { | ||
1696 | case ARPHRD_TUNNEL: | ||
1697 | case ARPHRD_TUNNEL6: | ||
1698 | case ARPHRD_SIT: | ||
1699 | case ARPHRD_IPGRE: | ||
1700 | case ARPHRD_VOID: | ||
1701 | case ARPHRD_NONE: | ||
1702 | return __bpf_redirect_no_mac(skb, dev, flags); | ||
1703 | default: | ||
1704 | return __bpf_redirect_common(skb, dev, flags); | ||
1705 | } | ||
1706 | } | ||
1707 | |||
1650 | BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) | 1708 | BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) |
1651 | { | 1709 | { |
1652 | struct net_device *dev; | 1710 | struct net_device *dev; |
@@ -1675,10 +1733,7 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) | |||
1675 | return -ENOMEM; | 1733 | return -ENOMEM; |
1676 | } | 1734 | } |
1677 | 1735 | ||
1678 | bpf_push_mac_rcsum(clone); | 1736 | return __bpf_redirect(clone, dev, flags); |
1679 | |||
1680 | return flags & BPF_F_INGRESS ? | ||
1681 | __bpf_rx_skb(dev, clone) : __bpf_tx_skb(dev, clone); | ||
1682 | } | 1737 | } |
1683 | 1738 | ||
1684 | static const struct bpf_func_proto bpf_clone_redirect_proto = { | 1739 | static const struct bpf_func_proto bpf_clone_redirect_proto = { |
@@ -1722,10 +1777,7 @@ int skb_do_redirect(struct sk_buff *skb) | |||
1722 | return -EINVAL; | 1777 | return -EINVAL; |
1723 | } | 1778 | } |
1724 | 1779 | ||
1725 | bpf_push_mac_rcsum(skb); | 1780 | return __bpf_redirect(skb, dev, ri->flags); |
1726 | |||
1727 | return ri->flags & BPF_F_INGRESS ? | ||
1728 | __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); | ||
1729 | } | 1781 | } |
1730 | 1782 | ||
1731 | static const struct bpf_func_proto bpf_redirect_proto = { | 1783 | static const struct bpf_func_proto bpf_redirect_proto = { |
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 12b7304d55dc..72c58675973e 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile | |||
@@ -27,6 +27,7 @@ hostprogs-y += xdp2 | |||
27 | hostprogs-y += test_current_task_under_cgroup | 27 | hostprogs-y += test_current_task_under_cgroup |
28 | hostprogs-y += trace_event | 28 | hostprogs-y += trace_event |
29 | hostprogs-y += sampleip | 29 | hostprogs-y += sampleip |
30 | hostprogs-y += tc_l2_redirect | ||
30 | 31 | ||
31 | test_verifier-objs := test_verifier.o libbpf.o | 32 | test_verifier-objs := test_verifier.o libbpf.o |
32 | test_maps-objs := test_maps.o libbpf.o | 33 | test_maps-objs := test_maps.o libbpf.o |
@@ -56,6 +57,7 @@ test_current_task_under_cgroup-objs := bpf_load.o libbpf.o \ | |||
56 | test_current_task_under_cgroup_user.o | 57 | test_current_task_under_cgroup_user.o |
57 | trace_event-objs := bpf_load.o libbpf.o trace_event_user.o | 58 | trace_event-objs := bpf_load.o libbpf.o trace_event_user.o |
58 | sampleip-objs := bpf_load.o libbpf.o sampleip_user.o | 59 | sampleip-objs := bpf_load.o libbpf.o sampleip_user.o |
60 | tc_l2_redirect-objs := bpf_load.o libbpf.o tc_l2_redirect_user.o | ||
59 | 61 | ||
60 | # Tell kbuild to always build the programs | 62 | # Tell kbuild to always build the programs |
61 | always := $(hostprogs-y) | 63 | always := $(hostprogs-y) |
@@ -72,6 +74,7 @@ always += test_probe_write_user_kern.o | |||
72 | always += trace_output_kern.o | 74 | always += trace_output_kern.o |
73 | always += tcbpf1_kern.o | 75 | always += tcbpf1_kern.o |
74 | always += tcbpf2_kern.o | 76 | always += tcbpf2_kern.o |
77 | always += tc_l2_redirect_kern.o | ||
75 | always += lathist_kern.o | 78 | always += lathist_kern.o |
76 | always += offwaketime_kern.o | 79 | always += offwaketime_kern.o |
77 | always += spintest_kern.o | 80 | always += spintest_kern.o |
@@ -111,6 +114,7 @@ HOSTLOADLIBES_xdp2 += -lelf | |||
111 | HOSTLOADLIBES_test_current_task_under_cgroup += -lelf | 114 | HOSTLOADLIBES_test_current_task_under_cgroup += -lelf |
112 | HOSTLOADLIBES_trace_event += -lelf | 115 | HOSTLOADLIBES_trace_event += -lelf |
113 | HOSTLOADLIBES_sampleip += -lelf | 116 | HOSTLOADLIBES_sampleip += -lelf |
117 | HOSTLOADLIBES_tc_l2_redirect += -l elf | ||
114 | 118 | ||
115 | # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: | 119 | # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: |
116 | # make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang | 120 | # make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang |
diff --git a/samples/bpf/tc_l2_redirect.sh b/samples/bpf/tc_l2_redirect.sh new file mode 100755 index 000000000000..80a05591a140 --- /dev/null +++ b/samples/bpf/tc_l2_redirect.sh | |||
@@ -0,0 +1,173 @@ | |||
1 | #!/bin/bash | ||
2 | |||
3 | [[ -z $TC ]] && TC='tc' | ||
4 | [[ -z $IP ]] && IP='ip' | ||
5 | |||
6 | REDIRECT_USER='./tc_l2_redirect' | ||
7 | REDIRECT_BPF='./tc_l2_redirect_kern.o' | ||
8 | |||
9 | RP_FILTER=$(< /proc/sys/net/ipv4/conf/all/rp_filter) | ||
10 | IPV6_FORWARDING=$(< /proc/sys/net/ipv6/conf/all/forwarding) | ||
11 | |||
12 | function config_common { | ||
13 | local tun_type=$1 | ||
14 | |||
15 | $IP netns add ns1 | ||
16 | $IP netns add ns2 | ||
17 | $IP link add ve1 type veth peer name vens1 | ||
18 | $IP link add ve2 type veth peer name vens2 | ||
19 | $IP link set dev ve1 up | ||
20 | $IP link set dev ve2 up | ||
21 | $IP link set dev ve1 mtu 1500 | ||
22 | $IP link set dev ve2 mtu 1500 | ||
23 | $IP link set dev vens1 netns ns1 | ||
24 | $IP link set dev vens2 netns ns2 | ||
25 | |||
26 | $IP -n ns1 link set dev lo up | ||
27 | $IP -n ns1 link set dev vens1 up | ||
28 | $IP -n ns1 addr add 10.1.1.101/24 dev vens1 | ||
29 | $IP -n ns1 addr add 2401:db01::65/64 dev vens1 nodad | ||
30 | $IP -n ns1 route add default via 10.1.1.1 dev vens1 | ||
31 | $IP -n ns1 route add default via 2401:db01::1 dev vens1 | ||
32 | |||
33 | $IP -n ns2 link set dev lo up | ||
34 | $IP -n ns2 link set dev vens2 up | ||
35 | $IP -n ns2 addr add 10.2.1.102/24 dev vens2 | ||
36 | $IP -n ns2 addr add 2401:db02::66/64 dev vens2 nodad | ||
37 | $IP -n ns2 addr add 10.10.1.102 dev lo | ||
38 | $IP -n ns2 addr add 2401:face::66/64 dev lo nodad | ||
39 | $IP -n ns2 link add ipt2 type ipip local 10.2.1.102 remote 10.2.1.1 | ||
40 | $IP -n ns2 link add ip6t2 type ip6tnl mode any local 2401:db02::66 remote 2401:db02::1 | ||
41 | $IP -n ns2 link set dev ipt2 up | ||
42 | $IP -n ns2 link set dev ip6t2 up | ||
43 | $IP netns exec ns2 $TC qdisc add dev vens2 clsact | ||
44 | $IP netns exec ns2 $TC filter add dev vens2 ingress bpf da obj $REDIRECT_BPF sec drop_non_tun_vip | ||
45 | if [[ $tun_type == "ipip" ]]; then | ||
46 | $IP -n ns2 route add 10.1.1.0/24 dev ipt2 | ||
47 | $IP netns exec ns2 sysctl -q -w net.ipv4.conf.all.rp_filter=0 | ||
48 | $IP netns exec ns2 sysctl -q -w net.ipv4.conf.ipt2.rp_filter=0 | ||
49 | else | ||
50 | $IP -n ns2 route add 10.1.1.0/24 dev ip6t2 | ||
51 | $IP -n ns2 route add 2401:db01::/64 dev ip6t2 | ||
52 | $IP netns exec ns2 sysctl -q -w net.ipv4.conf.all.rp_filter=0 | ||
53 | $IP netns exec ns2 sysctl -q -w net.ipv4.conf.ip6t2.rp_filter=0 | ||
54 | fi | ||
55 | |||
56 | $IP addr add 10.1.1.1/24 dev ve1 | ||
57 | $IP addr add 2401:db01::1/64 dev ve1 nodad | ||
58 | $IP addr add 10.2.1.1/24 dev ve2 | ||
59 | $IP addr add 2401:db02::1/64 dev ve2 nodad | ||
60 | |||
61 | $TC qdisc add dev ve2 clsact | ||
62 | $TC filter add dev ve2 ingress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_forward | ||
63 | |||
64 | sysctl -q -w net.ipv4.conf.all.rp_filter=0 | ||
65 | sysctl -q -w net.ipv6.conf.all.forwarding=1 | ||
66 | } | ||
67 | |||
68 | function cleanup { | ||
69 | set +e | ||
70 | [[ -z $DEBUG ]] || set +x | ||
71 | $IP netns delete ns1 >& /dev/null | ||
72 | $IP netns delete ns2 >& /dev/null | ||
73 | $IP link del ve1 >& /dev/null | ||
74 | $IP link del ve2 >& /dev/null | ||
75 | $IP link del ipt >& /dev/null | ||
76 | $IP link del ip6t >& /dev/null | ||
77 | sysctl -q -w net.ipv4.conf.all.rp_filter=$RP_FILTER | ||
78 | sysctl -q -w net.ipv6.conf.all.forwarding=$IPV6_FORWARDING | ||
79 | rm -f /sys/fs/bpf/tc/globals/tun_iface | ||
80 | [[ -z $DEBUG ]] || set -x | ||
81 | set -e | ||
82 | } | ||
83 | |||
84 | function l2_to_ipip { | ||
85 | echo -n "l2_to_ipip $1: " | ||
86 | |||
87 | local dir=$1 | ||
88 | |||
89 | config_common ipip | ||
90 | |||
91 | $IP link add ipt type ipip external | ||
92 | $IP link set dev ipt up | ||
93 | sysctl -q -w net.ipv4.conf.ipt.rp_filter=0 | ||
94 | sysctl -q -w net.ipv4.conf.ipt.forwarding=1 | ||
95 | |||
96 | if [[ $dir == "egress" ]]; then | ||
97 | $IP route add 10.10.1.0/24 via 10.2.1.102 dev ve2 | ||
98 | $TC filter add dev ve2 egress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_redirect | ||
99 | sysctl -q -w net.ipv4.conf.ve1.forwarding=1 | ||
100 | else | ||
101 | $TC qdisc add dev ve1 clsact | ||
102 | $TC filter add dev ve1 ingress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_redirect | ||
103 | fi | ||
104 | |||
105 | $REDIRECT_USER -U /sys/fs/bpf/tc/globals/tun_iface -i $(< /sys/class/net/ipt/ifindex) | ||
106 | |||
107 | $IP netns exec ns1 ping -c1 10.10.1.102 >& /dev/null | ||
108 | |||
109 | if [[ $dir == "egress" ]]; then | ||
110 | # test direct egress to ve2 (i.e. not forwarding from | ||
111 | # ve1 to ve2). | ||
112 | ping -c1 10.10.1.102 >& /dev/null | ||
113 | fi | ||
114 | |||
115 | cleanup | ||
116 | |||
117 | echo "OK" | ||
118 | } | ||
119 | |||
120 | function l2_to_ip6tnl { | ||
121 | echo -n "l2_to_ip6tnl $1: " | ||
122 | |||
123 | local dir=$1 | ||
124 | |||
125 | config_common ip6tnl | ||
126 | |||
127 | $IP link add ip6t type ip6tnl mode any external | ||
128 | $IP link set dev ip6t up | ||
129 | sysctl -q -w net.ipv4.conf.ip6t.rp_filter=0 | ||
130 | sysctl -q -w net.ipv4.conf.ip6t.forwarding=1 | ||
131 | |||
132 | if [[ $dir == "egress" ]]; then | ||
133 | $IP route add 10.10.1.0/24 via 10.2.1.102 dev ve2 | ||
134 | $IP route add 2401:face::/64 via 2401:db02::66 dev ve2 | ||
135 | $TC filter add dev ve2 egress bpf da obj $REDIRECT_BPF sec l2_to_ip6tun_ingress_redirect | ||
136 | sysctl -q -w net.ipv4.conf.ve1.forwarding=1 | ||
137 | else | ||
138 | $TC qdisc add dev ve1 clsact | ||
139 | $TC filter add dev ve1 ingress bpf da obj $REDIRECT_BPF sec l2_to_ip6tun_ingress_redirect | ||
140 | fi | ||
141 | |||
142 | $REDIRECT_USER -U /sys/fs/bpf/tc/globals/tun_iface -i $(< /sys/class/net/ip6t/ifindex) | ||
143 | |||
144 | $IP netns exec ns1 ping -c1 10.10.1.102 >& /dev/null | ||
145 | $IP netns exec ns1 ping -6 -c1 2401:face::66 >& /dev/null | ||
146 | |||
147 | if [[ $dir == "egress" ]]; then | ||
148 | # test direct egress to ve2 (i.e. not forwarding from | ||
149 | # ve1 to ve2). | ||
150 | ping -c1 10.10.1.102 >& /dev/null | ||
151 | ping -6 -c1 2401:face::66 >& /dev/null | ||
152 | fi | ||
153 | |||
154 | cleanup | ||
155 | |||
156 | echo "OK" | ||
157 | } | ||
158 | |||
159 | cleanup | ||
160 | test_names="l2_to_ipip l2_to_ip6tnl" | ||
161 | test_dirs="ingress egress" | ||
162 | if [[ $# -ge 2 ]]; then | ||
163 | test_names=$1 | ||
164 | test_dirs=$2 | ||
165 | elif [[ $# -ge 1 ]]; then | ||
166 | test_names=$1 | ||
167 | fi | ||
168 | |||
169 | for t in $test_names; do | ||
170 | for d in $test_dirs; do | ||
171 | $t $d | ||
172 | done | ||
173 | done | ||
diff --git a/samples/bpf/tc_l2_redirect_kern.c b/samples/bpf/tc_l2_redirect_kern.c new file mode 100644 index 000000000000..92a44729dbe4 --- /dev/null +++ b/samples/bpf/tc_l2_redirect_kern.c | |||
@@ -0,0 +1,236 @@ | |||
1 | /* Copyright (c) 2016 Facebook | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #include <uapi/linux/bpf.h> | ||
8 | #include <uapi/linux/if_ether.h> | ||
9 | #include <uapi/linux/if_packet.h> | ||
10 | #include <uapi/linux/ip.h> | ||
11 | #include <uapi/linux/ipv6.h> | ||
12 | #include <uapi/linux/in.h> | ||
13 | #include <uapi/linux/tcp.h> | ||
14 | #include <uapi/linux/filter.h> | ||
15 | #include <uapi/linux/pkt_cls.h> | ||
16 | #include <net/ipv6.h> | ||
17 | #include "bpf_helpers.h" | ||
18 | |||
19 | #define _htonl __builtin_bswap32 | ||
20 | |||
21 | #define PIN_GLOBAL_NS 2 | ||
22 | struct bpf_elf_map { | ||
23 | __u32 type; | ||
24 | __u32 size_key; | ||
25 | __u32 size_value; | ||
26 | __u32 max_elem; | ||
27 | __u32 flags; | ||
28 | __u32 id; | ||
29 | __u32 pinning; | ||
30 | }; | ||
31 | |||
32 | /* copy of 'struct ethhdr' without __packed */ | ||
33 | struct eth_hdr { | ||
34 | unsigned char h_dest[ETH_ALEN]; | ||
35 | unsigned char h_source[ETH_ALEN]; | ||
36 | unsigned short h_proto; | ||
37 | }; | ||
38 | |||
39 | struct bpf_elf_map SEC("maps") tun_iface = { | ||
40 | .type = BPF_MAP_TYPE_ARRAY, | ||
41 | .size_key = sizeof(int), | ||
42 | .size_value = sizeof(int), | ||
43 | .pinning = PIN_GLOBAL_NS, | ||
44 | .max_elem = 1, | ||
45 | }; | ||
46 | |||
47 | static __always_inline bool is_vip_addr(__be16 eth_proto, __be32 daddr) | ||
48 | { | ||
49 | if (eth_proto == htons(ETH_P_IP)) | ||
50 | return (_htonl(0xffffff00) & daddr) == _htonl(0x0a0a0100); | ||
51 | else if (eth_proto == htons(ETH_P_IPV6)) | ||
52 | return (daddr == _htonl(0x2401face)); | ||
53 | |||
54 | return false; | ||
55 | } | ||
56 | |||
57 | SEC("l2_to_iptun_ingress_forward") | ||
58 | int _l2_to_iptun_ingress_forward(struct __sk_buff *skb) | ||
59 | { | ||
60 | struct bpf_tunnel_key tkey = {}; | ||
61 | void *data = (void *)(long)skb->data; | ||
62 | struct eth_hdr *eth = data; | ||
63 | void *data_end = (void *)(long)skb->data_end; | ||
64 | int key = 0, *ifindex; | ||
65 | |||
66 | int ret; | ||
67 | |||
68 | if (data + sizeof(*eth) > data_end) | ||
69 | return TC_ACT_OK; | ||
70 | |||
71 | ifindex = bpf_map_lookup_elem(&tun_iface, &key); | ||
72 | if (!ifindex) | ||
73 | return TC_ACT_OK; | ||
74 | |||
75 | if (eth->h_proto == htons(ETH_P_IP)) { | ||
76 | char fmt4[] = "ingress forward to ifindex:%d daddr4:%x\n"; | ||
77 | struct iphdr *iph = data + sizeof(*eth); | ||
78 | |||
79 | if (data + sizeof(*eth) + sizeof(*iph) > data_end) | ||
80 | return TC_ACT_OK; | ||
81 | |||
82 | if (iph->protocol != IPPROTO_IPIP) | ||
83 | return TC_ACT_OK; | ||
84 | |||
85 | bpf_trace_printk(fmt4, sizeof(fmt4), *ifindex, | ||
86 | _htonl(iph->daddr)); | ||
87 | return bpf_redirect(*ifindex, BPF_F_INGRESS); | ||
88 | } else if (eth->h_proto == htons(ETH_P_IPV6)) { | ||
89 | char fmt6[] = "ingress forward to ifindex:%d daddr6:%x::%x\n"; | ||
90 | struct ipv6hdr *ip6h = data + sizeof(*eth); | ||
91 | |||
92 | if (data + sizeof(*eth) + sizeof(*ip6h) > data_end) | ||
93 | return TC_ACT_OK; | ||
94 | |||
95 | if (ip6h->nexthdr != IPPROTO_IPIP && | ||
96 | ip6h->nexthdr != IPPROTO_IPV6) | ||
97 | return TC_ACT_OK; | ||
98 | |||
99 | bpf_trace_printk(fmt6, sizeof(fmt6), *ifindex, | ||
100 | _htonl(ip6h->daddr.s6_addr32[0]), | ||
101 | _htonl(ip6h->daddr.s6_addr32[3])); | ||
102 | return bpf_redirect(*ifindex, BPF_F_INGRESS); | ||
103 | } | ||
104 | |||
105 | return TC_ACT_OK; | ||
106 | } | ||
107 | |||
108 | SEC("l2_to_iptun_ingress_redirect") | ||
109 | int _l2_to_iptun_ingress_redirect(struct __sk_buff *skb) | ||
110 | { | ||
111 | struct bpf_tunnel_key tkey = {}; | ||
112 | void *data = (void *)(long)skb->data; | ||
113 | struct eth_hdr *eth = data; | ||
114 | void *data_end = (void *)(long)skb->data_end; | ||
115 | int key = 0, *ifindex; | ||
116 | |||
117 | int ret; | ||
118 | |||
119 | if (data + sizeof(*eth) > data_end) | ||
120 | return TC_ACT_OK; | ||
121 | |||
122 | ifindex = bpf_map_lookup_elem(&tun_iface, &key); | ||
123 | if (!ifindex) | ||
124 | return TC_ACT_OK; | ||
125 | |||
126 | if (eth->h_proto == htons(ETH_P_IP)) { | ||
127 | char fmt4[] = "e/ingress redirect daddr4:%x to ifindex:%d\n"; | ||
128 | struct iphdr *iph = data + sizeof(*eth); | ||
129 | __be32 daddr = iph->daddr; | ||
130 | |||
131 | if (data + sizeof(*eth) + sizeof(*iph) > data_end) | ||
132 | return TC_ACT_OK; | ||
133 | |||
134 | if (!is_vip_addr(eth->h_proto, daddr)) | ||
135 | return TC_ACT_OK; | ||
136 | |||
137 | bpf_trace_printk(fmt4, sizeof(fmt4), _htonl(daddr), *ifindex); | ||
138 | } else { | ||
139 | return TC_ACT_OK; | ||
140 | } | ||
141 | |||
142 | tkey.tunnel_id = 10000; | ||
143 | tkey.tunnel_ttl = 64; | ||
144 | tkey.remote_ipv4 = 0x0a020166; /* 10.2.1.102 */ | ||
145 | bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), 0); | ||
146 | return bpf_redirect(*ifindex, 0); | ||
147 | } | ||
148 | |||
149 | SEC("l2_to_ip6tun_ingress_redirect") | ||
150 | int _l2_to_ip6tun_ingress_redirect(struct __sk_buff *skb) | ||
151 | { | ||
152 | struct bpf_tunnel_key tkey = {}; | ||
153 | void *data = (void *)(long)skb->data; | ||
154 | struct eth_hdr *eth = data; | ||
155 | void *data_end = (void *)(long)skb->data_end; | ||
156 | int key = 0, *ifindex; | ||
157 | |||
158 | if (data + sizeof(*eth) > data_end) | ||
159 | return TC_ACT_OK; | ||
160 | |||
161 | ifindex = bpf_map_lookup_elem(&tun_iface, &key); | ||
162 | if (!ifindex) | ||
163 | return TC_ACT_OK; | ||
164 | |||
165 | if (eth->h_proto == htons(ETH_P_IP)) { | ||
166 | char fmt4[] = "e/ingress redirect daddr4:%x to ifindex:%d\n"; | ||
167 | struct iphdr *iph = data + sizeof(*eth); | ||
168 | |||
169 | if (data + sizeof(*eth) + sizeof(*iph) > data_end) | ||
170 | return TC_ACT_OK; | ||
171 | |||
172 | if (!is_vip_addr(eth->h_proto, iph->daddr)) | ||
173 | return TC_ACT_OK; | ||
174 | |||
175 | bpf_trace_printk(fmt4, sizeof(fmt4), _htonl(iph->daddr), | ||
176 | *ifindex); | ||
177 | } else if (eth->h_proto == htons(ETH_P_IPV6)) { | ||
178 | char fmt6[] = "e/ingress redirect daddr6:%x to ifindex:%d\n"; | ||
179 | struct ipv6hdr *ip6h = data + sizeof(*eth); | ||
180 | |||
181 | if (data + sizeof(*eth) + sizeof(*ip6h) > data_end) | ||
182 | return TC_ACT_OK; | ||
183 | |||
184 | if (!is_vip_addr(eth->h_proto, ip6h->daddr.s6_addr32[0])) | ||
185 | return TC_ACT_OK; | ||
186 | |||
187 | bpf_trace_printk(fmt6, sizeof(fmt6), | ||
188 | _htonl(ip6h->daddr.s6_addr32[0]), *ifindex); | ||
189 | } else { | ||
190 | return TC_ACT_OK; | ||
191 | } | ||
192 | |||
193 | tkey.tunnel_id = 10000; | ||
194 | tkey.tunnel_ttl = 64; | ||
195 | /* 2401:db02:0:0:0:0:0:66 */ | ||
196 | tkey.remote_ipv6[0] = _htonl(0x2401db02); | ||
197 | tkey.remote_ipv6[1] = 0; | ||
198 | tkey.remote_ipv6[2] = 0; | ||
199 | tkey.remote_ipv6[3] = _htonl(0x00000066); | ||
200 | bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), BPF_F_TUNINFO_IPV6); | ||
201 | return bpf_redirect(*ifindex, 0); | ||
202 | } | ||
203 | |||
204 | SEC("drop_non_tun_vip") | ||
205 | int _drop_non_tun_vip(struct __sk_buff *skb) | ||
206 | { | ||
207 | struct bpf_tunnel_key tkey = {}; | ||
208 | void *data = (void *)(long)skb->data; | ||
209 | struct eth_hdr *eth = data; | ||
210 | void *data_end = (void *)(long)skb->data_end; | ||
211 | |||
212 | if (data + sizeof(*eth) > data_end) | ||
213 | return TC_ACT_OK; | ||
214 | |||
215 | if (eth->h_proto == htons(ETH_P_IP)) { | ||
216 | struct iphdr *iph = data + sizeof(*eth); | ||
217 | |||
218 | if (data + sizeof(*eth) + sizeof(*iph) > data_end) | ||
219 | return TC_ACT_OK; | ||
220 | |||
221 | if (is_vip_addr(eth->h_proto, iph->daddr)) | ||
222 | return TC_ACT_SHOT; | ||
223 | } else if (eth->h_proto == htons(ETH_P_IPV6)) { | ||
224 | struct ipv6hdr *ip6h = data + sizeof(*eth); | ||
225 | |||
226 | if (data + sizeof(*eth) + sizeof(*ip6h) > data_end) | ||
227 | return TC_ACT_OK; | ||
228 | |||
229 | if (is_vip_addr(eth->h_proto, ip6h->daddr.s6_addr32[0])) | ||
230 | return TC_ACT_SHOT; | ||
231 | } | ||
232 | |||
233 | return TC_ACT_OK; | ||
234 | } | ||
235 | |||
236 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/tc_l2_redirect_user.c b/samples/bpf/tc_l2_redirect_user.c new file mode 100644 index 000000000000..4013c5337b91 --- /dev/null +++ b/samples/bpf/tc_l2_redirect_user.c | |||
@@ -0,0 +1,73 @@ | |||
1 | /* Copyright (c) 2016 Facebook | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #include <linux/unistd.h> | ||
8 | #include <linux/bpf.h> | ||
9 | |||
10 | #include <stdlib.h> | ||
11 | #include <stdio.h> | ||
12 | #include <unistd.h> | ||
13 | #include <string.h> | ||
14 | #include <errno.h> | ||
15 | |||
16 | #include "libbpf.h" | ||
17 | |||
18 | static void usage(void) | ||
19 | { | ||
20 | printf("Usage: tc_l2_ipip_redirect [...]\n"); | ||
21 | printf(" -U <file> Update an already pinned BPF array\n"); | ||
22 | printf(" -i <ifindex> Interface index\n"); | ||
23 | printf(" -h Display this help\n"); | ||
24 | } | ||
25 | |||
26 | int main(int argc, char **argv) | ||
27 | { | ||
28 | const char *pinned_file = NULL; | ||
29 | int ifindex = -1; | ||
30 | int array_key = 0; | ||
31 | int array_fd = -1; | ||
32 | int ret = -1; | ||
33 | int opt; | ||
34 | |||
35 | while ((opt = getopt(argc, argv, "F:U:i:")) != -1) { | ||
36 | switch (opt) { | ||
37 | /* General args */ | ||
38 | case 'U': | ||
39 | pinned_file = optarg; | ||
40 | break; | ||
41 | case 'i': | ||
42 | ifindex = atoi(optarg); | ||
43 | break; | ||
44 | default: | ||
45 | usage(); | ||
46 | goto out; | ||
47 | } | ||
48 | } | ||
49 | |||
50 | if (ifindex < 0 || !pinned_file) { | ||
51 | usage(); | ||
52 | goto out; | ||
53 | } | ||
54 | |||
55 | array_fd = bpf_obj_get(pinned_file); | ||
56 | if (array_fd < 0) { | ||
57 | fprintf(stderr, "bpf_obj_get(%s): %s(%d)\n", | ||
58 | pinned_file, strerror(errno), errno); | ||
59 | goto out; | ||
60 | } | ||
61 | |||
62 | /* bpf_tunnel_key.remote_ipv4 expects host byte orders */ | ||
63 | ret = bpf_update_elem(array_fd, &array_key, &ifindex, 0); | ||
64 | if (ret) { | ||
65 | perror("bpf_update_elem"); | ||
66 | goto out; | ||
67 | } | ||
68 | |||
69 | out: | ||
70 | if (array_fd != -1) | ||
71 | close(array_fd); | ||
72 | return ret; | ||
73 | } | ||