diff options
-rw-r--r-- | include/uapi/linux/bpf.h | 10 | ||||
-rw-r--r-- | net/core/filter.c | 28 | ||||
-rw-r--r-- | samples/bpf/Makefile | 5 | ||||
-rwxr-xr-x | samples/bpf/do_hbm_test.sh | 436 | ||||
-rw-r--r-- | samples/bpf/hbm.c | 441 | ||||
-rw-r--r-- | samples/bpf/hbm.h | 31 | ||||
-rw-r--r-- | samples/bpf/hbm_kern.h | 137 | ||||
-rw-r--r-- | samples/bpf/hbm_out_kern.c | 157 | ||||
-rw-r--r-- | tools/include/uapi/linux/bpf.h | 10 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/bpf_helpers.h | 2 |
10 files changed, 1255 insertions, 2 deletions
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2e308e90ffea..3c38ac9a92a7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h | |||
@@ -2359,6 +2359,13 @@ union bpf_attr { | |||
2359 | * Return | 2359 | * Return |
2360 | * A **struct bpf_tcp_sock** pointer on success, or NULL in | 2360 | * A **struct bpf_tcp_sock** pointer on success, or NULL in |
2361 | * case of failure. | 2361 | * case of failure. |
2362 | * | ||
2363 | * int bpf_skb_ecn_set_ce(struct sk_buf *skb) | ||
2364 | * Description | ||
2365 | * Sets ECN of IP header to ce (congestion encountered) if | ||
2366 | * current value is ect (ECN capable). Works with IPv6 and IPv4. | ||
2367 | * Return | ||
2368 | * 1 if set, 0 if not set. | ||
2362 | */ | 2369 | */ |
2363 | #define __BPF_FUNC_MAPPER(FN) \ | 2370 | #define __BPF_FUNC_MAPPER(FN) \ |
2364 | FN(unspec), \ | 2371 | FN(unspec), \ |
@@ -2457,7 +2464,8 @@ union bpf_attr { | |||
2457 | FN(spin_lock), \ | 2464 | FN(spin_lock), \ |
2458 | FN(spin_unlock), \ | 2465 | FN(spin_unlock), \ |
2459 | FN(sk_fullsock), \ | 2466 | FN(sk_fullsock), \ |
2460 | FN(tcp_sock), | 2467 | FN(tcp_sock), \ |
2468 | FN(skb_ecn_set_ce), | ||
2461 | 2469 | ||
2462 | /* integer value in 'imm' field of BPF_CALL instruction selects which helper | 2470 | /* integer value in 'imm' field of BPF_CALL instruction selects which helper |
2463 | * function eBPF program intends to call | 2471 | * function eBPF program intends to call |
diff --git a/net/core/filter.c b/net/core/filter.c index 85749f6ec789..558ca72f2254 100644 --- a/net/core/filter.c +++ b/net/core/filter.c | |||
@@ -5426,6 +5426,32 @@ static const struct bpf_func_proto bpf_tcp_sock_proto = { | |||
5426 | .arg1_type = ARG_PTR_TO_SOCK_COMMON, | 5426 | .arg1_type = ARG_PTR_TO_SOCK_COMMON, |
5427 | }; | 5427 | }; |
5428 | 5428 | ||
5429 | BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb) | ||
5430 | { | ||
5431 | unsigned int iphdr_len; | ||
5432 | |||
5433 | if (skb->protocol == cpu_to_be16(ETH_P_IP)) | ||
5434 | iphdr_len = sizeof(struct iphdr); | ||
5435 | else if (skb->protocol == cpu_to_be16(ETH_P_IPV6)) | ||
5436 | iphdr_len = sizeof(struct ipv6hdr); | ||
5437 | else | ||
5438 | return 0; | ||
5439 | |||
5440 | if (skb_headlen(skb) < iphdr_len) | ||
5441 | return 0; | ||
5442 | |||
5443 | if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len)) | ||
5444 | return 0; | ||
5445 | |||
5446 | return INET_ECN_set_ce(skb); | ||
5447 | } | ||
5448 | |||
5449 | static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = { | ||
5450 | .func = bpf_skb_ecn_set_ce, | ||
5451 | .gpl_only = false, | ||
5452 | .ret_type = RET_INTEGER, | ||
5453 | .arg1_type = ARG_PTR_TO_CTX, | ||
5454 | }; | ||
5429 | #endif /* CONFIG_INET */ | 5455 | #endif /* CONFIG_INET */ |
5430 | 5456 | ||
5431 | bool bpf_helper_changes_pkt_data(void *func) | 5457 | bool bpf_helper_changes_pkt_data(void *func) |
@@ -5585,6 +5611,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) | |||
5585 | #ifdef CONFIG_INET | 5611 | #ifdef CONFIG_INET |
5586 | case BPF_FUNC_tcp_sock: | 5612 | case BPF_FUNC_tcp_sock: |
5587 | return &bpf_tcp_sock_proto; | 5613 | return &bpf_tcp_sock_proto; |
5614 | case BPF_FUNC_skb_ecn_set_ce: | ||
5615 | return &bpf_skb_ecn_set_ce_proto; | ||
5588 | #endif | 5616 | #endif |
5589 | default: | 5617 | default: |
5590 | return sk_filter_func_proto(func_id, prog); | 5618 | return sk_filter_func_proto(func_id, prog); |
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 0c62ac39c697..65e667bdf979 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile | |||
@@ -52,6 +52,7 @@ hostprogs-y += xdpsock | |||
52 | hostprogs-y += xdp_fwd | 52 | hostprogs-y += xdp_fwd |
53 | hostprogs-y += task_fd_query | 53 | hostprogs-y += task_fd_query |
54 | hostprogs-y += xdp_sample_pkts | 54 | hostprogs-y += xdp_sample_pkts |
55 | hostprogs-y += hbm | ||
55 | 56 | ||
56 | # Libbpf dependencies | 57 | # Libbpf dependencies |
57 | LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a | 58 | LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a |
@@ -107,6 +108,7 @@ xdpsock-objs := xdpsock_user.o | |||
107 | xdp_fwd-objs := xdp_fwd_user.o | 108 | xdp_fwd-objs := xdp_fwd_user.o |
108 | task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS) | 109 | task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS) |
109 | xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS) | 110 | xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS) |
111 | hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS) | ||
110 | 112 | ||
111 | # Tell kbuild to always build the programs | 113 | # Tell kbuild to always build the programs |
112 | always := $(hostprogs-y) | 114 | always := $(hostprogs-y) |
@@ -164,6 +166,7 @@ always += xdp_adjust_tail_kern.o | |||
164 | always += xdp_fwd_kern.o | 166 | always += xdp_fwd_kern.o |
165 | always += task_fd_query_kern.o | 167 | always += task_fd_query_kern.o |
166 | always += xdp_sample_pkts_kern.o | 168 | always += xdp_sample_pkts_kern.o |
169 | always += hbm_out_kern.o | ||
167 | 170 | ||
168 | KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include | 171 | KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include |
169 | KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/ | 172 | KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/ |
@@ -263,6 +266,8 @@ $(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF) | |||
263 | $(src)/*.c: verify_target_bpf $(LIBBPF) | 266 | $(src)/*.c: verify_target_bpf $(LIBBPF) |
264 | 267 | ||
265 | $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h | 268 | $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h |
269 | $(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h | ||
270 | $(obj)/hbm.o: $(src)/hbm.h | ||
266 | 271 | ||
267 | # asm/sysreg.h - inline assembly used by it is incompatible with llvm. | 272 | # asm/sysreg.h - inline assembly used by it is incompatible with llvm. |
268 | # But, there is no easy way to fix it, so just exclude it since it is | 273 | # But, there is no easy way to fix it, so just exclude it since it is |
diff --git a/samples/bpf/do_hbm_test.sh b/samples/bpf/do_hbm_test.sh new file mode 100755 index 000000000000..56c8b4115c95 --- /dev/null +++ b/samples/bpf/do_hbm_test.sh | |||
@@ -0,0 +1,436 @@ | |||
1 | #!/bin/bash | ||
2 | # SPDX-License-Identifier: GPL-2.0 | ||
3 | # | ||
4 | # Copyright (c) 2019 Facebook | ||
5 | # | ||
6 | # This program is free software; you can redistribute it and/or | ||
7 | # modify it under the terms of version 2 of the GNU General Public | ||
8 | # License as published by the Free Software Foundation. | ||
9 | |||
10 | Usage() { | ||
11 | echo "Script for testing HBM (Host Bandwidth Manager) framework." | ||
12 | echo "It creates a cgroup to use for testing and load a BPF program to limit" | ||
13 | echo "egress or ingress bandwidht. It then uses iperf3 or netperf to create" | ||
14 | echo "loads. The output is the goodput in Mbps (unless -D was used)." | ||
15 | echo "" | ||
16 | echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>] [-D]" | ||
17 | echo " [-d=<delay>|--delay=<delay>] [--debug] [-E]" | ||
18 | echo " [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >]" | ||
19 | echo " [-l] [-N] [-p=<port>|--port=<port>] [-P]" | ||
20 | echo " [-q=<qdisc>] [-R] [-s=<server>|--server=<server]" | ||
21 | echo " [-S|--stats] -t=<time>|--time=<time>] [-w] [cubic|dctcp]" | ||
22 | echo " Where:" | ||
23 | echo " out egress (default)" | ||
24 | echo " -b or --bpf BPF program filename to load and attach." | ||
25 | echo " Default is hbm_out_kern.o for egress," | ||
26 | echo " -c or -cc TCP congestion control (cubic or dctcp)" | ||
27 | echo " --debug print BPF trace buffer" | ||
28 | echo " -d or --delay add a delay in ms using netem" | ||
29 | echo " -D In addition to the goodput in Mbps, it also outputs" | ||
30 | echo " other detailed information. This information is" | ||
31 | echo " test dependent (i.e. iperf3 or netperf)." | ||
32 | echo " -E enable ECN (not required for dctcp)" | ||
33 | echo " -f or --flows number of concurrent flows (default=1)" | ||
34 | echo " -i or --id cgroup id (an integer, default is 1)" | ||
35 | echo " -N use netperf instead of iperf3" | ||
36 | echo " -l do not limit flows using loopback" | ||
37 | echo " -h Help" | ||
38 | echo " -p or --port iperf3 port (default is 5201)" | ||
39 | echo " -P use an iperf3 instance for each flow" | ||
40 | echo " -q use the specified qdisc" | ||
41 | echo " -r or --rate rate in Mbps (default 1s 1Gbps)" | ||
42 | echo " -R Use TCP_RR for netperf. 1st flow has req" | ||
43 | echo " size of 10KB, rest of 1MB. Reply in all" | ||
44 | echo " cases is 1 byte." | ||
45 | echo " More detailed output for each flow can be found" | ||
46 | echo " in the files netperf.<cg>.<flow>, where <cg> is the" | ||
47 | echo " cgroup id as specified with the -i flag, and <flow>" | ||
48 | echo " is the flow id starting at 1 and increasing by 1 for" | ||
49 | echo " flow (as specified by -f)." | ||
50 | echo " -s or --server hostname of netperf server. Used to create netperf" | ||
51 | echo " test traffic between to hosts (default is within host)" | ||
52 | echo " netserver must be running on the host." | ||
53 | echo " -S or --stats whether to update hbm stats (default is yes)." | ||
54 | echo " -t or --time duration of iperf3 in seconds (default=5)" | ||
55 | echo " -w Work conserving flag. cgroup can increase its" | ||
56 | echo " bandwidth beyond the rate limit specified" | ||
57 | echo " while there is available bandwidth. Current" | ||
58 | echo " implementation assumes there is only one NIC" | ||
59 | echo " (eth0), but can be extended to support multiple" | ||
60 | echo " NICs." | ||
61 | echo " cubic or dctcp specify which TCP CC to use" | ||
62 | echo " " | ||
63 | exit | ||
64 | } | ||
65 | |||
66 | #set -x | ||
67 | |||
68 | debug_flag=0 | ||
69 | args="$@" | ||
70 | name="$0" | ||
71 | netem=0 | ||
72 | cc=x | ||
73 | dir="-o" | ||
74 | dir_name="out" | ||
75 | dur=5 | ||
76 | flows=1 | ||
77 | id=1 | ||
78 | prog="" | ||
79 | port=5201 | ||
80 | rate=1000 | ||
81 | multi_iperf=0 | ||
82 | flow_cnt=1 | ||
83 | use_netperf=0 | ||
84 | rr=0 | ||
85 | ecn=0 | ||
86 | details=0 | ||
87 | server="" | ||
88 | qdisc="" | ||
89 | flags="" | ||
90 | do_stats=0 | ||
91 | |||
92 | function start_hbm () { | ||
93 | rm -f hbm.out | ||
94 | echo "./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog" > hbm.out | ||
95 | echo " " >> hbm.out | ||
96 | ./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog >> hbm.out 2>&1 & | ||
97 | echo $! | ||
98 | } | ||
99 | |||
100 | processArgs () { | ||
101 | for i in $args ; do | ||
102 | case $i in | ||
103 | # Support for upcomming ingress rate limiting | ||
104 | #in) # support for upcoming ingress rate limiting | ||
105 | # dir="-i" | ||
106 | # dir_name="in" | ||
107 | # ;; | ||
108 | out) | ||
109 | dir="-o" | ||
110 | dir_name="out" | ||
111 | ;; | ||
112 | -b=*|--bpf=*) | ||
113 | prog="${i#*=}" | ||
114 | ;; | ||
115 | -c=*|--cc=*) | ||
116 | cc="${i#*=}" | ||
117 | ;; | ||
118 | --debug) | ||
119 | flags="$flags -d" | ||
120 | debug_flag=1 | ||
121 | ;; | ||
122 | -d=*|--delay=*) | ||
123 | netem="${i#*=}" | ||
124 | ;; | ||
125 | -D) | ||
126 | details=1 | ||
127 | ;; | ||
128 | -E) | ||
129 | ecn=1 | ||
130 | ;; | ||
131 | # Support for upcomming fq Early Departure Time egress rate limiting | ||
132 | #--edt) | ||
133 | # prog="hbm_out_edt_kern.o" | ||
134 | # qdisc="fq" | ||
135 | # ;; | ||
136 | -f=*|--flows=*) | ||
137 | flows="${i#*=}" | ||
138 | ;; | ||
139 | -i=*|--id=*) | ||
140 | id="${i#*=}" | ||
141 | ;; | ||
142 | -l) | ||
143 | flags="$flags -l" | ||
144 | ;; | ||
145 | -N) | ||
146 | use_netperf=1 | ||
147 | ;; | ||
148 | -p=*|--port=*) | ||
149 | port="${i#*=}" | ||
150 | ;; | ||
151 | -P) | ||
152 | multi_iperf=1 | ||
153 | ;; | ||
154 | -q=*) | ||
155 | qdisc="${i#*=}" | ||
156 | ;; | ||
157 | -r=*|--rate=*) | ||
158 | rate="${i#*=}" | ||
159 | ;; | ||
160 | -R) | ||
161 | rr=1 | ||
162 | ;; | ||
163 | -s=*|--server=*) | ||
164 | server="${i#*=}" | ||
165 | ;; | ||
166 | -S|--stats) | ||
167 | flags="$flags -s" | ||
168 | do_stats=1 | ||
169 | ;; | ||
170 | -t=*|--time=*) | ||
171 | dur="${i#*=}" | ||
172 | ;; | ||
173 | -w) | ||
174 | flags="$flags -w" | ||
175 | ;; | ||
176 | cubic) | ||
177 | cc=cubic | ||
178 | ;; | ||
179 | dctcp) | ||
180 | cc=dctcp | ||
181 | ;; | ||
182 | *) | ||
183 | echo "Unknown arg:$i" | ||
184 | Usage | ||
185 | ;; | ||
186 | esac | ||
187 | done | ||
188 | } | ||
189 | |||
190 | processArgs | ||
191 | |||
192 | if [ $debug_flag -eq 1 ] ; then | ||
193 | rm -f hbm_out.log | ||
194 | fi | ||
195 | |||
196 | hbm_pid=$(start_hbm) | ||
197 | usleep 100000 | ||
198 | |||
199 | host=`hostname` | ||
200 | cg_base_dir=/sys/fs/cgroup | ||
201 | cg_dir="$cg_base_dir/cgroup-test-work-dir/hbm$id" | ||
202 | |||
203 | echo $$ >> $cg_dir/cgroup.procs | ||
204 | |||
205 | ulimit -l unlimited | ||
206 | |||
207 | rm -f ss.out | ||
208 | rm -f hbm.[0-9]*.$dir_name | ||
209 | if [ $ecn -ne 0 ] ; then | ||
210 | sysctl -w -q -n net.ipv4.tcp_ecn=1 | ||
211 | fi | ||
212 | |||
213 | if [ $use_netperf -eq 0 ] ; then | ||
214 | cur_cc=`sysctl -n net.ipv4.tcp_congestion_control` | ||
215 | if [ "$cc" != "x" ] ; then | ||
216 | sysctl -w -q -n net.ipv4.tcp_congestion_control=$cc | ||
217 | fi | ||
218 | fi | ||
219 | |||
220 | if [ "$netem" -ne "0" ] ; then | ||
221 | if [ "$qdisc" != "" ] ; then | ||
222 | echo "WARNING: Ignoring -q options because -d option used" | ||
223 | fi | ||
224 | tc qdisc del dev lo root > /dev/null 2>&1 | ||
225 | tc qdisc add dev lo root netem delay $netem\ms > /dev/null 2>&1 | ||
226 | elif [ "$qdisc" != "" ] ; then | ||
227 | tc qdisc del dev lo root > /dev/null 2>&1 | ||
228 | tc qdisc add dev lo root $qdisc > /dev/null 2>&1 | ||
229 | fi | ||
230 | |||
231 | n=0 | ||
232 | m=$[$dur * 5] | ||
233 | hn="::1" | ||
234 | if [ $use_netperf -ne 0 ] ; then | ||
235 | if [ "$server" != "" ] ; then | ||
236 | hn=$server | ||
237 | fi | ||
238 | fi | ||
239 | |||
240 | ( ping6 -i 0.2 -c $m $hn > ping.out 2>&1 ) & | ||
241 | |||
242 | if [ $use_netperf -ne 0 ] ; then | ||
243 | begNetserverPid=`ps ax | grep netserver | grep --invert-match "grep" | \ | ||
244 | awk '{ print $1 }'` | ||
245 | if [ "$begNetserverPid" == "" ] ; then | ||
246 | if [ "$server" == "" ] ; then | ||
247 | ( ./netserver > /dev/null 2>&1) & | ||
248 | usleep 100000 | ||
249 | fi | ||
250 | fi | ||
251 | flow_cnt=1 | ||
252 | if [ "$server" == "" ] ; then | ||
253 | np_server=$host | ||
254 | else | ||
255 | np_server=$server | ||
256 | fi | ||
257 | if [ "$cc" == "x" ] ; then | ||
258 | np_cc="" | ||
259 | else | ||
260 | np_cc="-K $cc,$cc" | ||
261 | fi | ||
262 | replySize=1 | ||
263 | while [ $flow_cnt -le $flows ] ; do | ||
264 | if [ $rr -ne 0 ] ; then | ||
265 | reqSize=1M | ||
266 | if [ $flow_cnt -eq 1 ] ; then | ||
267 | reqSize=10K | ||
268 | fi | ||
269 | if [ "$dir" == "-i" ] ; then | ||
270 | replySize=$reqSize | ||
271 | reqSize=1 | ||
272 | fi | ||
273 | ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR -- -r $reqSize,$replySize $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,REMOTE_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,LOCAL_RECV_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) & | ||
274 | else | ||
275 | if [ "$dir" == "-i" ] ; then | ||
276 | ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR -- -r 1,10M $np_cc -k P50_LATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REMOTE_TRANSPORT_RETRANS,REMOTE_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) & | ||
277 | else | ||
278 | ( ./netperf -H $np_server -l $dur -f m -j -t TCP_STREAM -- $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) & | ||
279 | fi | ||
280 | fi | ||
281 | flow_cnt=$[flow_cnt+1] | ||
282 | done | ||
283 | |||
284 | # sleep for duration of test (plus some buffer) | ||
285 | n=$[dur+2] | ||
286 | sleep $n | ||
287 | |||
288 | # force graceful termination of netperf | ||
289 | pids=`pgrep netperf` | ||
290 | for p in $pids ; do | ||
291 | kill -SIGALRM $p | ||
292 | done | ||
293 | |||
294 | flow_cnt=1 | ||
295 | rate=0 | ||
296 | if [ $details -ne 0 ] ; then | ||
297 | echo "" | ||
298 | echo "Details for HBM in cgroup $id" | ||
299 | if [ $do_stats -eq 1 ] ; then | ||
300 | if [ -e hbm.$id.$dir_name ] ; then | ||
301 | cat hbm.$id.$dir_name | ||
302 | fi | ||
303 | fi | ||
304 | fi | ||
305 | while [ $flow_cnt -le $flows ] ; do | ||
306 | if [ "$dir" == "-i" ] ; then | ||
307 | r=`cat netperf.$id.$flow_cnt | grep -o "REMOTE_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"` | ||
308 | else | ||
309 | r=`cat netperf.$id.$flow_cnt | grep -o "LOCAL_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"` | ||
310 | fi | ||
311 | echo "rate for flow $flow_cnt: $r" | ||
312 | rate=$[rate+r] | ||
313 | if [ $details -ne 0 ] ; then | ||
314 | echo "-----" | ||
315 | echo "Details for cgroup $id, flow $flow_cnt" | ||
316 | cat netperf.$id.$flow_cnt | ||
317 | fi | ||
318 | flow_cnt=$[flow_cnt+1] | ||
319 | done | ||
320 | if [ $details -ne 0 ] ; then | ||
321 | echo "" | ||
322 | delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"` | ||
323 | echo "PING AVG DELAY:$delay" | ||
324 | echo "AGGREGATE_GOODPUT:$rate" | ||
325 | else | ||
326 | echo $rate | ||
327 | fi | ||
328 | elif [ $multi_iperf -eq 0 ] ; then | ||
329 | (iperf3 -s -p $port -1 > /dev/null 2>&1) & | ||
330 | usleep 100000 | ||
331 | iperf3 -c $host -p $port -i 0 -P $flows -f m -t $dur > iperf.$id | ||
332 | rates=`grep receiver iperf.$id | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*"` | ||
333 | rate=`echo $rates | grep -o "[0-9]*$"` | ||
334 | |||
335 | if [ $details -ne 0 ] ; then | ||
336 | echo "" | ||
337 | echo "Details for HBM in cgroup $id" | ||
338 | if [ $do_stats -eq 1 ] ; then | ||
339 | if [ -e hbm.$id.$dir_name ] ; then | ||
340 | cat hbm.$id.$dir_name | ||
341 | fi | ||
342 | fi | ||
343 | delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"` | ||
344 | echo "PING AVG DELAY:$delay" | ||
345 | echo "AGGREGATE_GOODPUT:$rate" | ||
346 | else | ||
347 | echo $rate | ||
348 | fi | ||
349 | else | ||
350 | flow_cnt=1 | ||
351 | while [ $flow_cnt -le $flows ] ; do | ||
352 | (iperf3 -s -p $port -1 > /dev/null 2>&1) & | ||
353 | ( iperf3 -c $host -p $port -i 0 -P 1 -f m -t $dur | grep receiver | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*" | grep -o "[0-9]*$" > iperf3.$id.$flow_cnt ) & | ||
354 | port=$[port+1] | ||
355 | flow_cnt=$[flow_cnt+1] | ||
356 | done | ||
357 | n=$[dur+1] | ||
358 | sleep $n | ||
359 | flow_cnt=1 | ||
360 | rate=0 | ||
361 | if [ $details -ne 0 ] ; then | ||
362 | echo "" | ||
363 | echo "Details for HBM in cgroup $id" | ||
364 | if [ $do_stats -eq 1 ] ; then | ||
365 | if [ -e hbm.$id.$dir_name ] ; then | ||
366 | cat hbm.$id.$dir_name | ||
367 | fi | ||
368 | fi | ||
369 | fi | ||
370 | |||
371 | while [ $flow_cnt -le $flows ] ; do | ||
372 | r=`cat iperf3.$id.$flow_cnt` | ||
373 | # echo "rate for flow $flow_cnt: $r" | ||
374 | if [ $details -ne 0 ] ; then | ||
375 | echo "Rate for cgroup $id, flow $flow_cnt LOCAL_SEND_THROUGHPUT=$r" | ||
376 | fi | ||
377 | rate=$[rate+r] | ||
378 | flow_cnt=$[flow_cnt+1] | ||
379 | done | ||
380 | if [ $details -ne 0 ] ; then | ||
381 | delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"` | ||
382 | echo "PING AVG DELAY:$delay" | ||
383 | echo "AGGREGATE_GOODPUT:$rate" | ||
384 | else | ||
385 | echo $rate | ||
386 | fi | ||
387 | fi | ||
388 | |||
389 | if [ $use_netperf -eq 0 ] ; then | ||
390 | sysctl -w -q -n net.ipv4.tcp_congestion_control=$cur_cc | ||
391 | fi | ||
392 | if [ $ecn -ne 0 ] ; then | ||
393 | sysctl -w -q -n net.ipv4.tcp_ecn=0 | ||
394 | fi | ||
395 | if [ "$netem" -ne "0" ] ; then | ||
396 | tc qdisc del dev lo root > /dev/null 2>&1 | ||
397 | fi | ||
398 | |||
399 | sleep 2 | ||
400 | |||
401 | hbmPid=`ps ax | grep "hbm " | grep --invert-match "grep" | awk '{ print $1 }'` | ||
402 | if [ "$hbmPid" == "$hbm_pid" ] ; then | ||
403 | kill $hbm_pid | ||
404 | fi | ||
405 | |||
406 | sleep 1 | ||
407 | |||
408 | # Detach any BPF programs that may have lingered | ||
409 | ttx=`bpftool cgroup tree | grep hbm` | ||
410 | v=2 | ||
411 | for x in $ttx ; do | ||
412 | if [ "${x:0:36}" == "/sys/fs/cgroup/cgroup-test-work-dir/" ] ; then | ||
413 | cg=$x ; v=0 | ||
414 | else | ||
415 | if [ $v -eq 0 ] ; then | ||
416 | id=$x ; v=1 | ||
417 | else | ||
418 | if [ $v -eq 1 ] ; then | ||
419 | type=$x ; bpftool cgroup detach $cg $type id $id | ||
420 | v=0 | ||
421 | fi | ||
422 | fi | ||
423 | fi | ||
424 | done | ||
425 | |||
426 | if [ $use_netperf -ne 0 ] ; then | ||
427 | if [ "$server" == "" ] ; then | ||
428 | if [ "$begNetserverPid" == "" ] ; then | ||
429 | netserverPid=`ps ax | grep netserver | grep --invert-match "grep" | awk '{ print $1 }'` | ||
430 | if [ "$netserverPid" != "" ] ; then | ||
431 | kill $netserverPid | ||
432 | fi | ||
433 | fi | ||
434 | fi | ||
435 | fi | ||
436 | exit | ||
diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c new file mode 100644 index 000000000000..8408ccb7409f --- /dev/null +++ b/samples/bpf/hbm.c | |||
@@ -0,0 +1,441 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* Copyright (c) 2019 Facebook | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of version 2 of the GNU General Public | ||
6 | * License as published by the Free Software Foundation. | ||
7 | * | ||
8 | * Example program for Host Bandwidth Managment | ||
9 | * | ||
10 | * This program loads a cgroup skb BPF program to enforce cgroup output | ||
11 | * (egress) or input (ingress) bandwidth limits. | ||
12 | * | ||
13 | * USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>] [-w] [-h] [prog] | ||
14 | * Where: | ||
15 | * -d Print BPF trace debug buffer | ||
16 | * -l Also limit flows doing loopback | ||
17 | * -n <#> To create cgroup \"/hbm#\" and attach prog | ||
18 | * Default is /hbm1 | ||
19 | * -r <rate> Rate limit in Mbps | ||
20 | * -s Get HBM stats (marked, dropped, etc.) | ||
21 | * -t <time> Exit after specified seconds (deault is 0) | ||
22 | * -w Work conserving flag. cgroup can increase its bandwidth | ||
23 | * beyond the rate limit specified while there is available | ||
24 | * bandwidth. Current implementation assumes there is only | ||
25 | * NIC (eth0), but can be extended to support multiple NICs. | ||
26 | * Currrently only supported for egress. | ||
27 | * -h Print this info | ||
28 | * prog BPF program file name. Name defaults to hbm_out_kern.o | ||
29 | */ | ||
30 | |||
31 | #define _GNU_SOURCE | ||
32 | |||
33 | #include <stdio.h> | ||
34 | #include <stdlib.h> | ||
35 | #include <assert.h> | ||
36 | #include <sys/resource.h> | ||
37 | #include <sys/time.h> | ||
38 | #include <unistd.h> | ||
39 | #include <errno.h> | ||
40 | #include <fcntl.h> | ||
41 | #include <linux/unistd.h> | ||
42 | |||
43 | #include <linux/bpf.h> | ||
44 | #include <bpf/bpf.h> | ||
45 | |||
46 | #include "bpf_load.h" | ||
47 | #include "bpf_rlimit.h" | ||
48 | #include "cgroup_helpers.h" | ||
49 | #include "hbm.h" | ||
50 | #include "bpf_util.h" | ||
51 | #include "bpf/bpf.h" | ||
52 | #include "bpf/libbpf.h" | ||
53 | |||
54 | bool outFlag = true; | ||
55 | int minRate = 1000; /* cgroup rate limit in Mbps */ | ||
56 | int rate = 1000; /* can grow if rate conserving is enabled */ | ||
57 | int dur = 1; | ||
58 | bool stats_flag; | ||
59 | bool loopback_flag; | ||
60 | bool debugFlag; | ||
61 | bool work_conserving_flag; | ||
62 | |||
63 | static void Usage(void); | ||
64 | static void read_trace_pipe2(void); | ||
65 | static void do_error(char *msg, bool errno_flag); | ||
66 | |||
67 | #define DEBUGFS "/sys/kernel/debug/tracing/" | ||
68 | |||
69 | struct bpf_object *obj; | ||
70 | int bpfprog_fd; | ||
71 | int cgroup_storage_fd; | ||
72 | |||
73 | static void read_trace_pipe2(void) | ||
74 | { | ||
75 | int trace_fd; | ||
76 | FILE *outf; | ||
77 | char *outFname = "hbm_out.log"; | ||
78 | |||
79 | trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0); | ||
80 | if (trace_fd < 0) { | ||
81 | printf("Error opening trace_pipe\n"); | ||
82 | return; | ||
83 | } | ||
84 | |||
85 | // Future support of ingress | ||
86 | // if (!outFlag) | ||
87 | // outFname = "hbm_in.log"; | ||
88 | outf = fopen(outFname, "w"); | ||
89 | |||
90 | if (outf == NULL) | ||
91 | printf("Error creating %s\n", outFname); | ||
92 | |||
93 | while (1) { | ||
94 | static char buf[4097]; | ||
95 | ssize_t sz; | ||
96 | |||
97 | sz = read(trace_fd, buf, sizeof(buf) - 1); | ||
98 | if (sz > 0) { | ||
99 | buf[sz] = 0; | ||
100 | puts(buf); | ||
101 | if (outf != NULL) { | ||
102 | fprintf(outf, "%s\n", buf); | ||
103 | fflush(outf); | ||
104 | } | ||
105 | } | ||
106 | } | ||
107 | } | ||
108 | |||
109 | static void do_error(char *msg, bool errno_flag) | ||
110 | { | ||
111 | if (errno_flag) | ||
112 | printf("ERROR: %s, errno: %d\n", msg, errno); | ||
113 | else | ||
114 | printf("ERROR: %s\n", msg); | ||
115 | exit(1); | ||
116 | } | ||
117 | |||
118 | static int prog_load(char *prog) | ||
119 | { | ||
120 | struct bpf_prog_load_attr prog_load_attr = { | ||
121 | .prog_type = BPF_PROG_TYPE_CGROUP_SKB, | ||
122 | .file = prog, | ||
123 | .expected_attach_type = BPF_CGROUP_INET_EGRESS, | ||
124 | }; | ||
125 | int map_fd; | ||
126 | struct bpf_map *map; | ||
127 | |||
128 | int ret = 0; | ||
129 | |||
130 | if (access(prog, O_RDONLY) < 0) { | ||
131 | printf("Error accessing file %s: %s\n", prog, strerror(errno)); | ||
132 | return 1; | ||
133 | } | ||
134 | if (bpf_prog_load_xattr(&prog_load_attr, &obj, &bpfprog_fd)) | ||
135 | ret = 1; | ||
136 | if (!ret) { | ||
137 | map = bpf_object__find_map_by_name(obj, "queue_stats"); | ||
138 | map_fd = bpf_map__fd(map); | ||
139 | if (map_fd < 0) { | ||
140 | printf("Map not found: %s\n", strerror(map_fd)); | ||
141 | ret = 1; | ||
142 | } | ||
143 | } | ||
144 | |||
145 | if (ret) { | ||
146 | printf("ERROR: load_bpf_file failed for: %s\n", prog); | ||
147 | printf(" Output from verifier:\n%s\n------\n", bpf_log_buf); | ||
148 | ret = -1; | ||
149 | } else { | ||
150 | ret = map_fd; | ||
151 | } | ||
152 | |||
153 | return ret; | ||
154 | } | ||
155 | |||
156 | static int run_bpf_prog(char *prog, int cg_id) | ||
157 | { | ||
158 | int map_fd; | ||
159 | int rc = 0; | ||
160 | int key = 0; | ||
161 | int cg1 = 0; | ||
162 | int type = BPF_CGROUP_INET_EGRESS; | ||
163 | char cg_dir[100]; | ||
164 | struct hbm_queue_stats qstats = {0}; | ||
165 | |||
166 | sprintf(cg_dir, "/hbm%d", cg_id); | ||
167 | map_fd = prog_load(prog); | ||
168 | if (map_fd == -1) | ||
169 | return 1; | ||
170 | |||
171 | if (setup_cgroup_environment()) { | ||
172 | printf("ERROR: setting cgroup environment\n"); | ||
173 | goto err; | ||
174 | } | ||
175 | cg1 = create_and_get_cgroup(cg_dir); | ||
176 | if (!cg1) { | ||
177 | printf("ERROR: create_and_get_cgroup\n"); | ||
178 | goto err; | ||
179 | } | ||
180 | if (join_cgroup(cg_dir)) { | ||
181 | printf("ERROR: join_cgroup\n"); | ||
182 | goto err; | ||
183 | } | ||
184 | |||
185 | qstats.rate = rate; | ||
186 | qstats.stats = stats_flag ? 1 : 0; | ||
187 | qstats.loopback = loopback_flag ? 1 : 0; | ||
188 | if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) { | ||
189 | printf("ERROR: Could not update map element\n"); | ||
190 | goto err; | ||
191 | } | ||
192 | |||
193 | if (!outFlag) | ||
194 | type = BPF_CGROUP_INET_INGRESS; | ||
195 | if (bpf_prog_attach(bpfprog_fd, cg1, type, 0)) { | ||
196 | printf("ERROR: bpf_prog_attach fails!\n"); | ||
197 | log_err("Attaching prog"); | ||
198 | goto err; | ||
199 | } | ||
200 | |||
201 | if (work_conserving_flag) { | ||
202 | struct timeval t0, t_last, t_new; | ||
203 | FILE *fin; | ||
204 | unsigned long long last_eth_tx_bytes, new_eth_tx_bytes; | ||
205 | signed long long last_cg_tx_bytes, new_cg_tx_bytes; | ||
206 | signed long long delta_time, delta_bytes, delta_rate; | ||
207 | int delta_ms; | ||
208 | #define DELTA_RATE_CHECK 10000 /* in us */ | ||
209 | #define RATE_THRESHOLD 9500000000 /* 9.5 Gbps */ | ||
210 | |||
211 | bpf_map_lookup_elem(map_fd, &key, &qstats); | ||
212 | if (gettimeofday(&t0, NULL) < 0) | ||
213 | do_error("gettimeofday failed", true); | ||
214 | t_last = t0; | ||
215 | fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", "r"); | ||
216 | if (fscanf(fin, "%llu", &last_eth_tx_bytes) != 1) | ||
217 | do_error("fscanf fails", false); | ||
218 | fclose(fin); | ||
219 | last_cg_tx_bytes = qstats.bytes_total; | ||
220 | while (true) { | ||
221 | usleep(DELTA_RATE_CHECK); | ||
222 | if (gettimeofday(&t_new, NULL) < 0) | ||
223 | do_error("gettimeofday failed", true); | ||
224 | delta_ms = (t_new.tv_sec - t0.tv_sec) * 1000 + | ||
225 | (t_new.tv_usec - t0.tv_usec)/1000; | ||
226 | if (delta_ms > dur * 1000) | ||
227 | break; | ||
228 | delta_time = (t_new.tv_sec - t_last.tv_sec) * 1000000 + | ||
229 | (t_new.tv_usec - t_last.tv_usec); | ||
230 | if (delta_time == 0) | ||
231 | continue; | ||
232 | t_last = t_new; | ||
233 | fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", | ||
234 | "r"); | ||
235 | if (fscanf(fin, "%llu", &new_eth_tx_bytes) != 1) | ||
236 | do_error("fscanf fails", false); | ||
237 | fclose(fin); | ||
238 | printf(" new_eth_tx_bytes:%llu\n", | ||
239 | new_eth_tx_bytes); | ||
240 | bpf_map_lookup_elem(map_fd, &key, &qstats); | ||
241 | new_cg_tx_bytes = qstats.bytes_total; | ||
242 | delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes; | ||
243 | last_eth_tx_bytes = new_eth_tx_bytes; | ||
244 | delta_rate = (delta_bytes * 8000000) / delta_time; | ||
245 | printf("%5d - eth_rate:%.1fGbps cg_rate:%.3fGbps", | ||
246 | delta_ms, delta_rate/1000000000.0, | ||
247 | rate/1000.0); | ||
248 | if (delta_rate < RATE_THRESHOLD) { | ||
249 | /* can increase cgroup rate limit, but first | ||
250 | * check if we are using the current limit. | ||
251 | * Currently increasing by 6.25%, unknown | ||
252 | * if that is the optimal rate. | ||
253 | */ | ||
254 | int rate_diff100; | ||
255 | |||
256 | delta_bytes = new_cg_tx_bytes - | ||
257 | last_cg_tx_bytes; | ||
258 | last_cg_tx_bytes = new_cg_tx_bytes; | ||
259 | delta_rate = (delta_bytes * 8000000) / | ||
260 | delta_time; | ||
261 | printf(" rate:%.3fGbps", | ||
262 | delta_rate/1000000000.0); | ||
263 | rate_diff100 = (((long long)rate)*1000000 - | ||
264 | delta_rate) * 100 / | ||
265 | (((long long) rate) * 1000000); | ||
266 | printf(" rdiff:%d", rate_diff100); | ||
267 | if (rate_diff100 <= 3) { | ||
268 | rate += (rate >> 4); | ||
269 | if (rate > RATE_THRESHOLD / 1000000) | ||
270 | rate = RATE_THRESHOLD / 1000000; | ||
271 | qstats.rate = rate; | ||
272 | printf(" INC\n"); | ||
273 | } else { | ||
274 | printf("\n"); | ||
275 | } | ||
276 | } else { | ||
277 | /* Need to decrease cgroup rate limit. | ||
278 | * Currently decreasing by 12.5%, unknown | ||
279 | * if that is optimal | ||
280 | */ | ||
281 | printf(" DEC\n"); | ||
282 | rate -= (rate >> 3); | ||
283 | if (rate < minRate) | ||
284 | rate = minRate; | ||
285 | qstats.rate = rate; | ||
286 | } | ||
287 | if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) | ||
288 | do_error("update map element fails", false); | ||
289 | } | ||
290 | } else { | ||
291 | sleep(dur); | ||
292 | } | ||
293 | // Get stats! | ||
294 | if (stats_flag && bpf_map_lookup_elem(map_fd, &key, &qstats)) { | ||
295 | char fname[100]; | ||
296 | FILE *fout; | ||
297 | |||
298 | if (!outFlag) | ||
299 | sprintf(fname, "hbm.%d.in", cg_id); | ||
300 | else | ||
301 | sprintf(fname, "hbm.%d.out", cg_id); | ||
302 | fout = fopen(fname, "w"); | ||
303 | fprintf(fout, "id:%d\n", cg_id); | ||
304 | fprintf(fout, "ERROR: Could not lookup queue_stats\n"); | ||
305 | } else if (stats_flag && qstats.lastPacketTime > | ||
306 | qstats.firstPacketTime) { | ||
307 | long long delta_us = (qstats.lastPacketTime - | ||
308 | qstats.firstPacketTime)/1000; | ||
309 | unsigned int rate_mbps = ((qstats.bytes_total - | ||
310 | qstats.bytes_dropped) * 8 / | ||
311 | delta_us); | ||
312 | double percent_pkts, percent_bytes; | ||
313 | char fname[100]; | ||
314 | FILE *fout; | ||
315 | |||
316 | // Future support of ingress | ||
317 | // if (!outFlag) | ||
318 | // sprintf(fname, "hbm.%d.in", cg_id); | ||
319 | // else | ||
320 | sprintf(fname, "hbm.%d.out", cg_id); | ||
321 | fout = fopen(fname, "w"); | ||
322 | fprintf(fout, "id:%d\n", cg_id); | ||
323 | fprintf(fout, "rate_mbps:%d\n", rate_mbps); | ||
324 | fprintf(fout, "duration:%.1f secs\n", | ||
325 | (qstats.lastPacketTime - qstats.firstPacketTime) / | ||
326 | 1000000000.0); | ||
327 | fprintf(fout, "packets:%d\n", (int)qstats.pkts_total); | ||
328 | fprintf(fout, "bytes_MB:%d\n", (int)(qstats.bytes_total / | ||
329 | 1000000)); | ||
330 | fprintf(fout, "pkts_dropped:%d\n", (int)qstats.pkts_dropped); | ||
331 | fprintf(fout, "bytes_dropped_MB:%d\n", | ||
332 | (int)(qstats.bytes_dropped / | ||
333 | 1000000)); | ||
334 | // Marked Pkts and Bytes | ||
335 | percent_pkts = (qstats.pkts_marked * 100.0) / | ||
336 | (qstats.pkts_total + 1); | ||
337 | percent_bytes = (qstats.bytes_marked * 100.0) / | ||
338 | (qstats.bytes_total + 1); | ||
339 | fprintf(fout, "pkts_marked_percent:%6.2f\n", percent_pkts); | ||
340 | fprintf(fout, "bytes_marked_percent:%6.2f\n", percent_bytes); | ||
341 | |||
342 | // Dropped Pkts and Bytes | ||
343 | percent_pkts = (qstats.pkts_dropped * 100.0) / | ||
344 | (qstats.pkts_total + 1); | ||
345 | percent_bytes = (qstats.bytes_dropped * 100.0) / | ||
346 | (qstats.bytes_total + 1); | ||
347 | fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts); | ||
348 | fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes); | ||
349 | fclose(fout); | ||
350 | } | ||
351 | |||
352 | if (debugFlag) | ||
353 | read_trace_pipe2(); | ||
354 | return rc; | ||
355 | err: | ||
356 | rc = 1; | ||
357 | |||
358 | if (cg1) | ||
359 | close(cg1); | ||
360 | cleanup_cgroup_environment(); | ||
361 | |||
362 | return rc; | ||
363 | } | ||
364 | |||
365 | static void Usage(void) | ||
366 | { | ||
367 | printf("This program loads a cgroup skb BPF program to enforce\n" | ||
368 | "cgroup output (egress) bandwidth limits.\n\n" | ||
369 | "USAGE: hbm [-o] [-d] [-l] [-n <id>] [-r <rate>] [-s]\n" | ||
370 | " [-t <secs>] [-w] [-h] [prog]\n" | ||
371 | " Where:\n" | ||
372 | " -o indicates egress direction (default)\n" | ||
373 | " -d print BPF trace debug buffer\n" | ||
374 | " -l also limit flows using loopback\n" | ||
375 | " -n <#> to create cgroup \"/hbm#\" and attach prog\n" | ||
376 | " Default is /hbm1\n" | ||
377 | " -r <rate> Rate in Mbps\n" | ||
378 | " -s Update HBM stats\n" | ||
379 | " -t <time> Exit after specified seconds (deault is 0)\n" | ||
380 | " -w Work conserving flag. cgroup can increase\n" | ||
381 | " bandwidth beyond the rate limit specified\n" | ||
382 | " while there is available bandwidth. Current\n" | ||
383 | " implementation assumes there is only eth0\n" | ||
384 | " but can be extended to support multiple NICs\n" | ||
385 | " -h print this info\n" | ||
386 | " prog BPF program file name. Name defaults to\n" | ||
387 | " hbm_out_kern.o\n"); | ||
388 | } | ||
389 | |||
390 | int main(int argc, char **argv) | ||
391 | { | ||
392 | char *prog = "hbm_out_kern.o"; | ||
393 | int k; | ||
394 | int cg_id = 1; | ||
395 | char *optstring = "iodln:r:st:wh"; | ||
396 | |||
397 | while ((k = getopt(argc, argv, optstring)) != -1) { | ||
398 | switch (k) { | ||
399 | case'o': | ||
400 | break; | ||
401 | case 'd': | ||
402 | debugFlag = true; | ||
403 | break; | ||
404 | case 'l': | ||
405 | loopback_flag = true; | ||
406 | break; | ||
407 | case 'n': | ||
408 | cg_id = atoi(optarg); | ||
409 | break; | ||
410 | case 'r': | ||
411 | minRate = atoi(optarg) * 1.024; | ||
412 | rate = minRate; | ||
413 | break; | ||
414 | case 's': | ||
415 | stats_flag = true; | ||
416 | break; | ||
417 | case 't': | ||
418 | dur = atoi(optarg); | ||
419 | break; | ||
420 | case 'w': | ||
421 | work_conserving_flag = true; | ||
422 | break; | ||
423 | case '?': | ||
424 | if (optopt == 'n' || optopt == 'r' || optopt == 't') | ||
425 | fprintf(stderr, | ||
426 | "Option -%c requires an argument.\n\n", | ||
427 | optopt); | ||
428 | case 'h': | ||
429 | // fallthrough | ||
430 | default: | ||
431 | Usage(); | ||
432 | return 0; | ||
433 | } | ||
434 | } | ||
435 | |||
436 | if (optind < argc) | ||
437 | prog = argv[optind]; | ||
438 | printf("HBM prog: %s\n", prog != NULL ? prog : "NULL"); | ||
439 | |||
440 | return run_bpf_prog(prog, cg_id); | ||
441 | } | ||
diff --git a/samples/bpf/hbm.h b/samples/bpf/hbm.h new file mode 100644 index 000000000000..518e8147d084 --- /dev/null +++ b/samples/bpf/hbm.h | |||
@@ -0,0 +1,31 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 | ||
2 | * | ||
3 | * Copyright (c) 2019 Facebook | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or | ||
6 | * modify it under the terms of version 2 of the GNU General Public | ||
7 | * License as published by the Free Software Foundation. | ||
8 | * | ||
9 | * Include file for Host Bandwidth Management (HBM) programs | ||
10 | */ | ||
11 | struct hbm_vqueue { | ||
12 | struct bpf_spin_lock lock; | ||
13 | /* 4 byte hole */ | ||
14 | unsigned long long lasttime; /* In ns */ | ||
15 | int credit; /* In bytes */ | ||
16 | unsigned int rate; /* In bytes per NS << 20 */ | ||
17 | }; | ||
18 | |||
19 | struct hbm_queue_stats { | ||
20 | unsigned long rate; /* in Mbps*/ | ||
21 | unsigned long stats:1, /* get HBM stats (marked, dropped,..) */ | ||
22 | loopback:1; /* also limit flows using loopback */ | ||
23 | unsigned long long pkts_marked; | ||
24 | unsigned long long bytes_marked; | ||
25 | unsigned long long pkts_dropped; | ||
26 | unsigned long long bytes_dropped; | ||
27 | unsigned long long pkts_total; | ||
28 | unsigned long long bytes_total; | ||
29 | unsigned long long firstPacketTime; | ||
30 | unsigned long long lastPacketTime; | ||
31 | }; | ||
diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h new file mode 100644 index 000000000000..c5635d924193 --- /dev/null +++ b/samples/bpf/hbm_kern.h | |||
@@ -0,0 +1,137 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 | ||
2 | * | ||
3 | * Copyright (c) 2019 Facebook | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or | ||
6 | * modify it under the terms of version 2 of the GNU General Public | ||
7 | * License as published by the Free Software Foundation. | ||
8 | * | ||
9 | * Include file for sample Host Bandwidth Manager (HBM) BPF programs | ||
10 | */ | ||
11 | #define KBUILD_MODNAME "foo" | ||
12 | #include <stddef.h> | ||
13 | #include <stdbool.h> | ||
14 | #include <uapi/linux/bpf.h> | ||
15 | #include <uapi/linux/if_ether.h> | ||
16 | #include <uapi/linux/if_packet.h> | ||
17 | #include <uapi/linux/ip.h> | ||
18 | #include <uapi/linux/ipv6.h> | ||
19 | #include <uapi/linux/in.h> | ||
20 | #include <uapi/linux/tcp.h> | ||
21 | #include <uapi/linux/filter.h> | ||
22 | #include <uapi/linux/pkt_cls.h> | ||
23 | #include <net/ipv6.h> | ||
24 | #include <net/inet_ecn.h> | ||
25 | #include "bpf_endian.h" | ||
26 | #include "bpf_helpers.h" | ||
27 | #include "hbm.h" | ||
28 | |||
29 | #define DROP_PKT 0 | ||
30 | #define ALLOW_PKT 1 | ||
31 | #define TCP_ECN_OK 1 | ||
32 | |||
33 | #define HBM_DEBUG 0 // Set to 1 to enable debugging | ||
34 | #if HBM_DEBUG | ||
35 | #define bpf_printk(fmt, ...) \ | ||
36 | ({ \ | ||
37 | char ____fmt[] = fmt; \ | ||
38 | bpf_trace_printk(____fmt, sizeof(____fmt), \ | ||
39 | ##__VA_ARGS__); \ | ||
40 | }) | ||
41 | #else | ||
42 | #define bpf_printk(fmt, ...) | ||
43 | #endif | ||
44 | |||
45 | #define INITIAL_CREDIT_PACKETS 100 | ||
46 | #define MAX_BYTES_PER_PACKET 1500 | ||
47 | #define MARK_THRESH (40 * MAX_BYTES_PER_PACKET) | ||
48 | #define DROP_THRESH (80 * 5 * MAX_BYTES_PER_PACKET) | ||
49 | #define LARGE_PKT_DROP_THRESH (DROP_THRESH - (15 * MAX_BYTES_PER_PACKET)) | ||
50 | #define MARK_REGION_SIZE (LARGE_PKT_DROP_THRESH - MARK_THRESH) | ||
51 | #define LARGE_PKT_THRESH 120 | ||
52 | #define MAX_CREDIT (100 * MAX_BYTES_PER_PACKET) | ||
53 | #define INIT_CREDIT (INITIAL_CREDIT_PACKETS * MAX_BYTES_PER_PACKET) | ||
54 | |||
55 | // rate in bytes per ns << 20 | ||
56 | #define CREDIT_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20) | ||
57 | |||
58 | struct bpf_map_def SEC("maps") queue_state = { | ||
59 | .type = BPF_MAP_TYPE_CGROUP_STORAGE, | ||
60 | .key_size = sizeof(struct bpf_cgroup_storage_key), | ||
61 | .value_size = sizeof(struct hbm_vqueue), | ||
62 | }; | ||
63 | BPF_ANNOTATE_KV_PAIR(queue_state, struct bpf_cgroup_storage_key, | ||
64 | struct hbm_vqueue); | ||
65 | |||
66 | struct bpf_map_def SEC("maps") queue_stats = { | ||
67 | .type = BPF_MAP_TYPE_ARRAY, | ||
68 | .key_size = sizeof(u32), | ||
69 | .value_size = sizeof(struct hbm_queue_stats), | ||
70 | .max_entries = 1, | ||
71 | }; | ||
72 | BPF_ANNOTATE_KV_PAIR(queue_stats, int, struct hbm_queue_stats); | ||
73 | |||
74 | struct hbm_pkt_info { | ||
75 | bool is_ip; | ||
76 | bool is_tcp; | ||
77 | short ecn; | ||
78 | }; | ||
79 | |||
80 | static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb, | ||
81 | struct hbm_pkt_info *pkti) | ||
82 | { | ||
83 | struct iphdr iph; | ||
84 | struct ipv6hdr *ip6h; | ||
85 | |||
86 | bpf_skb_load_bytes(skb, 0, &iph, 12); | ||
87 | if (iph.version == 6) { | ||
88 | ip6h = (struct ipv6hdr *)&iph; | ||
89 | pkti->is_ip = true; | ||
90 | pkti->is_tcp = (ip6h->nexthdr == 6); | ||
91 | pkti->ecn = (ip6h->flow_lbl[0] >> 4) & INET_ECN_MASK; | ||
92 | } else if (iph.version == 4) { | ||
93 | pkti->is_ip = true; | ||
94 | pkti->is_tcp = (iph.protocol == 6); | ||
95 | pkti->ecn = iph.tos & INET_ECN_MASK; | ||
96 | } else { | ||
97 | pkti->is_ip = false; | ||
98 | pkti->is_tcp = false; | ||
99 | pkti->ecn = 0; | ||
100 | } | ||
101 | } | ||
102 | |||
103 | static __always_inline void hbm_init_vqueue(struct hbm_vqueue *qdp, int rate) | ||
104 | { | ||
105 | bpf_printk("Initializing queue_state, rate:%d\n", rate * 128); | ||
106 | qdp->lasttime = bpf_ktime_get_ns(); | ||
107 | qdp->credit = INIT_CREDIT; | ||
108 | qdp->rate = rate * 128; | ||
109 | } | ||
110 | |||
111 | static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp, | ||
112 | int len, | ||
113 | unsigned long long curtime, | ||
114 | bool congestion_flag, | ||
115 | bool drop_flag) | ||
116 | { | ||
117 | if (qsp != NULL) { | ||
118 | // Following is needed for work conserving | ||
119 | __sync_add_and_fetch(&(qsp->bytes_total), len); | ||
120 | if (qsp->stats) { | ||
121 | // Optionally update statistics | ||
122 | if (qsp->firstPacketTime == 0) | ||
123 | qsp->firstPacketTime = curtime; | ||
124 | qsp->lastPacketTime = curtime; | ||
125 | __sync_add_and_fetch(&(qsp->pkts_total), 1); | ||
126 | if (congestion_flag || drop_flag) { | ||
127 | __sync_add_and_fetch(&(qsp->pkts_marked), 1); | ||
128 | __sync_add_and_fetch(&(qsp->bytes_marked), len); | ||
129 | } | ||
130 | if (drop_flag) { | ||
131 | __sync_add_and_fetch(&(qsp->pkts_dropped), 1); | ||
132 | __sync_add_and_fetch(&(qsp->bytes_dropped), | ||
133 | len); | ||
134 | } | ||
135 | } | ||
136 | } | ||
137 | } | ||
diff --git a/samples/bpf/hbm_out_kern.c b/samples/bpf/hbm_out_kern.c new file mode 100644 index 000000000000..f806863d0b79 --- /dev/null +++ b/samples/bpf/hbm_out_kern.c | |||
@@ -0,0 +1,157 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* Copyright (c) 2019 Facebook | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of version 2 of the GNU General Public | ||
6 | * License as published by the Free Software Foundation. | ||
7 | * | ||
8 | * Sample Host Bandwidth Manager (HBM) BPF program. | ||
9 | * | ||
10 | * A cgroup skb BPF egress program to limit cgroup output bandwidth. | ||
11 | * It uses a modified virtual token bucket queue to limit average | ||
12 | * egress bandwidth. The implementation uses credits instead of tokens. | ||
13 | * Negative credits imply that queueing would have happened (this is | ||
14 | * a virtual queue, so no queueing is done by it. However, queueing may | ||
15 | * occur at the actual qdisc (which is not used for rate limiting). | ||
16 | * | ||
17 | * This implementation uses 3 thresholds, one to start marking packets and | ||
18 | * the other two to drop packets: | ||
19 | * CREDIT | ||
20 | * - <--------------------------|------------------------> + | ||
21 | * | | | 0 | ||
22 | * | Large pkt | | ||
23 | * | drop thresh | | ||
24 | * Small pkt drop Mark threshold | ||
25 | * thresh | ||
26 | * | ||
27 | * The effect of marking depends on the type of packet: | ||
28 | * a) If the packet is ECN enabled and it is a TCP packet, then the packet | ||
29 | * is ECN marked. | ||
30 | * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr | ||
31 | * to reduce the congestion window. The current implementation uses a linear | ||
32 | * distribution (0% probability at marking threshold, 100% probability | ||
33 | * at drop threshold). | ||
34 | * c) If the packet is not a TCP packet, then it is dropped. | ||
35 | * | ||
36 | * If the credit is below the drop threshold, the packet is dropped. If it | ||
37 | * is a TCP packet, then it also calls tcp_cwr since packets dropped by | ||
38 | * by a cgroup skb BPF program do not automatically trigger a call to | ||
39 | * tcp_cwr in the current kernel code. | ||
40 | * | ||
41 | * This BPF program actually uses 2 drop thresholds, one threshold | ||
42 | * for larger packets (>= 120 bytes) and another for smaller packets. This | ||
43 | * protects smaller packets such as SYNs, ACKs, etc. | ||
44 | * | ||
45 | * The default bandwidth limit is set at 1Gbps but this can be changed by | ||
46 | * a user program through a shared BPF map. In addition, by default this BPF | ||
47 | * program does not limit connections using loopback. This behavior can be | ||
48 | * overwritten by the user program. There is also an option to calculate | ||
49 | * some statistics, such as percent of packets marked or dropped, which | ||
50 | * the user program can access. | ||
51 | * | ||
52 | * A latter patch provides such a program (hbm.c) | ||
53 | */ | ||
54 | |||
55 | #include "hbm_kern.h" | ||
56 | |||
57 | SEC("cgroup_skb/egress") | ||
58 | int _hbm_out_cg(struct __sk_buff *skb) | ||
59 | { | ||
60 | struct hbm_pkt_info pkti; | ||
61 | int len = skb->len; | ||
62 | unsigned int queue_index = 0; | ||
63 | unsigned long long curtime; | ||
64 | int credit; | ||
65 | signed long long delta = 0, zero = 0; | ||
66 | int max_credit = MAX_CREDIT; | ||
67 | bool congestion_flag = false; | ||
68 | bool drop_flag = false; | ||
69 | bool cwr_flag = false; | ||
70 | struct hbm_vqueue *qdp; | ||
71 | struct hbm_queue_stats *qsp = NULL; | ||
72 | int rv = ALLOW_PKT; | ||
73 | |||
74 | qsp = bpf_map_lookup_elem(&queue_stats, &queue_index); | ||
75 | if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1)) | ||
76 | return ALLOW_PKT; | ||
77 | |||
78 | hbm_get_pkt_info(skb, &pkti); | ||
79 | |||
80 | // We may want to account for the length of headers in len | ||
81 | // calculation, like ETH header + overhead, specially if it | ||
82 | // is a gso packet. But I am not doing it right now. | ||
83 | |||
84 | qdp = bpf_get_local_storage(&queue_state, 0); | ||
85 | if (!qdp) | ||
86 | return ALLOW_PKT; | ||
87 | else if (qdp->lasttime == 0) | ||
88 | hbm_init_vqueue(qdp, 1024); | ||
89 | |||
90 | curtime = bpf_ktime_get_ns(); | ||
91 | |||
92 | // Begin critical section | ||
93 | bpf_spin_lock(&qdp->lock); | ||
94 | credit = qdp->credit; | ||
95 | delta = curtime - qdp->lasttime; | ||
96 | /* delta < 0 implies that another process with a curtime greater | ||
97 | * than ours beat us to the critical section and already added | ||
98 | * the new credit, so we should not add it ourselves | ||
99 | */ | ||
100 | if (delta > 0) { | ||
101 | qdp->lasttime = curtime; | ||
102 | credit += CREDIT_PER_NS(delta, qdp->rate); | ||
103 | if (credit > MAX_CREDIT) | ||
104 | credit = MAX_CREDIT; | ||
105 | } | ||
106 | credit -= len; | ||
107 | qdp->credit = credit; | ||
108 | bpf_spin_unlock(&qdp->lock); | ||
109 | // End critical section | ||
110 | |||
111 | // Check if we should update rate | ||
112 | if (qsp != NULL && (qsp->rate * 128) != qdp->rate) { | ||
113 | qdp->rate = qsp->rate * 128; | ||
114 | bpf_printk("Updating rate: %d (1sec:%llu bits)\n", | ||
115 | (int)qdp->rate, | ||
116 | CREDIT_PER_NS(1000000000, qdp->rate) * 8); | ||
117 | } | ||
118 | |||
119 | // Set flags (drop, congestion, cwr) | ||
120 | // Dropping => we are congested, so ignore congestion flag | ||
121 | if (credit < -DROP_THRESH || | ||
122 | (len > LARGE_PKT_THRESH && | ||
123 | credit < -LARGE_PKT_DROP_THRESH)) { | ||
124 | // Very congested, set drop flag | ||
125 | drop_flag = true; | ||
126 | } else if (credit < 0) { | ||
127 | // Congested, set congestion flag | ||
128 | if (pkti.ecn) { | ||
129 | if (credit < -MARK_THRESH) | ||
130 | congestion_flag = true; | ||
131 | else | ||
132 | congestion_flag = false; | ||
133 | } else { | ||
134 | congestion_flag = true; | ||
135 | } | ||
136 | } | ||
137 | |||
138 | if (congestion_flag) { | ||
139 | if (!bpf_skb_ecn_set_ce(skb)) { | ||
140 | if (len > LARGE_PKT_THRESH) { | ||
141 | // Problem if too many small packets? | ||
142 | drop_flag = true; | ||
143 | } | ||
144 | } | ||
145 | } | ||
146 | |||
147 | if (drop_flag) | ||
148 | rv = DROP_PKT; | ||
149 | |||
150 | hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag); | ||
151 | |||
152 | if (rv == DROP_PKT) | ||
153 | __sync_add_and_fetch(&(qdp->credit), len); | ||
154 | |||
155 | return rv; | ||
156 | } | ||
157 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2e308e90ffea..3c38ac9a92a7 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h | |||
@@ -2359,6 +2359,13 @@ union bpf_attr { | |||
2359 | * Return | 2359 | * Return |
2360 | * A **struct bpf_tcp_sock** pointer on success, or NULL in | 2360 | * A **struct bpf_tcp_sock** pointer on success, or NULL in |
2361 | * case of failure. | 2361 | * case of failure. |
2362 | * | ||
2363 | * int bpf_skb_ecn_set_ce(struct sk_buf *skb) | ||
2364 | * Description | ||
2365 | * Sets ECN of IP header to ce (congestion encountered) if | ||
2366 | * current value is ect (ECN capable). Works with IPv6 and IPv4. | ||
2367 | * Return | ||
2368 | * 1 if set, 0 if not set. | ||
2362 | */ | 2369 | */ |
2363 | #define __BPF_FUNC_MAPPER(FN) \ | 2370 | #define __BPF_FUNC_MAPPER(FN) \ |
2364 | FN(unspec), \ | 2371 | FN(unspec), \ |
@@ -2457,7 +2464,8 @@ union bpf_attr { | |||
2457 | FN(spin_lock), \ | 2464 | FN(spin_lock), \ |
2458 | FN(spin_unlock), \ | 2465 | FN(spin_unlock), \ |
2459 | FN(sk_fullsock), \ | 2466 | FN(sk_fullsock), \ |
2460 | FN(tcp_sock), | 2467 | FN(tcp_sock), \ |
2468 | FN(skb_ecn_set_ce), | ||
2461 | 2469 | ||
2462 | /* integer value in 'imm' field of BPF_CALL instruction selects which helper | 2470 | /* integer value in 'imm' field of BPF_CALL instruction selects which helper |
2463 | * function eBPF program intends to call | 2471 | * function eBPF program intends to call |
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 026bea831e03..c9433a496d54 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h | |||
@@ -180,6 +180,8 @@ static struct bpf_sock *(*bpf_sk_fullsock)(struct bpf_sock *sk) = | |||
180 | (void *) BPF_FUNC_sk_fullsock; | 180 | (void *) BPF_FUNC_sk_fullsock; |
181 | static struct bpf_tcp_sock *(*bpf_tcp_sock)(struct bpf_sock *sk) = | 181 | static struct bpf_tcp_sock *(*bpf_tcp_sock)(struct bpf_sock *sk) = |
182 | (void *) BPF_FUNC_tcp_sock; | 182 | (void *) BPF_FUNC_tcp_sock; |
183 | static int (*bpf_skb_ecn_set_ce)(void *ctx) = | ||
184 | (void *) BPF_FUNC_skb_ecn_set_ce; | ||
183 | 185 | ||
184 | /* llvm builtin functions that eBPF C program may use to | 186 | /* llvm builtin functions that eBPF C program may use to |
185 | * emit BPF_LD_ABS and BPF_LD_IND instructions | 187 | * emit BPF_LD_ABS and BPF_LD_IND instructions |