aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/uapi/linux/bpf.h10
-rw-r--r--net/core/filter.c28
-rw-r--r--samples/bpf/Makefile5
-rwxr-xr-xsamples/bpf/do_hbm_test.sh436
-rw-r--r--samples/bpf/hbm.c441
-rw-r--r--samples/bpf/hbm.h31
-rw-r--r--samples/bpf/hbm_kern.h137
-rw-r--r--samples/bpf/hbm_out_kern.c157
-rw-r--r--tools/include/uapi/linux/bpf.h10
-rw-r--r--tools/testing/selftests/bpf/bpf_helpers.h2
10 files changed, 1255 insertions, 2 deletions
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2e308e90ffea..3c38ac9a92a7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2359,6 +2359,13 @@ union bpf_attr {
2359 * Return 2359 * Return
2360 * A **struct bpf_tcp_sock** pointer on success, or NULL in 2360 * A **struct bpf_tcp_sock** pointer on success, or NULL in
2361 * case of failure. 2361 * case of failure.
2362 *
2363 * int bpf_skb_ecn_set_ce(struct sk_buf *skb)
2364 * Description
2365 * Sets ECN of IP header to ce (congestion encountered) if
2366 * current value is ect (ECN capable). Works with IPv6 and IPv4.
2367 * Return
2368 * 1 if set, 0 if not set.
2362 */ 2369 */
2363#define __BPF_FUNC_MAPPER(FN) \ 2370#define __BPF_FUNC_MAPPER(FN) \
2364 FN(unspec), \ 2371 FN(unspec), \
@@ -2457,7 +2464,8 @@ union bpf_attr {
2457 FN(spin_lock), \ 2464 FN(spin_lock), \
2458 FN(spin_unlock), \ 2465 FN(spin_unlock), \
2459 FN(sk_fullsock), \ 2466 FN(sk_fullsock), \
2460 FN(tcp_sock), 2467 FN(tcp_sock), \
2468 FN(skb_ecn_set_ce),
2461 2469
2462/* integer value in 'imm' field of BPF_CALL instruction selects which helper 2470/* integer value in 'imm' field of BPF_CALL instruction selects which helper
2463 * function eBPF program intends to call 2471 * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index 85749f6ec789..558ca72f2254 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5426,6 +5426,32 @@ static const struct bpf_func_proto bpf_tcp_sock_proto = {
5426 .arg1_type = ARG_PTR_TO_SOCK_COMMON, 5426 .arg1_type = ARG_PTR_TO_SOCK_COMMON,
5427}; 5427};
5428 5428
5429BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
5430{
5431 unsigned int iphdr_len;
5432
5433 if (skb->protocol == cpu_to_be16(ETH_P_IP))
5434 iphdr_len = sizeof(struct iphdr);
5435 else if (skb->protocol == cpu_to_be16(ETH_P_IPV6))
5436 iphdr_len = sizeof(struct ipv6hdr);
5437 else
5438 return 0;
5439
5440 if (skb_headlen(skb) < iphdr_len)
5441 return 0;
5442
5443 if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len))
5444 return 0;
5445
5446 return INET_ECN_set_ce(skb);
5447}
5448
5449static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
5450 .func = bpf_skb_ecn_set_ce,
5451 .gpl_only = false,
5452 .ret_type = RET_INTEGER,
5453 .arg1_type = ARG_PTR_TO_CTX,
5454};
5429#endif /* CONFIG_INET */ 5455#endif /* CONFIG_INET */
5430 5456
5431bool bpf_helper_changes_pkt_data(void *func) 5457bool bpf_helper_changes_pkt_data(void *func)
@@ -5585,6 +5611,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
5585#ifdef CONFIG_INET 5611#ifdef CONFIG_INET
5586 case BPF_FUNC_tcp_sock: 5612 case BPF_FUNC_tcp_sock:
5587 return &bpf_tcp_sock_proto; 5613 return &bpf_tcp_sock_proto;
5614 case BPF_FUNC_skb_ecn_set_ce:
5615 return &bpf_skb_ecn_set_ce_proto;
5588#endif 5616#endif
5589 default: 5617 default:
5590 return sk_filter_func_proto(func_id, prog); 5618 return sk_filter_func_proto(func_id, prog);
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 0c62ac39c697..65e667bdf979 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -52,6 +52,7 @@ hostprogs-y += xdpsock
52hostprogs-y += xdp_fwd 52hostprogs-y += xdp_fwd
53hostprogs-y += task_fd_query 53hostprogs-y += task_fd_query
54hostprogs-y += xdp_sample_pkts 54hostprogs-y += xdp_sample_pkts
55hostprogs-y += hbm
55 56
56# Libbpf dependencies 57# Libbpf dependencies
57LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a 58LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
@@ -107,6 +108,7 @@ xdpsock-objs := xdpsock_user.o
107xdp_fwd-objs := xdp_fwd_user.o 108xdp_fwd-objs := xdp_fwd_user.o
108task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS) 109task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
109xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS) 110xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS)
111hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS)
110 112
111# Tell kbuild to always build the programs 113# Tell kbuild to always build the programs
112always := $(hostprogs-y) 114always := $(hostprogs-y)
@@ -164,6 +166,7 @@ always += xdp_adjust_tail_kern.o
164always += xdp_fwd_kern.o 166always += xdp_fwd_kern.o
165always += task_fd_query_kern.o 167always += task_fd_query_kern.o
166always += xdp_sample_pkts_kern.o 168always += xdp_sample_pkts_kern.o
169always += hbm_out_kern.o
167 170
168KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include 171KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include
169KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/ 172KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -263,6 +266,8 @@ $(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF)
263$(src)/*.c: verify_target_bpf $(LIBBPF) 266$(src)/*.c: verify_target_bpf $(LIBBPF)
264 267
265$(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h 268$(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h
269$(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h
270$(obj)/hbm.o: $(src)/hbm.h
266 271
267# asm/sysreg.h - inline assembly used by it is incompatible with llvm. 272# asm/sysreg.h - inline assembly used by it is incompatible with llvm.
268# But, there is no easy way to fix it, so just exclude it since it is 273# But, there is no easy way to fix it, so just exclude it since it is
diff --git a/samples/bpf/do_hbm_test.sh b/samples/bpf/do_hbm_test.sh
new file mode 100755
index 000000000000..56c8b4115c95
--- /dev/null
+++ b/samples/bpf/do_hbm_test.sh
@@ -0,0 +1,436 @@
1#!/bin/bash
2# SPDX-License-Identifier: GPL-2.0
3#
4# Copyright (c) 2019 Facebook
5#
6# This program is free software; you can redistribute it and/or
7# modify it under the terms of version 2 of the GNU General Public
8# License as published by the Free Software Foundation.
9
10Usage() {
11 echo "Script for testing HBM (Host Bandwidth Manager) framework."
12 echo "It creates a cgroup to use for testing and load a BPF program to limit"
13 echo "egress or ingress bandwidht. It then uses iperf3 or netperf to create"
14 echo "loads. The output is the goodput in Mbps (unless -D was used)."
15 echo ""
16 echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>] [-D]"
17 echo " [-d=<delay>|--delay=<delay>] [--debug] [-E]"
18 echo " [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >]"
19 echo " [-l] [-N] [-p=<port>|--port=<port>] [-P]"
20 echo " [-q=<qdisc>] [-R] [-s=<server>|--server=<server]"
21 echo " [-S|--stats] -t=<time>|--time=<time>] [-w] [cubic|dctcp]"
22 echo " Where:"
23 echo " out egress (default)"
24 echo " -b or --bpf BPF program filename to load and attach."
25 echo " Default is hbm_out_kern.o for egress,"
26 echo " -c or -cc TCP congestion control (cubic or dctcp)"
27 echo " --debug print BPF trace buffer"
28 echo " -d or --delay add a delay in ms using netem"
29 echo " -D In addition to the goodput in Mbps, it also outputs"
30 echo " other detailed information. This information is"
31 echo " test dependent (i.e. iperf3 or netperf)."
32 echo " -E enable ECN (not required for dctcp)"
33 echo " -f or --flows number of concurrent flows (default=1)"
34 echo " -i or --id cgroup id (an integer, default is 1)"
35 echo " -N use netperf instead of iperf3"
36 echo " -l do not limit flows using loopback"
37 echo " -h Help"
38 echo " -p or --port iperf3 port (default is 5201)"
39 echo " -P use an iperf3 instance for each flow"
40 echo " -q use the specified qdisc"
41 echo " -r or --rate rate in Mbps (default 1s 1Gbps)"
42 echo " -R Use TCP_RR for netperf. 1st flow has req"
43 echo " size of 10KB, rest of 1MB. Reply in all"
44 echo " cases is 1 byte."
45 echo " More detailed output for each flow can be found"
46 echo " in the files netperf.<cg>.<flow>, where <cg> is the"
47 echo " cgroup id as specified with the -i flag, and <flow>"
48 echo " is the flow id starting at 1 and increasing by 1 for"
49 echo " flow (as specified by -f)."
50 echo " -s or --server hostname of netperf server. Used to create netperf"
51 echo " test traffic between to hosts (default is within host)"
52 echo " netserver must be running on the host."
53 echo " -S or --stats whether to update hbm stats (default is yes)."
54 echo " -t or --time duration of iperf3 in seconds (default=5)"
55 echo " -w Work conserving flag. cgroup can increase its"
56 echo " bandwidth beyond the rate limit specified"
57 echo " while there is available bandwidth. Current"
58 echo " implementation assumes there is only one NIC"
59 echo " (eth0), but can be extended to support multiple"
60 echo " NICs."
61 echo " cubic or dctcp specify which TCP CC to use"
62 echo " "
63 exit
64}
65
66#set -x
67
68debug_flag=0
69args="$@"
70name="$0"
71netem=0
72cc=x
73dir="-o"
74dir_name="out"
75dur=5
76flows=1
77id=1
78prog=""
79port=5201
80rate=1000
81multi_iperf=0
82flow_cnt=1
83use_netperf=0
84rr=0
85ecn=0
86details=0
87server=""
88qdisc=""
89flags=""
90do_stats=0
91
92function start_hbm () {
93 rm -f hbm.out
94 echo "./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog" > hbm.out
95 echo " " >> hbm.out
96 ./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog >> hbm.out 2>&1 &
97 echo $!
98}
99
100processArgs () {
101 for i in $args ; do
102 case $i in
103 # Support for upcomming ingress rate limiting
104 #in) # support for upcoming ingress rate limiting
105 # dir="-i"
106 # dir_name="in"
107 # ;;
108 out)
109 dir="-o"
110 dir_name="out"
111 ;;
112 -b=*|--bpf=*)
113 prog="${i#*=}"
114 ;;
115 -c=*|--cc=*)
116 cc="${i#*=}"
117 ;;
118 --debug)
119 flags="$flags -d"
120 debug_flag=1
121 ;;
122 -d=*|--delay=*)
123 netem="${i#*=}"
124 ;;
125 -D)
126 details=1
127 ;;
128 -E)
129 ecn=1
130 ;;
131 # Support for upcomming fq Early Departure Time egress rate limiting
132 #--edt)
133 # prog="hbm_out_edt_kern.o"
134 # qdisc="fq"
135 # ;;
136 -f=*|--flows=*)
137 flows="${i#*=}"
138 ;;
139 -i=*|--id=*)
140 id="${i#*=}"
141 ;;
142 -l)
143 flags="$flags -l"
144 ;;
145 -N)
146 use_netperf=1
147 ;;
148 -p=*|--port=*)
149 port="${i#*=}"
150 ;;
151 -P)
152 multi_iperf=1
153 ;;
154 -q=*)
155 qdisc="${i#*=}"
156 ;;
157 -r=*|--rate=*)
158 rate="${i#*=}"
159 ;;
160 -R)
161 rr=1
162 ;;
163 -s=*|--server=*)
164 server="${i#*=}"
165 ;;
166 -S|--stats)
167 flags="$flags -s"
168 do_stats=1
169 ;;
170 -t=*|--time=*)
171 dur="${i#*=}"
172 ;;
173 -w)
174 flags="$flags -w"
175 ;;
176 cubic)
177 cc=cubic
178 ;;
179 dctcp)
180 cc=dctcp
181 ;;
182 *)
183 echo "Unknown arg:$i"
184 Usage
185 ;;
186 esac
187 done
188}
189
190processArgs
191
192if [ $debug_flag -eq 1 ] ; then
193 rm -f hbm_out.log
194fi
195
196hbm_pid=$(start_hbm)
197usleep 100000
198
199host=`hostname`
200cg_base_dir=/sys/fs/cgroup
201cg_dir="$cg_base_dir/cgroup-test-work-dir/hbm$id"
202
203echo $$ >> $cg_dir/cgroup.procs
204
205ulimit -l unlimited
206
207rm -f ss.out
208rm -f hbm.[0-9]*.$dir_name
209if [ $ecn -ne 0 ] ; then
210 sysctl -w -q -n net.ipv4.tcp_ecn=1
211fi
212
213if [ $use_netperf -eq 0 ] ; then
214 cur_cc=`sysctl -n net.ipv4.tcp_congestion_control`
215 if [ "$cc" != "x" ] ; then
216 sysctl -w -q -n net.ipv4.tcp_congestion_control=$cc
217 fi
218fi
219
220if [ "$netem" -ne "0" ] ; then
221 if [ "$qdisc" != "" ] ; then
222 echo "WARNING: Ignoring -q options because -d option used"
223 fi
224 tc qdisc del dev lo root > /dev/null 2>&1
225 tc qdisc add dev lo root netem delay $netem\ms > /dev/null 2>&1
226elif [ "$qdisc" != "" ] ; then
227 tc qdisc del dev lo root > /dev/null 2>&1
228 tc qdisc add dev lo root $qdisc > /dev/null 2>&1
229fi
230
231n=0
232m=$[$dur * 5]
233hn="::1"
234if [ $use_netperf -ne 0 ] ; then
235 if [ "$server" != "" ] ; then
236 hn=$server
237 fi
238fi
239
240( ping6 -i 0.2 -c $m $hn > ping.out 2>&1 ) &
241
242if [ $use_netperf -ne 0 ] ; then
243 begNetserverPid=`ps ax | grep netserver | grep --invert-match "grep" | \
244 awk '{ print $1 }'`
245 if [ "$begNetserverPid" == "" ] ; then
246 if [ "$server" == "" ] ; then
247 ( ./netserver > /dev/null 2>&1) &
248 usleep 100000
249 fi
250 fi
251 flow_cnt=1
252 if [ "$server" == "" ] ; then
253 np_server=$host
254 else
255 np_server=$server
256 fi
257 if [ "$cc" == "x" ] ; then
258 np_cc=""
259 else
260 np_cc="-K $cc,$cc"
261 fi
262 replySize=1
263 while [ $flow_cnt -le $flows ] ; do
264 if [ $rr -ne 0 ] ; then
265 reqSize=1M
266 if [ $flow_cnt -eq 1 ] ; then
267 reqSize=10K
268 fi
269 if [ "$dir" == "-i" ] ; then
270 replySize=$reqSize
271 reqSize=1
272 fi
273 ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR -- -r $reqSize,$replySize $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,REMOTE_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,LOCAL_RECV_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
274 else
275 if [ "$dir" == "-i" ] ; then
276 ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR -- -r 1,10M $np_cc -k P50_LATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REMOTE_TRANSPORT_RETRANS,REMOTE_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
277 else
278 ( ./netperf -H $np_server -l $dur -f m -j -t TCP_STREAM -- $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
279 fi
280 fi
281 flow_cnt=$[flow_cnt+1]
282 done
283
284# sleep for duration of test (plus some buffer)
285 n=$[dur+2]
286 sleep $n
287
288# force graceful termination of netperf
289 pids=`pgrep netperf`
290 for p in $pids ; do
291 kill -SIGALRM $p
292 done
293
294 flow_cnt=1
295 rate=0
296 if [ $details -ne 0 ] ; then
297 echo ""
298 echo "Details for HBM in cgroup $id"
299 if [ $do_stats -eq 1 ] ; then
300 if [ -e hbm.$id.$dir_name ] ; then
301 cat hbm.$id.$dir_name
302 fi
303 fi
304 fi
305 while [ $flow_cnt -le $flows ] ; do
306 if [ "$dir" == "-i" ] ; then
307 r=`cat netperf.$id.$flow_cnt | grep -o "REMOTE_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"`
308 else
309 r=`cat netperf.$id.$flow_cnt | grep -o "LOCAL_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"`
310 fi
311 echo "rate for flow $flow_cnt: $r"
312 rate=$[rate+r]
313 if [ $details -ne 0 ] ; then
314 echo "-----"
315 echo "Details for cgroup $id, flow $flow_cnt"
316 cat netperf.$id.$flow_cnt
317 fi
318 flow_cnt=$[flow_cnt+1]
319 done
320 if [ $details -ne 0 ] ; then
321 echo ""
322 delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
323 echo "PING AVG DELAY:$delay"
324 echo "AGGREGATE_GOODPUT:$rate"
325 else
326 echo $rate
327 fi
328elif [ $multi_iperf -eq 0 ] ; then
329 (iperf3 -s -p $port -1 > /dev/null 2>&1) &
330 usleep 100000
331 iperf3 -c $host -p $port -i 0 -P $flows -f m -t $dur > iperf.$id
332 rates=`grep receiver iperf.$id | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*"`
333 rate=`echo $rates | grep -o "[0-9]*$"`
334
335 if [ $details -ne 0 ] ; then
336 echo ""
337 echo "Details for HBM in cgroup $id"
338 if [ $do_stats -eq 1 ] ; then
339 if [ -e hbm.$id.$dir_name ] ; then
340 cat hbm.$id.$dir_name
341 fi
342 fi
343 delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
344 echo "PING AVG DELAY:$delay"
345 echo "AGGREGATE_GOODPUT:$rate"
346 else
347 echo $rate
348 fi
349else
350 flow_cnt=1
351 while [ $flow_cnt -le $flows ] ; do
352 (iperf3 -s -p $port -1 > /dev/null 2>&1) &
353 ( iperf3 -c $host -p $port -i 0 -P 1 -f m -t $dur | grep receiver | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*" | grep -o "[0-9]*$" > iperf3.$id.$flow_cnt ) &
354 port=$[port+1]
355 flow_cnt=$[flow_cnt+1]
356 done
357 n=$[dur+1]
358 sleep $n
359 flow_cnt=1
360 rate=0
361 if [ $details -ne 0 ] ; then
362 echo ""
363 echo "Details for HBM in cgroup $id"
364 if [ $do_stats -eq 1 ] ; then
365 if [ -e hbm.$id.$dir_name ] ; then
366 cat hbm.$id.$dir_name
367 fi
368 fi
369 fi
370
371 while [ $flow_cnt -le $flows ] ; do
372 r=`cat iperf3.$id.$flow_cnt`
373# echo "rate for flow $flow_cnt: $r"
374 if [ $details -ne 0 ] ; then
375 echo "Rate for cgroup $id, flow $flow_cnt LOCAL_SEND_THROUGHPUT=$r"
376 fi
377 rate=$[rate+r]
378 flow_cnt=$[flow_cnt+1]
379 done
380 if [ $details -ne 0 ] ; then
381 delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
382 echo "PING AVG DELAY:$delay"
383 echo "AGGREGATE_GOODPUT:$rate"
384 else
385 echo $rate
386 fi
387fi
388
389if [ $use_netperf -eq 0 ] ; then
390 sysctl -w -q -n net.ipv4.tcp_congestion_control=$cur_cc
391fi
392if [ $ecn -ne 0 ] ; then
393 sysctl -w -q -n net.ipv4.tcp_ecn=0
394fi
395if [ "$netem" -ne "0" ] ; then
396 tc qdisc del dev lo root > /dev/null 2>&1
397fi
398
399sleep 2
400
401hbmPid=`ps ax | grep "hbm " | grep --invert-match "grep" | awk '{ print $1 }'`
402if [ "$hbmPid" == "$hbm_pid" ] ; then
403 kill $hbm_pid
404fi
405
406sleep 1
407
408# Detach any BPF programs that may have lingered
409ttx=`bpftool cgroup tree | grep hbm`
410v=2
411for x in $ttx ; do
412 if [ "${x:0:36}" == "/sys/fs/cgroup/cgroup-test-work-dir/" ] ; then
413 cg=$x ; v=0
414 else
415 if [ $v -eq 0 ] ; then
416 id=$x ; v=1
417 else
418 if [ $v -eq 1 ] ; then
419 type=$x ; bpftool cgroup detach $cg $type id $id
420 v=0
421 fi
422 fi
423 fi
424done
425
426if [ $use_netperf -ne 0 ] ; then
427 if [ "$server" == "" ] ; then
428 if [ "$begNetserverPid" == "" ] ; then
429 netserverPid=`ps ax | grep netserver | grep --invert-match "grep" | awk '{ print $1 }'`
430 if [ "$netserverPid" != "" ] ; then
431 kill $netserverPid
432 fi
433 fi
434 fi
435fi
436exit
diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c
new file mode 100644
index 000000000000..8408ccb7409f
--- /dev/null
+++ b/samples/bpf/hbm.c
@@ -0,0 +1,441 @@
1// SPDX-License-Identifier: GPL-2.0
2/* Copyright (c) 2019 Facebook
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * Example program for Host Bandwidth Managment
9 *
10 * This program loads a cgroup skb BPF program to enforce cgroup output
11 * (egress) or input (ingress) bandwidth limits.
12 *
13 * USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>] [-w] [-h] [prog]
14 * Where:
15 * -d Print BPF trace debug buffer
16 * -l Also limit flows doing loopback
17 * -n <#> To create cgroup \"/hbm#\" and attach prog
18 * Default is /hbm1
19 * -r <rate> Rate limit in Mbps
20 * -s Get HBM stats (marked, dropped, etc.)
21 * -t <time> Exit after specified seconds (deault is 0)
22 * -w Work conserving flag. cgroup can increase its bandwidth
23 * beyond the rate limit specified while there is available
24 * bandwidth. Current implementation assumes there is only
25 * NIC (eth0), but can be extended to support multiple NICs.
26 * Currrently only supported for egress.
27 * -h Print this info
28 * prog BPF program file name. Name defaults to hbm_out_kern.o
29 */
30
31#define _GNU_SOURCE
32
33#include <stdio.h>
34#include <stdlib.h>
35#include <assert.h>
36#include <sys/resource.h>
37#include <sys/time.h>
38#include <unistd.h>
39#include <errno.h>
40#include <fcntl.h>
41#include <linux/unistd.h>
42
43#include <linux/bpf.h>
44#include <bpf/bpf.h>
45
46#include "bpf_load.h"
47#include "bpf_rlimit.h"
48#include "cgroup_helpers.h"
49#include "hbm.h"
50#include "bpf_util.h"
51#include "bpf/bpf.h"
52#include "bpf/libbpf.h"
53
54bool outFlag = true;
55int minRate = 1000; /* cgroup rate limit in Mbps */
56int rate = 1000; /* can grow if rate conserving is enabled */
57int dur = 1;
58bool stats_flag;
59bool loopback_flag;
60bool debugFlag;
61bool work_conserving_flag;
62
63static void Usage(void);
64static void read_trace_pipe2(void);
65static void do_error(char *msg, bool errno_flag);
66
67#define DEBUGFS "/sys/kernel/debug/tracing/"
68
69struct bpf_object *obj;
70int bpfprog_fd;
71int cgroup_storage_fd;
72
73static void read_trace_pipe2(void)
74{
75 int trace_fd;
76 FILE *outf;
77 char *outFname = "hbm_out.log";
78
79 trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
80 if (trace_fd < 0) {
81 printf("Error opening trace_pipe\n");
82 return;
83 }
84
85// Future support of ingress
86// if (!outFlag)
87// outFname = "hbm_in.log";
88 outf = fopen(outFname, "w");
89
90 if (outf == NULL)
91 printf("Error creating %s\n", outFname);
92
93 while (1) {
94 static char buf[4097];
95 ssize_t sz;
96
97 sz = read(trace_fd, buf, sizeof(buf) - 1);
98 if (sz > 0) {
99 buf[sz] = 0;
100 puts(buf);
101 if (outf != NULL) {
102 fprintf(outf, "%s\n", buf);
103 fflush(outf);
104 }
105 }
106 }
107}
108
109static void do_error(char *msg, bool errno_flag)
110{
111 if (errno_flag)
112 printf("ERROR: %s, errno: %d\n", msg, errno);
113 else
114 printf("ERROR: %s\n", msg);
115 exit(1);
116}
117
118static int prog_load(char *prog)
119{
120 struct bpf_prog_load_attr prog_load_attr = {
121 .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
122 .file = prog,
123 .expected_attach_type = BPF_CGROUP_INET_EGRESS,
124 };
125 int map_fd;
126 struct bpf_map *map;
127
128 int ret = 0;
129
130 if (access(prog, O_RDONLY) < 0) {
131 printf("Error accessing file %s: %s\n", prog, strerror(errno));
132 return 1;
133 }
134 if (bpf_prog_load_xattr(&prog_load_attr, &obj, &bpfprog_fd))
135 ret = 1;
136 if (!ret) {
137 map = bpf_object__find_map_by_name(obj, "queue_stats");
138 map_fd = bpf_map__fd(map);
139 if (map_fd < 0) {
140 printf("Map not found: %s\n", strerror(map_fd));
141 ret = 1;
142 }
143 }
144
145 if (ret) {
146 printf("ERROR: load_bpf_file failed for: %s\n", prog);
147 printf(" Output from verifier:\n%s\n------\n", bpf_log_buf);
148 ret = -1;
149 } else {
150 ret = map_fd;
151 }
152
153 return ret;
154}
155
156static int run_bpf_prog(char *prog, int cg_id)
157{
158 int map_fd;
159 int rc = 0;
160 int key = 0;
161 int cg1 = 0;
162 int type = BPF_CGROUP_INET_EGRESS;
163 char cg_dir[100];
164 struct hbm_queue_stats qstats = {0};
165
166 sprintf(cg_dir, "/hbm%d", cg_id);
167 map_fd = prog_load(prog);
168 if (map_fd == -1)
169 return 1;
170
171 if (setup_cgroup_environment()) {
172 printf("ERROR: setting cgroup environment\n");
173 goto err;
174 }
175 cg1 = create_and_get_cgroup(cg_dir);
176 if (!cg1) {
177 printf("ERROR: create_and_get_cgroup\n");
178 goto err;
179 }
180 if (join_cgroup(cg_dir)) {
181 printf("ERROR: join_cgroup\n");
182 goto err;
183 }
184
185 qstats.rate = rate;
186 qstats.stats = stats_flag ? 1 : 0;
187 qstats.loopback = loopback_flag ? 1 : 0;
188 if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) {
189 printf("ERROR: Could not update map element\n");
190 goto err;
191 }
192
193 if (!outFlag)
194 type = BPF_CGROUP_INET_INGRESS;
195 if (bpf_prog_attach(bpfprog_fd, cg1, type, 0)) {
196 printf("ERROR: bpf_prog_attach fails!\n");
197 log_err("Attaching prog");
198 goto err;
199 }
200
201 if (work_conserving_flag) {
202 struct timeval t0, t_last, t_new;
203 FILE *fin;
204 unsigned long long last_eth_tx_bytes, new_eth_tx_bytes;
205 signed long long last_cg_tx_bytes, new_cg_tx_bytes;
206 signed long long delta_time, delta_bytes, delta_rate;
207 int delta_ms;
208#define DELTA_RATE_CHECK 10000 /* in us */
209#define RATE_THRESHOLD 9500000000 /* 9.5 Gbps */
210
211 bpf_map_lookup_elem(map_fd, &key, &qstats);
212 if (gettimeofday(&t0, NULL) < 0)
213 do_error("gettimeofday failed", true);
214 t_last = t0;
215 fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", "r");
216 if (fscanf(fin, "%llu", &last_eth_tx_bytes) != 1)
217 do_error("fscanf fails", false);
218 fclose(fin);
219 last_cg_tx_bytes = qstats.bytes_total;
220 while (true) {
221 usleep(DELTA_RATE_CHECK);
222 if (gettimeofday(&t_new, NULL) < 0)
223 do_error("gettimeofday failed", true);
224 delta_ms = (t_new.tv_sec - t0.tv_sec) * 1000 +
225 (t_new.tv_usec - t0.tv_usec)/1000;
226 if (delta_ms > dur * 1000)
227 break;
228 delta_time = (t_new.tv_sec - t_last.tv_sec) * 1000000 +
229 (t_new.tv_usec - t_last.tv_usec);
230 if (delta_time == 0)
231 continue;
232 t_last = t_new;
233 fin = fopen("/sys/class/net/eth0/statistics/tx_bytes",
234 "r");
235 if (fscanf(fin, "%llu", &new_eth_tx_bytes) != 1)
236 do_error("fscanf fails", false);
237 fclose(fin);
238 printf(" new_eth_tx_bytes:%llu\n",
239 new_eth_tx_bytes);
240 bpf_map_lookup_elem(map_fd, &key, &qstats);
241 new_cg_tx_bytes = qstats.bytes_total;
242 delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes;
243 last_eth_tx_bytes = new_eth_tx_bytes;
244 delta_rate = (delta_bytes * 8000000) / delta_time;
245 printf("%5d - eth_rate:%.1fGbps cg_rate:%.3fGbps",
246 delta_ms, delta_rate/1000000000.0,
247 rate/1000.0);
248 if (delta_rate < RATE_THRESHOLD) {
249 /* can increase cgroup rate limit, but first
250 * check if we are using the current limit.
251 * Currently increasing by 6.25%, unknown
252 * if that is the optimal rate.
253 */
254 int rate_diff100;
255
256 delta_bytes = new_cg_tx_bytes -
257 last_cg_tx_bytes;
258 last_cg_tx_bytes = new_cg_tx_bytes;
259 delta_rate = (delta_bytes * 8000000) /
260 delta_time;
261 printf(" rate:%.3fGbps",
262 delta_rate/1000000000.0);
263 rate_diff100 = (((long long)rate)*1000000 -
264 delta_rate) * 100 /
265 (((long long) rate) * 1000000);
266 printf(" rdiff:%d", rate_diff100);
267 if (rate_diff100 <= 3) {
268 rate += (rate >> 4);
269 if (rate > RATE_THRESHOLD / 1000000)
270 rate = RATE_THRESHOLD / 1000000;
271 qstats.rate = rate;
272 printf(" INC\n");
273 } else {
274 printf("\n");
275 }
276 } else {
277 /* Need to decrease cgroup rate limit.
278 * Currently decreasing by 12.5%, unknown
279 * if that is optimal
280 */
281 printf(" DEC\n");
282 rate -= (rate >> 3);
283 if (rate < minRate)
284 rate = minRate;
285 qstats.rate = rate;
286 }
287 if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY))
288 do_error("update map element fails", false);
289 }
290 } else {
291 sleep(dur);
292 }
293 // Get stats!
294 if (stats_flag && bpf_map_lookup_elem(map_fd, &key, &qstats)) {
295 char fname[100];
296 FILE *fout;
297
298 if (!outFlag)
299 sprintf(fname, "hbm.%d.in", cg_id);
300 else
301 sprintf(fname, "hbm.%d.out", cg_id);
302 fout = fopen(fname, "w");
303 fprintf(fout, "id:%d\n", cg_id);
304 fprintf(fout, "ERROR: Could not lookup queue_stats\n");
305 } else if (stats_flag && qstats.lastPacketTime >
306 qstats.firstPacketTime) {
307 long long delta_us = (qstats.lastPacketTime -
308 qstats.firstPacketTime)/1000;
309 unsigned int rate_mbps = ((qstats.bytes_total -
310 qstats.bytes_dropped) * 8 /
311 delta_us);
312 double percent_pkts, percent_bytes;
313 char fname[100];
314 FILE *fout;
315
316// Future support of ingress
317// if (!outFlag)
318// sprintf(fname, "hbm.%d.in", cg_id);
319// else
320 sprintf(fname, "hbm.%d.out", cg_id);
321 fout = fopen(fname, "w");
322 fprintf(fout, "id:%d\n", cg_id);
323 fprintf(fout, "rate_mbps:%d\n", rate_mbps);
324 fprintf(fout, "duration:%.1f secs\n",
325 (qstats.lastPacketTime - qstats.firstPacketTime) /
326 1000000000.0);
327 fprintf(fout, "packets:%d\n", (int)qstats.pkts_total);
328 fprintf(fout, "bytes_MB:%d\n", (int)(qstats.bytes_total /
329 1000000));
330 fprintf(fout, "pkts_dropped:%d\n", (int)qstats.pkts_dropped);
331 fprintf(fout, "bytes_dropped_MB:%d\n",
332 (int)(qstats.bytes_dropped /
333 1000000));
334 // Marked Pkts and Bytes
335 percent_pkts = (qstats.pkts_marked * 100.0) /
336 (qstats.pkts_total + 1);
337 percent_bytes = (qstats.bytes_marked * 100.0) /
338 (qstats.bytes_total + 1);
339 fprintf(fout, "pkts_marked_percent:%6.2f\n", percent_pkts);
340 fprintf(fout, "bytes_marked_percent:%6.2f\n", percent_bytes);
341
342 // Dropped Pkts and Bytes
343 percent_pkts = (qstats.pkts_dropped * 100.0) /
344 (qstats.pkts_total + 1);
345 percent_bytes = (qstats.bytes_dropped * 100.0) /
346 (qstats.bytes_total + 1);
347 fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts);
348 fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes);
349 fclose(fout);
350 }
351
352 if (debugFlag)
353 read_trace_pipe2();
354 return rc;
355err:
356 rc = 1;
357
358 if (cg1)
359 close(cg1);
360 cleanup_cgroup_environment();
361
362 return rc;
363}
364
365static void Usage(void)
366{
367 printf("This program loads a cgroup skb BPF program to enforce\n"
368 "cgroup output (egress) bandwidth limits.\n\n"
369 "USAGE: hbm [-o] [-d] [-l] [-n <id>] [-r <rate>] [-s]\n"
370 " [-t <secs>] [-w] [-h] [prog]\n"
371 " Where:\n"
372 " -o indicates egress direction (default)\n"
373 " -d print BPF trace debug buffer\n"
374 " -l also limit flows using loopback\n"
375 " -n <#> to create cgroup \"/hbm#\" and attach prog\n"
376 " Default is /hbm1\n"
377 " -r <rate> Rate in Mbps\n"
378 " -s Update HBM stats\n"
379 " -t <time> Exit after specified seconds (deault is 0)\n"
380 " -w Work conserving flag. cgroup can increase\n"
381 " bandwidth beyond the rate limit specified\n"
382 " while there is available bandwidth. Current\n"
383 " implementation assumes there is only eth0\n"
384 " but can be extended to support multiple NICs\n"
385 " -h print this info\n"
386 " prog BPF program file name. Name defaults to\n"
387 " hbm_out_kern.o\n");
388}
389
390int main(int argc, char **argv)
391{
392 char *prog = "hbm_out_kern.o";
393 int k;
394 int cg_id = 1;
395 char *optstring = "iodln:r:st:wh";
396
397 while ((k = getopt(argc, argv, optstring)) != -1) {
398 switch (k) {
399 case'o':
400 break;
401 case 'd':
402 debugFlag = true;
403 break;
404 case 'l':
405 loopback_flag = true;
406 break;
407 case 'n':
408 cg_id = atoi(optarg);
409 break;
410 case 'r':
411 minRate = atoi(optarg) * 1.024;
412 rate = minRate;
413 break;
414 case 's':
415 stats_flag = true;
416 break;
417 case 't':
418 dur = atoi(optarg);
419 break;
420 case 'w':
421 work_conserving_flag = true;
422 break;
423 case '?':
424 if (optopt == 'n' || optopt == 'r' || optopt == 't')
425 fprintf(stderr,
426 "Option -%c requires an argument.\n\n",
427 optopt);
428 case 'h':
429 // fallthrough
430 default:
431 Usage();
432 return 0;
433 }
434 }
435
436 if (optind < argc)
437 prog = argv[optind];
438 printf("HBM prog: %s\n", prog != NULL ? prog : "NULL");
439
440 return run_bpf_prog(prog, cg_id);
441}
diff --git a/samples/bpf/hbm.h b/samples/bpf/hbm.h
new file mode 100644
index 000000000000..518e8147d084
--- /dev/null
+++ b/samples/bpf/hbm.h
@@ -0,0 +1,31 @@
1/* SPDX-License-Identifier: GPL-2.0
2 *
3 * Copyright (c) 2019 Facebook
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of version 2 of the GNU General Public
7 * License as published by the Free Software Foundation.
8 *
9 * Include file for Host Bandwidth Management (HBM) programs
10 */
11struct hbm_vqueue {
12 struct bpf_spin_lock lock;
13 /* 4 byte hole */
14 unsigned long long lasttime; /* In ns */
15 int credit; /* In bytes */
16 unsigned int rate; /* In bytes per NS << 20 */
17};
18
19struct hbm_queue_stats {
20 unsigned long rate; /* in Mbps*/
21 unsigned long stats:1, /* get HBM stats (marked, dropped,..) */
22 loopback:1; /* also limit flows using loopback */
23 unsigned long long pkts_marked;
24 unsigned long long bytes_marked;
25 unsigned long long pkts_dropped;
26 unsigned long long bytes_dropped;
27 unsigned long long pkts_total;
28 unsigned long long bytes_total;
29 unsigned long long firstPacketTime;
30 unsigned long long lastPacketTime;
31};
diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h
new file mode 100644
index 000000000000..c5635d924193
--- /dev/null
+++ b/samples/bpf/hbm_kern.h
@@ -0,0 +1,137 @@
1/* SPDX-License-Identifier: GPL-2.0
2 *
3 * Copyright (c) 2019 Facebook
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of version 2 of the GNU General Public
7 * License as published by the Free Software Foundation.
8 *
9 * Include file for sample Host Bandwidth Manager (HBM) BPF programs
10 */
11#define KBUILD_MODNAME "foo"
12#include <stddef.h>
13#include <stdbool.h>
14#include <uapi/linux/bpf.h>
15#include <uapi/linux/if_ether.h>
16#include <uapi/linux/if_packet.h>
17#include <uapi/linux/ip.h>
18#include <uapi/linux/ipv6.h>
19#include <uapi/linux/in.h>
20#include <uapi/linux/tcp.h>
21#include <uapi/linux/filter.h>
22#include <uapi/linux/pkt_cls.h>
23#include <net/ipv6.h>
24#include <net/inet_ecn.h>
25#include "bpf_endian.h"
26#include "bpf_helpers.h"
27#include "hbm.h"
28
29#define DROP_PKT 0
30#define ALLOW_PKT 1
31#define TCP_ECN_OK 1
32
33#define HBM_DEBUG 0 // Set to 1 to enable debugging
34#if HBM_DEBUG
35#define bpf_printk(fmt, ...) \
36({ \
37 char ____fmt[] = fmt; \
38 bpf_trace_printk(____fmt, sizeof(____fmt), \
39 ##__VA_ARGS__); \
40})
41#else
42#define bpf_printk(fmt, ...)
43#endif
44
45#define INITIAL_CREDIT_PACKETS 100
46#define MAX_BYTES_PER_PACKET 1500
47#define MARK_THRESH (40 * MAX_BYTES_PER_PACKET)
48#define DROP_THRESH (80 * 5 * MAX_BYTES_PER_PACKET)
49#define LARGE_PKT_DROP_THRESH (DROP_THRESH - (15 * MAX_BYTES_PER_PACKET))
50#define MARK_REGION_SIZE (LARGE_PKT_DROP_THRESH - MARK_THRESH)
51#define LARGE_PKT_THRESH 120
52#define MAX_CREDIT (100 * MAX_BYTES_PER_PACKET)
53#define INIT_CREDIT (INITIAL_CREDIT_PACKETS * MAX_BYTES_PER_PACKET)
54
55// rate in bytes per ns << 20
56#define CREDIT_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20)
57
58struct bpf_map_def SEC("maps") queue_state = {
59 .type = BPF_MAP_TYPE_CGROUP_STORAGE,
60 .key_size = sizeof(struct bpf_cgroup_storage_key),
61 .value_size = sizeof(struct hbm_vqueue),
62};
63BPF_ANNOTATE_KV_PAIR(queue_state, struct bpf_cgroup_storage_key,
64 struct hbm_vqueue);
65
66struct bpf_map_def SEC("maps") queue_stats = {
67 .type = BPF_MAP_TYPE_ARRAY,
68 .key_size = sizeof(u32),
69 .value_size = sizeof(struct hbm_queue_stats),
70 .max_entries = 1,
71};
72BPF_ANNOTATE_KV_PAIR(queue_stats, int, struct hbm_queue_stats);
73
74struct hbm_pkt_info {
75 bool is_ip;
76 bool is_tcp;
77 short ecn;
78};
79
80static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb,
81 struct hbm_pkt_info *pkti)
82{
83 struct iphdr iph;
84 struct ipv6hdr *ip6h;
85
86 bpf_skb_load_bytes(skb, 0, &iph, 12);
87 if (iph.version == 6) {
88 ip6h = (struct ipv6hdr *)&iph;
89 pkti->is_ip = true;
90 pkti->is_tcp = (ip6h->nexthdr == 6);
91 pkti->ecn = (ip6h->flow_lbl[0] >> 4) & INET_ECN_MASK;
92 } else if (iph.version == 4) {
93 pkti->is_ip = true;
94 pkti->is_tcp = (iph.protocol == 6);
95 pkti->ecn = iph.tos & INET_ECN_MASK;
96 } else {
97 pkti->is_ip = false;
98 pkti->is_tcp = false;
99 pkti->ecn = 0;
100 }
101}
102
103static __always_inline void hbm_init_vqueue(struct hbm_vqueue *qdp, int rate)
104{
105 bpf_printk("Initializing queue_state, rate:%d\n", rate * 128);
106 qdp->lasttime = bpf_ktime_get_ns();
107 qdp->credit = INIT_CREDIT;
108 qdp->rate = rate * 128;
109}
110
111static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp,
112 int len,
113 unsigned long long curtime,
114 bool congestion_flag,
115 bool drop_flag)
116{
117 if (qsp != NULL) {
118 // Following is needed for work conserving
119 __sync_add_and_fetch(&(qsp->bytes_total), len);
120 if (qsp->stats) {
121 // Optionally update statistics
122 if (qsp->firstPacketTime == 0)
123 qsp->firstPacketTime = curtime;
124 qsp->lastPacketTime = curtime;
125 __sync_add_and_fetch(&(qsp->pkts_total), 1);
126 if (congestion_flag || drop_flag) {
127 __sync_add_and_fetch(&(qsp->pkts_marked), 1);
128 __sync_add_and_fetch(&(qsp->bytes_marked), len);
129 }
130 if (drop_flag) {
131 __sync_add_and_fetch(&(qsp->pkts_dropped), 1);
132 __sync_add_and_fetch(&(qsp->bytes_dropped),
133 len);
134 }
135 }
136 }
137}
diff --git a/samples/bpf/hbm_out_kern.c b/samples/bpf/hbm_out_kern.c
new file mode 100644
index 000000000000..f806863d0b79
--- /dev/null
+++ b/samples/bpf/hbm_out_kern.c
@@ -0,0 +1,157 @@
1// SPDX-License-Identifier: GPL-2.0
2/* Copyright (c) 2019 Facebook
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * Sample Host Bandwidth Manager (HBM) BPF program.
9 *
10 * A cgroup skb BPF egress program to limit cgroup output bandwidth.
11 * It uses a modified virtual token bucket queue to limit average
12 * egress bandwidth. The implementation uses credits instead of tokens.
13 * Negative credits imply that queueing would have happened (this is
14 * a virtual queue, so no queueing is done by it. However, queueing may
15 * occur at the actual qdisc (which is not used for rate limiting).
16 *
17 * This implementation uses 3 thresholds, one to start marking packets and
18 * the other two to drop packets:
19 * CREDIT
20 * - <--------------------------|------------------------> +
21 * | | | 0
22 * | Large pkt |
23 * | drop thresh |
24 * Small pkt drop Mark threshold
25 * thresh
26 *
27 * The effect of marking depends on the type of packet:
28 * a) If the packet is ECN enabled and it is a TCP packet, then the packet
29 * is ECN marked.
30 * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr
31 * to reduce the congestion window. The current implementation uses a linear
32 * distribution (0% probability at marking threshold, 100% probability
33 * at drop threshold).
34 * c) If the packet is not a TCP packet, then it is dropped.
35 *
36 * If the credit is below the drop threshold, the packet is dropped. If it
37 * is a TCP packet, then it also calls tcp_cwr since packets dropped by
38 * by a cgroup skb BPF program do not automatically trigger a call to
39 * tcp_cwr in the current kernel code.
40 *
41 * This BPF program actually uses 2 drop thresholds, one threshold
42 * for larger packets (>= 120 bytes) and another for smaller packets. This
43 * protects smaller packets such as SYNs, ACKs, etc.
44 *
45 * The default bandwidth limit is set at 1Gbps but this can be changed by
46 * a user program through a shared BPF map. In addition, by default this BPF
47 * program does not limit connections using loopback. This behavior can be
48 * overwritten by the user program. There is also an option to calculate
49 * some statistics, such as percent of packets marked or dropped, which
50 * the user program can access.
51 *
52 * A latter patch provides such a program (hbm.c)
53 */
54
55#include "hbm_kern.h"
56
57SEC("cgroup_skb/egress")
58int _hbm_out_cg(struct __sk_buff *skb)
59{
60 struct hbm_pkt_info pkti;
61 int len = skb->len;
62 unsigned int queue_index = 0;
63 unsigned long long curtime;
64 int credit;
65 signed long long delta = 0, zero = 0;
66 int max_credit = MAX_CREDIT;
67 bool congestion_flag = false;
68 bool drop_flag = false;
69 bool cwr_flag = false;
70 struct hbm_vqueue *qdp;
71 struct hbm_queue_stats *qsp = NULL;
72 int rv = ALLOW_PKT;
73
74 qsp = bpf_map_lookup_elem(&queue_stats, &queue_index);
75 if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1))
76 return ALLOW_PKT;
77
78 hbm_get_pkt_info(skb, &pkti);
79
80 // We may want to account for the length of headers in len
81 // calculation, like ETH header + overhead, specially if it
82 // is a gso packet. But I am not doing it right now.
83
84 qdp = bpf_get_local_storage(&queue_state, 0);
85 if (!qdp)
86 return ALLOW_PKT;
87 else if (qdp->lasttime == 0)
88 hbm_init_vqueue(qdp, 1024);
89
90 curtime = bpf_ktime_get_ns();
91
92 // Begin critical section
93 bpf_spin_lock(&qdp->lock);
94 credit = qdp->credit;
95 delta = curtime - qdp->lasttime;
96 /* delta < 0 implies that another process with a curtime greater
97 * than ours beat us to the critical section and already added
98 * the new credit, so we should not add it ourselves
99 */
100 if (delta > 0) {
101 qdp->lasttime = curtime;
102 credit += CREDIT_PER_NS(delta, qdp->rate);
103 if (credit > MAX_CREDIT)
104 credit = MAX_CREDIT;
105 }
106 credit -= len;
107 qdp->credit = credit;
108 bpf_spin_unlock(&qdp->lock);
109 // End critical section
110
111 // Check if we should update rate
112 if (qsp != NULL && (qsp->rate * 128) != qdp->rate) {
113 qdp->rate = qsp->rate * 128;
114 bpf_printk("Updating rate: %d (1sec:%llu bits)\n",
115 (int)qdp->rate,
116 CREDIT_PER_NS(1000000000, qdp->rate) * 8);
117 }
118
119 // Set flags (drop, congestion, cwr)
120 // Dropping => we are congested, so ignore congestion flag
121 if (credit < -DROP_THRESH ||
122 (len > LARGE_PKT_THRESH &&
123 credit < -LARGE_PKT_DROP_THRESH)) {
124 // Very congested, set drop flag
125 drop_flag = true;
126 } else if (credit < 0) {
127 // Congested, set congestion flag
128 if (pkti.ecn) {
129 if (credit < -MARK_THRESH)
130 congestion_flag = true;
131 else
132 congestion_flag = false;
133 } else {
134 congestion_flag = true;
135 }
136 }
137
138 if (congestion_flag) {
139 if (!bpf_skb_ecn_set_ce(skb)) {
140 if (len > LARGE_PKT_THRESH) {
141 // Problem if too many small packets?
142 drop_flag = true;
143 }
144 }
145 }
146
147 if (drop_flag)
148 rv = DROP_PKT;
149
150 hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag);
151
152 if (rv == DROP_PKT)
153 __sync_add_and_fetch(&(qdp->credit), len);
154
155 return rv;
156}
157char _license[] SEC("license") = "GPL";
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 2e308e90ffea..3c38ac9a92a7 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2359,6 +2359,13 @@ union bpf_attr {
2359 * Return 2359 * Return
2360 * A **struct bpf_tcp_sock** pointer on success, or NULL in 2360 * A **struct bpf_tcp_sock** pointer on success, or NULL in
2361 * case of failure. 2361 * case of failure.
2362 *
2363 * int bpf_skb_ecn_set_ce(struct sk_buf *skb)
2364 * Description
2365 * Sets ECN of IP header to ce (congestion encountered) if
2366 * current value is ect (ECN capable). Works with IPv6 and IPv4.
2367 * Return
2368 * 1 if set, 0 if not set.
2362 */ 2369 */
2363#define __BPF_FUNC_MAPPER(FN) \ 2370#define __BPF_FUNC_MAPPER(FN) \
2364 FN(unspec), \ 2371 FN(unspec), \
@@ -2457,7 +2464,8 @@ union bpf_attr {
2457 FN(spin_lock), \ 2464 FN(spin_lock), \
2458 FN(spin_unlock), \ 2465 FN(spin_unlock), \
2459 FN(sk_fullsock), \ 2466 FN(sk_fullsock), \
2460 FN(tcp_sock), 2467 FN(tcp_sock), \
2468 FN(skb_ecn_set_ce),
2461 2469
2462/* integer value in 'imm' field of BPF_CALL instruction selects which helper 2470/* integer value in 'imm' field of BPF_CALL instruction selects which helper
2463 * function eBPF program intends to call 2471 * function eBPF program intends to call
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 026bea831e03..c9433a496d54 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -180,6 +180,8 @@ static struct bpf_sock *(*bpf_sk_fullsock)(struct bpf_sock *sk) =
180 (void *) BPF_FUNC_sk_fullsock; 180 (void *) BPF_FUNC_sk_fullsock;
181static struct bpf_tcp_sock *(*bpf_tcp_sock)(struct bpf_sock *sk) = 181static struct bpf_tcp_sock *(*bpf_tcp_sock)(struct bpf_sock *sk) =
182 (void *) BPF_FUNC_tcp_sock; 182 (void *) BPF_FUNC_tcp_sock;
183static int (*bpf_skb_ecn_set_ce)(void *ctx) =
184 (void *) BPF_FUNC_skb_ecn_set_ce;
183 185
184/* llvm builtin functions that eBPF C program may use to 186/* llvm builtin functions that eBPF C program may use to
185 * emit BPF_LD_ABS and BPF_LD_IND instructions 187 * emit BPF_LD_ABS and BPF_LD_IND instructions