aboutsummaryrefslogtreecommitdiffstats
path: root/samples
diff options
context:
space:
mode:
authorbrakmo <brakmo@fb.com>2019-03-01 15:38:48 -0500
committerAlexei Starovoitov <ast@kernel.org>2019-03-02 13:48:27 -0500
commit187d0738ff351f725a58be3d606d3a7fc8db8aed (patch)
tree23f8914204b589b663a8e76e086b16bab0bfcf81 /samples
parent5cce85c640ccc9d9aab8b05c77d7d076a44d4db2 (diff)
bpf: Sample HBM BPF program to limit egress bw
A cgroup skb BPF program to limit cgroup output bandwidth. It uses a modified virtual token bucket queue to limit average egress bandwidth. The implementation uses credits instead of tokens. Negative credits imply that queueing would have happened (this is a virtual queue, so no queueing is done by it. However, queueing may occur at the actual qdisc (which is not used for rate limiting). This implementation uses 3 thresholds, one to start marking packets and the other two to drop packets: CREDIT - <--------------------------|------------------------> + | | | 0 | Large pkt | | drop thresh | Small pkt drop Mark threshold thresh The effect of marking depends on the type of packet: a) If the packet is ECN enabled, then the packet is ECN ce marked. The current mark threshold is tuned for DCTCP. c) Else, it is dropped if it is a large packet. If the credit is below the drop threshold, the packet is dropped. Note that dropping a packet through the BPF program does not trigger CWR (Congestion Window Reduction) in TCP packets. A future patch will add support for triggering CWR. This BPF program actually uses 2 drop thresholds, one threshold for larger packets (>= 120 bytes) and another for smaller packets. This protects smaller packets such as SYNs, ACKs, etc. The default bandwidth limit is set at 1Gbps but this can be changed by a user program through a shared BPF map. In addition, by default this BPF program does not limit connections using loopback. This behavior can be overwritten by the user program. There is also an option to calculate some statistics, such as percent of packets marked or dropped, which the user program can access. A latter patch provides such a program (hbm.c) Signed-off-by: Lawrence Brakmo <brakmo@fb.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Diffstat (limited to 'samples')
-rw-r--r--samples/bpf/Makefile2
-rw-r--r--samples/bpf/hbm.h31
-rw-r--r--samples/bpf/hbm_kern.h137
-rw-r--r--samples/bpf/hbm_out_kern.c157
4 files changed, 327 insertions, 0 deletions
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 0c62ac39c697..e1bdc96486f6 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -164,6 +164,7 @@ always += xdp_adjust_tail_kern.o
164always += xdp_fwd_kern.o 164always += xdp_fwd_kern.o
165always += task_fd_query_kern.o 165always += task_fd_query_kern.o
166always += xdp_sample_pkts_kern.o 166always += xdp_sample_pkts_kern.o
167always += hbm_out_kern.o
167 168
168KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include 169KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include
169KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/ 170KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -263,6 +264,7 @@ $(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF)
263$(src)/*.c: verify_target_bpf $(LIBBPF) 264$(src)/*.c: verify_target_bpf $(LIBBPF)
264 265
265$(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h 266$(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h
267$(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h
266 268
267# asm/sysreg.h - inline assembly used by it is incompatible with llvm. 269# asm/sysreg.h - inline assembly used by it is incompatible with llvm.
268# But, there is no easy way to fix it, so just exclude it since it is 270# But, there is no easy way to fix it, so just exclude it since it is
diff --git a/samples/bpf/hbm.h b/samples/bpf/hbm.h
new file mode 100644
index 000000000000..518e8147d084
--- /dev/null
+++ b/samples/bpf/hbm.h
@@ -0,0 +1,31 @@
1/* SPDX-License-Identifier: GPL-2.0
2 *
3 * Copyright (c) 2019 Facebook
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of version 2 of the GNU General Public
7 * License as published by the Free Software Foundation.
8 *
9 * Include file for Host Bandwidth Management (HBM) programs
10 */
11struct hbm_vqueue {
12 struct bpf_spin_lock lock;
13 /* 4 byte hole */
14 unsigned long long lasttime; /* In ns */
15 int credit; /* In bytes */
16 unsigned int rate; /* In bytes per NS << 20 */
17};
18
19struct hbm_queue_stats {
20 unsigned long rate; /* in Mbps*/
21 unsigned long stats:1, /* get HBM stats (marked, dropped,..) */
22 loopback:1; /* also limit flows using loopback */
23 unsigned long long pkts_marked;
24 unsigned long long bytes_marked;
25 unsigned long long pkts_dropped;
26 unsigned long long bytes_dropped;
27 unsigned long long pkts_total;
28 unsigned long long bytes_total;
29 unsigned long long firstPacketTime;
30 unsigned long long lastPacketTime;
31};
diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h
new file mode 100644
index 000000000000..c5635d924193
--- /dev/null
+++ b/samples/bpf/hbm_kern.h
@@ -0,0 +1,137 @@
1/* SPDX-License-Identifier: GPL-2.0
2 *
3 * Copyright (c) 2019 Facebook
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of version 2 of the GNU General Public
7 * License as published by the Free Software Foundation.
8 *
9 * Include file for sample Host Bandwidth Manager (HBM) BPF programs
10 */
11#define KBUILD_MODNAME "foo"
12#include <stddef.h>
13#include <stdbool.h>
14#include <uapi/linux/bpf.h>
15#include <uapi/linux/if_ether.h>
16#include <uapi/linux/if_packet.h>
17#include <uapi/linux/ip.h>
18#include <uapi/linux/ipv6.h>
19#include <uapi/linux/in.h>
20#include <uapi/linux/tcp.h>
21#include <uapi/linux/filter.h>
22#include <uapi/linux/pkt_cls.h>
23#include <net/ipv6.h>
24#include <net/inet_ecn.h>
25#include "bpf_endian.h"
26#include "bpf_helpers.h"
27#include "hbm.h"
28
29#define DROP_PKT 0
30#define ALLOW_PKT 1
31#define TCP_ECN_OK 1
32
33#define HBM_DEBUG 0 // Set to 1 to enable debugging
34#if HBM_DEBUG
35#define bpf_printk(fmt, ...) \
36({ \
37 char ____fmt[] = fmt; \
38 bpf_trace_printk(____fmt, sizeof(____fmt), \
39 ##__VA_ARGS__); \
40})
41#else
42#define bpf_printk(fmt, ...)
43#endif
44
45#define INITIAL_CREDIT_PACKETS 100
46#define MAX_BYTES_PER_PACKET 1500
47#define MARK_THRESH (40 * MAX_BYTES_PER_PACKET)
48#define DROP_THRESH (80 * 5 * MAX_BYTES_PER_PACKET)
49#define LARGE_PKT_DROP_THRESH (DROP_THRESH - (15 * MAX_BYTES_PER_PACKET))
50#define MARK_REGION_SIZE (LARGE_PKT_DROP_THRESH - MARK_THRESH)
51#define LARGE_PKT_THRESH 120
52#define MAX_CREDIT (100 * MAX_BYTES_PER_PACKET)
53#define INIT_CREDIT (INITIAL_CREDIT_PACKETS * MAX_BYTES_PER_PACKET)
54
55// rate in bytes per ns << 20
56#define CREDIT_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20)
57
58struct bpf_map_def SEC("maps") queue_state = {
59 .type = BPF_MAP_TYPE_CGROUP_STORAGE,
60 .key_size = sizeof(struct bpf_cgroup_storage_key),
61 .value_size = sizeof(struct hbm_vqueue),
62};
63BPF_ANNOTATE_KV_PAIR(queue_state, struct bpf_cgroup_storage_key,
64 struct hbm_vqueue);
65
66struct bpf_map_def SEC("maps") queue_stats = {
67 .type = BPF_MAP_TYPE_ARRAY,
68 .key_size = sizeof(u32),
69 .value_size = sizeof(struct hbm_queue_stats),
70 .max_entries = 1,
71};
72BPF_ANNOTATE_KV_PAIR(queue_stats, int, struct hbm_queue_stats);
73
74struct hbm_pkt_info {
75 bool is_ip;
76 bool is_tcp;
77 short ecn;
78};
79
80static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb,
81 struct hbm_pkt_info *pkti)
82{
83 struct iphdr iph;
84 struct ipv6hdr *ip6h;
85
86 bpf_skb_load_bytes(skb, 0, &iph, 12);
87 if (iph.version == 6) {
88 ip6h = (struct ipv6hdr *)&iph;
89 pkti->is_ip = true;
90 pkti->is_tcp = (ip6h->nexthdr == 6);
91 pkti->ecn = (ip6h->flow_lbl[0] >> 4) & INET_ECN_MASK;
92 } else if (iph.version == 4) {
93 pkti->is_ip = true;
94 pkti->is_tcp = (iph.protocol == 6);
95 pkti->ecn = iph.tos & INET_ECN_MASK;
96 } else {
97 pkti->is_ip = false;
98 pkti->is_tcp = false;
99 pkti->ecn = 0;
100 }
101}
102
103static __always_inline void hbm_init_vqueue(struct hbm_vqueue *qdp, int rate)
104{
105 bpf_printk("Initializing queue_state, rate:%d\n", rate * 128);
106 qdp->lasttime = bpf_ktime_get_ns();
107 qdp->credit = INIT_CREDIT;
108 qdp->rate = rate * 128;
109}
110
111static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp,
112 int len,
113 unsigned long long curtime,
114 bool congestion_flag,
115 bool drop_flag)
116{
117 if (qsp != NULL) {
118 // Following is needed for work conserving
119 __sync_add_and_fetch(&(qsp->bytes_total), len);
120 if (qsp->stats) {
121 // Optionally update statistics
122 if (qsp->firstPacketTime == 0)
123 qsp->firstPacketTime = curtime;
124 qsp->lastPacketTime = curtime;
125 __sync_add_and_fetch(&(qsp->pkts_total), 1);
126 if (congestion_flag || drop_flag) {
127 __sync_add_and_fetch(&(qsp->pkts_marked), 1);
128 __sync_add_and_fetch(&(qsp->bytes_marked), len);
129 }
130 if (drop_flag) {
131 __sync_add_and_fetch(&(qsp->pkts_dropped), 1);
132 __sync_add_and_fetch(&(qsp->bytes_dropped),
133 len);
134 }
135 }
136 }
137}
diff --git a/samples/bpf/hbm_out_kern.c b/samples/bpf/hbm_out_kern.c
new file mode 100644
index 000000000000..f806863d0b79
--- /dev/null
+++ b/samples/bpf/hbm_out_kern.c
@@ -0,0 +1,157 @@
1// SPDX-License-Identifier: GPL-2.0
2/* Copyright (c) 2019 Facebook
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * Sample Host Bandwidth Manager (HBM) BPF program.
9 *
10 * A cgroup skb BPF egress program to limit cgroup output bandwidth.
11 * It uses a modified virtual token bucket queue to limit average
12 * egress bandwidth. The implementation uses credits instead of tokens.
13 * Negative credits imply that queueing would have happened (this is
14 * a virtual queue, so no queueing is done by it. However, queueing may
15 * occur at the actual qdisc (which is not used for rate limiting).
16 *
17 * This implementation uses 3 thresholds, one to start marking packets and
18 * the other two to drop packets:
19 * CREDIT
20 * - <--------------------------|------------------------> +
21 * | | | 0
22 * | Large pkt |
23 * | drop thresh |
24 * Small pkt drop Mark threshold
25 * thresh
26 *
27 * The effect of marking depends on the type of packet:
28 * a) If the packet is ECN enabled and it is a TCP packet, then the packet
29 * is ECN marked.
30 * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr
31 * to reduce the congestion window. The current implementation uses a linear
32 * distribution (0% probability at marking threshold, 100% probability
33 * at drop threshold).
34 * c) If the packet is not a TCP packet, then it is dropped.
35 *
36 * If the credit is below the drop threshold, the packet is dropped. If it
37 * is a TCP packet, then it also calls tcp_cwr since packets dropped by
38 * by a cgroup skb BPF program do not automatically trigger a call to
39 * tcp_cwr in the current kernel code.
40 *
41 * This BPF program actually uses 2 drop thresholds, one threshold
42 * for larger packets (>= 120 bytes) and another for smaller packets. This
43 * protects smaller packets such as SYNs, ACKs, etc.
44 *
45 * The default bandwidth limit is set at 1Gbps but this can be changed by
46 * a user program through a shared BPF map. In addition, by default this BPF
47 * program does not limit connections using loopback. This behavior can be
48 * overwritten by the user program. There is also an option to calculate
49 * some statistics, such as percent of packets marked or dropped, which
50 * the user program can access.
51 *
52 * A latter patch provides such a program (hbm.c)
53 */
54
55#include "hbm_kern.h"
56
57SEC("cgroup_skb/egress")
58int _hbm_out_cg(struct __sk_buff *skb)
59{
60 struct hbm_pkt_info pkti;
61 int len = skb->len;
62 unsigned int queue_index = 0;
63 unsigned long long curtime;
64 int credit;
65 signed long long delta = 0, zero = 0;
66 int max_credit = MAX_CREDIT;
67 bool congestion_flag = false;
68 bool drop_flag = false;
69 bool cwr_flag = false;
70 struct hbm_vqueue *qdp;
71 struct hbm_queue_stats *qsp = NULL;
72 int rv = ALLOW_PKT;
73
74 qsp = bpf_map_lookup_elem(&queue_stats, &queue_index);
75 if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1))
76 return ALLOW_PKT;
77
78 hbm_get_pkt_info(skb, &pkti);
79
80 // We may want to account for the length of headers in len
81 // calculation, like ETH header + overhead, specially if it
82 // is a gso packet. But I am not doing it right now.
83
84 qdp = bpf_get_local_storage(&queue_state, 0);
85 if (!qdp)
86 return ALLOW_PKT;
87 else if (qdp->lasttime == 0)
88 hbm_init_vqueue(qdp, 1024);
89
90 curtime = bpf_ktime_get_ns();
91
92 // Begin critical section
93 bpf_spin_lock(&qdp->lock);
94 credit = qdp->credit;
95 delta = curtime - qdp->lasttime;
96 /* delta < 0 implies that another process with a curtime greater
97 * than ours beat us to the critical section and already added
98 * the new credit, so we should not add it ourselves
99 */
100 if (delta > 0) {
101 qdp->lasttime = curtime;
102 credit += CREDIT_PER_NS(delta, qdp->rate);
103 if (credit > MAX_CREDIT)
104 credit = MAX_CREDIT;
105 }
106 credit -= len;
107 qdp->credit = credit;
108 bpf_spin_unlock(&qdp->lock);
109 // End critical section
110
111 // Check if we should update rate
112 if (qsp != NULL && (qsp->rate * 128) != qdp->rate) {
113 qdp->rate = qsp->rate * 128;
114 bpf_printk("Updating rate: %d (1sec:%llu bits)\n",
115 (int)qdp->rate,
116 CREDIT_PER_NS(1000000000, qdp->rate) * 8);
117 }
118
119 // Set flags (drop, congestion, cwr)
120 // Dropping => we are congested, so ignore congestion flag
121 if (credit < -DROP_THRESH ||
122 (len > LARGE_PKT_THRESH &&
123 credit < -LARGE_PKT_DROP_THRESH)) {
124 // Very congested, set drop flag
125 drop_flag = true;
126 } else if (credit < 0) {
127 // Congested, set congestion flag
128 if (pkti.ecn) {
129 if (credit < -MARK_THRESH)
130 congestion_flag = true;
131 else
132 congestion_flag = false;
133 } else {
134 congestion_flag = true;
135 }
136 }
137
138 if (congestion_flag) {
139 if (!bpf_skb_ecn_set_ce(skb)) {
140 if (len > LARGE_PKT_THRESH) {
141 // Problem if too many small packets?
142 drop_flag = true;
143 }
144 }
145 }
146
147 if (drop_flag)
148 rv = DROP_PKT;
149
150 hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag);
151
152 if (rv == DROP_PKT)
153 __sync_add_and_fetch(&(qdp->credit), len);
154
155 return rv;
156}
157char _license[] SEC("license") = "GPL";