aboutsummaryrefslogtreecommitdiffstats
path: root/net/sched/cls_bpf.c
diff options
context:
space:
mode:
authorDaniel Borkmann <daniel@iogearbox.net>2015-03-01 06:31:48 -0500
committerDavid S. Miller <davem@davemloft.net>2015-03-01 14:05:19 -0500
commite2e9b6541dd4b31848079da80fe2253daaafb549 (patch)
treedd4de9c9faa662e188285fa5db20663859139f55 /net/sched/cls_bpf.c
parent24701ecea76b0b93bd9667486934ec310825f558 (diff)
cls_bpf: add initial eBPF support for programmable classifiers
This work extends the "classic" BPF programmable tc classifier by extending its scope also to native eBPF code! This allows for user space to implement own custom, 'safe' C like classifiers (or whatever other frontend language LLVM et al may provide in future), that can then be compiled with the LLVM eBPF backend to an eBPF elf file. The result of this can be loaded into the kernel via iproute2's tc. In the kernel, they can be JITed on major archs and thus run in native performance. Simple, minimal toy example to demonstrate the workflow: #include <linux/ip.h> #include <linux/if_ether.h> #include <linux/bpf.h> #include "tc_bpf_api.h" __section("classify") int cls_main(struct sk_buff *skb) { return (0x800 << 16) | load_byte(skb, ETH_HLEN + __builtin_offsetof(struct iphdr, tos)); } char __license[] __section("license") = "GPL"; The classifier can then be compiled into eBPF opcodes and loaded via tc, for example: clang -O2 -emit-llvm -c cls.c -o - | llc -march=bpf -filetype=obj -o cls.o tc filter add dev em1 parent 1: bpf cls.o [...] As it has been demonstrated, the scope can even reach up to a fully fledged flow dissector (similarly as in samples/bpf/sockex2_kern.c). For tc, maps are allowed to be used, but from kernel context only, in other words, eBPF code can keep state across filter invocations. In future, we perhaps may reattach from a different application to those maps e.g., to read out collected statistics/state. Similarly as in socket filters, we may extend functionality for eBPF classifiers over time depending on the use cases. For that purpose, cls_bpf programs are using BPF_PROG_TYPE_SCHED_CLS program type, so we can allow additional functions/accessors (e.g. an ABI compatible offset translation to skb fields/metadata). For an initial cls_bpf support, we allow the same set of helper functions as eBPF socket filters, but we could diverge at some point in time w/o problem. I was wondering whether cls_bpf and act_bpf could share C programs, I can imagine that at some point, we introduce i) further common handlers for both (or even beyond their scope), and/or if truly needed ii) some restricted function space for each of them. Both can be abstracted easily through struct bpf_verifier_ops in future. The context of cls_bpf versus act_bpf is slightly different though: a cls_bpf program will return a specific classid whereas act_bpf a drop/non-drop return code, latter may also in future mangle skbs. That said, we can surely have a "classify" and "action" section in a single object file, or considered mentioned constraint add a possibility of a shared section. The workflow for getting native eBPF running from tc [1] is as follows: for f_bpf, I've added a slightly modified ELF parser code from Alexei's kernel sample, which reads out the LLVM compiled object, sets up maps (and dynamically fixes up map fds) if any, and loads the eBPF instructions all centrally through the bpf syscall. The resulting fd from the loaded program itself is being passed down to cls_bpf, which looks up struct bpf_prog from the fd store, and holds reference, so that it stays available also after tc program lifetime. On tc filter destruction, it will then drop its reference. Moreover, I've also added the optional possibility to annotate an eBPF filter with a name (e.g. path to object file, or something else if preferred) so that when tc dumps currently installed filters, some more context can be given to an admin for a given instance (as opposed to just the file descriptor number). Last but not least, bpf_prog_get() and bpf_prog_put() needed to be exported, so that eBPF can be used from cls_bpf built as a module. Thanks to 60a3b2253c41 ("net: bpf: make eBPF interpreter images read-only") I think this is of no concern since anything wanting to alter eBPF opcode after verification stage would crash the kernel. [1] http://git.breakpoint.cc/cgit/dborkman/iproute2.git/log/?h=ebpf Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Cc: Jamal Hadi Salim <jhs@mojatatu.com> Cc: Jiri Pirko <jiri@resnulli.us> Acked-by: Alexei Starovoitov <ast@plumgrid.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/sched/cls_bpf.c')
-rw-r--r--net/sched/cls_bpf.c206
1 files changed, 154 insertions, 52 deletions
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 5f3ee9e4b5bf..6f7ed8f8e6ee 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -16,6 +16,8 @@
16#include <linux/types.h> 16#include <linux/types.h>
17#include <linux/skbuff.h> 17#include <linux/skbuff.h>
18#include <linux/filter.h> 18#include <linux/filter.h>
19#include <linux/bpf.h>
20
19#include <net/rtnetlink.h> 21#include <net/rtnetlink.h>
20#include <net/pkt_cls.h> 22#include <net/pkt_cls.h>
21#include <net/sock.h> 23#include <net/sock.h>
@@ -24,6 +26,8 @@ MODULE_LICENSE("GPL");
24MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); 26MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
25MODULE_DESCRIPTION("TC BPF based classifier"); 27MODULE_DESCRIPTION("TC BPF based classifier");
26 28
29#define CLS_BPF_NAME_LEN 256
30
27struct cls_bpf_head { 31struct cls_bpf_head {
28 struct list_head plist; 32 struct list_head plist;
29 u32 hgen; 33 u32 hgen;
@@ -32,18 +36,24 @@ struct cls_bpf_head {
32 36
33struct cls_bpf_prog { 37struct cls_bpf_prog {
34 struct bpf_prog *filter; 38 struct bpf_prog *filter;
35 struct sock_filter *bpf_ops;
36 struct tcf_exts exts;
37 struct tcf_result res;
38 struct list_head link; 39 struct list_head link;
40 struct tcf_result res;
41 struct tcf_exts exts;
39 u32 handle; 42 u32 handle;
40 u16 bpf_num_ops; 43 union {
44 u32 bpf_fd;
45 u16 bpf_num_ops;
46 };
47 struct sock_filter *bpf_ops;
48 const char *bpf_name;
41 struct tcf_proto *tp; 49 struct tcf_proto *tp;
42 struct rcu_head rcu; 50 struct rcu_head rcu;
43}; 51};
44 52
45static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { 53static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
46 [TCA_BPF_CLASSID] = { .type = NLA_U32 }, 54 [TCA_BPF_CLASSID] = { .type = NLA_U32 },
55 [TCA_BPF_FD] = { .type = NLA_U32 },
56 [TCA_BPF_NAME] = { .type = NLA_NUL_STRING, .len = CLS_BPF_NAME_LEN },
47 [TCA_BPF_OPS_LEN] = { .type = NLA_U16 }, 57 [TCA_BPF_OPS_LEN] = { .type = NLA_U16 },
48 [TCA_BPF_OPS] = { .type = NLA_BINARY, 58 [TCA_BPF_OPS] = { .type = NLA_BINARY,
49 .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, 59 .len = sizeof(struct sock_filter) * BPF_MAXINSNS },
@@ -76,6 +86,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
76 return -1; 86 return -1;
77} 87}
78 88
89static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog)
90{
91 return !prog->bpf_ops;
92}
93
79static int cls_bpf_init(struct tcf_proto *tp) 94static int cls_bpf_init(struct tcf_proto *tp)
80{ 95{
81 struct cls_bpf_head *head; 96 struct cls_bpf_head *head;
@@ -94,8 +109,12 @@ static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog)
94{ 109{
95 tcf_exts_destroy(&prog->exts); 110 tcf_exts_destroy(&prog->exts);
96 111
97 bpf_prog_destroy(prog->filter); 112 if (cls_bpf_is_ebpf(prog))
113 bpf_prog_put(prog->filter);
114 else
115 bpf_prog_destroy(prog->filter);
98 116
117 kfree(prog->bpf_name);
99 kfree(prog->bpf_ops); 118 kfree(prog->bpf_ops);
100 kfree(prog); 119 kfree(prog);
101} 120}
@@ -114,6 +133,7 @@ static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg)
114 list_del_rcu(&prog->link); 133 list_del_rcu(&prog->link);
115 tcf_unbind_filter(tp, &prog->res); 134 tcf_unbind_filter(tp, &prog->res);
116 call_rcu(&prog->rcu, __cls_bpf_delete_prog); 135 call_rcu(&prog->rcu, __cls_bpf_delete_prog);
136
117 return 0; 137 return 0;
118} 138}
119 139
@@ -151,69 +171,121 @@ static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle)
151 return ret; 171 return ret;
152} 172}
153 173
154static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, 174static int cls_bpf_prog_from_ops(struct nlattr **tb,
155 struct cls_bpf_prog *prog, 175 struct cls_bpf_prog *prog, u32 classid)
156 unsigned long base, struct nlattr **tb,
157 struct nlattr *est, bool ovr)
158{ 176{
159 struct sock_filter *bpf_ops; 177 struct sock_filter *bpf_ops;
160 struct tcf_exts exts; 178 struct sock_fprog_kern fprog_tmp;
161 struct sock_fprog_kern tmp;
162 struct bpf_prog *fp; 179 struct bpf_prog *fp;
163 u16 bpf_size, bpf_num_ops; 180 u16 bpf_size, bpf_num_ops;
164 u32 classid;
165 int ret; 181 int ret;
166 182
167 if (!tb[TCA_BPF_OPS_LEN] || !tb[TCA_BPF_OPS] || !tb[TCA_BPF_CLASSID])
168 return -EINVAL;
169
170 tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE);
171 ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr);
172 if (ret < 0)
173 return ret;
174
175 classid = nla_get_u32(tb[TCA_BPF_CLASSID]);
176 bpf_num_ops = nla_get_u16(tb[TCA_BPF_OPS_LEN]); 183 bpf_num_ops = nla_get_u16(tb[TCA_BPF_OPS_LEN]);
177 if (bpf_num_ops > BPF_MAXINSNS || bpf_num_ops == 0) { 184 if (bpf_num_ops > BPF_MAXINSNS || bpf_num_ops == 0)
178 ret = -EINVAL; 185 return -EINVAL;
179 goto errout;
180 }
181 186
182 bpf_size = bpf_num_ops * sizeof(*bpf_ops); 187 bpf_size = bpf_num_ops * sizeof(*bpf_ops);
183 if (bpf_size != nla_len(tb[TCA_BPF_OPS])) { 188 if (bpf_size != nla_len(tb[TCA_BPF_OPS]))
184 ret = -EINVAL; 189 return -EINVAL;
185 goto errout;
186 }
187 190
188 bpf_ops = kzalloc(bpf_size, GFP_KERNEL); 191 bpf_ops = kzalloc(bpf_size, GFP_KERNEL);
189 if (bpf_ops == NULL) { 192 if (bpf_ops == NULL)
190 ret = -ENOMEM; 193 return -ENOMEM;
191 goto errout;
192 }
193 194
194 memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size); 195 memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size);
195 196
196 tmp.len = bpf_num_ops; 197 fprog_tmp.len = bpf_num_ops;
197 tmp.filter = bpf_ops; 198 fprog_tmp.filter = bpf_ops;
198 199
199 ret = bpf_prog_create(&fp, &tmp); 200 ret = bpf_prog_create(&fp, &fprog_tmp);
200 if (ret) 201 if (ret < 0) {
201 goto errout_free; 202 kfree(bpf_ops);
203 return ret;
204 }
202 205
203 prog->bpf_num_ops = bpf_num_ops;
204 prog->bpf_ops = bpf_ops; 206 prog->bpf_ops = bpf_ops;
207 prog->bpf_num_ops = bpf_num_ops;
208 prog->bpf_name = NULL;
209
205 prog->filter = fp; 210 prog->filter = fp;
206 prog->res.classid = classid; 211 prog->res.classid = classid;
207 212
213 return 0;
214}
215
216static int cls_bpf_prog_from_efd(struct nlattr **tb,
217 struct cls_bpf_prog *prog, u32 classid)
218{
219 struct bpf_prog *fp;
220 char *name = NULL;
221 u32 bpf_fd;
222
223 bpf_fd = nla_get_u32(tb[TCA_BPF_FD]);
224
225 fp = bpf_prog_get(bpf_fd);
226 if (IS_ERR(fp))
227 return PTR_ERR(fp);
228
229 if (fp->type != BPF_PROG_TYPE_SCHED_CLS) {
230 bpf_prog_put(fp);
231 return -EINVAL;
232 }
233
234 if (tb[TCA_BPF_NAME]) {
235 name = kmemdup(nla_data(tb[TCA_BPF_NAME]),
236 nla_len(tb[TCA_BPF_NAME]),
237 GFP_KERNEL);
238 if (!name) {
239 bpf_prog_put(fp);
240 return -ENOMEM;
241 }
242 }
243
244 prog->bpf_ops = NULL;
245 prog->bpf_fd = bpf_fd;
246 prog->bpf_name = name;
247
248 prog->filter = fp;
249 prog->res.classid = classid;
250
251 return 0;
252}
253
254static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
255 struct cls_bpf_prog *prog,
256 unsigned long base, struct nlattr **tb,
257 struct nlattr *est, bool ovr)
258{
259 struct tcf_exts exts;
260 bool is_bpf, is_ebpf;
261 u32 classid;
262 int ret;
263
264 is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS];
265 is_ebpf = tb[TCA_BPF_FD];
266
267 if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf) ||
268 !tb[TCA_BPF_CLASSID])
269 return -EINVAL;
270
271 tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE);
272 ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr);
273 if (ret < 0)
274 return ret;
275
276 classid = nla_get_u32(tb[TCA_BPF_CLASSID]);
277
278 ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog, classid) :
279 cls_bpf_prog_from_efd(tb, prog, classid);
280 if (ret < 0) {
281 tcf_exts_destroy(&exts);
282 return ret;
283 }
284
208 tcf_bind_filter(tp, &prog->res, base); 285 tcf_bind_filter(tp, &prog->res, base);
209 tcf_exts_change(tp, &prog->exts, &exts); 286 tcf_exts_change(tp, &prog->exts, &exts);
210 287
211 return 0; 288 return 0;
212errout_free:
213 kfree(bpf_ops);
214errout:
215 tcf_exts_destroy(&exts);
216 return ret;
217} 289}
218 290
219static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp, 291static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp,
@@ -297,11 +369,43 @@ errout:
297 return ret; 369 return ret;
298} 370}
299 371
372static int cls_bpf_dump_bpf_info(const struct cls_bpf_prog *prog,
373 struct sk_buff *skb)
374{
375 struct nlattr *nla;
376
377 if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_num_ops))
378 return -EMSGSIZE;
379
380 nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_num_ops *
381 sizeof(struct sock_filter));
382 if (nla == NULL)
383 return -EMSGSIZE;
384
385 memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla));
386
387 return 0;
388}
389
390static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog,
391 struct sk_buff *skb)
392{
393 if (nla_put_u32(skb, TCA_BPF_FD, prog->bpf_fd))
394 return -EMSGSIZE;
395
396 if (prog->bpf_name &&
397 nla_put_string(skb, TCA_BPF_NAME, prog->bpf_name))
398 return -EMSGSIZE;
399
400 return 0;
401}
402
300static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, 403static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
301 struct sk_buff *skb, struct tcmsg *tm) 404 struct sk_buff *skb, struct tcmsg *tm)
302{ 405{
303 struct cls_bpf_prog *prog = (struct cls_bpf_prog *) fh; 406 struct cls_bpf_prog *prog = (struct cls_bpf_prog *) fh;
304 struct nlattr *nest, *nla; 407 struct nlattr *nest;
408 int ret;
305 409
306 if (prog == NULL) 410 if (prog == NULL)
307 return skb->len; 411 return skb->len;
@@ -314,16 +418,14 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
314 418
315 if (nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid)) 419 if (nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid))
316 goto nla_put_failure; 420 goto nla_put_failure;
317 if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_num_ops))
318 goto nla_put_failure;
319 421
320 nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_num_ops * 422 if (cls_bpf_is_ebpf(prog))
321 sizeof(struct sock_filter)); 423 ret = cls_bpf_dump_ebpf_info(prog, skb);
322 if (nla == NULL) 424 else
425 ret = cls_bpf_dump_bpf_info(prog, skb);
426 if (ret)
323 goto nla_put_failure; 427 goto nla_put_failure;
324 428
325 memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla));
326
327 if (tcf_exts_dump(skb, &prog->exts) < 0) 429 if (tcf_exts_dump(skb, &prog->exts) < 0)
328 goto nla_put_failure; 430 goto nla_put_failure;
329 431