aboutsummaryrefslogtreecommitdiffstats
path: root/tools/include
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2018-04-26 21:19:50 -0400
committerDavid S. Miller <davem@davemloft.net>2018-04-26 21:19:50 -0400
commit79741a38b4a2538a68342c45b813ecb9dd648ee8 (patch)
treebd744350673c8e3a912525b4733ab8e0ae24cdfd /tools/include
parentcb586c63e3fc5b227c51fd8c4cb40b34d3750645 (diff)
parentc0885f61bbb6a89c35397d3a8fe49c35822cde81 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Daniel Borkmann says: ==================== pull-request: bpf-next 2018-04-27 The following pull-request contains BPF updates for your *net-next* tree. The main changes are: 1) Add extensive BPF helper description into include/uapi/linux/bpf.h and a new script bpf_helpers_doc.py which allows for generating a man page out of it. Thus, every helper in BPF now comes with proper function signature, detailed description and return code explanation, from Quentin. 2) Migrate the BPF collect metadata tunnel tests from BPF samples over to the BPF selftests and further extend them with v6 vxlan, geneve and ipip tests, simplify the ipip tests, improve documentation and convert to bpf_ntoh*() / bpf_hton*() api, from William. 3) Currently, helpers that expect ARG_PTR_TO_MAP_{KEY,VALUE} can only access stack and packet memory. Extend this to allow such helpers to also use map values, which enabled use cases where value from a first lookup can be directly used as a key for a second lookup, from Paul. 4) Add a new helper bpf_skb_get_xfrm_state() for tc BPF programs in order to retrieve XFRM state information containing SPI, peer address and reqid values, from Eyal. 5) Various optimizations in nfp driver's BPF JIT in order to turn ADD and SUB instructions with negative immediate into the opposite operation with a positive immediate such that nfp can better fit small immediates into instructions. Savings in instruction count up to 4% have been observed, from Jakub. 6) Add the BPF prog's gpl_compatible flag to struct bpf_prog_info and add support for dumping this through bpftool, from Jiri. 7) Move the BPF sockmap samples over into BPF selftests instead since sockmap was rather a series of tests than sample anyway and this way this can be run from automated bots, from John. 8) Follow-up fix for bpf_adjust_tail() helper in order to make it work with generic XDP, from Nikita. 9) Some follow-up cleanups to BTF, namely, removing unused defines from BTF uapi header and renaming 'name' struct btf_* members into name_off to make it more clear they are offsets into string section, from Martin. 10) Remove test_sock_addr from TEST_GEN_PROGS in BPF selftests since not run directly but invoked from test_sock_addr.sh, from Yonghong. 11) Remove redundant ret assignment in sample BPF loader, from Wang. 12) Add couple of missing files to BPF selftest's gitignore, from Anders. There are two trivial merge conflicts while pulling: 1) Remove samples/sockmap/Makefile since all sockmap tests have been moved to selftests. 2) Add both hunks from tools/testing/selftests/bpf/.gitignore to the file since git should ignore all of them. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'tools/include')
-rw-r--r--tools/include/uapi/linux/bpf.h1784
-rw-r--r--tools/include/uapi/linux/btf.h8
2 files changed, 1399 insertions, 393 deletions
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index c8383a289f7b..da77a9388947 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -377,403 +377,1396 @@ union bpf_attr {
377 }; 377 };
378} __attribute__((aligned(8))); 378} __attribute__((aligned(8)));
379 379
380/* BPF helper function descriptions: 380/* The description below is an attempt at providing documentation to eBPF
381 * 381 * developers about the multiple available eBPF helper functions. It can be
382 * void *bpf_map_lookup_elem(&map, &key) 382 * parsed and used to produce a manual page. The workflow is the following,
383 * Return: Map value or NULL 383 * and requires the rst2man utility:
384 * 384 *
385 * int bpf_map_update_elem(&map, &key, &value, flags) 385 * $ ./scripts/bpf_helpers_doc.py \
386 * Return: 0 on success or negative error 386 * --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst
387 * 387 * $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7
388 * int bpf_map_delete_elem(&map, &key) 388 * $ man /tmp/bpf-helpers.7
389 * Return: 0 on success or negative error 389 *
390 * 390 * Note that in order to produce this external documentation, some RST
391 * int bpf_probe_read(void *dst, int size, void *src) 391 * formatting is used in the descriptions to get "bold" and "italics" in
392 * Return: 0 on success or negative error 392 * manual pages. Also note that the few trailing white spaces are
393 * intentional, removing them would break paragraphs for rst2man.
394 *
395 * Start of BPF helper function descriptions:
396 *
397 * void *bpf_map_lookup_elem(struct bpf_map *map, const void *key)
398 * Description
399 * Perform a lookup in *map* for an entry associated to *key*.
400 * Return
401 * Map value associated to *key*, or **NULL** if no entry was
402 * found.
403 *
404 * int bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags)
405 * Description
406 * Add or update the value of the entry associated to *key* in
407 * *map* with *value*. *flags* is one of:
408 *
409 * **BPF_NOEXIST**
410 * The entry for *key* must not exist in the map.
411 * **BPF_EXIST**
412 * The entry for *key* must already exist in the map.
413 * **BPF_ANY**
414 * No condition on the existence of the entry for *key*.
415 *
416 * Flag value **BPF_NOEXIST** cannot be used for maps of types
417 * **BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY** (all
418 * elements always exist), the helper would return an error.
419 * Return
420 * 0 on success, or a negative error in case of failure.
421 *
422 * int bpf_map_delete_elem(struct bpf_map *map, const void *key)
423 * Description
424 * Delete entry with *key* from *map*.
425 * Return
426 * 0 on success, or a negative error in case of failure.
427 *
428 * int bpf_probe_read(void *dst, u32 size, const void *src)
429 * Description
430 * For tracing programs, safely attempt to read *size* bytes from
431 * address *src* and store the data in *dst*.
432 * Return
433 * 0 on success, or a negative error in case of failure.
393 * 434 *
394 * u64 bpf_ktime_get_ns(void) 435 * u64 bpf_ktime_get_ns(void)
395 * Return: current ktime 436 * Description
396 * 437 * Return the time elapsed since system boot, in nanoseconds.
397 * int bpf_trace_printk(const char *fmt, int fmt_size, ...) 438 * Return
398 * Return: length of buffer written or negative error 439 * Current *ktime*.
399 * 440 *
400 * u32 bpf_prandom_u32(void) 441 * int bpf_trace_printk(const char *fmt, u32 fmt_size, ...)
401 * Return: random value 442 * Description
402 * 443 * This helper is a "printk()-like" facility for debugging. It
403 * u32 bpf_raw_smp_processor_id(void) 444 * prints a message defined by format *fmt* (of size *fmt_size*)
404 * Return: SMP processor ID 445 * to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if
405 * 446 * available. It can take up to three additional **u64**
406 * int bpf_skb_store_bytes(skb, offset, from, len, flags) 447 * arguments (as an eBPF helpers, the total number of arguments is
407 * store bytes into packet 448 * limited to five).
408 * @skb: pointer to skb 449 *
409 * @offset: offset within packet from skb->mac_header 450 * Each time the helper is called, it appends a line to the trace.
410 * @from: pointer where to copy bytes from 451 * The format of the trace is customizable, and the exact output
411 * @len: number of bytes to store into packet 452 * one will get depends on the options set in
412 * @flags: bit 0 - if true, recompute skb->csum 453 * *\/sys/kernel/debug/tracing/trace_options* (see also the
413 * other bits - reserved 454 * *README* file under the same directory). However, it usually
414 * Return: 0 on success or negative error 455 * defaults to something like:
415 * 456 *
416 * int bpf_l3_csum_replace(skb, offset, from, to, flags) 457 * ::
417 * recompute IP checksum 458 *
418 * @skb: pointer to skb 459 * telnet-470 [001] .N.. 419421.045894: 0x00000001: <formatted msg>
419 * @offset: offset within packet where IP checksum is located 460 *
420 * @from: old value of header field 461 * In the above:
421 * @to: new value of header field 462 *
422 * @flags: bits 0-3 - size of header field 463 * * ``telnet`` is the name of the current task.
423 * other bits - reserved 464 * * ``470`` is the PID of the current task.
424 * Return: 0 on success or negative error 465 * * ``001`` is the CPU number on which the task is
425 * 466 * running.
426 * int bpf_l4_csum_replace(skb, offset, from, to, flags) 467 * * In ``.N..``, each character refers to a set of
427 * recompute TCP/UDP checksum 468 * options (whether irqs are enabled, scheduling
428 * @skb: pointer to skb 469 * options, whether hard/softirqs are running, level of
429 * @offset: offset within packet where TCP/UDP checksum is located 470 * preempt_disabled respectively). **N** means that
430 * @from: old value of header field 471 * **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED**
431 * @to: new value of header field 472 * are set.
432 * @flags: bits 0-3 - size of header field 473 * * ``419421.045894`` is a timestamp.
433 * bit 4 - is pseudo header 474 * * ``0x00000001`` is a fake value used by BPF for the
434 * other bits - reserved 475 * instruction pointer register.
435 * Return: 0 on success or negative error 476 * * ``<formatted msg>`` is the message formatted with
436 * 477 * *fmt*.
437 * int bpf_tail_call(ctx, prog_array_map, index) 478 *
438 * jump into another BPF program 479 * The conversion specifiers supported by *fmt* are similar, but
439 * @ctx: context pointer passed to next program 480 * more limited than for printk(). They are **%d**, **%i**,
440 * @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY 481 * **%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**,
441 * @index: 32-bit index inside array that selects specific program to run 482 * **%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size
442 * Return: 0 on success or negative error 483 * of field, padding with zeroes, etc.) is available, and the
443 * 484 * helper will return **-EINVAL** (but print nothing) if it
444 * int bpf_clone_redirect(skb, ifindex, flags) 485 * encounters an unknown specifier.
445 * redirect to another netdev 486 *
446 * @skb: pointer to skb 487 * Also, note that **bpf_trace_printk**\ () is slow, and should
447 * @ifindex: ifindex of the net device 488 * only be used for debugging purposes. For this reason, a notice
448 * @flags: bit 0 - if set, redirect to ingress instead of egress 489 * bloc (spanning several lines) is printed to kernel logs and
449 * other bits - reserved 490 * states that the helper should not be used "for production use"
450 * Return: 0 on success or negative error 491 * the first time this helper is used (or more precisely, when
492 * **trace_printk**\ () buffers are allocated). For passing values
493 * to user space, perf events should be preferred.
494 * Return
495 * The number of bytes written to the buffer, or a negative error
496 * in case of failure.
497 *
498 * u32 bpf_get_prandom_u32(void)
499 * Description
500 * Get a pseudo-random number.
501 *
502 * From a security point of view, this helper uses its own
503 * pseudo-random internal state, and cannot be used to infer the
504 * seed of other random functions in the kernel. However, it is
505 * essential to note that the generator used by the helper is not
506 * cryptographically secure.
507 * Return
508 * A random 32-bit unsigned value.
509 *
510 * u32 bpf_get_smp_processor_id(void)
511 * Description
512 * Get the SMP (symmetric multiprocessing) processor id. Note that
513 * all programs run with preemption disabled, which means that the
514 * SMP processor id is stable during all the execution of the
515 * program.
516 * Return
517 * The SMP id of the processor running the program.
518 *
519 * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags)
520 * Description
521 * Store *len* bytes from address *from* into the packet
522 * associated to *skb*, at *offset*. *flags* are a combination of
523 * **BPF_F_RECOMPUTE_CSUM** (automatically recompute the
524 * checksum for the packet after storing the bytes) and
525 * **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\
526 * **->swhash** and *skb*\ **->l4hash** to 0).
527 *
528 * A call to this helper is susceptible to change the underlaying
529 * packet buffer. Therefore, at load time, all checks on pointers
530 * previously done by the verifier are invalidated and must be
531 * performed again, if the helper is used in combination with
532 * direct packet access.
533 * Return
534 * 0 on success, or a negative error in case of failure.
535 *
536 * int bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size)
537 * Description
538 * Recompute the layer 3 (e.g. IP) checksum for the packet
539 * associated to *skb*. Computation is incremental, so the helper
540 * must know the former value of the header field that was
541 * modified (*from*), the new value of this field (*to*), and the
542 * number of bytes (2 or 4) for this field, stored in *size*.
543 * Alternatively, it is possible to store the difference between
544 * the previous and the new values of the header field in *to*, by
545 * setting *from* and *size* to 0. For both methods, *offset*
546 * indicates the location of the IP checksum within the packet.
547 *
548 * This helper works in combination with **bpf_csum_diff**\ (),
549 * which does not update the checksum in-place, but offers more
550 * flexibility and can handle sizes larger than 2 or 4 for the
551 * checksum to update.
552 *
553 * A call to this helper is susceptible to change the underlaying
554 * packet buffer. Therefore, at load time, all checks on pointers
555 * previously done by the verifier are invalidated and must be
556 * performed again, if the helper is used in combination with
557 * direct packet access.
558 * Return
559 * 0 on success, or a negative error in case of failure.
560 *
561 * int bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags)
562 * Description
563 * Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the
564 * packet associated to *skb*. Computation is incremental, so the
565 * helper must know the former value of the header field that was
566 * modified (*from*), the new value of this field (*to*), and the
567 * number of bytes (2 or 4) for this field, stored on the lowest
568 * four bits of *flags*. Alternatively, it is possible to store
569 * the difference between the previous and the new values of the
570 * header field in *to*, by setting *from* and the four lowest
571 * bits of *flags* to 0. For both methods, *offset* indicates the
572 * location of the IP checksum within the packet. In addition to
573 * the size of the field, *flags* can be added (bitwise OR) actual
574 * flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left
575 * untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and
576 * for updates resulting in a null checksum the value is set to
577 * **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates
578 * the checksum is to be computed against a pseudo-header.
579 *
580 * This helper works in combination with **bpf_csum_diff**\ (),
581 * which does not update the checksum in-place, but offers more
582 * flexibility and can handle sizes larger than 2 or 4 for the
583 * checksum to update.
584 *
585 * A call to this helper is susceptible to change the underlaying
586 * packet buffer. Therefore, at load time, all checks on pointers
587 * previously done by the verifier are invalidated and must be
588 * performed again, if the helper is used in combination with
589 * direct packet access.
590 * Return
591 * 0 on success, or a negative error in case of failure.
592 *
593 * int bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index)
594 * Description
595 * This special helper is used to trigger a "tail call", or in
596 * other words, to jump into another eBPF program. The same stack
597 * frame is used (but values on stack and in registers for the
598 * caller are not accessible to the callee). This mechanism allows
599 * for program chaining, either for raising the maximum number of
600 * available eBPF instructions, or to execute given programs in
601 * conditional blocks. For security reasons, there is an upper
602 * limit to the number of successive tail calls that can be
603 * performed.
604 *
605 * Upon call of this helper, the program attempts to jump into a
606 * program referenced at index *index* in *prog_array_map*, a
607 * special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes
608 * *ctx*, a pointer to the context.
609 *
610 * If the call succeeds, the kernel immediately runs the first
611 * instruction of the new program. This is not a function call,
612 * and it never returns to the previous program. If the call
613 * fails, then the helper has no effect, and the caller continues
614 * to run its subsequent instructions. A call can fail if the
615 * destination program for the jump does not exist (i.e. *index*
616 * is superior to the number of entries in *prog_array_map*), or
617 * if the maximum number of tail calls has been reached for this
618 * chain of programs. This limit is defined in the kernel by the
619 * macro **MAX_TAIL_CALL_CNT** (not accessible to user space),
620 * which is currently set to 32.
621 * Return
622 * 0 on success, or a negative error in case of failure.
623 *
624 * int bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags)
625 * Description
626 * Clone and redirect the packet associated to *skb* to another
627 * net device of index *ifindex*. Both ingress and egress
628 * interfaces can be used for redirection. The **BPF_F_INGRESS**
629 * value in *flags* is used to make the distinction (ingress path
630 * is selected if the flag is present, egress path otherwise).
631 * This is the only flag supported for now.
632 *
633 * In comparison with **bpf_redirect**\ () helper,
634 * **bpf_clone_redirect**\ () has the associated cost of
635 * duplicating the packet buffer, but this can be executed out of
636 * the eBPF program. Conversely, **bpf_redirect**\ () is more
637 * efficient, but it is handled through an action code where the
638 * redirection happens only after the eBPF program has returned.
639 *
640 * A call to this helper is susceptible to change the underlaying
641 * packet buffer. Therefore, at load time, all checks on pointers
642 * previously done by the verifier are invalidated and must be
643 * performed again, if the helper is used in combination with
644 * direct packet access.
645 * Return
646 * 0 on success, or a negative error in case of failure.
451 * 647 *
452 * u64 bpf_get_current_pid_tgid(void) 648 * u64 bpf_get_current_pid_tgid(void)
453 * Return: current->tgid << 32 | current->pid 649 * Return
650 * A 64-bit integer containing the current tgid and pid, and
651 * created as such:
652 * *current_task*\ **->tgid << 32 \|**
653 * *current_task*\ **->pid**.
454 * 654 *
455 * u64 bpf_get_current_uid_gid(void) 655 * u64 bpf_get_current_uid_gid(void)
456 * Return: current_gid << 32 | current_uid 656 * Return
457 * 657 * A 64-bit integer containing the current GID and UID, and
458 * int bpf_get_current_comm(char *buf, int size_of_buf) 658 * created as such: *current_gid* **<< 32 \|** *current_uid*.
459 * stores current->comm into buf 659 *
460 * Return: 0 on success or negative error 660 * int bpf_get_current_comm(char *buf, u32 size_of_buf)
461 * 661 * Description
462 * u32 bpf_get_cgroup_classid(skb) 662 * Copy the **comm** attribute of the current task into *buf* of
463 * retrieve a proc's classid 663 * *size_of_buf*. The **comm** attribute contains the name of
464 * @skb: pointer to skb 664 * the executable (excluding the path) for the current task. The
465 * Return: classid if != 0 665 * *size_of_buf* must be strictly positive. On success, the
466 * 666 * helper makes sure that the *buf* is NUL-terminated. On failure,
467 * int bpf_skb_vlan_push(skb, vlan_proto, vlan_tci) 667 * it is filled with zeroes.
468 * Return: 0 on success or negative error 668 * Return
469 * 669 * 0 on success, or a negative error in case of failure.
470 * int bpf_skb_vlan_pop(skb) 670 *
471 * Return: 0 on success or negative error 671 * u32 bpf_get_cgroup_classid(struct sk_buff *skb)
472 * 672 * Description
473 * int bpf_skb_get_tunnel_key(skb, key, size, flags) 673 * Retrieve the classid for the current task, i.e. for the net_cls
474 * int bpf_skb_set_tunnel_key(skb, key, size, flags) 674 * cgroup to which *skb* belongs.
475 * retrieve or populate tunnel metadata 675 *
476 * @skb: pointer to skb 676 * This helper can be used on TC egress path, but not on ingress.
477 * @key: pointer to 'struct bpf_tunnel_key' 677 *
478 * @size: size of 'struct bpf_tunnel_key' 678 * The net_cls cgroup provides an interface to tag network packets
479 * @flags: room for future extensions 679 * based on a user-provided identifier for all traffic coming from
480 * Return: 0 on success or negative error 680 * the tasks belonging to the related cgroup. See also the related
481 * 681 * kernel documentation, available from the Linux sources in file
482 * u64 bpf_perf_event_read(map, flags) 682 * *Documentation/cgroup-v1/net_cls.txt*.
483 * read perf event counter value 683 *
484 * @map: pointer to perf_event_array map 684 * The Linux kernel has two versions for cgroups: there are
485 * @flags: index of event in the map or bitmask flags 685 * cgroups v1 and cgroups v2. Both are available to users, who can
486 * Return: value of perf event counter read or error code 686 * use a mixture of them, but note that the net_cls cgroup is for
487 * 687 * cgroup v1 only. This makes it incompatible with BPF programs
488 * int bpf_redirect(ifindex, flags) 688 * run on cgroups, which is a cgroup-v2-only feature (a socket can
489 * redirect to another netdev 689 * only hold data for one version of cgroups at a time).
490 * @ifindex: ifindex of the net device 690 *
491 * @flags: 691 * This helper is only available is the kernel was compiled with
492 * cls_bpf: 692 * the **CONFIG_CGROUP_NET_CLASSID** configuration option set to
493 * bit 0 - if set, redirect to ingress instead of egress 693 * "**y**" or to "**m**".
494 * other bits - reserved 694 * Return
495 * xdp_bpf: 695 * The classid, or 0 for the default unconfigured classid.
496 * all bits - reserved 696 *
497 * Return: cls_bpf: TC_ACT_REDIRECT on success or TC_ACT_SHOT on error 697 * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
498 * xdp_bfp: XDP_REDIRECT on success or XDP_ABORT on error 698 * Description
499 * int bpf_redirect_map(map, key, flags) 699 * Push a *vlan_tci* (VLAN tag control information) of protocol
500 * redirect to endpoint in map 700 * *vlan_proto* to the packet associated to *skb*, then update
501 * @map: pointer to dev map 701 * the checksum. Note that if *vlan_proto* is different from
502 * @key: index in map to lookup 702 * **ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to
503 * @flags: -- 703 * be **ETH_P_8021Q**.
504 * Return: XDP_REDIRECT on success or XDP_ABORT on error 704 *
505 * 705 * A call to this helper is susceptible to change the underlaying
506 * u32 bpf_get_route_realm(skb) 706 * packet buffer. Therefore, at load time, all checks on pointers
507 * retrieve a dst's tclassid 707 * previously done by the verifier are invalidated and must be
508 * @skb: pointer to skb 708 * performed again, if the helper is used in combination with
509 * Return: realm if != 0 709 * direct packet access.
510 * 710 * Return
511 * int bpf_perf_event_output(ctx, map, flags, data, size) 711 * 0 on success, or a negative error in case of failure.
512 * output perf raw sample 712 *
513 * @ctx: struct pt_regs* 713 * int bpf_skb_vlan_pop(struct sk_buff *skb)
514 * @map: pointer to perf_event_array map 714 * Description
515 * @flags: index of event in the map or bitmask flags 715 * Pop a VLAN header from the packet associated to *skb*.
516 * @data: data on stack to be output as raw data 716 *
517 * @size: size of data 717 * A call to this helper is susceptible to change the underlaying
518 * Return: 0 on success or negative error 718 * packet buffer. Therefore, at load time, all checks on pointers
519 * 719 * previously done by the verifier are invalidated and must be
520 * int bpf_get_stackid(ctx, map, flags) 720 * performed again, if the helper is used in combination with
521 * walk user or kernel stack and return id 721 * direct packet access.
522 * @ctx: struct pt_regs* 722 * Return
523 * @map: pointer to stack_trace map 723 * 0 on success, or a negative error in case of failure.
524 * @flags: bits 0-7 - numer of stack frames to skip 724 *
525 * bit 8 - collect user stack instead of kernel 725 * int bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
526 * bit 9 - compare stacks by hash only 726 * Description
527 * bit 10 - if two different stacks hash into the same stackid 727 * Get tunnel metadata. This helper takes a pointer *key* to an
528 * discard old 728 * empty **struct bpf_tunnel_key** of **size**, that will be
529 * other bits - reserved 729 * filled with tunnel metadata for the packet associated to *skb*.
530 * Return: >= 0 stackid on success or negative error 730 * The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which
531 * 731 * indicates that the tunnel is based on IPv6 protocol instead of
532 * s64 bpf_csum_diff(from, from_size, to, to_size, seed) 732 * IPv4.
533 * calculate csum diff 733 *
534 * @from: raw from buffer 734 * The **struct bpf_tunnel_key** is an object that generalizes the
535 * @from_size: length of from buffer 735 * principal parameters used by various tunneling protocols into a
536 * @to: raw to buffer 736 * single struct. This way, it can be used to easily make a
537 * @to_size: length of to buffer 737 * decision based on the contents of the encapsulation header,
538 * @seed: optional seed 738 * "summarized" in this struct. In particular, it holds the IP
539 * Return: csum result or negative error code 739 * address of the remote end (IPv4 or IPv6, depending on the case)
540 * 740 * in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also,
541 * int bpf_skb_get_tunnel_opt(skb, opt, size) 741 * this struct exposes the *key*\ **->tunnel_id**, which is
542 * retrieve tunnel options metadata 742 * generally mapped to a VNI (Virtual Network Identifier), making
543 * @skb: pointer to skb 743 * it programmable together with the **bpf_skb_set_tunnel_key**\
544 * @opt: pointer to raw tunnel option data 744 * () helper.
545 * @size: size of @opt 745 *
546 * Return: option size 746 * Let's imagine that the following code is part of a program
547 * 747 * attached to the TC ingress interface, on one end of a GRE
548 * int bpf_skb_set_tunnel_opt(skb, opt, size) 748 * tunnel, and is supposed to filter out all messages coming from
549 * populate tunnel options metadata 749 * remote ends with IPv4 address other than 10.0.0.1:
550 * @skb: pointer to skb 750 *
551 * @opt: pointer to raw tunnel option data 751 * ::
552 * @size: size of @opt 752 *
553 * Return: 0 on success or negative error 753 * int ret;
554 * 754 * struct bpf_tunnel_key key = {};
555 * int bpf_skb_change_proto(skb, proto, flags) 755 *
556 * Change protocol of the skb. Currently supported is v4 -> v6, 756 * ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
557 * v6 -> v4 transitions. The helper will also resize the skb. eBPF 757 * if (ret < 0)
558 * program is expected to fill the new headers via skb_store_bytes 758 * return TC_ACT_SHOT; // drop packet
559 * and lX_csum_replace. 759 *
560 * @skb: pointer to skb 760 * if (key.remote_ipv4 != 0x0a000001)
561 * @proto: new skb->protocol type 761 * return TC_ACT_SHOT; // drop packet
562 * @flags: reserved 762 *
563 * Return: 0 on success or negative error 763 * return TC_ACT_OK; // accept packet
564 * 764 *
565 * int bpf_skb_change_type(skb, type) 765 * This interface can also be used with all encapsulation devices
566 * Change packet type of skb. 766 * that can operate in "collect metadata" mode: instead of having
567 * @skb: pointer to skb 767 * one network device per specific configuration, the "collect
568 * @type: new skb->pkt_type type 768 * metadata" mode only requires a single device where the
569 * Return: 0 on success or negative error 769 * configuration can be extracted from this helper.
570 * 770 *
571 * int bpf_skb_under_cgroup(skb, map, index) 771 * This can be used together with various tunnels such as VXLan,
572 * Check cgroup2 membership of skb 772 * Geneve, GRE or IP in IP (IPIP).
573 * @skb: pointer to skb 773 * Return
574 * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type 774 * 0 on success, or a negative error in case of failure.
575 * @index: index of the cgroup in the bpf_map 775 *
576 * Return: 776 * int bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
577 * == 0 skb failed the cgroup2 descendant test 777 * Description
578 * == 1 skb succeeded the cgroup2 descendant test 778 * Populate tunnel metadata for packet associated to *skb.* The
579 * < 0 error 779 * tunnel metadata is set to the contents of *key*, of *size*. The
580 * 780 * *flags* can be set to a combination of the following values:
581 * u32 bpf_get_hash_recalc(skb) 781 *
582 * Retrieve and possibly recalculate skb->hash. 782 * **BPF_F_TUNINFO_IPV6**
583 * @skb: pointer to skb 783 * Indicate that the tunnel is based on IPv6 protocol
584 * Return: hash 784 * instead of IPv4.
785 * **BPF_F_ZERO_CSUM_TX**
786 * For IPv4 packets, add a flag to tunnel metadata
787 * indicating that checksum computation should be skipped
788 * and checksum set to zeroes.
789 * **BPF_F_DONT_FRAGMENT**
790 * Add a flag to tunnel metadata indicating that the
791 * packet should not be fragmented.
792 * **BPF_F_SEQ_NUMBER**
793 * Add a flag to tunnel metadata indicating that a
794 * sequence number should be added to tunnel header before
795 * sending the packet. This flag was added for GRE
796 * encapsulation, but might be used with other protocols
797 * as well in the future.
798 *
799 * Here is a typical usage on the transmit path:
800 *
801 * ::
802 *
803 * struct bpf_tunnel_key key;
804 * populate key ...
805 * bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0);
806 * bpf_clone_redirect(skb, vxlan_dev_ifindex, 0);
807 *
808 * See also the description of the **bpf_skb_get_tunnel_key**\ ()
809 * helper for additional information.
810 * Return
811 * 0 on success, or a negative error in case of failure.
812 *
813 * u64 bpf_perf_event_read(struct bpf_map *map, u64 flags)
814 * Description
815 * Read the value of a perf event counter. This helper relies on a
816 * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of
817 * the perf event counter is selected when *map* is updated with
818 * perf event file descriptors. The *map* is an array whose size
819 * is the number of available CPUs, and each cell contains a value
820 * relative to one CPU. The value to retrieve is indicated by
821 * *flags*, that contains the index of the CPU to look up, masked
822 * with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
823 * **BPF_F_CURRENT_CPU** to indicate that the value for the
824 * current CPU should be retrieved.
825 *
826 * Note that before Linux 4.13, only hardware perf event can be
827 * retrieved.
828 *
829 * Also, be aware that the newer helper
830 * **bpf_perf_event_read_value**\ () is recommended over
831 * **bpf_perf_event_read*\ () in general. The latter has some ABI
832 * quirks where error and counter value are used as a return code
833 * (which is wrong to do since ranges may overlap). This issue is
834 * fixed with bpf_perf_event_read_value(), which at the same time
835 * provides more features over the **bpf_perf_event_read**\ ()
836 * interface. Please refer to the description of
837 * **bpf_perf_event_read_value**\ () for details.
838 * Return
839 * The value of the perf event counter read from the map, or a
840 * negative error code in case of failure.
841 *
842 * int bpf_redirect(u32 ifindex, u64 flags)
843 * Description
844 * Redirect the packet to another net device of index *ifindex*.
845 * This helper is somewhat similar to **bpf_clone_redirect**\
846 * (), except that the packet is not cloned, which provides
847 * increased performance.
848 *
849 * Except for XDP, both ingress and egress interfaces can be used
850 * for redirection. The **BPF_F_INGRESS** value in *flags* is used
851 * to make the distinction (ingress path is selected if the flag
852 * is present, egress path otherwise). Currently, XDP only
853 * supports redirection to the egress interface, and accepts no
854 * flag at all.
855 *
856 * The same effect can be attained with the more generic
857 * **bpf_redirect_map**\ (), which requires specific maps to be
858 * used but offers better performance.
859 * Return
860 * For XDP, the helper returns **XDP_REDIRECT** on success or
861 * **XDP_ABORTED** on error. For other program types, the values
862 * are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on
863 * error.
864 *
865 * u32 bpf_get_route_realm(struct sk_buff *skb)
866 * Description
867 * Retrieve the realm or the route, that is to say the
868 * **tclassid** field of the destination for the *skb*. The
869 * indentifier retrieved is a user-provided tag, similar to the
870 * one used with the net_cls cgroup (see description for
871 * **bpf_get_cgroup_classid**\ () helper), but here this tag is
872 * held by a route (a destination entry), not by a task.
873 *
874 * Retrieving this identifier works with the clsact TC egress hook
875 * (see also **tc-bpf(8)**), or alternatively on conventional
876 * classful egress qdiscs, but not on TC ingress path. In case of
877 * clsact TC egress hook, this has the advantage that, internally,
878 * the destination entry has not been dropped yet in the transmit
879 * path. Therefore, the destination entry does not need to be
880 * artificially held via **netif_keep_dst**\ () for a classful
881 * qdisc until the *skb* is freed.
882 *
883 * This helper is available only if the kernel was compiled with
884 * **CONFIG_IP_ROUTE_CLASSID** configuration option.
885 * Return
886 * The realm of the route for the packet associated to *skb*, or 0
887 * if none was found.
888 *
889 * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
890 * Description
891 * Write raw *data* blob into a special BPF perf event held by
892 * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
893 * event must have the following attributes: **PERF_SAMPLE_RAW**
894 * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
895 * **PERF_COUNT_SW_BPF_OUTPUT** as **config**.
896 *
897 * The *flags* are used to indicate the index in *map* for which
898 * the value must be put, masked with **BPF_F_INDEX_MASK**.
899 * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
900 * to indicate that the index of the current CPU core should be
901 * used.
902 *
903 * The value to write, of *size*, is passed through eBPF stack and
904 * pointed by *data*.
905 *
906 * The context of the program *ctx* needs also be passed to the
907 * helper.
908 *
909 * On user space, a program willing to read the values needs to
910 * call **perf_event_open**\ () on the perf event (either for
911 * one or for all CPUs) and to store the file descriptor into the
912 * *map*. This must be done before the eBPF program can send data
913 * into it. An example is available in file
914 * *samples/bpf/trace_output_user.c* in the Linux kernel source
915 * tree (the eBPF program counterpart is in
916 * *samples/bpf/trace_output_kern.c*).
917 *
918 * **bpf_perf_event_output**\ () achieves better performance
919 * than **bpf_trace_printk**\ () for sharing data with user
920 * space, and is much better suitable for streaming data from eBPF
921 * programs.
922 *
923 * Note that this helper is not restricted to tracing use cases
924 * and can be used with programs attached to TC or XDP as well,
925 * where it allows for passing data to user space listeners. Data
926 * can be:
927 *
928 * * Only custom structs,
929 * * Only the packet payload, or
930 * * A combination of both.
931 * Return
932 * 0 on success, or a negative error in case of failure.
933 *
934 * int bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len)
935 * Description
936 * This helper was provided as an easy way to load data from a
937 * packet. It can be used to load *len* bytes from *offset* from
938 * the packet associated to *skb*, into the buffer pointed by
939 * *to*.
940 *
941 * Since Linux 4.7, usage of this helper has mostly been replaced
942 * by "direct packet access", enabling packet data to be
943 * manipulated with *skb*\ **->data** and *skb*\ **->data_end**
944 * pointing respectively to the first byte of packet data and to
945 * the byte after the last byte of packet data. However, it
946 * remains useful if one wishes to read large quantities of data
947 * at once from a packet into the eBPF stack.
948 * Return
949 * 0 on success, or a negative error in case of failure.
950 *
951 * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags)
952 * Description
953 * Walk a user or a kernel stack and return its id. To achieve
954 * this, the helper needs *ctx*, which is a pointer to the context
955 * on which the tracing program is executed, and a pointer to a
956 * *map* of type **BPF_MAP_TYPE_STACK_TRACE**.
957 *
958 * The last argument, *flags*, holds the number of stack frames to
959 * skip (from 0 to 255), masked with
960 * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
961 * a combination of the following flags:
962 *
963 * **BPF_F_USER_STACK**
964 * Collect a user space stack instead of a kernel stack.
965 * **BPF_F_FAST_STACK_CMP**
966 * Compare stacks by hash only.
967 * **BPF_F_REUSE_STACKID**
968 * If two different stacks hash into the same *stackid*,
969 * discard the old one.
970 *
971 * The stack id retrieved is a 32 bit long integer handle which
972 * can be further combined with other data (including other stack
973 * ids) and used as a key into maps. This can be useful for
974 * generating a variety of graphs (such as flame graphs or off-cpu
975 * graphs).
976 *
977 * For walking a stack, this helper is an improvement over
978 * **bpf_probe_read**\ (), which can be used with unrolled loops
979 * but is not efficient and consumes a lot of eBPF instructions.
980 * Instead, **bpf_get_stackid**\ () can collect up to
981 * **PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that
982 * this limit can be controlled with the **sysctl** program, and
983 * that it should be manually increased in order to profile long
984 * user stacks (such as stacks for Java programs). To do so, use:
985 *
986 * ::
987 *
988 * # sysctl kernel.perf_event_max_stack=<new value>
989 *
990 * Return
991 * The positive or null stack id on success, or a negative error
992 * in case of failure.
993 *
994 * s64 bpf_csum_diff(__be32 *from, u32 from_size, __be32 *to, u32 to_size, __wsum seed)
995 * Description
996 * Compute a checksum difference, from the raw buffer pointed by
997 * *from*, of length *from_size* (that must be a multiple of 4),
998 * towards the raw buffer pointed by *to*, of size *to_size*
999 * (same remark). An optional *seed* can be added to the value
1000 * (this can be cascaded, the seed may come from a previous call
1001 * to the helper).
1002 *
1003 * This is flexible enough to be used in several ways:
1004 *
1005 * * With *from_size* == 0, *to_size* > 0 and *seed* set to
1006 * checksum, it can be used when pushing new data.
1007 * * With *from_size* > 0, *to_size* == 0 and *seed* set to
1008 * checksum, it can be used when removing data from a packet.
1009 * * With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it
1010 * can be used to compute a diff. Note that *from_size* and
1011 * *to_size* do not need to be equal.
1012 *
1013 * This helper can be used in combination with
1014 * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to
1015 * which one can feed in the difference computed with
1016 * **bpf_csum_diff**\ ().
1017 * Return
1018 * The checksum result, or a negative error code in case of
1019 * failure.
1020 *
1021 * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
1022 * Description
1023 * Retrieve tunnel options metadata for the packet associated to
1024 * *skb*, and store the raw tunnel option data to the buffer *opt*
1025 * of *size*.
1026 *
1027 * This helper can be used with encapsulation devices that can
1028 * operate in "collect metadata" mode (please refer to the related
1029 * note in the description of **bpf_skb_get_tunnel_key**\ () for
1030 * more details). A particular example where this can be used is
1031 * in combination with the Geneve encapsulation protocol, where it
1032 * allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper)
1033 * and retrieving arbitrary TLVs (Type-Length-Value headers) from
1034 * the eBPF program. This allows for full customization of these
1035 * headers.
1036 * Return
1037 * The size of the option data retrieved.
1038 *
1039 * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
1040 * Description
1041 * Set tunnel options metadata for the packet associated to *skb*
1042 * to the option data contained in the raw buffer *opt* of *size*.
1043 *
1044 * See also the description of the **bpf_skb_get_tunnel_opt**\ ()
1045 * helper for additional information.
1046 * Return
1047 * 0 on success, or a negative error in case of failure.
1048 *
1049 * int bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags)
1050 * Description
1051 * Change the protocol of the *skb* to *proto*. Currently
1052 * supported are transition from IPv4 to IPv6, and from IPv6 to
1053 * IPv4. The helper takes care of the groundwork for the
1054 * transition, including resizing the socket buffer. The eBPF
1055 * program is expected to fill the new headers, if any, via
1056 * **skb_store_bytes**\ () and to recompute the checksums with
1057 * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\
1058 * (). The main case for this helper is to perform NAT64
1059 * operations out of an eBPF program.
1060 *
1061 * Internally, the GSO type is marked as dodgy so that headers are
1062 * checked and segments are recalculated by the GSO/GRO engine.
1063 * The size for GSO target is adapted as well.
1064 *
1065 * All values for *flags* are reserved for future usage, and must
1066 * be left at zero.
1067 *
1068 * A call to this helper is susceptible to change the underlaying
1069 * packet buffer. Therefore, at load time, all checks on pointers
1070 * previously done by the verifier are invalidated and must be
1071 * performed again, if the helper is used in combination with
1072 * direct packet access.
1073 * Return
1074 * 0 on success, or a negative error in case of failure.
1075 *
1076 * int bpf_skb_change_type(struct sk_buff *skb, u32 type)
1077 * Description
1078 * Change the packet type for the packet associated to *skb*. This
1079 * comes down to setting *skb*\ **->pkt_type** to *type*, except
1080 * the eBPF program does not have a write access to *skb*\
1081 * **->pkt_type** beside this helper. Using a helper here allows
1082 * for graceful handling of errors.
1083 *
1084 * The major use case is to change incoming *skb*s to
1085 * **PACKET_HOST** in a programmatic way instead of having to
1086 * recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for
1087 * example.
1088 *
1089 * Note that *type* only allows certain values. At this time, they
1090 * are:
1091 *
1092 * **PACKET_HOST**
1093 * Packet is for us.
1094 * **PACKET_BROADCAST**
1095 * Send packet to all.
1096 * **PACKET_MULTICAST**
1097 * Send packet to group.
1098 * **PACKET_OTHERHOST**
1099 * Send packet to someone else.
1100 * Return
1101 * 0 on success, or a negative error in case of failure.
1102 *
1103 * int bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index)
1104 * Description
1105 * Check whether *skb* is a descendant of the cgroup2 held by
1106 * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
1107 * Return
1108 * The return value depends on the result of the test, and can be:
1109 *
1110 * * 0, if the *skb* failed the cgroup2 descendant test.
1111 * * 1, if the *skb* succeeded the cgroup2 descendant test.
1112 * * A negative error code, if an error occurred.
1113 *
1114 * u32 bpf_get_hash_recalc(struct sk_buff *skb)
1115 * Description
1116 * Retrieve the hash of the packet, *skb*\ **->hash**. If it is
1117 * not set, in particular if the hash was cleared due to mangling,
1118 * recompute this hash. Later accesses to the hash can be done
1119 * directly with *skb*\ **->hash**.
1120 *
1121 * Calling **bpf_set_hash_invalid**\ (), changing a packet
1122 * prototype with **bpf_skb_change_proto**\ (), or calling
1123 * **bpf_skb_store_bytes**\ () with the
1124 * **BPF_F_INVALIDATE_HASH** are actions susceptible to clear
1125 * the hash and to trigger a new computation for the next call to
1126 * **bpf_get_hash_recalc**\ ().
1127 * Return
1128 * The 32-bit hash.
585 * 1129 *
586 * u64 bpf_get_current_task(void) 1130 * u64 bpf_get_current_task(void)
587 * Returns current task_struct 1131 * Return
588 * Return: current 1132 * A pointer to the current task struct.
589 * 1133 *
590 * int bpf_probe_write_user(void *dst, void *src, int len) 1134 * int bpf_probe_write_user(void *dst, const void *src, u32 len)
591 * safely attempt to write to a location 1135 * Description
592 * @dst: destination address in userspace 1136 * Attempt in a safe way to write *len* bytes from the buffer
593 * @src: source address on stack 1137 * *src* to *dst* in memory. It only works for threads that are in
594 * @len: number of bytes to copy 1138 * user context, and *dst* must be a valid user space address.
595 * Return: 0 on success or negative error 1139 *
596 * 1140 * This helper should not be used to implement any kind of
597 * int bpf_current_task_under_cgroup(map, index) 1141 * security mechanism because of TOC-TOU attacks, but rather to
598 * Check cgroup2 membership of current task 1142 * debug, divert, and manipulate execution of semi-cooperative
599 * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type 1143 * processes.
600 * @index: index of the cgroup in the bpf_map 1144 *
601 * Return: 1145 * Keep in mind that this feature is meant for experiments, and it
602 * == 0 current failed the cgroup2 descendant test 1146 * has a risk of crashing the system and running programs.
603 * == 1 current succeeded the cgroup2 descendant test 1147 * Therefore, when an eBPF program using this helper is attached,
604 * < 0 error 1148 * a warning including PID and process name is printed to kernel
605 * 1149 * logs.
606 * int bpf_skb_change_tail(skb, len, flags) 1150 * Return
607 * The helper will resize the skb to the given new size, to be used f.e. 1151 * 0 on success, or a negative error in case of failure.
608 * with control messages. 1152 *
609 * @skb: pointer to skb 1153 * int bpf_current_task_under_cgroup(struct bpf_map *map, u32 index)
610 * @len: new skb length 1154 * Description
611 * @flags: reserved 1155 * Check whether the probe is being run is the context of a given
612 * Return: 0 on success or negative error 1156 * subset of the cgroup2 hierarchy. The cgroup2 to test is held by
613 * 1157 * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
614 * int bpf_skb_pull_data(skb, len) 1158 * Return
615 * The helper will pull in non-linear data in case the skb is non-linear 1159 * The return value depends on the result of the test, and can be:
616 * and not all of len are part of the linear section. Only needed for 1160 *
617 * read/write with direct packet access. 1161 * * 0, if the *skb* task belongs to the cgroup2.
618 * @skb: pointer to skb 1162 * * 1, if the *skb* task does not belong to the cgroup2.
619 * @len: len to make read/writeable 1163 * * A negative error code, if an error occurred.
620 * Return: 0 on success or negative error 1164 *
621 * 1165 * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags)
622 * s64 bpf_csum_update(skb, csum) 1166 * Description
623 * Adds csum into skb->csum in case of CHECKSUM_COMPLETE. 1167 * Resize (trim or grow) the packet associated to *skb* to the
624 * @skb: pointer to skb 1168 * new *len*. The *flags* are reserved for future usage, and must
625 * @csum: csum to add 1169 * be left at zero.
626 * Return: csum on success or negative error 1170 *
627 * 1171 * The basic idea is that the helper performs the needed work to
628 * void bpf_set_hash_invalid(skb) 1172 * change the size of the packet, then the eBPF program rewrites
629 * Invalidate current skb->hash. 1173 * the rest via helpers like **bpf_skb_store_bytes**\ (),
630 * @skb: pointer to skb 1174 * **bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ ()
631 * 1175 * and others. This helper is a slow path utility intended for
632 * int bpf_get_numa_node_id() 1176 * replies with control messages. And because it is targeted for
633 * Return: Id of current NUMA node. 1177 * slow path, the helper itself can afford to be slow: it
634 * 1178 * implicitly linearizes, unclones and drops offloads from the
635 * int bpf_skb_change_head() 1179 * *skb*.
636 * Grows headroom of skb and adjusts MAC header offset accordingly. 1180 *
637 * Will extends/reallocae as required automatically. 1181 * A call to this helper is susceptible to change the underlaying
638 * May change skb data pointer and will thus invalidate any check 1182 * packet buffer. Therefore, at load time, all checks on pointers
639 * performed for direct packet access. 1183 * previously done by the verifier are invalidated and must be
640 * @skb: pointer to skb 1184 * performed again, if the helper is used in combination with
641 * @len: length of header to be pushed in front 1185 * direct packet access.
642 * @flags: Flags (unused for now) 1186 * Return
643 * Return: 0 on success or negative error 1187 * 0 on success, or a negative error in case of failure.
644 * 1188 *
645 * int bpf_xdp_adjust_head(xdp_md, delta) 1189 * int bpf_skb_pull_data(struct sk_buff *skb, u32 len)
646 * Adjust the xdp_md.data by delta 1190 * Description
647 * @xdp_md: pointer to xdp_md 1191 * Pull in non-linear data in case the *skb* is non-linear and not
648 * @delta: An positive/negative integer to be added to xdp_md.data 1192 * all of *len* are part of the linear section. Make *len* bytes
649 * Return: 0 on success or negative on error 1193 * from *skb* readable and writable. If a zero value is passed for
1194 * *len*, then the whole length of the *skb* is pulled.
1195 *
1196 * This helper is only needed for reading and writing with direct
1197 * packet access.
1198 *
1199 * For direct packet access, testing that offsets to access
1200 * are within packet boundaries (test on *skb*\ **->data_end**) is
1201 * susceptible to fail if offsets are invalid, or if the requested
1202 * data is in non-linear parts of the *skb*. On failure the
1203 * program can just bail out, or in the case of a non-linear
1204 * buffer, use a helper to make the data available. The
1205 * **bpf_skb_load_bytes**\ () helper is a first solution to access
1206 * the data. Another one consists in using **bpf_skb_pull_data**
1207 * to pull in once the non-linear parts, then retesting and
1208 * eventually access the data.
1209 *
1210 * At the same time, this also makes sure the *skb* is uncloned,
1211 * which is a necessary condition for direct write. As this needs
1212 * to be an invariant for the write part only, the verifier
1213 * detects writes and adds a prologue that is calling
1214 * **bpf_skb_pull_data()** to effectively unclone the *skb* from
1215 * the very beginning in case it is indeed cloned.
1216 *
1217 * A call to this helper is susceptible to change the underlaying
1218 * packet buffer. Therefore, at load time, all checks on pointers
1219 * previously done by the verifier are invalidated and must be
1220 * performed again, if the helper is used in combination with
1221 * direct packet access.
1222 * Return
1223 * 0 on success, or a negative error in case of failure.
1224 *
1225 * s64 bpf_csum_update(struct sk_buff *skb, __wsum csum)
1226 * Description
1227 * Add the checksum *csum* into *skb*\ **->csum** in case the
1228 * driver has supplied a checksum for the entire packet into that
1229 * field. Return an error otherwise. This helper is intended to be
1230 * used in combination with **bpf_csum_diff**\ (), in particular
1231 * when the checksum needs to be updated after data has been
1232 * written into the packet through direct packet access.
1233 * Return
1234 * The checksum on success, or a negative error code in case of
1235 * failure.
1236 *
1237 * void bpf_set_hash_invalid(struct sk_buff *skb)
1238 * Description
1239 * Invalidate the current *skb*\ **->hash**. It can be used after
1240 * mangling on headers through direct packet access, in order to
1241 * indicate that the hash is outdated and to trigger a
1242 * recalculation the next time the kernel tries to access this
1243 * hash or when the **bpf_get_hash_recalc**\ () helper is called.
1244 *
1245 * int bpf_get_numa_node_id(void)
1246 * Description
1247 * Return the id of the current NUMA node. The primary use case
1248 * for this helper is the selection of sockets for the local NUMA
1249 * node, when the program is attached to sockets using the
1250 * **SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**),
1251 * but the helper is also available to other eBPF program types,
1252 * similarly to **bpf_get_smp_processor_id**\ ().
1253 * Return
1254 * The id of current NUMA node.
1255 *
1256 * int bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags)
1257 * Description
1258 * Grows headroom of packet associated to *skb* and adjusts the
1259 * offset of the MAC header accordingly, adding *len* bytes of
1260 * space. It automatically extends and reallocates memory as
1261 * required.
1262 *
1263 * This helper can be used on a layer 3 *skb* to push a MAC header
1264 * for redirection into a layer 2 device.
1265 *
1266 * All values for *flags* are reserved for future usage, and must
1267 * be left at zero.
1268 *
1269 * A call to this helper is susceptible to change the underlaying
1270 * packet buffer. Therefore, at load time, all checks on pointers
1271 * previously done by the verifier are invalidated and must be
1272 * performed again, if the helper is used in combination with
1273 * direct packet access.
1274 * Return
1275 * 0 on success, or a negative error in case of failure.
1276 *
1277 * int bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta)
1278 * Description
1279 * Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that
1280 * it is possible to use a negative value for *delta*. This helper
1281 * can be used to prepare the packet for pushing or popping
1282 * headers.
1283 *
1284 * A call to this helper is susceptible to change the underlaying
1285 * packet buffer. Therefore, at load time, all checks on pointers
1286 * previously done by the verifier are invalidated and must be
1287 * performed again, if the helper is used in combination with
1288 * direct packet access.
1289 * Return
1290 * 0 on success, or a negative error in case of failure.
650 * 1291 *
651 * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr) 1292 * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
652 * Copy a NUL terminated string from unsafe address. In case the string 1293 * Description
653 * length is smaller than size, the target is not padded with further NUL 1294 * Copy a NUL terminated string from an unsafe address
654 * bytes. In case the string length is larger than size, just count-1 1295 * *unsafe_ptr* to *dst*. The *size* should include the
655 * bytes are copied and the last byte is set to NUL. 1296 * terminating NUL byte. In case the string length is smaller than
656 * @dst: destination address 1297 * *size*, the target is not padded with further NUL bytes. If the
657 * @size: maximum number of bytes to copy, including the trailing NUL 1298 * string length is larger than *size*, just *size*-1 bytes are
658 * @unsafe_ptr: unsafe address 1299 * copied and the last byte is set to NUL.
659 * Return: 1300 *
660 * > 0 length of the string including the trailing NUL on success 1301 * On success, the length of the copied string is returned. This
661 * < 0 error 1302 * makes this helper useful in tracing programs for reading
662 * 1303 * strings, and more importantly to get its length at runtime. See
663 * u64 bpf_get_socket_cookie(skb) 1304 * the following snippet:
664 * Get the cookie for the socket stored inside sk_buff. 1305 *
665 * @skb: pointer to skb 1306 * ::
666 * Return: 8 Bytes non-decreasing number on success or 0 if the socket 1307 *
667 * field is missing inside sk_buff 1308 * SEC("kprobe/sys_open")
668 * 1309 * void bpf_sys_open(struct pt_regs *ctx)
669 * u32 bpf_get_socket_uid(skb) 1310 * {
670 * Get the owner uid of the socket stored inside sk_buff. 1311 * char buf[PATHLEN]; // PATHLEN is defined to 256
671 * @skb: pointer to skb 1312 * int res = bpf_probe_read_str(buf, sizeof(buf),
672 * Return: uid of the socket owner on success or overflowuid if failed. 1313 * ctx->di);
673 * 1314 *
674 * u32 bpf_set_hash(skb, hash) 1315 * // Consume buf, for example push it to
675 * Set full skb->hash. 1316 * // userspace via bpf_perf_event_output(); we
676 * @skb: pointer to skb 1317 * // can use res (the string length) as event
677 * @hash: hash to set 1318 * // size, after checking its boundaries.
678 * 1319 * }
679 * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen) 1320 *
680 * Calls setsockopt. Not all opts are available, only those with 1321 * In comparison, using **bpf_probe_read()** helper here instead
681 * integer optvals plus TCP_CONGESTION. 1322 * to read the string would require to estimate the length at
682 * Supported levels: SOL_SOCKET and IPPROTO_TCP 1323 * compile time, and would often result in copying more memory
683 * @bpf_socket: pointer to bpf_socket 1324 * than necessary.
684 * @level: SOL_SOCKET or IPPROTO_TCP 1325 *
685 * @optname: option name 1326 * Another useful use case is when parsing individual process
686 * @optval: pointer to option value 1327 * arguments or individual environment variables navigating
687 * @optlen: length of optval in bytes 1328 * *current*\ **->mm->arg_start** and *current*\
688 * Return: 0 or negative error 1329 * **->mm->env_start**: using this helper and the return value,
689 * 1330 * one can quickly iterate at the right offset of the memory area.
690 * int bpf_getsockopt(bpf_socket, level, optname, optval, optlen) 1331 * Return
691 * Calls getsockopt. Not all opts are available. 1332 * On success, the strictly positive length of the string,
692 * Supported levels: IPPROTO_TCP 1333 * including the trailing NUL character. On error, a negative
693 * @bpf_socket: pointer to bpf_socket 1334 * value.
694 * @level: IPPROTO_TCP 1335 *
695 * @optname: option name 1336 * u64 bpf_get_socket_cookie(struct sk_buff *skb)
696 * @optval: pointer to option value 1337 * Description
697 * @optlen: length of optval in bytes 1338 * If the **struct sk_buff** pointed by *skb* has a known socket,
698 * Return: 0 or negative error 1339 * retrieve the cookie (generated by the kernel) of this socket.
699 * 1340 * If no cookie has been set yet, generate a new cookie. Once
700 * int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags) 1341 * generated, the socket cookie remains stable for the life of the
701 * Set callback flags for sock_ops 1342 * socket. This helper can be useful for monitoring per socket
702 * @bpf_sock_ops: pointer to bpf_sock_ops_kern struct 1343 * networking traffic statistics as it provides a unique socket
703 * @flags: flags value 1344 * identifier per namespace.
704 * Return: 0 for no error 1345 * Return
705 * -EINVAL if there is no full tcp socket 1346 * A 8-byte long non-decreasing number on success, or 0 if the
706 * bits in flags that are not supported by current kernel 1347 * socket field is missing inside *skb*.
707 * 1348 *
708 * int bpf_skb_adjust_room(skb, len_diff, mode, flags) 1349 * u32 bpf_get_socket_uid(struct sk_buff *skb)
709 * Grow or shrink room in sk_buff. 1350 * Return
710 * @skb: pointer to skb 1351 * The owner UID of the socket associated to *skb*. If the socket
711 * @len_diff: (signed) amount of room to grow/shrink 1352 * is **NULL**, or if it is not a full socket (i.e. if it is a
712 * @mode: operation mode (enum bpf_adj_room_mode) 1353 * time-wait or a request socket instead), **overflowuid** value
713 * @flags: reserved for future use 1354 * is returned (note that **overflowuid** might also be the actual
714 * Return: 0 on success or negative error code 1355 * UID value for the socket).
715 * 1356 *
716 * int bpf_sk_redirect_map(map, key, flags) 1357 * u32 bpf_set_hash(struct sk_buff *skb, u32 hash)
717 * Redirect skb to a sock in map using key as a lookup key for the 1358 * Description
718 * sock in map. 1359 * Set the full hash for *skb* (set the field *skb*\ **->hash**)
719 * @map: pointer to sockmap 1360 * to value *hash*.
720 * @key: key to lookup sock in map 1361 * Return
721 * @flags: reserved for future use 1362 * 0
722 * Return: SK_PASS 1363 *
723 * 1364 * int bpf_setsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
724 * int bpf_sock_map_update(skops, map, key, flags) 1365 * Description
725 * @skops: pointer to bpf_sock_ops 1366 * Emulate a call to **setsockopt()** on the socket associated to
726 * @map: pointer to sockmap to update 1367 * *bpf_socket*, which must be a full socket. The *level* at
727 * @key: key to insert/update sock in map 1368 * which the option resides and the name *optname* of the option
728 * @flags: same flags as map update elem 1369 * must be specified, see **setsockopt(2)** for more information.
729 * 1370 * The option value of length *optlen* is pointed by *optval*.
730 * int bpf_xdp_adjust_meta(xdp_md, delta) 1371 *
731 * Adjust the xdp_md.data_meta by delta 1372 * This helper actually implements a subset of **setsockopt()**.
732 * @xdp_md: pointer to xdp_md 1373 * It supports the following *level*\ s:
733 * @delta: An positive/negative integer to be added to xdp_md.data_meta 1374 *
734 * Return: 0 on success or negative on error 1375 * * **SOL_SOCKET**, which supports the following *optname*\ s:
735 * 1376 * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**,
736 * int bpf_perf_event_read_value(map, flags, buf, buf_size) 1377 * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**.
737 * read perf event counter value and perf event enabled/running time 1378 * * **IPPROTO_TCP**, which supports the following *optname*\ s:
738 * @map: pointer to perf_event_array map 1379 * **TCP_CONGESTION**, **TCP_BPF_IW**,
739 * @flags: index of event in the map or bitmask flags 1380 * **TCP_BPF_SNDCWND_CLAMP**.
740 * @buf: buf to fill 1381 * * **IPPROTO_IP**, which supports *optname* **IP_TOS**.
741 * @buf_size: size of the buf 1382 * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
742 * Return: 0 on success or negative error code 1383 * Return
743 * 1384 * 0 on success, or a negative error in case of failure.
744 * int bpf_perf_prog_read_value(ctx, buf, buf_size) 1385 *
745 * read perf prog attached perf event counter and enabled/running time 1386 * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags)
746 * @ctx: pointer to ctx 1387 * Description
747 * @buf: buf to fill 1388 * Grow or shrink the room for data in the packet associated to
748 * @buf_size: size of the buf 1389 * *skb* by *len_diff*, and according to the selected *mode*.
749 * Return : 0 on success or negative error code 1390 *
750 * 1391 * There is a single supported mode at this time:
751 * int bpf_override_return(pt_regs, rc) 1392 *
752 * @pt_regs: pointer to struct pt_regs 1393 * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
753 * @rc: the return value to set 1394 * (room space is added or removed below the layer 3 header).
754 * 1395 *
755 * int bpf_msg_redirect_map(map, key, flags) 1396 * All values for *flags* are reserved for future usage, and must
756 * Redirect msg to a sock in map using key as a lookup key for the 1397 * be left at zero.
757 * sock in map. 1398 *
758 * @map: pointer to sockmap 1399 * A call to this helper is susceptible to change the underlaying
759 * @key: key to lookup sock in map 1400 * packet buffer. Therefore, at load time, all checks on pointers
760 * @flags: reserved for future use 1401 * previously done by the verifier are invalidated and must be
761 * Return: SK_PASS 1402 * performed again, if the helper is used in combination with
762 * 1403 * direct packet access.
763 * int bpf_bind(ctx, addr, addr_len) 1404 * Return
764 * Bind socket to address. Only binding to IP is supported, no port can be 1405 * 0 on success, or a negative error in case of failure.
765 * set in addr. 1406 *
766 * @ctx: pointer to context of type bpf_sock_addr 1407 * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags)
767 * @addr: pointer to struct sockaddr to bind socket to 1408 * Description
768 * @addr_len: length of sockaddr structure 1409 * Redirect the packet to the endpoint referenced by *map* at
769 * Return: 0 on success or negative error code 1410 * index *key*. Depending on its type, this *map* can contain
770 * 1411 * references to net devices (for forwarding packets through other
771 * int bpf_xdp_adjust_tail(xdp_md, delta) 1412 * ports), or to CPUs (for redirecting XDP frames to another CPU;
772 * Adjust the xdp_md.data_end by delta. Only shrinking of packet's 1413 * but this is only implemented for native XDP (with driver
773 * size is supported. 1414 * support) as of this writing).
774 * @xdp_md: pointer to xdp_md 1415 *
775 * @delta: A negative integer to be added to xdp_md.data_end 1416 * All values for *flags* are reserved for future usage, and must
776 * Return: 0 on success or negative on error 1417 * be left at zero.
1418 *
1419 * When used to redirect packets to net devices, this helper
1420 * provides a high performance increase over **bpf_redirect**\ ().
1421 * This is due to various implementation details of the underlying
1422 * mechanisms, one of which is the fact that **bpf_redirect_map**\
1423 * () tries to send packet as a "bulk" to the device.
1424 * Return
1425 * **XDP_REDIRECT** on success, or **XDP_ABORTED** on error.
1426 *
1427 * int bpf_sk_redirect_map(struct bpf_map *map, u32 key, u64 flags)
1428 * Description
1429 * Redirect the packet to the socket referenced by *map* (of type
1430 * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
1431 * egress interfaces can be used for redirection. The
1432 * **BPF_F_INGRESS** value in *flags* is used to make the
1433 * distinction (ingress path is selected if the flag is present,
1434 * egress path otherwise). This is the only flag supported for now.
1435 * Return
1436 * **SK_PASS** on success, or **SK_DROP** on error.
1437 *
1438 * int bpf_sock_map_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
1439 * Description
1440 * Add an entry to, or update a *map* referencing sockets. The
1441 * *skops* is used as a new value for the entry associated to
1442 * *key*. *flags* is one of:
1443 *
1444 * **BPF_NOEXIST**
1445 * The entry for *key* must not exist in the map.
1446 * **BPF_EXIST**
1447 * The entry for *key* must already exist in the map.
1448 * **BPF_ANY**
1449 * No condition on the existence of the entry for *key*.
1450 *
1451 * If the *map* has eBPF programs (parser and verdict), those will
1452 * be inherited by the socket being added. If the socket is
1453 * already attached to eBPF programs, this results in an error.
1454 * Return
1455 * 0 on success, or a negative error in case of failure.
1456 *
1457 * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta)
1458 * Description
1459 * Adjust the address pointed by *xdp_md*\ **->data_meta** by
1460 * *delta* (which can be positive or negative). Note that this
1461 * operation modifies the address stored in *xdp_md*\ **->data**,
1462 * so the latter must be loaded only after the helper has been
1463 * called.
1464 *
1465 * The use of *xdp_md*\ **->data_meta** is optional and programs
1466 * are not required to use it. The rationale is that when the
1467 * packet is processed with XDP (e.g. as DoS filter), it is
1468 * possible to push further meta data along with it before passing
1469 * to the stack, and to give the guarantee that an ingress eBPF
1470 * program attached as a TC classifier on the same device can pick
1471 * this up for further post-processing. Since TC works with socket
1472 * buffers, it remains possible to set from XDP the **mark** or
1473 * **priority** pointers, or other pointers for the socket buffer.
1474 * Having this scratch space generic and programmable allows for
1475 * more flexibility as the user is free to store whatever meta
1476 * data they need.
1477 *
1478 * A call to this helper is susceptible to change the underlaying
1479 * packet buffer. Therefore, at load time, all checks on pointers
1480 * previously done by the verifier are invalidated and must be
1481 * performed again, if the helper is used in combination with
1482 * direct packet access.
1483 * Return
1484 * 0 on success, or a negative error in case of failure.
1485 *
1486 * int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size)
1487 * Description
1488 * Read the value of a perf event counter, and store it into *buf*
1489 * of size *buf_size*. This helper relies on a *map* of type
1490 * **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event
1491 * counter is selected when *map* is updated with perf event file
1492 * descriptors. The *map* is an array whose size is the number of
1493 * available CPUs, and each cell contains a value relative to one
1494 * CPU. The value to retrieve is indicated by *flags*, that
1495 * contains the index of the CPU to look up, masked with
1496 * **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
1497 * **BPF_F_CURRENT_CPU** to indicate that the value for the
1498 * current CPU should be retrieved.
1499 *
1500 * This helper behaves in a way close to
1501 * **bpf_perf_event_read**\ () helper, save that instead of
1502 * just returning the value observed, it fills the *buf*
1503 * structure. This allows for additional data to be retrieved: in
1504 * particular, the enabled and running times (in *buf*\
1505 * **->enabled** and *buf*\ **->running**, respectively) are
1506 * copied. In general, **bpf_perf_event_read_value**\ () is
1507 * recommended over **bpf_perf_event_read**\ (), which has some
1508 * ABI issues and provides fewer functionalities.
1509 *
1510 * These values are interesting, because hardware PMU (Performance
1511 * Monitoring Unit) counters are limited resources. When there are
1512 * more PMU based perf events opened than available counters,
1513 * kernel will multiplex these events so each event gets certain
1514 * percentage (but not all) of the PMU time. In case that
1515 * multiplexing happens, the number of samples or counter value
1516 * will not reflect the case compared to when no multiplexing
1517 * occurs. This makes comparison between different runs difficult.
1518 * Typically, the counter value should be normalized before
1519 * comparing to other experiments. The usual normalization is done
1520 * as follows.
1521 *
1522 * ::
1523 *
1524 * normalized_counter = counter * t_enabled / t_running
1525 *
1526 * Where t_enabled is the time enabled for event and t_running is
1527 * the time running for event since last normalization. The
1528 * enabled and running times are accumulated since the perf event
1529 * open. To achieve scaling factor between two invocations of an
1530 * eBPF program, users can can use CPU id as the key (which is
1531 * typical for perf array usage model) to remember the previous
1532 * value and do the calculation inside the eBPF program.
1533 * Return
1534 * 0 on success, or a negative error in case of failure.
1535 *
1536 * int bpf_perf_prog_read_value(struct bpf_perf_event_data_kern *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
1537 * Description
1538 * For en eBPF program attached to a perf event, retrieve the
1539 * value of the event counter associated to *ctx* and store it in
1540 * the structure pointed by *buf* and of size *buf_size*. Enabled
1541 * and running times are also stored in the structure (see
1542 * description of helper **bpf_perf_event_read_value**\ () for
1543 * more details).
1544 * Return
1545 * 0 on success, or a negative error in case of failure.
1546 *
1547 * int bpf_getsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
1548 * Description
1549 * Emulate a call to **getsockopt()** on the socket associated to
1550 * *bpf_socket*, which must be a full socket. The *level* at
1551 * which the option resides and the name *optname* of the option
1552 * must be specified, see **getsockopt(2)** for more information.
1553 * The retrieved value is stored in the structure pointed by
1554 * *opval* and of length *optlen*.
1555 *
1556 * This helper actually implements a subset of **getsockopt()**.
1557 * It supports the following *level*\ s:
1558 *
1559 * * **IPPROTO_TCP**, which supports *optname*
1560 * **TCP_CONGESTION**.
1561 * * **IPPROTO_IP**, which supports *optname* **IP_TOS**.
1562 * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
1563 * Return
1564 * 0 on success, or a negative error in case of failure.
1565 *
1566 * int bpf_override_return(struct pt_reg *regs, u64 rc)
1567 * Description
1568 * Used for error injection, this helper uses kprobes to override
1569 * the return value of the probed function, and to set it to *rc*.
1570 * The first argument is the context *regs* on which the kprobe
1571 * works.
1572 *
1573 * This helper works by setting setting the PC (program counter)
1574 * to an override function which is run in place of the original
1575 * probed function. This means the probed function is not run at
1576 * all. The replacement function just returns with the required
1577 * value.
1578 *
1579 * This helper has security implications, and thus is subject to
1580 * restrictions. It is only available if the kernel was compiled
1581 * with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration
1582 * option, and in this case it only works on functions tagged with
1583 * **ALLOW_ERROR_INJECTION** in the kernel code.
1584 *
1585 * Also, the helper is only available for the architectures having
1586 * the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing,
1587 * x86 architecture is the only one to support this feature.
1588 * Return
1589 * 0
1590 *
1591 * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops_kern *bpf_sock, int argval)
1592 * Description
1593 * Attempt to set the value of the **bpf_sock_ops_cb_flags** field
1594 * for the full TCP socket associated to *bpf_sock_ops* to
1595 * *argval*.
1596 *
1597 * The primary use of this field is to determine if there should
1598 * be calls to eBPF programs of type
1599 * **BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP
1600 * code. A program of the same type can change its value, per
1601 * connection and as necessary, when the connection is
1602 * established. This field is directly accessible for reading, but
1603 * this helper must be used for updates in order to return an
1604 * error if an eBPF program tries to set a callback that is not
1605 * supported in the current kernel.
1606 *
1607 * The supported callback values that *argval* can combine are:
1608 *
1609 * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out)
1610 * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
1611 * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
1612 *
1613 * Here are some examples of where one could call such eBPF
1614 * program:
1615 *
1616 * * When RTO fires.
1617 * * When a packet is retransmitted.
1618 * * When the connection terminates.
1619 * * When a packet is sent.
1620 * * When a packet is received.
1621 * Return
1622 * Code **-EINVAL** if the socket is not a full TCP socket;
1623 * otherwise, a positive number containing the bits that could not
1624 * be set is returned (which comes down to 0 if all bits were set
1625 * as required).
1626 *
1627 * int bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags)
1628 * Description
1629 * This helper is used in programs implementing policies at the
1630 * socket level. If the message *msg* is allowed to pass (i.e. if
1631 * the verdict eBPF program returns **SK_PASS**), redirect it to
1632 * the socket referenced by *map* (of type
1633 * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
1634 * egress interfaces can be used for redirection. The
1635 * **BPF_F_INGRESS** value in *flags* is used to make the
1636 * distinction (ingress path is selected if the flag is present,
1637 * egress path otherwise). This is the only flag supported for now.
1638 * Return
1639 * **SK_PASS** on success, or **SK_DROP** on error.
1640 *
1641 * int bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes)
1642 * Description
1643 * For socket policies, apply the verdict of the eBPF program to
1644 * the next *bytes* (number of bytes) of message *msg*.
1645 *
1646 * For example, this helper can be used in the following cases:
1647 *
1648 * * A single **sendmsg**\ () or **sendfile**\ () system call
1649 * contains multiple logical messages that the eBPF program is
1650 * supposed to read and for which it should apply a verdict.
1651 * * An eBPF program only cares to read the first *bytes* of a
1652 * *msg*. If the message has a large payload, then setting up
1653 * and calling the eBPF program repeatedly for all bytes, even
1654 * though the verdict is already known, would create unnecessary
1655 * overhead.
1656 *
1657 * When called from within an eBPF program, the helper sets a
1658 * counter internal to the BPF infrastructure, that is used to
1659 * apply the last verdict to the next *bytes*. If *bytes* is
1660 * smaller than the current data being processed from a
1661 * **sendmsg**\ () or **sendfile**\ () system call, the first
1662 * *bytes* will be sent and the eBPF program will be re-run with
1663 * the pointer for start of data pointing to byte number *bytes*
1664 * **+ 1**. If *bytes* is larger than the current data being
1665 * processed, then the eBPF verdict will be applied to multiple
1666 * **sendmsg**\ () or **sendfile**\ () calls until *bytes* are
1667 * consumed.
1668 *
1669 * Note that if a socket closes with the internal counter holding
1670 * a non-zero value, this is not a problem because data is not
1671 * being buffered for *bytes* and is sent as it is received.
1672 * Return
1673 * 0
1674 *
1675 * int bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes)
1676 * Description
1677 * For socket policies, prevent the execution of the verdict eBPF
1678 * program for message *msg* until *bytes* (byte number) have been
1679 * accumulated.
1680 *
1681 * This can be used when one needs a specific number of bytes
1682 * before a verdict can be assigned, even if the data spans
1683 * multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme
1684 * case would be a user calling **sendmsg**\ () repeatedly with
1685 * 1-byte long message segments. Obviously, this is bad for
1686 * performance, but it is still valid. If the eBPF program needs
1687 * *bytes* bytes to validate a header, this helper can be used to
1688 * prevent the eBPF program to be called again until *bytes* have
1689 * been accumulated.
1690 * Return
1691 * 0
1692 *
1693 * int bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags)
1694 * Description
1695 * For socket policies, pull in non-linear data from user space
1696 * for *msg* and set pointers *msg*\ **->data** and *msg*\
1697 * **->data_end** to *start* and *end* bytes offsets into *msg*,
1698 * respectively.
1699 *
1700 * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
1701 * *msg* it can only parse data that the (**data**, **data_end**)
1702 * pointers have already consumed. For **sendmsg**\ () hooks this
1703 * is likely the first scatterlist element. But for calls relying
1704 * on the **sendpage** handler (e.g. **sendfile**\ ()) this will
1705 * be the range (**0**, **0**) because the data is shared with
1706 * user space and by default the objective is to avoid allowing
1707 * user space to modify data while (or after) eBPF verdict is
1708 * being decided. This helper can be used to pull in data and to
1709 * set the start and end pointer to given values. Data will be
1710 * copied if necessary (i.e. if data was not linear and if start
1711 * and end pointers do not point to the same chunk).
1712 *
1713 * A call to this helper is susceptible to change the underlaying
1714 * packet buffer. Therefore, at load time, all checks on pointers
1715 * previously done by the verifier are invalidated and must be
1716 * performed again, if the helper is used in combination with
1717 * direct packet access.
1718 *
1719 * All values for *flags* are reserved for future usage, and must
1720 * be left at zero.
1721 * Return
1722 * 0 on success, or a negative error in case of failure.
1723 *
1724 * int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len)
1725 * Description
1726 * Bind the socket associated to *ctx* to the address pointed by
1727 * *addr*, of length *addr_len*. This allows for making outgoing
1728 * connection from the desired IP address, which can be useful for
1729 * example when all processes inside a cgroup should use one
1730 * single IP address on a host that has multiple IP configured.
1731 *
1732 * This helper works for IPv4 and IPv6, TCP and UDP sockets. The
1733 * domain (*addr*\ **->sa_family**) must be **AF_INET** (or
1734 * **AF_INET6**). Looking for a free port to bind to can be
1735 * expensive, therefore binding to port is not permitted by the
1736 * helper: *addr*\ **->sin_port** (or **sin6_port**, respectively)
1737 * must be set to zero.
1738 * Return
1739 * 0 on success, or a negative error in case of failure.
1740 *
1741 * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta)
1742 * Description
1743 * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is
1744 * only possible to shrink the packet as of this writing,
1745 * therefore *delta* must be a negative integer.
1746 *
1747 * A call to this helper is susceptible to change the underlaying
1748 * packet buffer. Therefore, at load time, all checks on pointers
1749 * previously done by the verifier are invalidated and must be
1750 * performed again, if the helper is used in combination with
1751 * direct packet access.
1752 * Return
1753 * 0 on success, or a negative error in case of failure.
1754 *
1755 * int bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags)
1756 * Description
1757 * Retrieve the XFRM state (IP transform framework, see also
1758 * **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*.
1759 *
1760 * The retrieved value is stored in the **struct bpf_xfrm_state**
1761 * pointed by *xfrm_state* and of length *size*.
1762 *
1763 * All values for *flags* are reserved for future usage, and must
1764 * be left at zero.
1765 *
1766 * This helper is available only if the kernel was compiled with
1767 * **CONFIG_XFRM** configuration option.
1768 * Return
1769 * 0 on success, or a negative error in case of failure.
777 */ 1770 */
778#define __BPF_FUNC_MAPPER(FN) \ 1771#define __BPF_FUNC_MAPPER(FN) \
779 FN(unspec), \ 1772 FN(unspec), \
@@ -841,7 +1834,8 @@ union bpf_attr {
841 FN(msg_cork_bytes), \ 1834 FN(msg_cork_bytes), \
842 FN(msg_pull_data), \ 1835 FN(msg_pull_data), \
843 FN(bind), \ 1836 FN(bind), \
844 FN(xdp_adjust_tail), 1837 FN(xdp_adjust_tail), \
1838 FN(skb_get_xfrm_state),
845 1839
846/* integer value in 'imm' field of BPF_CALL instruction selects which helper 1840/* integer value in 'imm' field of BPF_CALL instruction selects which helper
847 * function eBPF program intends to call 1841 * function eBPF program intends to call
@@ -947,6 +1941,19 @@ struct bpf_tunnel_key {
947 __u32 tunnel_label; 1941 __u32 tunnel_label;
948}; 1942};
949 1943
1944/* user accessible mirror of in-kernel xfrm_state.
1945 * new fields can only be added to the end of this structure
1946 */
1947struct bpf_xfrm_state {
1948 __u32 reqid;
1949 __u32 spi; /* Stored in network byte order */
1950 __u16 family;
1951 union {
1952 __u32 remote_ipv4; /* Stored in network byte order */
1953 __u32 remote_ipv6[4]; /* Stored in network byte order */
1954 };
1955};
1956
950/* Generic BPF return codes which all BPF program types may support. 1957/* Generic BPF return codes which all BPF program types may support.
951 * The values are binary compatible with their TC_ACT_* counter-part to 1958 * The values are binary compatible with their TC_ACT_* counter-part to
952 * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT 1959 * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT
@@ -1037,6 +2044,7 @@ struct bpf_prog_info {
1037 __aligned_u64 map_ids; 2044 __aligned_u64 map_ids;
1038 char name[BPF_OBJ_NAME_LEN]; 2045 char name[BPF_OBJ_NAME_LEN];
1039 __u32 ifindex; 2046 __u32 ifindex;
2047 __u32 gpl_compatible:1;
1040 __u64 netns_dev; 2048 __u64 netns_dev;
1041 __u64 netns_ino; 2049 __u64 netns_ino;
1042} __attribute__((aligned(8))); 2050} __attribute__((aligned(8)));
diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h
index 74a30b1090df..bcb56ee47014 100644
--- a/tools/include/uapi/linux/btf.h
+++ b/tools/include/uapi/linux/btf.h
@@ -6,9 +6,7 @@
6#include <linux/types.h> 6#include <linux/types.h>
7 7
8#define BTF_MAGIC 0xeB9F 8#define BTF_MAGIC 0xeB9F
9#define BTF_MAGIC_SWAP 0x9FeB
10#define BTF_VERSION 1 9#define BTF_VERSION 1
11#define BTF_FLAGS_COMPR 0x01
12 10
13struct btf_header { 11struct btf_header {
14 __u16 magic; 12 __u16 magic;
@@ -43,7 +41,7 @@ struct btf_header {
43#define BTF_STR_OFFSET(ref) ((ref) & BTF_MAX_NAME_OFFSET) 41#define BTF_STR_OFFSET(ref) ((ref) & BTF_MAX_NAME_OFFSET)
44 42
45struct btf_type { 43struct btf_type {
46 __u32 name; 44 __u32 name_off;
47 /* "info" bits arrangement 45 /* "info" bits arrangement
48 * bits 0-15: vlen (e.g. # of struct's members) 46 * bits 0-15: vlen (e.g. # of struct's members)
49 * bits 16-23: unused 47 * bits 16-23: unused
@@ -105,7 +103,7 @@ struct btf_type {
105 * info in "struct btf_type"). 103 * info in "struct btf_type").
106 */ 104 */
107struct btf_enum { 105struct btf_enum {
108 __u32 name; 106 __u32 name_off;
109 __s32 val; 107 __s32 val;
110}; 108};
111 109
@@ -122,7 +120,7 @@ struct btf_array {
122 * "struct btf_type"). 120 * "struct btf_type").
123 */ 121 */
124struct btf_member { 122struct btf_member {
125 __u32 name; 123 __u32 name_off;
126 __u32 type; 124 __u32 type;
127 __u32 offset; /* offset in bits */ 125 __u32 offset; /* offset in bits */
128}; 126};