aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation/networking/packet_mmap.txt
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/networking/packet_mmap.txt')
-rw-r--r--Documentation/networking/packet_mmap.txt368
1 files changed, 362 insertions, 6 deletions
diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt
index 94444b152fbc..23dd80e82b8e 100644
--- a/Documentation/networking/packet_mmap.txt
+++ b/Documentation/networking/packet_mmap.txt
@@ -685,14 +685,342 @@ int main(int argc, char **argp)
685} 685}
686 686
687------------------------------------------------------------------------------- 687-------------------------------------------------------------------------------
688+ AF_PACKET TPACKET_V3 example
689-------------------------------------------------------------------------------
690
691AF_PACKET's TPACKET_V3 ring buffer can be configured to use non-static frame
692sizes by doing it's own memory management. It is based on blocks where polling
693works on a per block basis instead of per ring as in TPACKET_V2 and predecessor.
694
695It is said that TPACKET_V3 brings the following benefits:
696 *) ~15 - 20% reduction in CPU-usage
697 *) ~20% increase in packet capture rate
698 *) ~2x increase in packet density
699 *) Port aggregation analysis
700 *) Non static frame size to capture entire packet payload
701
702So it seems to be a good candidate to be used with packet fanout.
703
704Minimal example code by Daniel Borkmann based on Chetan Loke's lolpcap (compile
705it with gcc -Wall -O2 blob.c, and try things like "./a.out eth0", etc.):
706
707#include <stdio.h>
708#include <stdlib.h>
709#include <stdint.h>
710#include <string.h>
711#include <assert.h>
712#include <net/if.h>
713#include <arpa/inet.h>
714#include <netdb.h>
715#include <poll.h>
716#include <unistd.h>
717#include <signal.h>
718#include <inttypes.h>
719#include <sys/socket.h>
720#include <sys/mman.h>
721#include <linux/if_packet.h>
722#include <linux/if_ether.h>
723#include <linux/ip.h>
724
725#define BLOCK_SIZE (1 << 22)
726#define FRAME_SIZE 2048
727
728#define NUM_BLOCKS 64
729#define NUM_FRAMES ((BLOCK_SIZE * NUM_BLOCKS) / FRAME_SIZE)
730
731#define BLOCK_RETIRE_TOV_IN_MS 64
732#define BLOCK_PRIV_AREA_SZ 13
733
734#define ALIGN_8(x) (((x) + 8 - 1) & ~(8 - 1))
735
736#define BLOCK_STATUS(x) ((x)->h1.block_status)
737#define BLOCK_NUM_PKTS(x) ((x)->h1.num_pkts)
738#define BLOCK_O2FP(x) ((x)->h1.offset_to_first_pkt)
739#define BLOCK_LEN(x) ((x)->h1.blk_len)
740#define BLOCK_SNUM(x) ((x)->h1.seq_num)
741#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
742#define BLOCK_PRIV(x) ((void *) ((uint8_t *) (x) + BLOCK_O2PRIV(x)))
743#define BLOCK_HDR_LEN (ALIGN_8(sizeof(struct block_desc)))
744#define BLOCK_PLUS_PRIV(sz_pri) (BLOCK_HDR_LEN + ALIGN_8((sz_pri)))
745
746#ifndef likely
747# define likely(x) __builtin_expect(!!(x), 1)
748#endif
749#ifndef unlikely
750# define unlikely(x) __builtin_expect(!!(x), 0)
751#endif
752
753struct block_desc {
754 uint32_t version;
755 uint32_t offset_to_priv;
756 struct tpacket_hdr_v1 h1;
757};
758
759struct ring {
760 struct iovec *rd;
761 uint8_t *map;
762 struct tpacket_req3 req;
763};
764
765static unsigned long packets_total = 0, bytes_total = 0;
766static sig_atomic_t sigint = 0;
767
768void sighandler(int num)
769{
770 sigint = 1;
771}
772
773static int setup_socket(struct ring *ring, char *netdev)
774{
775 int err, i, fd, v = TPACKET_V3;
776 struct sockaddr_ll ll;
777
778 fd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
779 if (fd < 0) {
780 perror("socket");
781 exit(1);
782 }
783
784 err = setsockopt(fd, SOL_PACKET, PACKET_VERSION, &v, sizeof(v));
785 if (err < 0) {
786 perror("setsockopt");
787 exit(1);
788 }
789
790 memset(&ring->req, 0, sizeof(ring->req));
791 ring->req.tp_block_size = BLOCK_SIZE;
792 ring->req.tp_frame_size = FRAME_SIZE;
793 ring->req.tp_block_nr = NUM_BLOCKS;
794 ring->req.tp_frame_nr = NUM_FRAMES;
795 ring->req.tp_retire_blk_tov = BLOCK_RETIRE_TOV_IN_MS;
796 ring->req.tp_sizeof_priv = BLOCK_PRIV_AREA_SZ;
797 ring->req.tp_feature_req_word |= TP_FT_REQ_FILL_RXHASH;
798
799 err = setsockopt(fd, SOL_PACKET, PACKET_RX_RING, &ring->req,
800 sizeof(ring->req));
801 if (err < 0) {
802 perror("setsockopt");
803 exit(1);
804 }
805
806 ring->map = mmap(NULL, ring->req.tp_block_size * ring->req.tp_block_nr,
807 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED,
808 fd, 0);
809 if (ring->map == MAP_FAILED) {
810 perror("mmap");
811 exit(1);
812 }
813
814 ring->rd = malloc(ring->req.tp_block_nr * sizeof(*ring->rd));
815 assert(ring->rd);
816 for (i = 0; i < ring->req.tp_block_nr; ++i) {
817 ring->rd[i].iov_base = ring->map + (i * ring->req.tp_block_size);
818 ring->rd[i].iov_len = ring->req.tp_block_size;
819 }
820
821 memset(&ll, 0, sizeof(ll));
822 ll.sll_family = PF_PACKET;
823 ll.sll_protocol = htons(ETH_P_ALL);
824 ll.sll_ifindex = if_nametoindex(netdev);
825 ll.sll_hatype = 0;
826 ll.sll_pkttype = 0;
827 ll.sll_halen = 0;
828
829 err = bind(fd, (struct sockaddr *) &ll, sizeof(ll));
830 if (err < 0) {
831 perror("bind");
832 exit(1);
833 }
834
835 return fd;
836}
837
838#ifdef __checked
839static uint64_t prev_block_seq_num = 0;
840
841void assert_block_seq_num(struct block_desc *pbd)
842{
843 if (unlikely(prev_block_seq_num + 1 != BLOCK_SNUM(pbd))) {
844 printf("prev_block_seq_num:%"PRIu64", expected seq:%"PRIu64" != "
845 "actual seq:%"PRIu64"\n", prev_block_seq_num,
846 prev_block_seq_num + 1, (uint64_t) BLOCK_SNUM(pbd));
847 exit(1);
848 }
849
850 prev_block_seq_num = BLOCK_SNUM(pbd);
851}
852
853static void assert_block_len(struct block_desc *pbd, uint32_t bytes, int block_num)
854{
855 if (BLOCK_NUM_PKTS(pbd)) {
856 if (unlikely(bytes != BLOCK_LEN(pbd))) {
857 printf("block:%u with %upackets, expected len:%u != actual len:%u\n",
858 block_num, BLOCK_NUM_PKTS(pbd), bytes, BLOCK_LEN(pbd));
859 exit(1);
860 }
861 } else {
862 if (unlikely(BLOCK_LEN(pbd) != BLOCK_PLUS_PRIV(BLOCK_PRIV_AREA_SZ))) {
863 printf("block:%u, expected len:%lu != actual len:%u\n",
864 block_num, BLOCK_HDR_LEN, BLOCK_LEN(pbd));
865 exit(1);
866 }
867 }
868}
869
870static void assert_block_header(struct block_desc *pbd, const int block_num)
871{
872 uint32_t block_status = BLOCK_STATUS(pbd);
873
874 if (unlikely((block_status & TP_STATUS_USER) == 0)) {
875 printf("block:%u, not in TP_STATUS_USER\n", block_num);
876 exit(1);
877 }
878
879 assert_block_seq_num(pbd);
880}
881#else
882static inline void assert_block_header(struct block_desc *pbd, const int block_num)
883{
884}
885static void assert_block_len(struct block_desc *pbd, uint32_t bytes, int block_num)
886{
887}
888#endif
889
890static void display(struct tpacket3_hdr *ppd)
891{
892 struct ethhdr *eth = (struct ethhdr *) ((uint8_t *) ppd + ppd->tp_mac);
893 struct iphdr *ip = (struct iphdr *) ((uint8_t *) eth + ETH_HLEN);
894
895 if (eth->h_proto == htons(ETH_P_IP)) {
896 struct sockaddr_in ss, sd;
897 char sbuff[NI_MAXHOST], dbuff[NI_MAXHOST];
898
899 memset(&ss, 0, sizeof(ss));
900 ss.sin_family = PF_INET;
901 ss.sin_addr.s_addr = ip->saddr;
902 getnameinfo((struct sockaddr *) &ss, sizeof(ss),
903 sbuff, sizeof(sbuff), NULL, 0, NI_NUMERICHOST);
904
905 memset(&sd, 0, sizeof(sd));
906 sd.sin_family = PF_INET;
907 sd.sin_addr.s_addr = ip->daddr;
908 getnameinfo((struct sockaddr *) &sd, sizeof(sd),
909 dbuff, sizeof(dbuff), NULL, 0, NI_NUMERICHOST);
910
911 printf("%s -> %s, ", sbuff, dbuff);
912 }
913
914 printf("rxhash: 0x%x\n", ppd->hv1.tp_rxhash);
915}
916
917static void walk_block(struct block_desc *pbd, const int block_num)
918{
919 int num_pkts = BLOCK_NUM_PKTS(pbd), i;
920 unsigned long bytes = 0;
921 unsigned long bytes_with_padding = BLOCK_PLUS_PRIV(BLOCK_PRIV_AREA_SZ);
922 struct tpacket3_hdr *ppd;
923
924 assert_block_header(pbd, block_num);
925
926 ppd = (struct tpacket3_hdr *) ((uint8_t *) pbd + BLOCK_O2FP(pbd));
927 for (i = 0; i < num_pkts; ++i) {
928 bytes += ppd->tp_snaplen;
929 if (ppd->tp_next_offset)
930 bytes_with_padding += ppd->tp_next_offset;
931 else
932 bytes_with_padding += ALIGN_8(ppd->tp_snaplen + ppd->tp_mac);
933
934 display(ppd);
935
936 ppd = (struct tpacket3_hdr *) ((uint8_t *) ppd + ppd->tp_next_offset);
937 __sync_synchronize();
938 }
939
940 assert_block_len(pbd, bytes_with_padding, block_num);
941
942 packets_total += num_pkts;
943 bytes_total += bytes;
944}
945
946void flush_block(struct block_desc *pbd)
947{
948 BLOCK_STATUS(pbd) = TP_STATUS_KERNEL;
949 __sync_synchronize();
950}
951
952static void teardown_socket(struct ring *ring, int fd)
953{
954 munmap(ring->map, ring->req.tp_block_size * ring->req.tp_block_nr);
955 free(ring->rd);
956 close(fd);
957}
958
959int main(int argc, char **argp)
960{
961 int fd, err;
962 socklen_t len;
963 struct ring ring;
964 struct pollfd pfd;
965 unsigned int block_num = 0;
966 struct block_desc *pbd;
967 struct tpacket_stats_v3 stats;
968
969 if (argc != 2) {
970 fprintf(stderr, "Usage: %s INTERFACE\n", argp[0]);
971 return EXIT_FAILURE;
972 }
973
974 signal(SIGINT, sighandler);
975
976 memset(&ring, 0, sizeof(ring));
977 fd = setup_socket(&ring, argp[argc - 1]);
978 assert(fd > 0);
979
980 memset(&pfd, 0, sizeof(pfd));
981 pfd.fd = fd;
982 pfd.events = POLLIN | POLLERR;
983 pfd.revents = 0;
984
985 while (likely(!sigint)) {
986 pbd = (struct block_desc *) ring.rd[block_num].iov_base;
987retry_block:
988 if ((BLOCK_STATUS(pbd) & TP_STATUS_USER) == 0) {
989 poll(&pfd, 1, -1);
990 goto retry_block;
991 }
992
993 walk_block(pbd, block_num);
994 flush_block(pbd);
995 block_num = (block_num + 1) % NUM_BLOCKS;
996 }
997
998 len = sizeof(stats);
999 err = getsockopt(fd, SOL_PACKET, PACKET_STATISTICS, &stats, &len);
1000 if (err < 0) {
1001 perror("getsockopt");
1002 exit(1);
1003 }
1004
1005 fflush(stdout);
1006 printf("\nReceived %u packets, %lu bytes, %u dropped, freeze_q_cnt: %u\n",
1007 stats.tp_packets, bytes_total, stats.tp_drops,
1008 stats.tp_freeze_q_cnt);
1009
1010 teardown_socket(&ring, fd);
1011 return 0;
1012}
1013
1014-------------------------------------------------------------------------------
688+ PACKET_TIMESTAMP 1015+ PACKET_TIMESTAMP
689------------------------------------------------------------------------------- 1016-------------------------------------------------------------------------------
690 1017
691The PACKET_TIMESTAMP setting determines the source of the timestamp in 1018The PACKET_TIMESTAMP setting determines the source of the timestamp in
692the packet meta information. If your NIC is capable of timestamping 1019the packet meta information for mmap(2)ed RX_RING and TX_RINGs. If your
693packets in hardware, you can request those hardware timestamps to used. 1020NIC is capable of timestamping packets in hardware, you can request those
694Note: you may need to enable the generation of hardware timestamps with 1021hardware timestamps to be used. Note: you may need to enable the generation
695SIOCSHWTSTAMP. 1022of hardware timestamps with SIOCSHWTSTAMP (see related information from
1023Documentation/networking/timestamping.txt).
696 1024
697PACKET_TIMESTAMP accepts the same integer bit field as 1025PACKET_TIMESTAMP accepts the same integer bit field as
698SO_TIMESTAMPING. However, only the SOF_TIMESTAMPING_SYS_HARDWARE 1026SO_TIMESTAMPING. However, only the SOF_TIMESTAMPING_SYS_HARDWARE
@@ -704,8 +1032,36 @@ SOF_TIMESTAMPING_RAW_HARDWARE if both bits are set.
704 req |= SOF_TIMESTAMPING_SYS_HARDWARE; 1032 req |= SOF_TIMESTAMPING_SYS_HARDWARE;
705 setsockopt(fd, SOL_PACKET, PACKET_TIMESTAMP, (void *) &req, sizeof(req)) 1033 setsockopt(fd, SOL_PACKET, PACKET_TIMESTAMP, (void *) &req, sizeof(req))
706 1034
707If PACKET_TIMESTAMP is not set, a software timestamp generated inside 1035For the mmap(2)ed ring buffers, such timestamps are stored in the
708the networking stack is used (the behavior before this setting was added). 1036tpacket{,2,3}_hdr structure's tp_sec and tp_{n,u}sec members. To determine
1037what kind of timestamp has been reported, the tp_status field is binary |'ed
1038with the following possible bits ...
1039
1040 TP_STATUS_TS_SYS_HARDWARE
1041 TP_STATUS_TS_RAW_HARDWARE
1042 TP_STATUS_TS_SOFTWARE
1043
1044... that are equivalent to its SOF_TIMESTAMPING_* counterparts. For the
1045RX_RING, if none of those 3 are set (i.e. PACKET_TIMESTAMP is not set),
1046then this means that a software fallback was invoked *within* PF_PACKET's
1047processing code (less precise).
1048
1049Getting timestamps for the TX_RING works as follows: i) fill the ring frames,
1050ii) call sendto() e.g. in blocking mode, iii) wait for status of relevant
1051frames to be updated resp. the frame handed over to the application, iv) walk
1052through the frames to pick up the individual hw/sw timestamps.
1053
1054Only (!) if transmit timestamping is enabled, then these bits are combined
1055with binary | with TP_STATUS_AVAILABLE, so you must check for that in your
1056application (e.g. !(tp_status & (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING))
1057in a first step to see if the frame belongs to the application, and then
1058one can extract the type of timestamp in a second step from tp_status)!
1059
1060If you don't care about them, thus having it disabled, checking for
1061TP_STATUS_AVAILABLE resp. TP_STATUS_WRONG_FORMAT is sufficient. If in the
1062TX_RING part only TP_STATUS_AVAILABLE is set, then the tp_sec and tp_{n,u}sec
1063members do not contain a valid value. For TX_RINGs, by default no timestamp
1064is generated!
709 1065
710See include/linux/net_tstamp.h and Documentation/networking/timestamping 1066See include/linux/net_tstamp.h and Documentation/networking/timestamping
711for more information on hardware timestamps. 1067for more information on hardware timestamps.