aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation/networking
diff options
context:
space:
mode:
authorDaniel Borkmann <dborkman@redhat.com>2013-03-29 01:36:29 -0400
committerDavid S. Miller <davem@davemloft.net>2013-03-30 17:40:27 -0400
commit4eb06148250f92e1e58bf069c309dac173e8b5f7 (patch)
tree15c737c159a14b6abe6e5b8148bebc4d9b6913d6 /Documentation/networking
parente052f7e64daae6aa7a7ccd003b3c285d99755afb (diff)
doc: packet: add minimal TPACKET_V3 example code
Lost in space for a long time, but it finally came back to us from some ancient code tombs. This patch adds a minimal runnable example of Linux' packet mmap(2) from Chetan Loke's TPACKET_V3. Special thanks to David S. Miller, and also Eric Leblond and Victor Julien! Cc: Eric Leblond <eric@regit.org> Cc: Victor Julien <victor@inliniac.net> Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'Documentation/networking')
-rw-r--r--Documentation/networking/packet_mmap.txt327
1 files changed, 327 insertions, 0 deletions
diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt
index 94444b152fbc..65efb85e49de 100644
--- a/Documentation/networking/packet_mmap.txt
+++ b/Documentation/networking/packet_mmap.txt
@@ -685,6 +685,333 @@ int main(int argc, char **argp)
685} 685}
686 686
687------------------------------------------------------------------------------- 687-------------------------------------------------------------------------------
688+ AF_PACKET TPACKET_V3 example
689-------------------------------------------------------------------------------
690
691AF_PACKET's TPACKET_V3 ring buffer can be configured to use non-static frame
692sizes by doing it's own memory management. It is based on blocks where polling
693works on a per block basis instead of per ring as in TPACKET_V2 and predecessor.
694
695It is said that TPACKET_V3 brings the following benefits:
696 *) ~15 - 20% reduction in CPU-usage
697 *) ~20% increase in packet capture rate
698 *) ~2x increase in packet density
699 *) Port aggregation analysis
700 *) Non static frame size to capture entire packet payload
701
702So it seems to be a good candidate to be used with packet fanout.
703
704Minimal example code by Daniel Borkmann based on Chetan Loke's lolpcap (compile
705it with gcc -Wall -O2 blob.c, and try things like "./a.out eth0", etc.):
706
707#include <stdio.h>
708#include <stdlib.h>
709#include <stdint.h>
710#include <string.h>
711#include <assert.h>
712#include <net/if.h>
713#include <arpa/inet.h>
714#include <netdb.h>
715#include <poll.h>
716#include <unistd.h>
717#include <signal.h>
718#include <inttypes.h>
719#include <sys/socket.h>
720#include <sys/mman.h>
721#include <linux/if_packet.h>
722#include <linux/if_ether.h>
723#include <linux/ip.h>
724
725#define BLOCK_SIZE (1 << 22)
726#define FRAME_SIZE 2048
727
728#define NUM_BLOCKS 64
729#define NUM_FRAMES ((BLOCK_SIZE * NUM_BLOCKS) / FRAME_SIZE)
730
731#define BLOCK_RETIRE_TOV_IN_MS 64
732#define BLOCK_PRIV_AREA_SZ 13
733
734#define ALIGN_8(x) (((x) + 8 - 1) & ~(8 - 1))
735
736#define BLOCK_STATUS(x) ((x)->h1.block_status)
737#define BLOCK_NUM_PKTS(x) ((x)->h1.num_pkts)
738#define BLOCK_O2FP(x) ((x)->h1.offset_to_first_pkt)
739#define BLOCK_LEN(x) ((x)->h1.blk_len)
740#define BLOCK_SNUM(x) ((x)->h1.seq_num)
741#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
742#define BLOCK_PRIV(x) ((void *) ((uint8_t *) (x) + BLOCK_O2PRIV(x)))
743#define BLOCK_HDR_LEN (ALIGN_8(sizeof(struct block_desc)))
744#define BLOCK_PLUS_PRIV(sz_pri) (BLOCK_HDR_LEN + ALIGN_8((sz_pri)))
745
746#ifndef likely
747# define likely(x) __builtin_expect(!!(x), 1)
748#endif
749#ifndef unlikely
750# define unlikely(x) __builtin_expect(!!(x), 0)
751#endif
752
753struct block_desc {
754 uint32_t version;
755 uint32_t offset_to_priv;
756 struct tpacket_hdr_v1 h1;
757};
758
759struct ring {
760 struct iovec *rd;
761 uint8_t *map;
762 struct tpacket_req3 req;
763};
764
765static unsigned long packets_total = 0, bytes_total = 0;
766static sig_atomic_t sigint = 0;
767
768void sighandler(int num)
769{
770 sigint = 1;
771}
772
773static int setup_socket(struct ring *ring, char *netdev)
774{
775 int err, i, fd, v = TPACKET_V3;
776 struct sockaddr_ll ll;
777
778 fd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
779 if (fd < 0) {
780 perror("socket");
781 exit(1);
782 }
783
784 err = setsockopt(fd, SOL_PACKET, PACKET_VERSION, &v, sizeof(v));
785 if (err < 0) {
786 perror("setsockopt");
787 exit(1);
788 }
789
790 memset(&ring->req, 0, sizeof(ring->req));
791 ring->req.tp_block_size = BLOCK_SIZE;
792 ring->req.tp_frame_size = FRAME_SIZE;
793 ring->req.tp_block_nr = NUM_BLOCKS;
794 ring->req.tp_frame_nr = NUM_FRAMES;
795 ring->req.tp_retire_blk_tov = BLOCK_RETIRE_TOV_IN_MS;
796 ring->req.tp_sizeof_priv = BLOCK_PRIV_AREA_SZ;
797 ring->req.tp_feature_req_word |= TP_FT_REQ_FILL_RXHASH;
798
799 err = setsockopt(fd, SOL_PACKET, PACKET_RX_RING, &ring->req,
800 sizeof(ring->req));
801 if (err < 0) {
802 perror("setsockopt");
803 exit(1);
804 }
805
806 ring->map = mmap(NULL, ring->req.tp_block_size * ring->req.tp_block_nr,
807 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED,
808 fd, 0);
809 if (ring->map == MAP_FAILED) {
810 perror("mmap");
811 exit(1);
812 }
813
814 ring->rd = malloc(ring->req.tp_block_nr * sizeof(*ring->rd));
815 assert(ring->rd);
816 for (i = 0; i < ring->req.tp_block_nr; ++i) {
817 ring->rd[i].iov_base = ring->map + (i * ring->req.tp_block_size);
818 ring->rd[i].iov_len = ring->req.tp_block_size;
819 }
820
821 memset(&ll, 0, sizeof(ll));
822 ll.sll_family = PF_PACKET;
823 ll.sll_protocol = htons(ETH_P_ALL);
824 ll.sll_ifindex = if_nametoindex(netdev);
825 ll.sll_hatype = 0;
826 ll.sll_pkttype = 0;
827 ll.sll_halen = 0;
828
829 err = bind(fd, (struct sockaddr *) &ll, sizeof(ll));
830 if (err < 0) {
831 perror("bind");
832 exit(1);
833 }
834
835 return fd;
836}
837
838#ifdef __checked
839static uint64_t prev_block_seq_num = 0;
840
841void assert_block_seq_num(struct block_desc *pbd)
842{
843 if (unlikely(prev_block_seq_num + 1 != BLOCK_SNUM(pbd))) {
844 printf("prev_block_seq_num:%"PRIu64", expected seq:%"PRIu64" != "
845 "actual seq:%"PRIu64"\n", prev_block_seq_num,
846 prev_block_seq_num + 1, (uint64_t) BLOCK_SNUM(pbd));
847 exit(1);
848 }
849
850 prev_block_seq_num = BLOCK_SNUM(pbd);
851}
852
853static void assert_block_len(struct block_desc *pbd, uint32_t bytes, int block_num)
854{
855 if (BLOCK_NUM_PKTS(pbd)) {
856 if (unlikely(bytes != BLOCK_LEN(pbd))) {
857 printf("block:%u with %upackets, expected len:%u != actual len:%u\n",
858 block_num, BLOCK_NUM_PKTS(pbd), bytes, BLOCK_LEN(pbd));
859 exit(1);
860 }
861 } else {
862 if (unlikely(BLOCK_LEN(pbd) != BLOCK_PLUS_PRIV(BLOCK_PRIV_AREA_SZ))) {
863 printf("block:%u, expected len:%lu != actual len:%u\n",
864 block_num, BLOCK_HDR_LEN, BLOCK_LEN(pbd));
865 exit(1);
866 }
867 }
868}
869
870static void assert_block_header(struct block_desc *pbd, const int block_num)
871{
872 uint32_t block_status = BLOCK_STATUS(pbd);
873
874 if (unlikely((block_status & TP_STATUS_USER) == 0)) {
875 printf("block:%u, not in TP_STATUS_USER\n", block_num);
876 exit(1);
877 }
878
879 assert_block_seq_num(pbd);
880}
881#else
882static inline void assert_block_header(struct block_desc *pbd, const int block_num)
883{
884}
885static void assert_block_len(struct block_desc *pbd, uint32_t bytes, int block_num)
886{
887}
888#endif
889
890static void display(struct tpacket3_hdr *ppd)
891{
892 struct ethhdr *eth = (struct ethhdr *) ((uint8_t *) ppd + ppd->tp_mac);
893 struct iphdr *ip = (struct iphdr *) ((uint8_t *) eth + ETH_HLEN);
894
895 if (eth->h_proto == htons(ETH_P_IP)) {
896 struct sockaddr_in ss, sd;
897 char sbuff[NI_MAXHOST], dbuff[NI_MAXHOST];
898
899 memset(&ss, 0, sizeof(ss));
900 ss.sin_family = PF_INET;
901 ss.sin_addr.s_addr = ip->saddr;
902 getnameinfo((struct sockaddr *) &ss, sizeof(ss),
903 sbuff, sizeof(sbuff), NULL, 0, NI_NUMERICHOST);
904
905 memset(&sd, 0, sizeof(sd));
906 sd.sin_family = PF_INET;
907 sd.sin_addr.s_addr = ip->daddr;
908 getnameinfo((struct sockaddr *) &sd, sizeof(sd),
909 dbuff, sizeof(dbuff), NULL, 0, NI_NUMERICHOST);
910
911 printf("%s -> %s, ", sbuff, dbuff);
912 }
913
914 printf("rxhash: 0x%x\n", ppd->hv1.tp_rxhash);
915}
916
917static void walk_block(struct block_desc *pbd, const int block_num)
918{
919 int num_pkts = BLOCK_NUM_PKTS(pbd), i;
920 unsigned long bytes = 0;
921 unsigned long bytes_with_padding = BLOCK_PLUS_PRIV(BLOCK_PRIV_AREA_SZ);
922 struct tpacket3_hdr *ppd;
923
924 assert_block_header(pbd, block_num);
925
926 ppd = (struct tpacket3_hdr *) ((uint8_t *) pbd + BLOCK_O2FP(pbd));
927 for (i = 0; i < num_pkts; ++i) {
928 bytes += ppd->tp_snaplen;
929 if (ppd->tp_next_offset)
930 bytes_with_padding += ppd->tp_next_offset;
931 else
932 bytes_with_padding += ALIGN_8(ppd->tp_snaplen + ppd->tp_mac);
933
934 display(ppd);
935
936 ppd = (struct tpacket3_hdr *) ((uint8_t *) ppd + ppd->tp_next_offset);
937 __sync_synchronize();
938 }
939
940 assert_block_len(pbd, bytes_with_padding, block_num);
941
942 packets_total += num_pkts;
943 bytes_total += bytes;
944}
945
946void flush_block(struct block_desc *pbd)
947{
948 BLOCK_STATUS(pbd) = TP_STATUS_KERNEL;
949 __sync_synchronize();
950}
951
952static void teardown_socket(struct ring *ring, int fd)
953{
954 munmap(ring->map, ring->req.tp_block_size * ring->req.tp_block_nr);
955 free(ring->rd);
956 close(fd);
957}
958
959int main(int argc, char **argp)
960{
961 int fd, err;
962 socklen_t len;
963 struct ring ring;
964 struct pollfd pfd;
965 unsigned int block_num = 0;
966 struct block_desc *pbd;
967 struct tpacket_stats_v3 stats;
968
969 if (argc != 2) {
970 fprintf(stderr, "Usage: %s INTERFACE\n", argp[0]);
971 return EXIT_FAILURE;
972 }
973
974 signal(SIGINT, sighandler);
975
976 memset(&ring, 0, sizeof(ring));
977 fd = setup_socket(&ring, argp[argc - 1]);
978 assert(fd > 0);
979
980 memset(&pfd, 0, sizeof(pfd));
981 pfd.fd = fd;
982 pfd.events = POLLIN | POLLERR;
983 pfd.revents = 0;
984
985 while (likely(!sigint)) {
986 pbd = (struct block_desc *) ring.rd[block_num].iov_base;
987retry_block:
988 if ((BLOCK_STATUS(pbd) & TP_STATUS_USER) == 0) {
989 poll(&pfd, 1, -1);
990 goto retry_block;
991 }
992
993 walk_block(pbd, block_num);
994 flush_block(pbd);
995 block_num = (block_num + 1) % NUM_BLOCKS;
996 }
997
998 len = sizeof(stats);
999 err = getsockopt(fd, SOL_PACKET, PACKET_STATISTICS, &stats, &len);
1000 if (err < 0) {
1001 perror("getsockopt");
1002 exit(1);
1003 }
1004
1005 fflush(stdout);
1006 printf("\nReceived %u packets, %lu bytes, %u dropped, freeze_q_cnt: %u\n",
1007 stats.tp_packets, bytes_total, stats.tp_drops,
1008 stats.tp_freeze_q_cnt);
1009
1010 teardown_socket(&ring, fd);
1011 return 0;
1012}
1013
1014-------------------------------------------------------------------------------
688+ PACKET_TIMESTAMP 1015+ PACKET_TIMESTAMP
689------------------------------------------------------------------------------- 1016-------------------------------------------------------------------------------
690 1017