aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDaniel Borkmann <daniel@iogearbox.net>2018-10-16 20:30:32 -0400
committerDaniel Borkmann <daniel@iogearbox.net>2018-10-16 20:30:33 -0400
commit44d520eb17cd5fd7a3ac28b9d872e655c3920021 (patch)
tree91c572d5fd4344b131082e7a1e5e033245dc21b8
parent3f4c3127d332000530349db4843deece27fe5e0c (diff)
parent753fb2ee09345e0730e610b2ee3a01964fe22a63 (diff)
Merge branch 'bpf-sk-msg-peek'
John Fastabend says: ==================== This adds support for the MSG_PEEK flag when redirecting into an ingress psock sk_msg queue. The first patch adds some base support to the helpers, then the feature, and finally we add an option for the test suite to do a duplicate MSG_PEEK call on every recv to test the feature. With duplicate MSG_PEEK call all tests continue to PASS. ==================== Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
-rw-r--r--include/linux/skmsg.h13
-rw-r--r--include/net/tcp.h2
-rw-r--r--net/ipv4/tcp_bpf.c42
-rw-r--r--net/tls/tls_sw.c3
-rw-r--r--tools/testing/selftests/bpf/test_sockmap.c167
5 files changed, 153 insertions, 74 deletions
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 31df0d9fa536..22347b08e1f8 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -187,18 +187,21 @@ static inline void sk_msg_xfer_full(struct sk_msg *dst, struct sk_msg *src)
187 sk_msg_init(src); 187 sk_msg_init(src);
188} 188}
189 189
190static inline bool sk_msg_full(const struct sk_msg *msg)
191{
192 return (msg->sg.end == msg->sg.start) && msg->sg.size;
193}
194
190static inline u32 sk_msg_elem_used(const struct sk_msg *msg) 195static inline u32 sk_msg_elem_used(const struct sk_msg *msg)
191{ 196{
197 if (sk_msg_full(msg))
198 return MAX_MSG_FRAGS;
199
192 return msg->sg.end >= msg->sg.start ? 200 return msg->sg.end >= msg->sg.start ?
193 msg->sg.end - msg->sg.start : 201 msg->sg.end - msg->sg.start :
194 msg->sg.end + (MAX_MSG_FRAGS - msg->sg.start); 202 msg->sg.end + (MAX_MSG_FRAGS - msg->sg.start);
195} 203}
196 204
197static inline bool sk_msg_full(const struct sk_msg *msg)
198{
199 return (msg->sg.end == msg->sg.start) && msg->sg.size;
200}
201
202static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which) 205static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which)
203{ 206{
204 return &msg->sg.data[which]; 207 return &msg->sg.data[which];
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3600ae0f25c3..14fdd7ce9992 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2089,7 +2089,7 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes,
2089int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, 2089int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
2090 int nonblock, int flags, int *addr_len); 2090 int nonblock, int flags, int *addr_len);
2091int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, 2091int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
2092 struct msghdr *msg, int len); 2092 struct msghdr *msg, int len, int flags);
2093 2093
2094/* Call BPF_SOCK_OPS program that returns an int. If the return value 2094/* Call BPF_SOCK_OPS program that returns an int. If the return value
2095 * is < 0, then the BPF op failed (for example if the loaded BPF 2095 * is < 0, then the BPF op failed (for example if the loaded BPF
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index f9d3cf185827..b7918d4caa30 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -39,17 +39,19 @@ static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock,
39} 39}
40 40
41int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, 41int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
42 struct msghdr *msg, int len) 42 struct msghdr *msg, int len, int flags)
43{ 43{
44 struct iov_iter *iter = &msg->msg_iter; 44 struct iov_iter *iter = &msg->msg_iter;
45 int peek = flags & MSG_PEEK;
45 int i, ret, copied = 0; 46 int i, ret, copied = 0;
47 struct sk_msg *msg_rx;
48
49 msg_rx = list_first_entry_or_null(&psock->ingress_msg,
50 struct sk_msg, list);
46 51
47 while (copied != len) { 52 while (copied != len) {
48 struct scatterlist *sge; 53 struct scatterlist *sge;
49 struct sk_msg *msg_rx;
50 54
51 msg_rx = list_first_entry_or_null(&psock->ingress_msg,
52 struct sk_msg, list);
53 if (unlikely(!msg_rx)) 55 if (unlikely(!msg_rx))
54 break; 56 break;
55 57
@@ -70,22 +72,30 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
70 } 72 }
71 73
72 copied += copy; 74 copied += copy;
73 sge->offset += copy; 75 if (likely(!peek)) {
74 sge->length -= copy; 76 sge->offset += copy;
75 sk_mem_uncharge(sk, copy); 77 sge->length -= copy;
76 msg_rx->sg.size -= copy; 78 sk_mem_uncharge(sk, copy);
77 if (!sge->length) { 79 msg_rx->sg.size -= copy;
78 i++; 80
79 if (i == MAX_SKB_FRAGS) 81 if (!sge->length) {
80 i = 0; 82 sk_msg_iter_var_next(i);
81 if (!msg_rx->skb) 83 if (!msg_rx->skb)
82 put_page(page); 84 put_page(page);
85 }
86 } else {
87 sk_msg_iter_var_next(i);
83 } 88 }
84 89
85 if (copied == len) 90 if (copied == len)
86 break; 91 break;
87 } while (i != msg_rx->sg.end); 92 } while (i != msg_rx->sg.end);
88 93
94 if (unlikely(peek)) {
95 msg_rx = list_next_entry(msg_rx, list);
96 continue;
97 }
98
89 msg_rx->sg.start = i; 99 msg_rx->sg.start = i;
90 if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { 100 if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) {
91 list_del(&msg_rx->list); 101 list_del(&msg_rx->list);
@@ -93,6 +103,8 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
93 consume_skb(msg_rx->skb); 103 consume_skb(msg_rx->skb);
94 kfree(msg_rx); 104 kfree(msg_rx);
95 } 105 }
106 msg_rx = list_first_entry_or_null(&psock->ingress_msg,
107 struct sk_msg, list);
96 } 108 }
97 109
98 return copied; 110 return copied;
@@ -115,7 +127,7 @@ int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
115 return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); 127 return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
116 lock_sock(sk); 128 lock_sock(sk);
117msg_bytes_ready: 129msg_bytes_ready:
118 copied = __tcp_bpf_recvmsg(sk, psock, msg, len); 130 copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags);
119 if (!copied) { 131 if (!copied) {
120 int data, err = 0; 132 int data, err = 0;
121 long timeo; 133 long timeo;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index a525fc4c2a4b..5cd88ba8acd1 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1478,7 +1478,8 @@ int tls_sw_recvmsg(struct sock *sk,
1478 skb = tls_wait_data(sk, psock, flags, timeo, &err); 1478 skb = tls_wait_data(sk, psock, flags, timeo, &err);
1479 if (!skb) { 1479 if (!skb) {
1480 if (psock) { 1480 if (psock) {
1481 int ret = __tcp_bpf_recvmsg(sk, psock, msg, len); 1481 int ret = __tcp_bpf_recvmsg(sk, psock,
1482 msg, len, flags);
1482 1483
1483 if (ret > 0) { 1484 if (ret > 0) {
1484 copied += ret; 1485 copied += ret;
diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 7cb69ce6dfa2..cbd1c0be8680 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -80,6 +80,7 @@ int txmsg_end;
80int txmsg_ingress; 80int txmsg_ingress;
81int txmsg_skb; 81int txmsg_skb;
82int ktls; 82int ktls;
83int peek_flag;
83 84
84static const struct option long_options[] = { 85static const struct option long_options[] = {
85 {"help", no_argument, NULL, 'h' }, 86 {"help", no_argument, NULL, 'h' },
@@ -102,6 +103,7 @@ static const struct option long_options[] = {
102 {"txmsg_ingress", no_argument, &txmsg_ingress, 1 }, 103 {"txmsg_ingress", no_argument, &txmsg_ingress, 1 },
103 {"txmsg_skb", no_argument, &txmsg_skb, 1 }, 104 {"txmsg_skb", no_argument, &txmsg_skb, 1 },
104 {"ktls", no_argument, &ktls, 1 }, 105 {"ktls", no_argument, &ktls, 1 },
106 {"peek", no_argument, &peek_flag, 1 },
105 {0, 0, NULL, 0 } 107 {0, 0, NULL, 0 }
106}; 108};
107 109
@@ -352,33 +354,40 @@ static int msg_loop_sendpage(int fd, int iov_length, int cnt,
352 return 0; 354 return 0;
353} 355}
354 356
355static int msg_loop(int fd, int iov_count, int iov_length, int cnt, 357static void msg_free_iov(struct msghdr *msg)
356 struct msg_stats *s, bool tx,
357 struct sockmap_options *opt)
358{ 358{
359 struct msghdr msg = {0}; 359 int i;
360 int err, i, flags = MSG_NOSIGNAL; 360
361 for (i = 0; i < msg->msg_iovlen; i++)
362 free(msg->msg_iov[i].iov_base);
363 free(msg->msg_iov);
364 msg->msg_iov = NULL;
365 msg->msg_iovlen = 0;
366}
367
368static int msg_alloc_iov(struct msghdr *msg,
369 int iov_count, int iov_length,
370 bool data, bool xmit)
371{
372 unsigned char k = 0;
361 struct iovec *iov; 373 struct iovec *iov;
362 unsigned char k; 374 int i;
363 bool data_test = opt->data_test;
364 bool drop = opt->drop_expected;
365 375
366 iov = calloc(iov_count, sizeof(struct iovec)); 376 iov = calloc(iov_count, sizeof(struct iovec));
367 if (!iov) 377 if (!iov)
368 return errno; 378 return errno;
369 379
370 k = 0;
371 for (i = 0; i < iov_count; i++) { 380 for (i = 0; i < iov_count; i++) {
372 unsigned char *d = calloc(iov_length, sizeof(char)); 381 unsigned char *d = calloc(iov_length, sizeof(char));
373 382
374 if (!d) { 383 if (!d) {
375 fprintf(stderr, "iov_count %i/%i OOM\n", i, iov_count); 384 fprintf(stderr, "iov_count %i/%i OOM\n", i, iov_count);
376 goto out_errno; 385 goto unwind_iov;
377 } 386 }
378 iov[i].iov_base = d; 387 iov[i].iov_base = d;
379 iov[i].iov_len = iov_length; 388 iov[i].iov_len = iov_length;
380 389
381 if (data_test && tx) { 390 if (data && xmit) {
382 int j; 391 int j;
383 392
384 for (j = 0; j < iov_length; j++) 393 for (j = 0; j < iov_length; j++)
@@ -386,9 +395,60 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
386 } 395 }
387 } 396 }
388 397
389 msg.msg_iov = iov; 398 msg->msg_iov = iov;
390 msg.msg_iovlen = iov_count; 399 msg->msg_iovlen = iov_count;
391 k = 0; 400
401 return 0;
402unwind_iov:
403 for (i--; i >= 0 ; i--)
404 free(msg->msg_iov[i].iov_base);
405 return -ENOMEM;
406}
407
408static int msg_verify_data(struct msghdr *msg, int size, int chunk_sz)
409{
410 int i, j, bytes_cnt = 0;
411 unsigned char k = 0;
412
413 for (i = 0; i < msg->msg_iovlen; i++) {
414 unsigned char *d = msg->msg_iov[i].iov_base;
415
416 for (j = 0;
417 j < msg->msg_iov[i].iov_len && size; j++) {
418 if (d[j] != k++) {
419 fprintf(stderr,
420 "detected data corruption @iov[%i]:%i %02x != %02x, %02x ?= %02x\n",
421 i, j, d[j], k - 1, d[j+1], k);
422 return -EIO;
423 }
424 bytes_cnt++;
425 if (bytes_cnt == chunk_sz) {
426 k = 0;
427 bytes_cnt = 0;
428 }
429 size--;
430 }
431 }
432 return 0;
433}
434
435static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
436 struct msg_stats *s, bool tx,
437 struct sockmap_options *opt)
438{
439 struct msghdr msg = {0}, msg_peek = {0};
440 int err, i, flags = MSG_NOSIGNAL;
441 bool drop = opt->drop_expected;
442 bool data = opt->data_test;
443
444 err = msg_alloc_iov(&msg, iov_count, iov_length, data, tx);
445 if (err)
446 goto out_errno;
447 if (peek_flag) {
448 err = msg_alloc_iov(&msg_peek, iov_count, iov_length, data, tx);
449 if (err)
450 goto out_errno;
451 }
392 452
393 if (tx) { 453 if (tx) {
394 clock_gettime(CLOCK_MONOTONIC, &s->start); 454 clock_gettime(CLOCK_MONOTONIC, &s->start);
@@ -408,19 +468,12 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
408 } 468 }
409 clock_gettime(CLOCK_MONOTONIC, &s->end); 469 clock_gettime(CLOCK_MONOTONIC, &s->end);
410 } else { 470 } else {
411 int slct, recv, max_fd = fd; 471 int slct, recvp = 0, recv, max_fd = fd;
412 int fd_flags = O_NONBLOCK; 472 int fd_flags = O_NONBLOCK;
413 struct timeval timeout; 473 struct timeval timeout;
414 float total_bytes; 474 float total_bytes;
415 int bytes_cnt = 0;
416 int chunk_sz;
417 fd_set w; 475 fd_set w;
418 476
419 if (opt->sendpage)
420 chunk_sz = iov_length * cnt;
421 else
422 chunk_sz = iov_length * iov_count;
423
424 fcntl(fd, fd_flags); 477 fcntl(fd, fd_flags);
425 total_bytes = (float)iov_count * (float)iov_length * (float)cnt; 478 total_bytes = (float)iov_count * (float)iov_length * (float)cnt;
426 err = clock_gettime(CLOCK_MONOTONIC, &s->start); 479 err = clock_gettime(CLOCK_MONOTONIC, &s->start);
@@ -452,6 +505,19 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
452 goto out_errno; 505 goto out_errno;
453 } 506 }
454 507
508 errno = 0;
509 if (peek_flag) {
510 flags |= MSG_PEEK;
511 recvp = recvmsg(fd, &msg_peek, flags);
512 if (recvp < 0) {
513 if (errno != EWOULDBLOCK) {
514 clock_gettime(CLOCK_MONOTONIC, &s->end);
515 goto out_errno;
516 }
517 }
518 flags = 0;
519 }
520
455 recv = recvmsg(fd, &msg, flags); 521 recv = recvmsg(fd, &msg, flags);
456 if (recv < 0) { 522 if (recv < 0) {
457 if (errno != EWOULDBLOCK) { 523 if (errno != EWOULDBLOCK) {
@@ -463,27 +529,23 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
463 529
464 s->bytes_recvd += recv; 530 s->bytes_recvd += recv;
465 531
466 if (data_test) { 532 if (data) {
467 int j; 533 int chunk_sz = opt->sendpage ?
468 534 iov_length * cnt :
469 for (i = 0; i < msg.msg_iovlen; i++) { 535 iov_length * iov_count;
470 unsigned char *d = iov[i].iov_base; 536
471 537 errno = msg_verify_data(&msg, recv, chunk_sz);
472 for (j = 0; 538 if (errno) {
473 j < iov[i].iov_len && recv; j++) { 539 perror("data verify msg failed\n");
474 if (d[j] != k++) { 540 goto out_errno;
475 errno = -EIO; 541 }
476 fprintf(stderr, 542 if (recvp) {
477 "detected data corruption @iov[%i]:%i %02x != %02x, %02x ?= %02x\n", 543 errno = msg_verify_data(&msg_peek,
478 i, j, d[j], k - 1, d[j+1], k); 544 recvp,
479 goto out_errno; 545 chunk_sz);
480 } 546 if (errno) {
481 bytes_cnt++; 547 perror("data verify msg_peek failed\n");
482 if (bytes_cnt == chunk_sz) { 548 goto out_errno;
483 k = 0;
484 bytes_cnt = 0;
485 }
486 recv--;
487 } 549 }
488 } 550 }
489 } 551 }
@@ -491,14 +553,12 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
491 clock_gettime(CLOCK_MONOTONIC, &s->end); 553 clock_gettime(CLOCK_MONOTONIC, &s->end);
492 } 554 }
493 555
494 for (i = 0; i < iov_count; i++) 556 msg_free_iov(&msg);
495 free(iov[i].iov_base); 557 msg_free_iov(&msg_peek);
496 free(iov); 558 return err;
497 return 0;
498out_errno: 559out_errno:
499 for (i = 0; i < iov_count; i++) 560 msg_free_iov(&msg);
500 free(iov[i].iov_base); 561 msg_free_iov(&msg_peek);
501 free(iov);
502 return errno; 562 return errno;
503} 563}
504 564
@@ -565,9 +625,10 @@ static int sendmsg_test(struct sockmap_options *opt)
565 } 625 }
566 if (opt->verbose) 626 if (opt->verbose)
567 fprintf(stdout, 627 fprintf(stdout,
568 "rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s\n", 628 "rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s %s\n",
569 s.bytes_sent, sent_Bps, sent_Bps/giga, 629 s.bytes_sent, sent_Bps, sent_Bps/giga,
570 s.bytes_recvd, recvd_Bps, recvd_Bps/giga); 630 s.bytes_recvd, recvd_Bps, recvd_Bps/giga,
631 peek_flag ? "(peek_msg)" : "");
571 if (err && txmsg_cork) 632 if (err && txmsg_cork)
572 err = 0; 633 err = 0;
573 exit(err ? 1 : 0); 634 exit(err ? 1 : 0);
@@ -999,6 +1060,8 @@ static void test_options(char *options)
999 strncat(options, "skb,", OPTSTRING); 1060 strncat(options, "skb,", OPTSTRING);
1000 if (ktls) 1061 if (ktls)
1001 strncat(options, "ktls,", OPTSTRING); 1062 strncat(options, "ktls,", OPTSTRING);
1063 if (peek_flag)
1064 strncat(options, "peek,", OPTSTRING);
1002} 1065}
1003 1066
1004static int __test_exec(int cgrp, int test, struct sockmap_options *opt) 1067static int __test_exec(int cgrp, int test, struct sockmap_options *opt)