aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorWillem de Bruijn <willemb@google.com>2014-08-04 22:11:49 -0400
committerDavid S. Miller <davem@davemloft.net>2014-08-05 19:35:54 -0400
commit4ed2d765dfaccff5ebdac68e2064b59125033a3b (patch)
tree7ba9b411995d2d365b25fe49eeff95075fec4d0d
parente7fd2885385157d46c85f282fc6d7d297db43e1f (diff)
net-timestamp: TCP timestamping
TCP timestamping extends SO_TIMESTAMPING to bytestreams. Bytestreams do not have a 1:1 relationship between send() buffers and network packets. The feature interprets a send call on a bytestream as a request for a timestamp for the last byte in that send() buffer. The choice corresponds to a request for a timestamp when all bytes in the buffer have been sent. That assumption depends on in-order kernel transmission. This is the common case. That said, it is possible to construct a traffic shaping tree that would result in reordering. The guarantee is strong, then, but not ironclad. This implementation supports send and sendpages (splice). GSO replaces one large packet with multiple smaller packets. This patch also copies the option into the correct smaller packet. This patch does not yet support timestamping on data in an initial TCP Fast Open SYN, because that takes a very different data path. If ID generation in ee_data is enabled, bytestream timestamps return a byte offset, instead of the packet counter for datagrams. The implementation supports a single timestamp per packet. It silenty replaces requests for previous timestamps. To avoid missing tstamps, flush the tcp queue by disabling Nagle, cork and autocork. Missing tstamps can be detected by offset when the ee_data ID is enabled. Implementation details: - On GSO, the timestamping code can be included in the main loop. I moved it into its own loop to reduce the impact on the common case to a single branch. - To avoid leaking the absolute seqno to userspace, the offset returned in ee_data must always be relative. It is an offset between an skb and sk field. The first is always set (also for GSO & ACK). The second must also never be uninitialized. Only allow the ID option on sockets in the ESTABLISHED state, for which the seqno is available. Never reset it to zero (instead, move it to the current seqno when reenabling the option). Signed-off-by: Willem de Bruijn <willemb@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--net/core/skbuff.c5
-rw-r--r--net/core/sock.c13
-rw-r--r--net/ipv4/tcp.c22
-rw-r--r--net/ipv4/tcp_offload.c18
4 files changed, 52 insertions, 6 deletions
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 9705c0732aab..3dec0293a7c5 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3522,8 +3522,11 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
3522 serr->ee.ee_errno = ENOMSG; 3522 serr->ee.ee_errno = ENOMSG;
3523 serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; 3523 serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
3524 serr->ee.ee_info = tstype; 3524 serr->ee.ee_info = tstype;
3525 if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) 3525 if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
3526 serr->ee.ee_data = skb_shinfo(skb)->tskey; 3526 serr->ee.ee_data = skb_shinfo(skb)->tskey;
3527 if (sk->sk_protocol == IPPROTO_TCP)
3528 serr->ee.ee_data -= sk->sk_tskey;
3529 }
3527 3530
3528 err = sock_queue_err_skb(sk, skb); 3531 err = sock_queue_err_skb(sk, skb);
3529 3532
diff --git a/net/core/sock.c b/net/core/sock.c
index 1e0f1c63ad6b..2714811afbd8 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -849,8 +849,17 @@ set_rcvbuf:
849 break; 849 break;
850 } 850 }
851 if (val & SOF_TIMESTAMPING_OPT_ID && 851 if (val & SOF_TIMESTAMPING_OPT_ID &&
852 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) 852 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
853 sk->sk_tskey = 0; 853 if (sk->sk_protocol == IPPROTO_TCP) {
854 if (sk->sk_state != TCP_ESTABLISHED) {
855 ret = -EINVAL;
856 break;
857 }
858 sk->sk_tskey = tcp_sk(sk)->snd_una;
859 } else {
860 sk->sk_tskey = 0;
861 }
862 }
854 sk->sk_tsflags = val; 863 sk->sk_tsflags = val;
855 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 864 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
856 sock_enable_timestamp(sk, 865 sock_enable_timestamp(sk,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9d2118e5fbc7..744af67a5989 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -426,6 +426,15 @@ void tcp_init_sock(struct sock *sk)
426} 426}
427EXPORT_SYMBOL(tcp_init_sock); 427EXPORT_SYMBOL(tcp_init_sock);
428 428
429void tcp_tx_timestamp(struct sock *sk, struct sk_buff *skb)
430{
431 struct skb_shared_info *shinfo = skb_shinfo(skb);
432
433 sock_tx_timestamp(sk, &shinfo->tx_flags);
434 if (shinfo->tx_flags & SKBTX_ANY_SW_TSTAMP)
435 shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
436}
437
429/* 438/*
430 * Wait for a TCP event. 439 * Wait for a TCP event.
431 * 440 *
@@ -523,7 +532,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
523 } 532 }
524 /* This barrier is coupled with smp_wmb() in tcp_reset() */ 533 /* This barrier is coupled with smp_wmb() in tcp_reset() */
525 smp_rmb(); 534 smp_rmb();
526 if (sk->sk_err) 535 if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
527 mask |= POLLERR; 536 mask |= POLLERR;
528 537
529 return mask; 538 return mask;
@@ -959,8 +968,10 @@ new_segment:
959 968
960 copied += copy; 969 copied += copy;
961 offset += copy; 970 offset += copy;
962 if (!(size -= copy)) 971 if (!(size -= copy)) {
972 tcp_tx_timestamp(sk, skb);
963 goto out; 973 goto out;
974 }
964 975
965 if (skb->len < size_goal || (flags & MSG_OOB)) 976 if (skb->len < size_goal || (flags & MSG_OOB))
966 continue; 977 continue;
@@ -1252,8 +1263,10 @@ new_segment:
1252 1263
1253 from += copy; 1264 from += copy;
1254 copied += copy; 1265 copied += copy;
1255 if ((seglen -= copy) == 0 && iovlen == 0) 1266 if ((seglen -= copy) == 0 && iovlen == 0) {
1267 tcp_tx_timestamp(sk, skb);
1256 goto out; 1268 goto out;
1269 }
1257 1270
1258 if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair)) 1271 if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
1259 continue; 1272 continue;
@@ -1617,6 +1630,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1617 struct sk_buff *skb; 1630 struct sk_buff *skb;
1618 u32 urg_hole = 0; 1631 u32 urg_hole = 0;
1619 1632
1633 if (unlikely(flags & MSG_ERRQUEUE))
1634 return ip_recv_error(sk, msg, len, addr_len);
1635
1620 if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) && 1636 if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) &&
1621 (sk->sk_state == TCP_ESTABLISHED)) 1637 (sk->sk_state == TCP_ESTABLISHED))
1622 sk_busy_loop(sk, nonblock); 1638 sk_busy_loop(sk, nonblock);
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 55046ecd083e..f597119fc4e7 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -14,6 +14,21 @@
14#include <net/tcp.h> 14#include <net/tcp.h>
15#include <net/protocol.h> 15#include <net/protocol.h>
16 16
17void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq, unsigned int seq,
18 unsigned int mss)
19{
20 while (skb) {
21 if (ts_seq < (__u64) seq + mss) {
22 skb_shinfo(skb)->tx_flags = SKBTX_SW_TSTAMP;
23 skb_shinfo(skb)->tskey = ts_seq;
24 return;
25 }
26
27 skb = skb->next;
28 seq += mss;
29 }
30}
31
17struct sk_buff *tcp_gso_segment(struct sk_buff *skb, 32struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
18 netdev_features_t features) 33 netdev_features_t features)
19{ 34{
@@ -91,6 +106,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
91 th = tcp_hdr(skb); 106 th = tcp_hdr(skb);
92 seq = ntohl(th->seq); 107 seq = ntohl(th->seq);
93 108
109 if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_SW_TSTAMP))
110 tcp_gso_tstamp(segs, skb_shinfo(gso_skb)->tskey, seq, mss);
111
94 newcheck = ~csum_fold((__force __wsum)((__force u32)th->check + 112 newcheck = ~csum_fold((__force __wsum)((__force u32)th->check +
95 (__force u32)delta)); 113 (__force u32)delta));
96 114