aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp.c
diff options
context:
space:
mode:
authorPavel Emelyanov <xemul@parallels.com>2012-04-18 23:41:01 -0400
committerDavid S. Miller <davem@davemloft.net>2012-04-21 15:52:25 -0400
commitc0e88ff0f256958401778ff692da4b8891acb5a9 (patch)
tree6766354e0c54da93550ffeea082f4dae77bc2c7c /net/ipv4/tcp.c
parentee9952831cfd0bbe834f4a26489d7dce74582e37 (diff)
tcp: Repair socket queues
Reading queues under repair mode is done with recvmsg call. The queue-under-repair set by TCP_REPAIR_QUEUE option is used to determine which queue should be read. Thus both send and receive queue can be read with this. Caller must pass the MSG_PEEK flag. Writing to queues is done with sendmsg call and yet again -- the repair-queue option can be used to push data into the receive queue. When putting an skb into receive queue a zero tcp header is appented to its head to address the tcp_hdr(skb)->syn and the ->fin checks by the (after repair) tcp_recvmsg. These flags flags are both set to zero and that's why. The fin cannot be met in the queue while reading the source socket, since the repair only works for closed/established sockets and queueing fin packet always changes its state. The syn in the queue denotes that the respective skb's seq is "off-by-one" as compared to the actual payload lenght. Thus, at the rcv queue refill we can just drop this flag and set the skb's sequences to precice values. When the repair mode is turned off, the write queue seqs are updated so that the whole queue is considered to be 'already sent, waiting for ACKs' (write_seq = snd_nxt <= snd_una). From the protocol POV the send queue looks like it was sent, but the data between the write_seq and snd_nxt is lost in the network. This helps to avoid another sockoption for setting the snd_nxt sequence. Leaving the whole queue in a 'not yet sent' state (as it will be after sendmsg-s) will not allow to receive any acks from the peer since the ack_seq will be after the snd_nxt. Thus even the ack for the window probe will be dropped and the connection will be 'locked' with the zero peer window. Signed-off-by: Pavel Emelyanov <xemul@parallels.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r--net/ipv4/tcp.c89
1 files changed, 86 insertions, 3 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e38d6f240321..47e2f4972f79 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -912,6 +912,39 @@ static inline int select_size(const struct sock *sk, bool sg)
912 return tmp; 912 return tmp;
913} 913}
914 914
915static int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
916{
917 struct sk_buff *skb;
918 struct tcp_skb_cb *cb;
919 struct tcphdr *th;
920
921 skb = alloc_skb(size + sizeof(*th), sk->sk_allocation);
922 if (!skb)
923 goto err;
924
925 th = (struct tcphdr *)skb_put(skb, sizeof(*th));
926 skb_reset_transport_header(skb);
927 memset(th, 0, sizeof(*th));
928
929 if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size))
930 goto err_free;
931
932 cb = TCP_SKB_CB(skb);
933
934 TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
935 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
936 TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
937
938 tcp_queue_rcv(sk, skb, sizeof(*th));
939
940 return size;
941
942err_free:
943 kfree_skb(skb);
944err:
945 return -ENOMEM;
946}
947
915int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 948int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
916 size_t size) 949 size_t size)
917{ 950{
@@ -933,6 +966,19 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
933 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) 966 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
934 goto out_err; 967 goto out_err;
935 968
969 if (unlikely(tp->repair)) {
970 if (tp->repair_queue == TCP_RECV_QUEUE) {
971 copied = tcp_send_rcvq(sk, msg, size);
972 goto out;
973 }
974
975 err = -EINVAL;
976 if (tp->repair_queue == TCP_NO_QUEUE)
977 goto out_err;
978
979 /* 'common' sending to sendq */
980 }
981
936 /* This should be in poll */ 982 /* This should be in poll */
937 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 983 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
938 984
@@ -1089,7 +1135,7 @@ new_segment:
1089 if ((seglen -= copy) == 0 && iovlen == 0) 1135 if ((seglen -= copy) == 0 && iovlen == 0)
1090 goto out; 1136 goto out;
1091 1137
1092 if (skb->len < max || (flags & MSG_OOB)) 1138 if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
1093 continue; 1139 continue;
1094 1140
1095 if (forced_push(tp)) { 1141 if (forced_push(tp)) {
@@ -1102,7 +1148,7 @@ new_segment:
1102wait_for_sndbuf: 1148wait_for_sndbuf:
1103 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1149 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1104wait_for_memory: 1150wait_for_memory:
1105 if (copied) 1151 if (copied && likely(!tp->repair))
1106 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); 1152 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1107 1153
1108 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) 1154 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
@@ -1113,7 +1159,7 @@ wait_for_memory:
1113 } 1159 }
1114 1160
1115out: 1161out:
1116 if (copied) 1162 if (copied && likely(!tp->repair))
1117 tcp_push(sk, flags, mss_now, tp->nonagle); 1163 tcp_push(sk, flags, mss_now, tp->nonagle);
1118 release_sock(sk); 1164 release_sock(sk);
1119 return copied; 1165 return copied;
@@ -1187,6 +1233,24 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1187 return -EAGAIN; 1233 return -EAGAIN;
1188} 1234}
1189 1235
1236static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1237{
1238 struct sk_buff *skb;
1239 int copied = 0, err = 0;
1240
1241 /* XXX -- need to support SO_PEEK_OFF */
1242
1243 skb_queue_walk(&sk->sk_write_queue, skb) {
1244 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len);
1245 if (err)
1246 break;
1247
1248 copied += skb->len;
1249 }
1250
1251 return err ?: copied;
1252}
1253
1190/* Clean up the receive buffer for full frames taken by the user, 1254/* Clean up the receive buffer for full frames taken by the user,
1191 * then send an ACK if necessary. COPIED is the number of bytes 1255 * then send an ACK if necessary. COPIED is the number of bytes
1192 * tcp_recvmsg has given to the user so far, it speeds up the 1256 * tcp_recvmsg has given to the user so far, it speeds up the
@@ -1432,6 +1496,21 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1432 if (flags & MSG_OOB) 1496 if (flags & MSG_OOB)
1433 goto recv_urg; 1497 goto recv_urg;
1434 1498
1499 if (unlikely(tp->repair)) {
1500 err = -EPERM;
1501 if (!(flags & MSG_PEEK))
1502 goto out;
1503
1504 if (tp->repair_queue == TCP_SEND_QUEUE)
1505 goto recv_sndq;
1506
1507 err = -EINVAL;
1508 if (tp->repair_queue == TCP_NO_QUEUE)
1509 goto out;
1510
1511 /* 'common' recv queue MSG_PEEK-ing */
1512 }
1513
1435 seq = &tp->copied_seq; 1514 seq = &tp->copied_seq;
1436 if (flags & MSG_PEEK) { 1515 if (flags & MSG_PEEK) {
1437 peek_seq = tp->copied_seq; 1516 peek_seq = tp->copied_seq;
@@ -1783,6 +1862,10 @@ out:
1783recv_urg: 1862recv_urg:
1784 err = tcp_recv_urg(sk, msg, len, flags); 1863 err = tcp_recv_urg(sk, msg, len, flags);
1785 goto out; 1864 goto out;
1865
1866recv_sndq:
1867 err = tcp_peek_sndq(sk, msg, len);
1868 goto out;
1786} 1869}
1787EXPORT_SYMBOL(tcp_recvmsg); 1870EXPORT_SYMBOL(tcp_recvmsg);
1788 1871