aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFlorian Westphal <fw@strlen.de>2014-09-26 16:37:35 -0400
committerDavid S. Miller <davem@davemloft.net>2014-09-29 00:13:10 -0400
commit9890092e46b2996bb85f7f973e69424cb5c07bc0 (patch)
tree4cc1000c27918b9fcf63806bb5ae02cee36feeda
parent7354c8c389d18719dd71cc810da70b0921d66694 (diff)
net: tcp: more detailed ACK events and events for CE marked packets
DataCenter TCP (DCTCP) determines cwnd growth based on ECN information and ACK properties, e.g. ACK that updates window is treated differently than DUPACK. Also DCTCP needs information whether ACK was delayed ACK. Furthermore, DCTCP also implements a CE state machine that keeps track of CE markings of incoming packets. Therefore, extend the congestion control framework to provide these event types, so that DCTCP can be properly implemented as a normal congestion algorithm module outside of the core stack. Joint work with Daniel Borkmann and Glenn Judd. Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: Glenn Judd <glenn.judd@morganstanley.com> Acked-by: Stephen Hemminger <stephen@networkplumber.org> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/tcp.h9
-rw-r--r--net/ipv4/tcp_input.c22
-rw-r--r--net/ipv4/tcp_output.c4
3 files changed, 30 insertions, 5 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7ec6a28305c0..1f57c5363492 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -763,10 +763,17 @@ enum tcp_ca_event {
763 CA_EVENT_CWND_RESTART, /* congestion window restart */ 763 CA_EVENT_CWND_RESTART, /* congestion window restart */
764 CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ 764 CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */
765 CA_EVENT_LOSS, /* loss timeout */ 765 CA_EVENT_LOSS, /* loss timeout */
766 CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */
767 CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */
768 CA_EVENT_DELAYED_ACK, /* Delayed ack is sent */
769 CA_EVENT_NON_DELAYED_ACK,
766}; 770};
767 771
772/* Information about inbound ACK, passed to cong_ops->in_ack_event() */
768enum tcp_ca_ack_event_flags { 773enum tcp_ca_ack_event_flags {
769 CA_ACK_SLOWPATH = (1 << 0), 774 CA_ACK_SLOWPATH = (1 << 0), /* In slow path processing */
775 CA_ACK_WIN_UPDATE = (1 << 1), /* ACK updated window */
776 CA_ACK_ECE = (1 << 2), /* ECE bit is set on ack */
770}; 777};
771 778
772/* 779/*
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8a38774cc66e..fc133178c787 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -233,14 +233,21 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s
233 tcp_enter_quickack_mode((struct sock *)tp); 233 tcp_enter_quickack_mode((struct sock *)tp);
234 break; 234 break;
235 case INET_ECN_CE: 235 case INET_ECN_CE:
236 if (tcp_ca_needs_ecn((struct sock *)tp))
237 tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
238
236 if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { 239 if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
237 /* Better not delay acks, sender can have a very low cwnd */ 240 /* Better not delay acks, sender can have a very low cwnd */
238 tcp_enter_quickack_mode((struct sock *)tp); 241 tcp_enter_quickack_mode((struct sock *)tp);
239 tp->ecn_flags |= TCP_ECN_DEMAND_CWR; 242 tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
240 } 243 }
241 /* fallinto */ 244 tp->ecn_flags |= TCP_ECN_SEEN;
245 break;
242 default: 246 default:
247 if (tcp_ca_needs_ecn((struct sock *)tp))
248 tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE);
243 tp->ecn_flags |= TCP_ECN_SEEN; 249 tp->ecn_flags |= TCP_ECN_SEEN;
250 break;
244 } 251 }
245} 252}
246 253
@@ -3429,10 +3436,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3429 tp->snd_una = ack; 3436 tp->snd_una = ack;
3430 flag |= FLAG_WIN_UPDATE; 3437 flag |= FLAG_WIN_UPDATE;
3431 3438
3432 tcp_in_ack_event(sk, 0); 3439 tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
3433 3440
3434 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); 3441 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
3435 } else { 3442 } else {
3443 u32 ack_ev_flags = CA_ACK_SLOWPATH;
3444
3436 if (ack_seq != TCP_SKB_CB(skb)->end_seq) 3445 if (ack_seq != TCP_SKB_CB(skb)->end_seq)
3437 flag |= FLAG_DATA; 3446 flag |= FLAG_DATA;
3438 else 3447 else
@@ -3444,10 +3453,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3444 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, 3453 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3445 &sack_rtt_us); 3454 &sack_rtt_us);
3446 3455
3447 if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) 3456 if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) {
3448 flag |= FLAG_ECE; 3457 flag |= FLAG_ECE;
3458 ack_ev_flags |= CA_ACK_ECE;
3459 }
3460
3461 if (flag & FLAG_WIN_UPDATE)
3462 ack_ev_flags |= CA_ACK_WIN_UPDATE;
3449 3463
3450 tcp_in_ack_event(sk, CA_ACK_SLOWPATH); 3464 tcp_in_ack_event(sk, ack_ev_flags);
3451 } 3465 }
3452 3466
3453 /* We passed data and got it acked, remove any soft error 3467 /* We passed data and got it acked, remove any soft error
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 20e73271d75c..124f9e4e4594 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3130,6 +3130,8 @@ void tcp_send_delayed_ack(struct sock *sk)
3130 int ato = icsk->icsk_ack.ato; 3130 int ato = icsk->icsk_ack.ato;
3131 unsigned long timeout; 3131 unsigned long timeout;
3132 3132
3133 tcp_ca_event(sk, CA_EVENT_DELAYED_ACK);
3134
3133 if (ato > TCP_DELACK_MIN) { 3135 if (ato > TCP_DELACK_MIN) {
3134 const struct tcp_sock *tp = tcp_sk(sk); 3136 const struct tcp_sock *tp = tcp_sk(sk);
3135 int max_ato = HZ / 2; 3137 int max_ato = HZ / 2;
@@ -3186,6 +3188,8 @@ void tcp_send_ack(struct sock *sk)
3186 if (sk->sk_state == TCP_CLOSE) 3188 if (sk->sk_state == TCP_CLOSE)
3187 return; 3189 return;
3188 3190
3191 tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK);
3192
3189 /* We are not putting this on the write queue, so 3193 /* We are not putting this on the write queue, so
3190 * tcp_transmit_skb() will set the ownership to this 3194 * tcp_transmit_skb() will set the ownership to this
3191 * sock. 3195 * sock.