aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2014-09-29 00:13:17 -0400
committerDavid S. Miller <davem@davemloft.net>2014-09-29 00:13:17 -0400
commita11238ec28d40f56f8b939f6f125694dba3adb70 (patch)
tree3a13df46a74af91d928dc4ac5150c2815ee42207
parent53dfd501819a6e9c3a7d56cac1ddaf03fe90800d (diff)
parente3118e8359bb7c59555aca60c725106e6d78c5ce (diff)
Merge branch 'dctcp'
Daniel Borkmann says: ==================== net: tcp: DCTCP congestion control algorithm This patch series adds support for the DataCenter TCP (DCTCP) congestion control algorithm. Please see individual patches for the details. The last patch adds DCTCP as a congestion control module, and previous ones add needed infrastructure to extend the congestion control framework. Joint work between Florian Westphal, Daniel Borkmann and Glenn Judd. v3 -> v2: - No changes anywhere, just a resend as requested by Dave - Added Stephen's ACK v1 -> v2: - Rebased to latest net-next - Addressed Eric's feedback, thanks! - Update stale comment wrt. DCTCP ECN usage - Don't call INET_ECN_xmit for every packet - Add dctcp ss/inetdiag support to expose internal stats to userspace ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--Documentation/networking/dctcp.txt43
-rw-r--r--include/net/tcp.h78
-rw-r--r--include/uapi/linux/inet_diag.h13
-rw-r--r--net/ipv4/Kconfig26
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/tcp.c6
-rw-r--r--net/ipv4/tcp_cong.c46
-rw-r--r--net/ipv4/tcp_dctcp.c344
-rw-r--r--net/ipv4/tcp_input.c32
-rw-r--r--net/ipv4/tcp_minisocks.c5
-rw-r--r--net/ipv4/tcp_output.c30
-rw-r--r--net/ipv4/tcp_westwood.c28
12 files changed, 574 insertions, 78 deletions
diff --git a/Documentation/networking/dctcp.txt b/Documentation/networking/dctcp.txt
new file mode 100644
index 000000000000..0d5dfbc89ec9
--- /dev/null
+++ b/Documentation/networking/dctcp.txt
@@ -0,0 +1,43 @@
1DCTCP (DataCenter TCP)
2----------------------
3
4DCTCP is an enhancement to the TCP congestion control algorithm for data
5center networks and leverages Explicit Congestion Notification (ECN) in
6the data center network to provide multi-bit feedback to the end hosts.
7
8To enable it on end hosts:
9
10 sysctl -w net.ipv4.tcp_congestion_control=dctcp
11
12All switches in the data center network running DCTCP must support ECN
13marking and be configured for marking when reaching defined switch buffer
14thresholds. The default ECN marking threshold heuristic for DCTCP on
15switches is 20 packets (30KB) at 1Gbps, and 65 packets (~100KB) at 10Gbps,
16but might need further careful tweaking.
17
18For more details, see below documents:
19
20Paper:
21
22The algorithm is further described in detail in the following two
23SIGCOMM/SIGMETRICS papers:
24
25 i) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye,
26 Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan:
27 "Data Center TCP (DCTCP)", Data Center Networks session
28 Proc. ACM SIGCOMM, New Delhi, 2010.
29 http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
30 http://www.sigcomm.org/ccr/papers/2010/October/1851275.1851192
31
32ii) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar:
33 "Analysis of DCTCP: Stability, Convergence, and Fairness"
34 Proc. ACM SIGMETRICS, San Jose, 2011.
35 http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf
36
37IETF informational draft:
38
39 http://tools.ietf.org/html/draft-bensley-tcpm-dctcp-00
40
41DCTCP site:
42
43 http://simula.stanford.edu/~alizade/Site/DCTCP.html
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 02a9a2c366bf..1f57c5363492 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -733,23 +733,6 @@ struct tcp_skb_cb {
733 733
734#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) 734#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
735 735
736/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
737 *
738 * If we receive a SYN packet with these bits set, it means a network is
739 * playing bad games with TOS bits. In order to avoid possible false congestion
740 * notifications, we disable TCP ECN negociation.
741 */
742static inline void
743TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb,
744 struct net *net)
745{
746 const struct tcphdr *th = tcp_hdr(skb);
747
748 if (net->ipv4.sysctl_tcp_ecn && th->ece && th->cwr &&
749 INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield))
750 inet_rsk(req)->ecn_ok = 1;
751}
752
753/* Due to TSO, an SKB can be composed of multiple actual 736/* Due to TSO, an SKB can be composed of multiple actual
754 * packets. To keep these tracked properly, we use this. 737 * packets. To keep these tracked properly, we use this.
755 */ 738 */
@@ -780,8 +763,17 @@ enum tcp_ca_event {
780 CA_EVENT_CWND_RESTART, /* congestion window restart */ 763 CA_EVENT_CWND_RESTART, /* congestion window restart */
781 CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ 764 CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */
782 CA_EVENT_LOSS, /* loss timeout */ 765 CA_EVENT_LOSS, /* loss timeout */
783 CA_EVENT_FAST_ACK, /* in sequence ack */ 766 CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */
784 CA_EVENT_SLOW_ACK, /* other ack */ 767 CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */
768 CA_EVENT_DELAYED_ACK, /* Delayed ack is sent */
769 CA_EVENT_NON_DELAYED_ACK,
770};
771
772/* Information about inbound ACK, passed to cong_ops->in_ack_event() */
773enum tcp_ca_ack_event_flags {
774 CA_ACK_SLOWPATH = (1 << 0), /* In slow path processing */
775 CA_ACK_WIN_UPDATE = (1 << 1), /* ACK updated window */
776 CA_ACK_ECE = (1 << 2), /* ECE bit is set on ack */
785}; 777};
786 778
787/* 779/*
@@ -791,7 +783,10 @@ enum tcp_ca_event {
791#define TCP_CA_MAX 128 783#define TCP_CA_MAX 128
792#define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX) 784#define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX)
793 785
786/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
794#define TCP_CONG_NON_RESTRICTED 0x1 787#define TCP_CONG_NON_RESTRICTED 0x1
788/* Requires ECN/ECT set on all packets */
789#define TCP_CONG_NEEDS_ECN 0x2
795 790
796struct tcp_congestion_ops { 791struct tcp_congestion_ops {
797 struct list_head list; 792 struct list_head list;
@@ -810,6 +805,8 @@ struct tcp_congestion_ops {
810 void (*set_state)(struct sock *sk, u8 new_state); 805 void (*set_state)(struct sock *sk, u8 new_state);
811 /* call when cwnd event occurs (optional) */ 806 /* call when cwnd event occurs (optional) */
812 void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev); 807 void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
808 /* call when ack arrives (optional) */
809 void (*in_ack_event)(struct sock *sk, u32 flags);
813 /* new value of cwnd after loss (optional) */ 810 /* new value of cwnd after loss (optional) */
814 u32 (*undo_cwnd)(struct sock *sk); 811 u32 (*undo_cwnd)(struct sock *sk);
815 /* hook for packet ack accounting (optional) */ 812 /* hook for packet ack accounting (optional) */
@@ -824,6 +821,7 @@ struct tcp_congestion_ops {
824int tcp_register_congestion_control(struct tcp_congestion_ops *type); 821int tcp_register_congestion_control(struct tcp_congestion_ops *type);
825void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); 822void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
826 823
824void tcp_assign_congestion_control(struct sock *sk);
827void tcp_init_congestion_control(struct sock *sk); 825void tcp_init_congestion_control(struct sock *sk);
828void tcp_cleanup_congestion_control(struct sock *sk); 826void tcp_cleanup_congestion_control(struct sock *sk);
829int tcp_set_default_congestion_control(const char *name); 827int tcp_set_default_congestion_control(const char *name);
@@ -835,11 +833,17 @@ int tcp_set_congestion_control(struct sock *sk, const char *name);
835int tcp_slow_start(struct tcp_sock *tp, u32 acked); 833int tcp_slow_start(struct tcp_sock *tp, u32 acked);
836void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w); 834void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w);
837 835
838extern struct tcp_congestion_ops tcp_init_congestion_ops;
839u32 tcp_reno_ssthresh(struct sock *sk); 836u32 tcp_reno_ssthresh(struct sock *sk);
840void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); 837void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
841extern struct tcp_congestion_ops tcp_reno; 838extern struct tcp_congestion_ops tcp_reno;
842 839
840static inline bool tcp_ca_needs_ecn(const struct sock *sk)
841{
842 const struct inet_connection_sock *icsk = inet_csk(sk);
843
844 return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN;
845}
846
843static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state) 847static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
844{ 848{
845 struct inet_connection_sock *icsk = inet_csk(sk); 849 struct inet_connection_sock *icsk = inet_csk(sk);
@@ -857,6 +861,40 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
857 icsk->icsk_ca_ops->cwnd_event(sk, event); 861 icsk->icsk_ca_ops->cwnd_event(sk, event);
858} 862}
859 863
864/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
865 *
866 * If we receive a SYN packet with these bits set, it means a
867 * network is playing bad games with TOS bits. In order to
868 * avoid possible false congestion notifications, we disable
869 * TCP ECN negociation.
870 *
871 * Exception: tcp_ca wants ECN. This is required for DCTCP
872 * congestion control; it requires setting ECT on all packets,
873 * including SYN. We inverse the test in this case: If our
874 * local socket wants ECN, but peer only set ece/cwr (but not
875 * ECT in IP header) its probably a non-DCTCP aware sender.
876 */
877static inline void
878TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb,
879 const struct sock *listen_sk)
880{
881 const struct tcphdr *th = tcp_hdr(skb);
882 const struct net *net = sock_net(listen_sk);
883 bool th_ecn = th->ece && th->cwr;
884 bool ect, need_ecn;
885
886 if (!th_ecn)
887 return;
888
889 ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
890 need_ecn = tcp_ca_needs_ecn(listen_sk);
891
892 if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn)
893 inet_rsk(req)->ecn_ok = 1;
894 else if (ect && need_ecn)
895 inet_rsk(req)->ecn_ok = 1;
896}
897
860/* These functions determine how the current flow behaves in respect of SACK 898/* These functions determine how the current flow behaves in respect of SACK
861 * handling. SACK is negotiated with the peer, and therefore it can vary 899 * handling. SACK is negotiated with the peer, and therefore it can vary
862 * between different flows. 900 * between different flows.
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index bbde90fa5838..d65c0a09efd3 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -110,10 +110,10 @@ enum {
110 INET_DIAG_TCLASS, 110 INET_DIAG_TCLASS,
111 INET_DIAG_SKMEMINFO, 111 INET_DIAG_SKMEMINFO,
112 INET_DIAG_SHUTDOWN, 112 INET_DIAG_SHUTDOWN,
113 INET_DIAG_DCTCPINFO,
113}; 114};
114 115
115#define INET_DIAG_MAX INET_DIAG_SHUTDOWN 116#define INET_DIAG_MAX INET_DIAG_DCTCPINFO
116
117 117
118/* INET_DIAG_MEM */ 118/* INET_DIAG_MEM */
119 119
@@ -133,5 +133,14 @@ struct tcpvegas_info {
133 __u32 tcpv_minrtt; 133 __u32 tcpv_minrtt;
134}; 134};
135 135
136/* INET_DIAG_DCTCPINFO */
137
138struct tcp_dctcp_info {
139 __u16 dctcp_enabled;
140 __u16 dctcp_ce_state;
141 __u32 dctcp_alpha;
142 __u32 dctcp_ab_ecn;
143 __u32 dctcp_ab_tot;
144};
136 145
137#endif /* _UAPI_INET_DIAG_H_ */ 146#endif /* _UAPI_INET_DIAG_H_ */
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 84f710b7472a..69fb37854449 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -570,6 +570,27 @@ config TCP_CONG_ILLINOIS
570 For further details see: 570 For further details see:
571 http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html 571 http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
572 572
573config TCP_CONG_DCTCP
574 tristate "DataCenter TCP (DCTCP)"
575 default n
576 ---help---
577 DCTCP leverages Explicit Congestion Notification (ECN) in the network to
578 provide multi-bit feedback to the end hosts. It is designed to provide:
579
580 - High burst tolerance (incast due to partition/aggregate),
581 - Low latency (short flows, queries),
582 - High throughput (continuous data updates, large file transfers) with
583 commodity, shallow-buffered switches.
584
585 All switches in the data center network running DCTCP must support
586 ECN marking and be configured for marking when reaching defined switch
587 buffer thresholds. The default ECN marking threshold heuristic for
588 DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets
589 (~100KB) at 10Gbps, but might need further careful tweaking.
590
591 For further details see:
592 http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
593
573choice 594choice
574 prompt "Default TCP congestion control" 595 prompt "Default TCP congestion control"
575 default DEFAULT_CUBIC 596 default DEFAULT_CUBIC
@@ -598,9 +619,11 @@ choice
598 config DEFAULT_WESTWOOD 619 config DEFAULT_WESTWOOD
599 bool "Westwood" if TCP_CONG_WESTWOOD=y 620 bool "Westwood" if TCP_CONG_WESTWOOD=y
600 621
622 config DEFAULT_DCTCP
623 bool "DCTCP" if TCP_CONG_DCTCP=y
624
601 config DEFAULT_RENO 625 config DEFAULT_RENO
602 bool "Reno" 626 bool "Reno"
603
604endchoice 627endchoice
605 628
606endif 629endif
@@ -620,6 +643,7 @@ config DEFAULT_TCP_CONG
620 default "westwood" if DEFAULT_WESTWOOD 643 default "westwood" if DEFAULT_WESTWOOD
621 default "veno" if DEFAULT_VENO 644 default "veno" if DEFAULT_VENO
622 default "reno" if DEFAULT_RENO 645 default "reno" if DEFAULT_RENO
646 default "dctcp" if DEFAULT_DCTCP
623 default "cubic" 647 default "cubic"
624 648
625config TCP_MD5SIG 649config TCP_MD5SIG
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index d78d404c596f..d8105787c199 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -43,6 +43,7 @@ obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
43obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o 43obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
44obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o 44obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
45obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o 45obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
46obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o
46obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o 47obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
47obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o 48obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
48obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o 49obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 87289e51be00..cf5e508e1ef5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -405,7 +405,7 @@ void tcp_init_sock(struct sock *sk)
405 405
406 tp->reordering = sysctl_tcp_reordering; 406 tp->reordering = sysctl_tcp_reordering;
407 tcp_enable_early_retrans(tp); 407 tcp_enable_early_retrans(tp);
408 icsk->icsk_ca_ops = &tcp_init_congestion_ops; 408 tcp_assign_congestion_control(sk);
409 409
410 tp->tsoffset = 0; 410 tp->tsoffset = 0;
411 411
@@ -3258,8 +3258,6 @@ void __init tcp_init(void)
3258 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); 3258 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
3259 3259
3260 tcp_metrics_init(); 3260 tcp_metrics_init();
3261 3261 BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
3262 tcp_register_congestion_control(&tcp_reno);
3263
3264 tcp_tasklet_init(); 3262 tcp_tasklet_init();
3265} 3263}
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 80248f56c89f..a6c8a5775624 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -74,24 +74,34 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
74EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); 74EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
75 75
76/* Assign choice of congestion control. */ 76/* Assign choice of congestion control. */
77void tcp_init_congestion_control(struct sock *sk) 77void tcp_assign_congestion_control(struct sock *sk)
78{ 78{
79 struct inet_connection_sock *icsk = inet_csk(sk); 79 struct inet_connection_sock *icsk = inet_csk(sk);
80 struct tcp_congestion_ops *ca; 80 struct tcp_congestion_ops *ca;
81 81
82 /* if no choice made yet assign the current value set as default */ 82 rcu_read_lock();
83 if (icsk->icsk_ca_ops == &tcp_init_congestion_ops) { 83 list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
84 rcu_read_lock(); 84 if (likely(try_module_get(ca->owner))) {
85 list_for_each_entry_rcu(ca, &tcp_cong_list, list) { 85 icsk->icsk_ca_ops = ca;
86 if (try_module_get(ca->owner)) { 86 goto out;
87 icsk->icsk_ca_ops = ca;
88 break;
89 }
90
91 /* fallback to next available */
92 } 87 }
93 rcu_read_unlock(); 88 /* Fallback to next available. The last really
89 * guaranteed fallback is Reno from this list.
90 */
94 } 91 }
92out:
93 rcu_read_unlock();
94
95 /* Clear out private data before diag gets it and
96 * the ca has not been initialized.
97 */
98 if (ca->get_info)
99 memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
100}
101
102void tcp_init_congestion_control(struct sock *sk)
103{
104 const struct inet_connection_sock *icsk = inet_csk(sk);
95 105
96 if (icsk->icsk_ca_ops->init) 106 if (icsk->icsk_ca_ops->init)
97 icsk->icsk_ca_ops->init(sk); 107 icsk->icsk_ca_ops->init(sk);
@@ -345,15 +355,3 @@ struct tcp_congestion_ops tcp_reno = {
345 .ssthresh = tcp_reno_ssthresh, 355 .ssthresh = tcp_reno_ssthresh,
346 .cong_avoid = tcp_reno_cong_avoid, 356 .cong_avoid = tcp_reno_cong_avoid,
347}; 357};
348
349/* Initial congestion control used (until SYN)
350 * really reno under another name so we can tell difference
351 * during tcp_set_default_congestion_control
352 */
353struct tcp_congestion_ops tcp_init_congestion_ops = {
354 .name = "",
355 .owner = THIS_MODULE,
356 .ssthresh = tcp_reno_ssthresh,
357 .cong_avoid = tcp_reno_cong_avoid,
358};
359EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
new file mode 100644
index 000000000000..b504371af742
--- /dev/null
+++ b/net/ipv4/tcp_dctcp.c
@@ -0,0 +1,344 @@
1/* DataCenter TCP (DCTCP) congestion control.
2 *
3 * http://simula.stanford.edu/~alizade/Site/DCTCP.html
4 *
5 * This is an implementation of DCTCP over Reno, an enhancement to the
6 * TCP congestion control algorithm designed for data centers. DCTCP
7 * leverages Explicit Congestion Notification (ECN) in the network to
8 * provide multi-bit feedback to the end hosts. DCTCP's goal is to meet
9 * the following three data center transport requirements:
10 *
11 * - High burst tolerance (incast due to partition/aggregate)
12 * - Low latency (short flows, queries)
13 * - High throughput (continuous data updates, large file transfers)
14 * with commodity shallow buffered switches
15 *
16 * The algorithm is described in detail in the following two papers:
17 *
18 * 1) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye,
19 * Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan:
20 * "Data Center TCP (DCTCP)", Data Center Networks session
21 * Proc. ACM SIGCOMM, New Delhi, 2010.
22 * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
23 *
24 * 2) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar:
25 * "Analysis of DCTCP: Stability, Convergence, and Fairness"
26 * Proc. ACM SIGMETRICS, San Jose, 2011.
27 * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf
28 *
29 * Initial prototype from Abdul Kabbani, Masato Yasuda and Mohammad Alizadeh.
30 *
31 * Authors:
32 *
33 * Daniel Borkmann <dborkman@redhat.com>
34 * Florian Westphal <fw@strlen.de>
35 * Glenn Judd <glenn.judd@morganstanley.com>
36 *
37 * This program is free software; you can redistribute it and/or modify
38 * it under the terms of the GNU General Public License as published by
39 * the Free Software Foundation; either version 2 of the License, or (at
40 * your option) any later version.
41 */
42
43#include <linux/module.h>
44#include <linux/mm.h>
45#include <net/tcp.h>
46#include <linux/inet_diag.h>
47
48#define DCTCP_MAX_ALPHA 1024U
49
50struct dctcp {
51 u32 acked_bytes_ecn;
52 u32 acked_bytes_total;
53 u32 prior_snd_una;
54 u32 prior_rcv_nxt;
55 u32 dctcp_alpha;
56 u32 next_seq;
57 u32 ce_state;
58 u32 delayed_ack_reserved;
59};
60
61static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */
62module_param(dctcp_shift_g, uint, 0644);
63MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha");
64
65static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA;
66module_param(dctcp_alpha_on_init, uint, 0644);
67MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value");
68
69static unsigned int dctcp_clamp_alpha_on_loss __read_mostly;
70module_param(dctcp_clamp_alpha_on_loss, uint, 0644);
71MODULE_PARM_DESC(dctcp_clamp_alpha_on_loss,
72 "parameter for clamping alpha on loss");
73
74static struct tcp_congestion_ops dctcp_reno;
75
76static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca)
77{
78 ca->next_seq = tp->snd_nxt;
79
80 ca->acked_bytes_ecn = 0;
81 ca->acked_bytes_total = 0;
82}
83
84static void dctcp_init(struct sock *sk)
85{
86 const struct tcp_sock *tp = tcp_sk(sk);
87
88 if ((tp->ecn_flags & TCP_ECN_OK) ||
89 (sk->sk_state == TCP_LISTEN ||
90 sk->sk_state == TCP_CLOSE)) {
91 struct dctcp *ca = inet_csk_ca(sk);
92
93 ca->prior_snd_una = tp->snd_una;
94 ca->prior_rcv_nxt = tp->rcv_nxt;
95
96 ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
97
98 ca->delayed_ack_reserved = 0;
99 ca->ce_state = 0;
100
101 dctcp_reset(tp, ca);
102 return;
103 }
104
105 /* No ECN support? Fall back to Reno. Also need to clear
106 * ECT from sk since it is set during 3WHS for DCTCP.
107 */
108 inet_csk(sk)->icsk_ca_ops = &dctcp_reno;
109 INET_ECN_dontxmit(sk);
110}
111
112static u32 dctcp_ssthresh(struct sock *sk)
113{
114 const struct dctcp *ca = inet_csk_ca(sk);
115 struct tcp_sock *tp = tcp_sk(sk);
116
117 return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U);
118}
119
120/* Minimal DCTP CE state machine:
121 *
122 * S: 0 <- last pkt was non-CE
123 * 1 <- last pkt was CE
124 */
125
126static void dctcp_ce_state_0_to_1(struct sock *sk)
127{
128 struct dctcp *ca = inet_csk_ca(sk);
129 struct tcp_sock *tp = tcp_sk(sk);
130
131 /* State has changed from CE=0 to CE=1 and delayed
132 * ACK has not sent yet.
133 */
134 if (!ca->ce_state && ca->delayed_ack_reserved) {
135 u32 tmp_rcv_nxt;
136
137 /* Save current rcv_nxt. */
138 tmp_rcv_nxt = tp->rcv_nxt;
139
140 /* Generate previous ack with CE=0. */
141 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
142 tp->rcv_nxt = ca->prior_rcv_nxt;
143
144 tcp_send_ack(sk);
145
146 /* Recover current rcv_nxt. */
147 tp->rcv_nxt = tmp_rcv_nxt;
148 }
149
150 ca->prior_rcv_nxt = tp->rcv_nxt;
151 ca->ce_state = 1;
152
153 tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
154}
155
156static void dctcp_ce_state_1_to_0(struct sock *sk)
157{
158 struct dctcp *ca = inet_csk_ca(sk);
159 struct tcp_sock *tp = tcp_sk(sk);
160
161 /* State has changed from CE=1 to CE=0 and delayed
162 * ACK has not sent yet.
163 */
164 if (ca->ce_state && ca->delayed_ack_reserved) {
165 u32 tmp_rcv_nxt;
166
167 /* Save current rcv_nxt. */
168 tmp_rcv_nxt = tp->rcv_nxt;
169
170 /* Generate previous ack with CE=1. */
171 tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
172 tp->rcv_nxt = ca->prior_rcv_nxt;
173
174 tcp_send_ack(sk);
175
176 /* Recover current rcv_nxt. */
177 tp->rcv_nxt = tmp_rcv_nxt;
178 }
179
180 ca->prior_rcv_nxt = tp->rcv_nxt;
181 ca->ce_state = 0;
182
183 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
184}
185
186static void dctcp_update_alpha(struct sock *sk, u32 flags)
187{
188 const struct tcp_sock *tp = tcp_sk(sk);
189 struct dctcp *ca = inet_csk_ca(sk);
190 u32 acked_bytes = tp->snd_una - ca->prior_snd_una;
191
192 /* If ack did not advance snd_una, count dupack as MSS size.
193 * If ack did update window, do not count it at all.
194 */
195 if (acked_bytes == 0 && !(flags & CA_ACK_WIN_UPDATE))
196 acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss;
197 if (acked_bytes) {
198 ca->acked_bytes_total += acked_bytes;
199 ca->prior_snd_una = tp->snd_una;
200
201 if (flags & CA_ACK_ECE)
202 ca->acked_bytes_ecn += acked_bytes;
203 }
204
205 /* Expired RTT */
206 if (!before(tp->snd_una, ca->next_seq)) {
207 /* For avoiding denominator == 1. */
208 if (ca->acked_bytes_total == 0)
209 ca->acked_bytes_total = 1;
210
211 /* alpha = (1 - g) * alpha + g * F */
212 ca->dctcp_alpha = ca->dctcp_alpha -
213 (ca->dctcp_alpha >> dctcp_shift_g) +
214 (ca->acked_bytes_ecn << (10U - dctcp_shift_g)) /
215 ca->acked_bytes_total;
216
217 if (ca->dctcp_alpha > DCTCP_MAX_ALPHA)
218 /* Clamp dctcp_alpha to max. */
219 ca->dctcp_alpha = DCTCP_MAX_ALPHA;
220
221 dctcp_reset(tp, ca);
222 }
223}
224
225static void dctcp_state(struct sock *sk, u8 new_state)
226{
227 if (dctcp_clamp_alpha_on_loss && new_state == TCP_CA_Loss) {
228 struct dctcp *ca = inet_csk_ca(sk);
229
230 /* If this extension is enabled, we clamp dctcp_alpha to
231 * max on packet loss; the motivation is that dctcp_alpha
232 * is an indicator to the extend of congestion and packet
233 * loss is an indicator of extreme congestion; setting
234 * this in practice turned out to be beneficial, and
235 * effectively assumes total congestion which reduces the
236 * window by half.
237 */
238 ca->dctcp_alpha = DCTCP_MAX_ALPHA;
239 }
240}
241
242static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev)
243{
244 struct dctcp *ca = inet_csk_ca(sk);
245
246 switch (ev) {
247 case CA_EVENT_DELAYED_ACK:
248 if (!ca->delayed_ack_reserved)
249 ca->delayed_ack_reserved = 1;
250 break;
251 case CA_EVENT_NON_DELAYED_ACK:
252 if (ca->delayed_ack_reserved)
253 ca->delayed_ack_reserved = 0;
254 break;
255 default:
256 /* Don't care for the rest. */
257 break;
258 }
259}
260
261static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
262{
263 switch (ev) {
264 case CA_EVENT_ECN_IS_CE:
265 dctcp_ce_state_0_to_1(sk);
266 break;
267 case CA_EVENT_ECN_NO_CE:
268 dctcp_ce_state_1_to_0(sk);
269 break;
270 case CA_EVENT_DELAYED_ACK:
271 case CA_EVENT_NON_DELAYED_ACK:
272 dctcp_update_ack_reserved(sk, ev);
273 break;
274 default:
275 /* Don't care for the rest. */
276 break;
277 }
278}
279
280static void dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
281{
282 const struct dctcp *ca = inet_csk_ca(sk);
283
284 /* Fill it also in case of VEGASINFO due to req struct limits.
285 * We can still correctly retrieve it later.
286 */
287 if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) ||
288 ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
289 struct tcp_dctcp_info info;
290
291 memset(&info, 0, sizeof(info));
292 if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) {
293 info.dctcp_enabled = 1;
294 info.dctcp_ce_state = (u16) ca->ce_state;
295 info.dctcp_alpha = ca->dctcp_alpha;
296 info.dctcp_ab_ecn = ca->acked_bytes_ecn;
297 info.dctcp_ab_tot = ca->acked_bytes_total;
298 }
299
300 nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info);
301 }
302}
303
304static struct tcp_congestion_ops dctcp __read_mostly = {
305 .init = dctcp_init,
306 .in_ack_event = dctcp_update_alpha,
307 .cwnd_event = dctcp_cwnd_event,
308 .ssthresh = dctcp_ssthresh,
309 .cong_avoid = tcp_reno_cong_avoid,
310 .set_state = dctcp_state,
311 .get_info = dctcp_get_info,
312 .flags = TCP_CONG_NEEDS_ECN,
313 .owner = THIS_MODULE,
314 .name = "dctcp",
315};
316
317static struct tcp_congestion_ops dctcp_reno __read_mostly = {
318 .ssthresh = tcp_reno_ssthresh,
319 .cong_avoid = tcp_reno_cong_avoid,
320 .get_info = dctcp_get_info,
321 .owner = THIS_MODULE,
322 .name = "dctcp-reno",
323};
324
325static int __init dctcp_register(void)
326{
327 BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE);
328 return tcp_register_congestion_control(&dctcp);
329}
330
331static void __exit dctcp_unregister(void)
332{
333 tcp_unregister_congestion_control(&dctcp);
334}
335
336module_init(dctcp_register);
337module_exit(dctcp_unregister);
338
339MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
340MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
341MODULE_AUTHOR("Glenn Judd <glenn.judd@morganstanley.com>");
342
343MODULE_LICENSE("GPL v2");
344MODULE_DESCRIPTION("DataCenter TCP (DCTCP)");
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5073eefa6fae..fc133178c787 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -233,14 +233,21 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s
233 tcp_enter_quickack_mode((struct sock *)tp); 233 tcp_enter_quickack_mode((struct sock *)tp);
234 break; 234 break;
235 case INET_ECN_CE: 235 case INET_ECN_CE:
236 if (tcp_ca_needs_ecn((struct sock *)tp))
237 tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
238
236 if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { 239 if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
237 /* Better not delay acks, sender can have a very low cwnd */ 240 /* Better not delay acks, sender can have a very low cwnd */
238 tcp_enter_quickack_mode((struct sock *)tp); 241 tcp_enter_quickack_mode((struct sock *)tp);
239 tp->ecn_flags |= TCP_ECN_DEMAND_CWR; 242 tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
240 } 243 }
241 /* fallinto */ 244 tp->ecn_flags |= TCP_ECN_SEEN;
245 break;
242 default: 246 default:
247 if (tcp_ca_needs_ecn((struct sock *)tp))
248 tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE);
243 tp->ecn_flags |= TCP_ECN_SEEN; 249 tp->ecn_flags |= TCP_ECN_SEEN;
250 break;
244 } 251 }
245} 252}
246 253
@@ -3362,6 +3369,14 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
3362 } 3369 }
3363} 3370}
3364 3371
3372static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
3373{
3374 const struct inet_connection_sock *icsk = inet_csk(sk);
3375
3376 if (icsk->icsk_ca_ops->in_ack_event)
3377 icsk->icsk_ca_ops->in_ack_event(sk, flags);
3378}
3379
3365/* This routine deals with incoming acks, but not outgoing ones. */ 3380/* This routine deals with incoming acks, but not outgoing ones. */
3366static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) 3381static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3367{ 3382{
@@ -3421,10 +3436,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3421 tp->snd_una = ack; 3436 tp->snd_una = ack;
3422 flag |= FLAG_WIN_UPDATE; 3437 flag |= FLAG_WIN_UPDATE;
3423 3438
3424 tcp_ca_event(sk, CA_EVENT_FAST_ACK); 3439 tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
3425 3440
3426 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); 3441 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
3427 } else { 3442 } else {
3443 u32 ack_ev_flags = CA_ACK_SLOWPATH;
3444
3428 if (ack_seq != TCP_SKB_CB(skb)->end_seq) 3445 if (ack_seq != TCP_SKB_CB(skb)->end_seq)
3429 flag |= FLAG_DATA; 3446 flag |= FLAG_DATA;
3430 else 3447 else
@@ -3436,10 +3453,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3436 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, 3453 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3437 &sack_rtt_us); 3454 &sack_rtt_us);
3438 3455
3439 if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) 3456 if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) {
3440 flag |= FLAG_ECE; 3457 flag |= FLAG_ECE;
3458 ack_ev_flags |= CA_ACK_ECE;
3459 }
3460
3461 if (flag & FLAG_WIN_UPDATE)
3462 ack_ev_flags |= CA_ACK_WIN_UPDATE;
3441 3463
3442 tcp_ca_event(sk, CA_EVENT_SLOW_ACK); 3464 tcp_in_ack_event(sk, ack_ev_flags);
3443 } 3465 }
3444 3466
3445 /* We passed data and got it acked, remove any soft error 3467 /* We passed data and got it acked, remove any soft error
@@ -5944,7 +5966,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
5944 goto drop_and_free; 5966 goto drop_and_free;
5945 5967
5946 if (!want_cookie || tmp_opt.tstamp_ok) 5968 if (!want_cookie || tmp_opt.tstamp_ok)
5947 TCP_ECN_create_request(req, skb, sock_net(sk)); 5969 TCP_ECN_create_request(req, skb, sk);
5948 5970
5949 if (want_cookie) { 5971 if (want_cookie) {
5950 isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); 5972 isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index a058f411d3a6..47b73506b77e 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -451,9 +451,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
451 newtp->snd_cwnd = TCP_INIT_CWND; 451 newtp->snd_cwnd = TCP_INIT_CWND;
452 newtp->snd_cwnd_cnt = 0; 452 newtp->snd_cwnd_cnt = 0;
453 453
454 if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops && 454 if (!try_module_get(newicsk->icsk_ca_ops->owner))
455 !try_module_get(newicsk->icsk_ca_ops->owner)) 455 tcp_assign_congestion_control(newsk);
456 newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
457 456
458 tcp_set_ca_state(newsk, TCP_CA_Open); 457 tcp_set_ca_state(newsk, TCP_CA_Open);
459 tcp_init_xmit_timers(newsk); 458 tcp_init_xmit_timers(newsk);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 4d92703df4c6..86a0216fcaa1 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -318,11 +318,15 @@ static u16 tcp_select_window(struct sock *sk)
318} 318}
319 319
320/* Packet ECN state for a SYN-ACK */ 320/* Packet ECN state for a SYN-ACK */
321static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb) 321static inline void TCP_ECN_send_synack(struct sock *sk, struct sk_buff *skb)
322{ 322{
323 const struct tcp_sock *tp = tcp_sk(sk);
324
323 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; 325 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
324 if (!(tp->ecn_flags & TCP_ECN_OK)) 326 if (!(tp->ecn_flags & TCP_ECN_OK))
325 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; 327 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
328 else if (tcp_ca_needs_ecn(sk))
329 INET_ECN_xmit(sk);
326} 330}
327 331
328/* Packet ECN state for a SYN. */ 332/* Packet ECN state for a SYN. */
@@ -331,17 +335,24 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
331 struct tcp_sock *tp = tcp_sk(sk); 335 struct tcp_sock *tp = tcp_sk(sk);
332 336
333 tp->ecn_flags = 0; 337 tp->ecn_flags = 0;
334 if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) { 338 if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
339 tcp_ca_needs_ecn(sk)) {
335 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; 340 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
336 tp->ecn_flags = TCP_ECN_OK; 341 tp->ecn_flags = TCP_ECN_OK;
342 if (tcp_ca_needs_ecn(sk))
343 INET_ECN_xmit(sk);
337 } 344 }
338} 345}
339 346
340static __inline__ void 347static __inline__ void
341TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th) 348TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th,
349 struct sock *sk)
342{ 350{
343 if (inet_rsk(req)->ecn_ok) 351 if (inet_rsk(req)->ecn_ok) {
344 th->ece = 1; 352 th->ece = 1;
353 if (tcp_ca_needs_ecn(sk))
354 INET_ECN_xmit(sk);
355 }
345} 356}
346 357
347/* Set up ECN state for a packet on a ESTABLISHED socket that is about to 358/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
@@ -362,7 +373,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
362 tcp_hdr(skb)->cwr = 1; 373 tcp_hdr(skb)->cwr = 1;
363 skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; 374 skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
364 } 375 }
365 } else { 376 } else if (!tcp_ca_needs_ecn(sk)) {
366 /* ACK or retransmitted segment: clear ECT|CE */ 377 /* ACK or retransmitted segment: clear ECT|CE */
367 INET_ECN_dontxmit(sk); 378 INET_ECN_dontxmit(sk);
368 } 379 }
@@ -2789,7 +2800,7 @@ int tcp_send_synack(struct sock *sk)
2789 } 2800 }
2790 2801
2791 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; 2802 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
2792 TCP_ECN_send_synack(tcp_sk(sk), skb); 2803 TCP_ECN_send_synack(sk, skb);
2793 } 2804 }
2794 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); 2805 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2795} 2806}
@@ -2848,7 +2859,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2848 memset(th, 0, sizeof(struct tcphdr)); 2859 memset(th, 0, sizeof(struct tcphdr));
2849 th->syn = 1; 2860 th->syn = 1;
2850 th->ack = 1; 2861 th->ack = 1;
2851 TCP_ECN_make_synack(req, th); 2862 TCP_ECN_make_synack(req, th, sk);
2852 th->source = htons(ireq->ir_num); 2863 th->source = htons(ireq->ir_num);
2853 th->dest = ireq->ir_rmt_port; 2864 th->dest = ireq->ir_rmt_port;
2854 /* Setting of flags are superfluous here for callers (and ECE is 2865 /* Setting of flags are superfluous here for callers (and ECE is
@@ -3119,6 +3130,8 @@ void tcp_send_delayed_ack(struct sock *sk)
3119 int ato = icsk->icsk_ack.ato; 3130 int ato = icsk->icsk_ack.ato;
3120 unsigned long timeout; 3131 unsigned long timeout;
3121 3132
3133 tcp_ca_event(sk, CA_EVENT_DELAYED_ACK);
3134
3122 if (ato > TCP_DELACK_MIN) { 3135 if (ato > TCP_DELACK_MIN) {
3123 const struct tcp_sock *tp = tcp_sk(sk); 3136 const struct tcp_sock *tp = tcp_sk(sk);
3124 int max_ato = HZ / 2; 3137 int max_ato = HZ / 2;
@@ -3175,6 +3188,8 @@ void tcp_send_ack(struct sock *sk)
3175 if (sk->sk_state == TCP_CLOSE) 3188 if (sk->sk_state == TCP_CLOSE)
3176 return; 3189 return;
3177 3190
3191 tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK);
3192
3178 /* We are not putting this on the write queue, so 3193 /* We are not putting this on the write queue, so
3179 * tcp_transmit_skb() will set the ownership to this 3194 * tcp_transmit_skb() will set the ownership to this
3180 * sock. 3195 * sock.
@@ -3196,6 +3211,7 @@ void tcp_send_ack(struct sock *sk)
3196 skb_mstamp_get(&buff->skb_mstamp); 3211 skb_mstamp_get(&buff->skb_mstamp);
3197 tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); 3212 tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
3198} 3213}
3214EXPORT_SYMBOL_GPL(tcp_send_ack);
3199 3215
3200/* This routine sends a packet with an out of date sequence 3216/* This routine sends a packet with an out of date sequence
3201 * number. It assumes the other end will try to ack it. 3217 * number. It assumes the other end will try to ack it.
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 81911a92356c..bb63fba47d47 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -220,32 +220,35 @@ static u32 tcp_westwood_bw_rttmin(const struct sock *sk)
220 return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); 220 return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
221} 221}
222 222
223static void tcp_westwood_ack(struct sock *sk, u32 ack_flags)
224{
225 if (ack_flags & CA_ACK_SLOWPATH) {
226 struct westwood *w = inet_csk_ca(sk);
227
228 westwood_update_window(sk);
229 w->bk += westwood_acked_count(sk);
230
231 update_rtt_min(w);
232 return;
233 }
234
235 westwood_fast_bw(sk);
236}
237
223static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) 238static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
224{ 239{
225 struct tcp_sock *tp = tcp_sk(sk); 240 struct tcp_sock *tp = tcp_sk(sk);
226 struct westwood *w = inet_csk_ca(sk); 241 struct westwood *w = inet_csk_ca(sk);
227 242
228 switch (event) { 243 switch (event) {
229 case CA_EVENT_FAST_ACK:
230 westwood_fast_bw(sk);
231 break;
232
233 case CA_EVENT_COMPLETE_CWR: 244 case CA_EVENT_COMPLETE_CWR:
234 tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); 245 tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
235 break; 246 break;
236
237 case CA_EVENT_LOSS: 247 case CA_EVENT_LOSS:
238 tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); 248 tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
239 /* Update RTT_min when next ack arrives */ 249 /* Update RTT_min when next ack arrives */
240 w->reset_rtt_min = 1; 250 w->reset_rtt_min = 1;
241 break; 251 break;
242
243 case CA_EVENT_SLOW_ACK:
244 westwood_update_window(sk);
245 w->bk += westwood_acked_count(sk);
246 update_rtt_min(w);
247 break;
248
249 default: 252 default:
250 /* don't care */ 253 /* don't care */
251 break; 254 break;
@@ -274,6 +277,7 @@ static struct tcp_congestion_ops tcp_westwood __read_mostly = {
274 .ssthresh = tcp_reno_ssthresh, 277 .ssthresh = tcp_reno_ssthresh,
275 .cong_avoid = tcp_reno_cong_avoid, 278 .cong_avoid = tcp_reno_cong_avoid,
276 .cwnd_event = tcp_westwood_event, 279 .cwnd_event = tcp_westwood_event,
280 .in_ack_event = tcp_westwood_ack,
277 .get_info = tcp_westwood_info, 281 .get_info = tcp_westwood_info,
278 .pkts_acked = tcp_westwood_pkts_acked, 282 .pkts_acked = tcp_westwood_pkts_acked,
279 283